Skip to content

batch_processing

StateParser

Bases: object

Parameters:

Name Type Description Default
hdx_spec dict

Dictionary with HDX-MS state specification.

required
data_src Union[PathLike[str], str, dict[str, DataFile], None]

Optional data source with input data files. If not specified, current directory is used. Otherwise, either a data source path can be specified or data can be given as a dictionary, where keys are filenames and values are :class:~io.StringIO with file contents.

required
Source code in pyhdx/batch_processing.py
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
class StateParser(object):
    """

    Args:
        hdx_spec: Dictionary with HDX-MS state specification.
        data_src: Optional data source with input data files. If not specified, current
            directory is used. Otherwise, either a data source path can be specified or
            data can be given as a dictionary, where keys are filenames and values are
            :class:`~io.StringIO` with file contents.
    """

    def __init__(
        self,
        hdx_spec: dict,
        data_src: Union[os.PathLike[str], str, dict[str, DataFile], None],
        # filter_kwargs: Optional[dict[str, Any]] = None,
        # correction_kwargs: Optional[dict[str, Any]] = None,
    ) -> None:
        warnings.warn(
            "Will be removed in favour of the `hdxms-datasets` package ", DeprecationWarning
        )
        self.hdx_spec = hdx_spec
        self.data_files: dict[str, DataFile] = {}

        if isinstance(data_src, (os.PathLike, str)):
            data_src = Path(data_src) or Path(".")
            for name, spec in self.hdx_spec["data_files"].items():
                datafile = DataFile(
                    name=name,
                    filepath_or_buffer=data_src / spec["filename"],
                    **{k: v for k, v in spec.items() if k != "filename"},
                )
                self.data_files[name] = datafile

        elif isinstance(data_src, dict):
            self.data_files = data_src
        else:
            raise TypeError(f"Invalid data type {type(data_src)!r}, must be path or dict")

    def load_hdxmset(self) -> HDXMeasurementSet:
        hdxm_list = [self.load_hdxm(state) for state in self.hdx_spec["states"].keys()]
        return HDXMeasurementSet(hdxm_list)

    def load_peptides(self, state: Union[str, int], peptides: str) -> pd.DataFrame:
        state = self.states[state] if isinstance(state, int) else state
        peptide_spec = self.hdx_spec["states"][state]["peptides"][peptides]

        df = self.data_files[peptide_spec["data_file"]].data

        # filter_fields = {"state", "exposure", "query", "dropna"}
        # peptide_df = filter_peptides(
        #     df, **{k: v for k, v in peptide_spec.items() if k in filter_fields}
        # )

        filter_fields = {"state", "exposure", "query", "dropna"}
        peptide_df = batch_filter_peptides(
            df, **{k: v for k, v in peptide_spec.items() if k in filter_fields}
        )

        return peptide_df

    # -> function as monkey patch dataset parser; OR perhaps add them to internal dict of loaders ?
    def load_hdxm(self, state: Union[str, int]) -> HDXMeasurement:
        state = self.states[state] if isinstance(state, int) else state
        peptide_spec = self.hdx_spec["states"][state]["peptides"]
        metadata = self.hdx_spec["states"][state]["metadata"]

        peptides = self.load_peptides(state, "experiment")
        fd_peptides = (
            self.load_peptides(state, "FD_control") if "FD_control" in peptide_spec else None
        )
        nd_peptides = (
            self.load_peptides(state, "ND_control") if "ND_control" in peptide_spec else None
        )

        if fd_peptides is None and "be_percent" in metadata:
            peptides = correct_d_uptake(peptides, d_percentage=metadata.get("d_percentage", 100.0))
            back_exchange = metadata["be_percent"] / 100.0
            peptides["rfu"] = peptides["uptake"] / ((1 - back_exchange) * peptides["ex_residues"])
            peptides["uptake_corrected"] = peptides["uptake"] / (1 - back_exchange)
        elif isinstance(fd_peptides, pd.DataFrame):
            peptides = apply_control(peptides, fd_peptides, nd_peptides)
            peptides = correct_d_uptake(
                peptides,
                drop_first=cfg.analysis.drop_first,
                d_percentage=metadata.get("d_percentage", 100.0),
            )

        global_metadata = self.hdx_spec.get("metadata", {})
        global_metadata.update(metadata)
        hdxm = HDXMeasurement(peptides, name=state, **global_metadata)

        return hdxm

    @property
    def correction_kwargs(self):
        kwargs = {
            "drop_first": cfg.analysis.drop_first,
            "d_percentage": self.hdx_spec["metadata"].get("d_percentage", 100.0),
        }

        # todo:
        # if 'corrections' in self.hdx_spec:
        # ...

        return kwargs

    @property
    def states(self) -> list[str]:
        return list(self.hdx_spec["states"].keys())

batch_convert_time(time_dict, target_unit='s')

Convenience function to convert time values.

Parameters:

Name Type Description Default
time_dict dict

Dictionary with time value(s) and unit.

required
target_unit Literal['s', 'min', 'h']

Target unit for time.

's'

Returns:

Type Description
Union[float, list[float]]

Converted time value(s).

Source code in pyhdx/batch_processing.py
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
def batch_convert_time(
    time_dict: dict, target_unit: Literal["s", "min", "h"] = "s"
) -> Union[float, list[float]]:
    """
    Convenience function to convert time values.

    Args:
        time_dict: Dictionary with time value(s) and unit.
        target_unit: Target unit for time.

    Returns:
        Converted time value(s).
    """

    warnings.warn("Will be removed in favour of the `hdxms-datasets` package ", DeprecationWarning)
    src_unit = time_dict["unit"]

    time_factor = time_factors[src_unit] / time_factors[target_unit]
    if values := time_dict.get("values"):
        return [v * time_factor for v in values]
    elif value := time_dict.get("value"):
        return value * time_factor
    else:
        raise ValueError("Invalid time dictionary")

batch_filter_peptides(df, state=None, exposure=None, query=None, dropna=True)

Convenience function to filter a peptides DataFrame. .

Parameters:

Name Type Description Default
df DataFrame

Input dataframe.

required
state Optional[str]

Name of protein state to select.

None
exposure Optional[dict]

Exposure value(s) to select. Exposure is given as a :obj:dict, with keys "value" or "values" for exposure value, and "unit" for the time unit.

None
query Optional[list[str]]

Additional queries to pass to pandas.DataFrame.query.

None
dropna bool

Drop rows with NaN uptake entries.

True

Examples:

Filter peptides for a specific protein state and exposure time:

>>> d = {"state", "SecB WT apo", "exposure": {"value": 0.167, "unit": "min"}
>>> filtered_df = filter_peptides(df, **d)

Returns:

Type Description
DataFrame

Filtered dataframe.

Source code in pyhdx/batch_processing.py
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
def batch_filter_peptides(
    df: pd.DataFrame,
    state: Optional[str] = None,
    exposure: Optional[dict] = None,
    query: Optional[list[str]] = None,
    dropna: bool = True,
) -> pd.DataFrame:
    """
    Convenience function to filter a peptides DataFrame. .

    Args:
        df: Input dataframe.
        state: Name of protein state to select.
        exposure: Exposure value(s) to select. Exposure is given as a :obj:`dict`, with keys "value" or "values" for
            exposure value, and "unit" for the time unit.
        query: Additional queries to pass to [pandas.DataFrame.query][].
        dropna: Drop rows with `NaN` uptake entries.

    Examples:
        Filter peptides for a specific protein state and exposure time:

        >>> d = {"state", "SecB WT apo", "exposure": {"value": 0.167, "unit": "min"}
        >>> filtered_df = filter_peptides(df, **d)

    Returns:
        Filtered dataframe.
    """
    warnings.warn("Will be removed in favour of the `hdxms-datasets` package ", DeprecationWarning)

    if state is not None:
        df = df[df["state"] == state]

    if exposure is not None:
        t_val = batch_convert_time(exposure, target_unit="s")
        if isinstance(t_val, list):
            df = df[df["exposure"].isin(t_val)]
        else:
            df = df[df["exposure"] == t_val]

    if query:
        for q in query:
            df = df.query(q)

    if dropna:
        df = df.dropna(subset=["uptake"])

    return df