Skip to content

process

correct_d_uptake(peptides, drop_first=1, d_percentage=100, deepcopy=False)

Corrects for back exchange, percentage deuterium in solution and prolines. Adds the number of effective exchanging residues as well as corrected deuterium uptake (requires the field 'rfu')

Modified the 'sequence' field, where n_terminal non-exchanging residues are marked with 'x' and prolines with lower case 'p'. Adds the fields '_start' and '_stop', which are the start and stop residue numbers for each peptide minus non-exchanging residues.

Parameters:

Name Type Description Default
peptides DataFrame

DataFrame with peptides

required
drop_first int

Number of n-terminal residues to consider as fully back-exchanging.

1
d_percentage float

Percentate deutrium in the exchange buffer.

100
deepcopy bool

Set to True to make a deep copy of the input DataFrame, otherwise a shallow copy is made.

False

Returns:

Source code in pyhdx/process.py
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
def correct_d_uptake(
    peptides: pd.DataFrame,
    drop_first: int = 1,
    d_percentage: float = 100,
    deepcopy: bool = False,
):
    """
    Corrects for back exchange, percentage deuterium in solution and prolines. Adds the number of effective exchanging
    residues as well as corrected deuterium uptake (requires the field 'rfu')

    Modified the 'sequence' field, where n_terminal non-exchanging residues are marked with 'x' and prolines with
    lower case 'p'. Adds the fields '_start' and '_stop', which are the start and stop residue numbers for each peptide
    minus non-exchanging residues.

    Args:
        peptides: DataFrame with peptides
        drop_first: Number of n-terminal residues to consider as fully back-exchanging.
        d_percentage: Percentate deutrium in the exchange buffer.
        deepcopy: Set to `True` to make a deep copy of the input DataFrame, otherwise a shallow copy is made.

    Returns:

    """

    peptides = peptides.copy(deep=deepcopy)

    if not 0.0 <= d_percentage <= 100.0:
        raise ValueError(f"Deuterium percentage must be 0-100, got {d_percentage}")

    peptides["_sequence"] = peptides["sequence"].copy()
    peptides["sequence"] = [s.replace("P", "p") for s in peptides["sequence"]]

    # Find the total number of n terminal / c_terminal residues to remove
    n_term = np.array(
        [len(seq) - len(seq[drop_first:].lstrip("p")) for seq in peptides["sequence"]]
    )

    c_term = np.array([len(seq) - len(seq.rstrip("p")) for seq in peptides["sequence"]])

    peptides["sequence"] = ["x" * nt + s[nt:] for nt, s in zip(n_term, peptides["sequence"])]
    peptides["_start"] = peptides["start"] + n_term
    peptides["_stop"] = peptides["stop"] - c_term

    ex_residues = (
        np.array([len(s) - s.count("x") - s.count("p") for s in peptides["sequence"]])
        * d_percentage
        / 100.0
    )

    peptides["ex_residues"] = ex_residues

    if "rfu" in peptides:
        peptides["uptake_corrected"] = peptides["rfu"] * ex_residues

    return peptides

filter_peptides(df, state=None, exposure=None, query=None, dropna=True)

Convenience function to filter a peptides DataFrame.

Parameters:

Name Type Description Default
df DataFrame

Input :class:pandas.DataFrame

required
state Optional[str]

Name of protein state to select.

None
exposure Union[dict, float, None]

Exposure value(s) to select. Exposure is given as a :obj:dict, with keys "value" or "values" for exposure value, and "unit" for the time unit.

None
query Optional[list[str]]

Additional queries to pass to :meth:pandas.DataFrame.query.

None
dropna bool

Drop rows with NaN uptake entries.

True
Example

::

d = {"state", "SecB WT apo", "exposure": {"value": 0.167, "unit": "min"} filtered_df = filter_peptides(df, **d)

Returns:

Source code in pyhdx/process.py
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
def filter_peptides(
    df: pd.DataFrame,
    state: Optional[str] = None,
    exposure: Union[dict, float, None] = None,
    query: Optional[list[str]] = None,
    dropna: bool = True,
) -> pd.DataFrame:
    """
    Convenience function to filter a peptides DataFrame.

    Args:
        df: Input :class:`pandas.DataFrame`
        state: Name of protein state to select.
        exposure: Exposure value(s) to select. Exposure is given as a :obj:`dict`, with keys "value" or "values" for
            exposure value, and "unit" for the time unit.
        query: Additional queries to pass to :meth:`pandas.DataFrame.query`.
        dropna: Drop rows with NaN uptake entries.

    Example:
        ::

        d = {"state", "SecB WT apo", "exposure": {"value": 0.167, "unit": "min"}
        filtered_df = filter_peptides(df, **d)

    Returns:

    """

    warnings.warn(
        "`filter_peptides` will be moved to the `hdxms-datasets` package", DeprecationWarning
    )
    if state:
        df = df[df["state"] == state]

    if isinstance(exposure, dict):
        if values := exposure.get("values"):
            values = convert_time(values, exposure.get("unit", "s"), "s")
            df = df[df["exposure"].isin(values)]
        elif value := exposure.get("value"):
            value = convert_time(value, exposure.get("unit", "s"), "s")
            df = df[df["exposure"] == value]
    elif isinstance(exposure, float):
        df = df[df["exposure"] == exposure]

    if query:
        for q in query:
            df = df.query(q)

    if dropna:
        df = df.dropna(subset=["uptake"])

    return df

sort_columns(df, column_order=None)

Sorts columns in DataFrame by a given column order. Columns not in suplied order are appended to the end of the dataframe.

Parameters:

Name Type Description Default
df DataFrame

DataFrame to sort.

required
column_order Optional[list[str]]

Order of columns to use. If None, a default order is used.

None

Returns:

Type Description
DataFrame

The column-sorted DataFrame.

Source code in pyhdx/process.py
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
def sort_columns(df: pd.DataFrame, column_order: Optional[list[str]] = None) -> pd.DataFrame:
    """
    Sorts columns in DataFrame by a given column order. Columns not in suplied order are appended to the end
    of the dataframe.


    Args:
        df: DataFrame to sort.
        column_order: Order of columns to use. If `None`, a default order is used.

    Returns:
        The column-sorted DataFrame.
    """

    # https://stackoverflow.com/questions/41968732/set-order-of-columns-in-pandas-dataframe
    column_order = column_order or COLUMN_ORDER

    columns_to_order = [col for col in column_order if col in df.columns]
    new_columns = columns_to_order + [col for col in df.columns if col not in columns_to_order]

    return df[new_columns]

verify_sequence(df, sequence=None, n_term=None, c_term=None)

Verify if sequence information in the dataframe is compatible with an externally supplied sequence and/or the residue numbers of the N terminal and C terminal residues.

Parameters:

Name Type Description Default
df DataFrame

Peptide dataframe. Must have columnse 'start', 'stop' and 'sequence'

required
sequence Optional[str]

Sequence to check as FASTA string.

None
n_term Optional[int]

Optional residue number of N terminal residue. Can be negative to include purification tags.

None
c_term Optional[int]

Optional residue number of C terminal residue.

None

Returns:

Type Description
tuple[Series, Series]

Tuple of pandas series with full and reconstructed (+ lower case prolines)

Source code in pyhdx/process.py
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
def verify_sequence(
    df: pd.DataFrame,
    sequence: Optional[str] = None,
    n_term: Optional[int] = None,
    c_term: Optional[int] = None,
) -> tuple[pd.Series, pd.Series]:
    """
    Verify if sequence information in the dataframe is compatible with an externally supplied sequence and/or the residue
    numbers of the N terminal and C terminal residues.

    Args:
        df: Peptide dataframe. Must have columnse 'start', 'stop' and 'sequence'
        sequence: Sequence to check as FASTA string.
        n_term: Optional residue number of N terminal residue. Can be negative to include purification tags.
        c_term: Optional residue number of C terminal residue.

    Returns:
        Tuple of pandas series with full and reconstructed (+ lower case prolines)
    """

    # TODO return single pd series
    # Returns:
    #     Pandas Series with (reconstructed) sequence with residue numbers as index.

    n_term = n_term if n_term is not None else 1

    if sequence is None and c_term is None:
        raise ValueError("Must provide either 'c_term' or 'sequence'")
    elif c_term is None:
        c_term = len(sequence) + n_term - 1

    r_number = pd.RangeIndex(n_term, c_term + 1, name="r_number")

    if df["start"].min() < n_term:
        raise ValueError(
            f"Peptide dataframe contains peptides with start residue number below supplied 'n_term' ({n_term})"
        )
    if df["end"].max() > c_term:
        raise ValueError(
            f"Peptide dataframe contains peptides with end residue number above supplied 'c_term' ({c_term})"
        )

    seq_full = pd.Series(index=r_number, dtype="U").fillna("X")
    seq_reconstruct = pd.Series(index=r_number, dtype="U").fillna("X")

    # iterate over dataframe from C terminal peptides to N terminal peptides
    # paste sequence information in pd.Series at the correct positions.
    for idx in df.index[::-1]:
        start, end = df.loc[idx, "start"], df.loc[idx, "stop"]
        seq_full.loc[start : end - 1] = list(df.loc[idx, "_sequence"])
        seq_reconstruct.loc[start : end - 1] = list(df.loc[idx, "sequence"])

    if sequence:
        for r, s1, s2 in zip(r_number, sequence, seq_full):
            if s2 != "X" and s1 != s2:
                raise ValueError(
                    f"Mismatch in supplied sequence and peptides sequence at residue {r}, expected '{s2}', got '{s1}'"
                )
        if len(sequence) != len(seq_full):
            raise ValueError(
                "Invalid length of supplied sequence. Please check 'n_term' and 'c_term' parameters"
            )
        seq_full = pd.Series(index=r_number, data=list(sequence))

    return seq_full, seq_reconstruct