Skip to content

support

array_intersection(arrays, fields)

Find and return the intersecting entries in multiple arrays.

Parameters:

Name Type Description Default
arrays Iterable[ndarray]

Iterable of input structured arrays

required
fields Iterable[str]

Iterable of fields to use to decide if entires are intersecting

required

Returns:

Name Type Description
selected list[ndarray]

Output iterable of arrays with only intersecting entries.

Source code in pyhdx/support.py
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
def array_intersection(arrays: Iterable[np.ndarray], fields: Iterable[str]) -> list[np.ndarray]:
    """
    Find and return the intersecting entries in multiple arrays.

    Args:
        arrays: Iterable of input structured arrays
        fields: Iterable of fields to use to decide if entires are intersecting

    Returns:
        selected: Output iterable of arrays with only intersecting entries.
    """

    intersection = reduce(np.intersect1d, [fields_view(d, fields) for d in arrays])
    selected = [elem[np.isin(fields_view(elem, fields), intersection)] for elem in arrays]

    return selected

autowrap(start, end, margin=4, step=5)

Automatically finds wrap value for coverage to not have overlapping peptides within margin

Parameters

start end margin

Returns

Source code in pyhdx/support.py
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
def autowrap(start, end, margin=4, step=5):
    """
    Automatically finds wrap value for coverage to not have overlapping peptides within margin

    Parameters
    ----------
    start
    end
    margin

    Returns
    -------

    """
    assert len(start) == len(end), "Unequal length of 'start' and 'end' vectors"

    wrap = step
    while True:
        wraps = try_wrap(start, end, wrap, margin=margin)
        wrap += step
        if wraps or wrap > len(start):
            break
    return wrap

clean_types(d)

cleans up nested dict/list/tuple/other d for exporting as yaml

Converts library specific types to python native types, including numpy dtypes, OrderedDict, numpy arrays

https://stackoverflow.com/questions/59605943/python-convert-types-in-deeply-nested-dictionary-or-array

Source code in pyhdx/support.py
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
def clean_types(d: Any) -> Any:
    """cleans up nested dict/list/tuple/other `d` for exporting as yaml

    Converts library specific types to python native types, including numpy dtypes,
    OrderedDict, numpy arrays

    # https://stackoverflow.com/questions/59605943/python-convert-types-in-deeply-nested-dictionary-or-array

    """
    if isinstance(d, np.floating):
        return float(d)

    if isinstance(d, np.integer):
        return int(d)

    if isinstance(d, np.ndarray):
        return d.tolist()

    if isinstance(d, list):
        return [clean_types(item) for item in d]

    if isinstance(d, tuple):
        return tuple(clean_types(item) for item in d)

    if isinstance(d, OrderedDict):
        return clean_types(dict(d))

    if isinstance(d, dict):
        return {k: clean_types(v) for k, v in d.items()}

    else:
        return d

colors_to_pymol(r_number, color_arr, c_term=None, no_coverage='#8c8c8c')

coverts colors (hexadecimal format) and corresponding residue numbers to pml script to color structures in pymol residue ranges in output are inclusive, incluive

c_term

optional residue number of the c terminal of the last peptide doedsnt cover the c terminal

Source code in pyhdx/support.py
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
def colors_to_pymol(r_number, color_arr, c_term=None, no_coverage="#8c8c8c"):
    """coverts colors (hexadecimal format) and corresponding residue numbers to pml
    script to color structures in pymol
    residue ranges in output are inclusive, incluive

    c_term:
        optional residue number of the c terminal of the last peptide doedsnt cover the c terminal
    """

    # todo replace with pandas dataframe magic

    c_term = c_term or np.max(r_number)
    pd_series = pd.Series(color_arr, index=r_number)
    pd_series = pd_series.reindex(np.arange(1, c_term + 1))
    pd_series = pd_series.replace("nan", no_coverage)  # No coverage at nan entries
    pd_series = pd_series.replace(np.nan, no_coverage)  # Numpy NaNs

    return series_to_pymol(pd_series)

dataframe_intersection(dataframes, by, reset_index=True)

Return a list of dataframes whos entries are limited to the intersection of rows of selected columns.

Parameters:

Name Type Description Default
dataframes list[DataFrame]

List of dataframes to intersect

required
by Union[list, str]

Column name or list of column names to intersect on.

required
reset_index bool

If True, the index is reset to a default integer index.

True

Returns:

Type Description
list[DataFrame]

List of dataframes with intersected rows

Source code in pyhdx/support.py
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
def dataframe_intersection(
    dataframes: list[pd.DataFrame],
    by: Union[list, str],
    reset_index: bool = True,
) -> list[pd.DataFrame]:
    """Return a list of dataframes whos entries are limited to the intersection of rows of selected columns.

    Args:
        dataframes: List of dataframes to intersect
        by: Column name or list of column names to intersect on.
        reset_index: If True, the index is reset to a default integer index.

    Returns:
        List of dataframes with intersected rows

    """
    set_index = [d.set_index(by) for d in dataframes]
    index_intersection = reduce(pd.Index.intersection, (d.index for d in set_index))
    intersected = [df.loc[index_intersection] for df in set_index]

    if reset_index:
        return [df.reset_index() for df in intersected]
    else:
        return intersected

gen_subclasses(cls)

Recursively find all subclasses of cls

Source code in pyhdx/support.py
728
729
730
731
732
def gen_subclasses(cls):
    """Recursively find all subclasses of cls"""
    for sub_cls in cls.__subclasses__():
        yield sub_cls
        yield from gen_subclasses(sub_cls)

grouper(n, iterable, padvalue=None)

grouper(3, 'abcdefg', 'x') --> ('a','b','c'), ('d','e','f'), ('g','x','x')

Source code in pyhdx/support.py
295
296
297
def grouper(n, iterable, padvalue=None):
    "grouper(3, 'abcdefg', 'x') --> ('a','b','c'), ('d','e','f'), ('g','x','x')"
    return itertools.zip_longest(*[iter(iterable)] * n, fillvalue=padvalue)

hex_to_rgb(h)

returns rgb as int 0-255

Source code in pyhdx/support.py
548
549
550
551
def hex_to_rgb(h):
    """returns rgb as int 0-255"""
    r, g, b = tuple(int(h.lstrip("#")[2 * i : 2 * i + 2], 16) for i in range(3))
    return r, g, b

make_color_array(rates, colors, thds, no_coverage='#8c8c8c')

:param rates: array of rates :param colors: list of colors (slow to fast) :param thds: list of thresholds no_coverage: color value for no coverage :return:

Source code in pyhdx/support.py
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
def make_color_array(rates, colors, thds, no_coverage="#8c8c8c"):
    """

    :param rates: array of rates
    :param colors: list of colors (slow to fast)
    :param thds: list of thresholds
    no_coverage: color value for no coverage
    :return:
    """

    output = np.full_like(rates, fill_value=no_coverage, dtype="U7")
    full_thds = [-np.inf] + list(thds) + [np.inf]
    for lower, upper, color in zip(full_thds[:-1], full_thds[1:], colors):
        b = (rates > lower) & (rates <= upper)

        output[b] = color

    return output

make_monomer(input_file, output_file)

reads input_file pdb file and removes all chains except chain A and all water

Source code in pyhdx/support.py
661
662
663
664
665
666
667
668
669
670
671
672
673
def make_monomer(input_file, output_file):
    """reads input_file pdb file and removes all chains except chain A and all water"""
    with open(input_file, "r") as f_in:
        with open(output_file, "w") as f_out:
            for line in iter(f_in.readline, ""):
                if line.startswith("COMPND") and "CHAIN" in line:
                    res = re.findall(":(.*);", line)[0]
                    line = line.replace(res + ";", " A;" + " " * (len(res) - 2))
                if line.startswith("ATOM") and " A " not in line:
                    continue
                elif line.startswith("HETATM") and "HOH" in line:
                    continue
                f_out.write(line)

multi_otsu(*rates, classes=3)

global otsu thesholding of multiple rate arrays in log space

Parameters

rates : iterable iterable of numpy structured arrays with a 'rate' field classes : :obj:int Number of classes to divide the data into

Returns

thds : :obj:tuple tuple with thresholds

Source code in pyhdx/support.py
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
def multi_otsu(*rates, classes=3):
    """
    global otsu thesholding of multiple rate arrays in log space

    Parameters
    ----------
    rates : iterable
        iterable of numpy structured arrays with  a 'rate' field
    classes : :obj:`int`
        Number of classes to divide the data into

    Returns
    -------
    thds : :obj:`tuple`
        tuple with thresholds

    """
    all_rates = np.concatenate([data["rate"] for data in rates])
    thd_rates = np.log(all_rates[~np.isnan(all_rates)])
    thds = threshold_multiotsu(thd_rates, classes=classes)
    return tuple(np.e**thd for thd in thds)

pbar_decorator(pbar)

Wraps a progress bar around a function, updating the progress bar with each function call

Source code in pyhdx/support.py
819
820
821
822
823
824
825
826
827
828
829
830
831
def pbar_decorator(pbar):
    """Wraps a progress bar around a function, updating the progress bar with each function call"""

    def func_wrapper(func):
        @wraps(func)
        def wrapper(*args, **kwargs):
            result = func(*args, **kwargs)
            pbar.update()
            return result

        return wrapper

    return func_wrapper

pprint_df_to_file(df, file_path_or_obj)

Pretty print (human-readable) a dataframe to a file

Parameters

df : :class:~pandas.DataFrame file_path_or_obj : :obj:str, Path or :class:~io.StringIO

Source code in pyhdx/support.py
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
def pprint_df_to_file(df, file_path_or_obj):
    """
    Pretty print (human-readable) a dataframe to a file

    Parameters
    ----------
    df : :class:`~pandas.DataFrame`
    file_path_or_obj : :obj:`str`, Path or :class:`~io.StringIO`

    """
    with pd.option_context(
        "display.max_rows",
        None,
        "display.max_columns",
        None,
        "display.expand_frame_repr",
        False,
    ):  # more options can be specified also
        if isinstance(file_path_or_obj, str):
            pth = Path(file_path_or_obj)
            pth.write_text(df.__str__())
        elif isinstance(file_path_or_obj, Path):
            file_path_or_obj.write_text(df.__str__())
        elif isinstance(file_path_or_obj, StringIO):
            file_path_or_obj.write(df.__str__())

reduce_inter(args, gap_size=-1)

Reduce overlapping intervals to its non-overlapping intveral parts

Author: Brent Pedersen Source: https://github.com/brentp/interlap/blob/3c4a5923c97a5d9a11571e0c9ea5bb7ea4e784ee/interlap.py#L224

:obj:int

Gaps of this size between adjacent peptides is not considered to overlap. A value of -1 means that peptides with exactly zero overlap are separated. With gap_size=0 peptides with exactly zero overlap are not separated, and larger values tolerate larger gap sizes.

reduce_inter([(2, 4), (4, 9)]) [(2, 4), (4, 9)] reduce_inter([(2, 6), (4, 10)]) [(2, 10)]

Source code in pyhdx/support.py
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
def reduce_inter(args: list[tuple[int, int]], gap_size: int = -1) -> list[tuple[int, int]]:
    """Reduce overlapping intervals to its non-overlapping intveral parts

    Author: Brent Pedersen
    Source: https://github.com/brentp/interlap/blob/3c4a5923c97a5d9a11571e0c9ea5bb7ea4e784ee/interlap.py#L224

    gap_size : :obj:`int`
        Gaps of this size between adjacent peptides is not considered to overlap. A value of -1 means that peptides
        with exactly zero overlap are separated. With gap_size=0 peptides with exactly zero overlap are not separated,
        and larger values tolerate larger gap sizes.

    >>> reduce_inter([(2, 4), (4, 9)])
    [(2, 4), (4, 9)]
    >>> reduce_inter([(2, 6), (4, 10)])
    [(2, 10)]
    """

    gap_size += 1

    if len(args) < 2:
        return args
    args.sort()
    ret = [args[0]]
    for next_i, (s, e) in enumerate(args, start=1):
        if next_i == len(args):
            ret[-1] = ret[-1][0], max(ret[-1][1], e)
            break

        ns, ne = args[next_i]  # next start, next end
        if (
            e + gap_size > ns or ret[-1][1] + gap_size > ns
        ):  # if current end is further than next start (overlap), OR current inverterval end later then next start
            ret[-1] = (
                ret[-1][0],
                max(e, ne, ret[-1][1]),
            )  # extend the end value of the current inverval by the new end
        else:
            ret.append((ns, ne))
    return ret

rgb_to_hex(rgb_a)

Converts rgba input values are [0, 255]

alpha is set to zero

returns as '#000000'

Source code in pyhdx/support.py
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
def rgb_to_hex(rgb_a):
    """Converts rgba
    input values are [0, 255]

    alpha is set to zero

    returns as '#000000'

    """
    # Single value
    if isinstance(rgb_a, tuple):
        try:
            r, g, b, a = rgb_a
        except ValueError:
            r, g, b = rgb_a
        return f"#{r:02x}{g:02x}{b:02x}"

    elif isinstance(rgb_a, list):
        try:
            rgba_array = np.array([[b, g, r, 0] for r, g, b, a in rgb_a], dtype=np.uint8)
        except ValueError:
            # todo this only works with lists of list and gives to wrong result? tests needed
            rgba_array = np.array([[b, g, r, 0] for r, g, b in rgb_a], dtype=np.uint8)

    elif isinstance(rgb_a, np.ndarray):
        # todo: allow rgb arrays
        assert rgb_a.shape[-1] == 4
        if rgb_a.data.c_contiguous:
            # todo check for c-contigious
            rgba_array = rgb_a
        else:
            rgba_array = np.array(rgb_a)
    else:
        raise TypeError(f"Invalid type for 'rgb_a': {rgb_a}")

    ints = rgba_array.astype(np.uint8).view(dtype=np.uint32).byteswap()
    padded = np.char.rjust(base_v(ints // 2**8, 16), 6, "0")
    result = np.char.add("#", padded).squeeze()

    return result

scale(x, out_range=(-1, 1))

rescale input array x to range out_range

Source code in pyhdx/support.py
721
722
723
724
725
def scale(x, out_range=(-1, 1)):
    """rescale input array x to range `out_range`"""
    domain = np.nanmin(x), np.nanmax(x)
    y = (x - (domain[1] + domain[0]) / 2) / (domain[1] - domain[0])
    return y * (out_range[1] - out_range[0]) + (out_range[1] + out_range[0]) / 2

select_config()

When the .pyhdx directory has multiple config files, prompts the users for which config to use and subsequently loads it.

Source code in pyhdx/support.py
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
def select_config() -> None:
    """When the .pyhdx directory has multiple config files, prompts the users
    for which config to use and subsequently loads it.

    """
    pyhdx_dir = Path().home() / ".pyhdx"
    config_options = list(pyhdx_dir.glob("*.yaml"))

    if len(config_options) > 1:
        s = "Found multiple configuration files:\n"
        for i, cfg_file in enumerate(config_options, start=1):
            s += f"{i}: {cfg_file.stem}\n"

        print(s)

        choice = typer.prompt("Which config file to use?", type=int)

        if choice < 1 or choice > len(config_options):
            print(f"Invalid option: {choice}")
        else:
            cfg.load_config(config_options[choice - 1])

series_to_pymol(pd_series)

Coverts a pandas series to pymol script to color proteins structures in pymol Series must have hexadecimal color values and residue number as index

Parameters

pd_series : :class:~pandas.Series

Returns

s_out : :obj:str

Source code in pyhdx/support.py
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
def series_to_pymol(pd_series):
    """
    Coverts a pandas series to pymol script to color proteins structures in pymol
    Series must have hexadecimal color values and residue number as index

    Parameters
    ----------
    pd_series : :class:`~pandas.Series`

    Returns
    -------

    s_out : :obj:`str`

    """

    # https://stackoverflow.com/questions/33483670/how-to-group-a-series-by-values-in-pandas
    grp = pd_series.groupby(pd_series)

    s_out = ""
    for c, pd_series in grp:
        r, g, b = hex_to_rgb(c)
        s_out += f"set_color color_{c}, [{r},{g},{b}]\n"

    # https://stackoverflow.com/questions/30993182/how-to-get-the-index-range-of-a-list-that-the-values-satisfy-some-criterion-in-p
    for c, pd_series in grp:
        result = [
            list(g) for _, g in groupby(pd_series.index, key=lambda n, c=count(): n - next(c))
        ]
        residues = [f"resi {g[0]}-{g[-1]}" for g in result]

        s_out += f"color color_{c}, " + " + ".join(residues) + "\n"

    return s_out

try_wrap(start, end, wrap, margin=4)

Check for a given coverage if the value of wrap is high enough to not have peptides overlapping within margin

start, end interval is inclusive, exclusive

Source code in pyhdx/support.py
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
def try_wrap(start, end, wrap, margin=4):
    """Check for a given coverage if the value of wrap is high enough to not have peptides overlapping within margin

    start, end interval is inclusive, exclusive

    """
    assert len(start) == len(end), "Unequal length of 'start' and 'end' vectors"

    offset = np.min(start)
    start = np.array(start) - offset
    end = np.array(end) - offset

    x = np.zeros((wrap, len(start) + margin))
    wrap_gen = itertools.cycle(range(wrap))
    for i, s, e in zip(wrap_gen, start, end):
        section = x[i, s : e + margin]
        if np.any(section):
            return False
        section[:] = 1

    return True