Source code for pmaf.biome.survey._survey

import warnings

warnings.simplefilter("ignore", category=FutureWarning)
from ._metakit import BiomeSurveyBackboneMetabase
from pmaf.biome.essentials._metakit import EssentialBackboneMetabase
from pmaf.biome.essentials._frequency import FrequencyTable
from pmaf.biome.essentials._samplemeta import SampleMetadata
from pmaf.biome.essentials._taxonomy import RepTaxonomy
from pmaf.biome.assembly._assembly import BiomeAssembly
from pmaf.biome._base import BiomeBackboneBase
from pmaf.biome.essentials._controller import EssentialsController
from collections import defaultdict
from ._shared import (
    mergeRepTaxonmy,
    mergeFrequencyTable,
    mergeSampleMetadata,
    parse_assembly_maps,
)
import numpy as np
import pandas as pd
from typing import Union, Sequence, Any, Optional, Tuple, List, Dict
from pmaf.internal._typing import AnyGenericIdentifier, AggFunc


[docs]class BiomeSurvey(BiomeBackboneBase, BiomeSurveyBackboneMetabase):
    """Assembly-like Survey class for merging instances of
    :class:`~pmaf.biome.assembly._assembly.BiomeAssembly`"""

    _SUPPORTED_ESSENTIALS = (RepTaxonomy, FrequencyTable, SampleMetadata)

    def __init__(
        self,
        assembiles: Optional[Sequence[BiomeAssembly]] = None,
        *args: Any,
        aggfunc: Union[
            AggFunc,
            Tuple[AggFunc, AggFunc],
            Dict[
                Union[str, int],
                Union[AggFunc, Dict[Union[EssentialBackboneMetabase, None], AggFunc]],
            ],
        ] = "mean",
        groupby: Union[str, Tuple[str, str], Dict[Union[int, str], str]] = "label",
        **kwargs: Any
    ):
        """This class performs merging/pooling of _multiple independent studies
        or instances of :class:`~pmaf.biome.essentials.EssentialBackboneBase`
        (essentials) into single instance of
        :class:`~pmaf.biome.survey._assembly.BiomeAssembly` -like class
        :class:`~pmaf.biome.survey._survey.BiomeSurvey`.

        Parameters
        ----------
        assembiles
            *essentials* to pool.
        *args
            Unpacked *essentials* to pool. (Convenience)
        aggfunc
            Aggregation method. Parameter take _multiple variations of
            aggregation approach. If `str` or `Callable` then `aggfunc` will be
            applied to both axes(feature and sample) and any *essentials*
            regardless of its type. To apply aggregation for each axis separately
            use `tuple` (for example,  *aggfunc=('sum', 'mean'))* where first
            aggregation method refers to feature axis and second to sample axis.
            To apply more complex aggregation use Dict type, where keys refer to axis
            like *0/feature* for feature axis or *1/sample* for sample axis. Values
            of the dictionary can refer to two approaches. First is when values are
            simply `str` or `Callable`, which is similar to using `tuple`. Second,
            is when using values with type `Dict` where dictionary values are
            `str` or `Callable` refer to aggregating function and keys are types or
            class of *essentials* (must have base abstract class :class:`~pmaf.biome.essentials._metakit.EssentialBackboneMetabase` ).
            Using this method each type of *essential* will be processed differently
            among instances of *assemblies*. Lastly, when using approach like
            Dict[axis, Dict[*essential-type*,*agg-func*]] using `None` for one of
            *essential-type* keys will assume that it refers to all *remaining-types*.
        groupby
            Grouping method. Parameters take _multiple variations
            similar to `aggfunc`. Variations are same as `aggfunc` with exception
            that values can be either `label` for both feature-axis or sample-axis
            like *groupby='label'* or *groupby=(`label`, `label`)* , or *taxonomy*
            for feature-axis only. Grouping by *taxonomy* will merge features with
            same consensus lineage.
        **kwargs
            Compatibility
        """
        if kwargs.get("_copyself", None) is not None:
            copy_data = kwargs.pop("_copyself")
            tmp_assembiles = copy_data["_assemblies"]
            new_essentials = copy_data["_essentials"]
            new_metadata = {}
        else:
            if assembiles is not None:
                if isinstance(assembiles, (tuple, list)):
                    tmp_assembiles = assembiles
                else:
                    tmp_assembiles = [assembiles]
            else:
                tmp_assembiles = []
            if len(args) > 0:
                for arg in args:
                    tmp_assembiles.append(arg)
            if len(tmp_assembiles) == 0:
                raise ValueError("No assemblies were provided.")

            if isinstance(aggfunc, str):
                tmp_aggfuncs = {
                    essential_type: {0: aggfunc, 1: aggfunc}
                    for essential_type in self._SUPPORTED_ESSENTIALS
                }
            elif isinstance(aggfunc, tuple):
                if len(aggfunc) != 2:
                    raise ValueError("`aggfunc` when tuple must have length of 2.")
                tmp_aggfuncs = {
                    essential_type: {0: aggfunc[0], 1: aggfunc[1]}
                    for essential_type in self._SUPPORTED_ESSENTIALS
                }
            elif isinstance(aggfunc, dict):
                if not (
                    sum([k in [0, "feature"] for k in aggfunc.keys()]) == 1
                    and sum([k in [1, "sample"] for k in aggfunc.keys()]) == 1
                ):
                    raise ValueError(
                        "When `aggfunc` have type Dict then it's keys must can be "
                        "either feature/0 or sample/1."
                    )
                # Following two lines provide valid feature key aka. fkey and
                # valid sample key aka. skey to access correct Dict values of `aggfunc`
                fkey = 0 if 0 in aggfunc.keys() else "feature"
                skey = 1 if 1 in aggfunc.keys() else "sample"
                if all([isinstance(v, str) or callable(v) for v in aggfunc.values()]):
                    tmp_aggfuncs = {
                        essential_type: {0: aggfunc[fkey], 1: aggfunc[skey]}
                        for essential_type in self._SUPPORTED_ESSENTIALS
                    }
                elif all(
                    [
                        issubclass(k, EssentialBackboneMetabase)
                        for edict in aggfunc.values()
                        for k in edict.keys()
                        if k is not None
                    ]
                ):
                    tmp_aggfuncs = defaultdict(dict)
                    if None not in aggfunc[fkey].keys():
                        raise ValueError(
                            "`aggfunc` must contain None key in <feature> values."
                        )
                    for essential_type in self._SUPPORTED_ESSENTIALS:
                        tmp_aggfuncs[essential_type].update({0: aggfunc[fkey][None]})
                    if None not in aggfunc[skey].keys():
                        raise ValueError(
                            "`aggfunc` must contain None key in <sample> values."
                        )
                    for essential_type in self._SUPPORTED_ESSENTIALS:
                        tmp_aggfuncs[essential_type].update({1: aggfunc[skey][None]})
                    for essential_type, func in aggfunc[fkey].items():
                        if essential_type is not None:
                            tmp_aggfuncs[essential_type].update({0: func})
                    for essential_type, func in aggfunc[skey].items():
                        if essential_type is not None:
                            tmp_aggfuncs[essential_type].update({1: func})
                    if not all(
                        [
                            isinstance(func, str) or callable(func)
                            for fdict in tmp_aggfuncs.values()
                            for func in fdict.values()
                        ]
                    ):
                        raise ValueError(
                            "`aggfunc` when dict must have values either "
                            "callables/func-names or dicts with keys as essential types "
                            "and value as callables or func-names"
                        )
                else:
                    raise ValueError(
                        "`aggfunc` when dict must have values either "
                        "callables/func-names or dicts with keys as essential types "
                        "and value as callables or func-names"
                    )
            else:
                raise TypeError("`aggfunc` has invalid type.")
            if isinstance(groupby, str):
                feature_groupby = groupby
                sample_groupby = groupby
            elif isinstance(groupby, tuple):
                if len(groupby) != 2:
                    raise ValueError("`groupby` when tuple must have length of 2.")
                feature_groupby = groupby[0]
                sample_groupby = groupby[1]
            elif isinstance(groupby, dict):
                if not (
                    sum([k in [0, "feature"] for k in groupby.keys()]) == 1
                    and sum([k in [1, "sample"] for k in groupby.keys()]) == 1
                ):
                    raise ValueError(
                        "`groupby` when dict must feature or 0 and sample or 1"
                    )
                feature_groupby = groupby[0 if 0 in groupby.keys() else "feature"]
                sample_groupby = groupby[1 if 1 in groupby.keys() else "sample"]
            else:
                raise TypeError("`groupby` has invalid type.")

            if feature_groupby == "taxonomy" and sample_groupby in ["index", "label"]:
                must_have_essentials = (RepTaxonomy,)
            else:
                must_have_essentials = (object,)

            ## APPROVE VALID ESSENTIALS AND MAKE ASSEMBLY MAP

            assembly_map = defaultdict(None)
            for label, asmbly in enumerate(tmp_assembiles):
                if not all(
                    [
                        any(
                            [
                                isinstance(essential, must_type)
                                for essential in asmbly.essentials
                            ]
                        )
                        for must_type in must_have_essentials
                    ]
                ):
                    raise ValueError(
                        "Assembly {} does not satisfy merging requirements."
                    )
                assembly_map[label] = asmbly

            ## Parse Assemblies and distribute into groups with indices
            features_map, samples_map = parse_assembly_maps(
                feature_groupby, sample_groupby, assembly_map
            )
            ## TRANSFORM MAKE ASSEMBLY MAP TO ESSENTIAL MAP

            essentials_map = defaultdict(dict)
            for label, asmbly in assembly_map.items():
                for essential in asmbly.essentials:
                    if isinstance(essential, self._SUPPORTED_ESSENTIALS):
                        essentials_map[type(essential)].update({label: essential})

            essentials_map = dict(essentials_map)
            new_essentials = []
            if RepTaxonomy in essentials_map.keys():
                new_essentials.append(
                    mergeRepTaxonmy(
                        feature_groupby, features_map, essentials_map, tmp_aggfuncs
                    )
                )
            if FrequencyTable in essentials_map.keys():
                new_essentials.append(
                    mergeFrequencyTable(
                        feature_groupby,
                        sample_groupby,
                        features_map,
                        samples_map,
                        essentials_map,
                        tmp_aggfuncs,
                    )
                )
            if SampleMetadata in essentials_map.keys():
                new_essentials.append(
                    mergeSampleMetadata(
                        sample_groupby, samples_map, essentials_map, tmp_aggfuncs
                    )
                )

            new_metadata = {
                "groupby": {
                    "feature": feature_groupby,
                    "sample": sample_groupby,
                    "agg": {"aggfunc": aggfunc, "aggmap": tmp_aggfuncs},
                }
            }
        tmp_controller = EssentialsController(**kwargs)
        for essential in new_essentials:
            tmp_controller.insert_essential(essential)

        self.__assembiles = tmp_assembiles
        self.__controller = tmp_controller
        tmp_metadata = {**kwargs.pop("metadata", {}), **new_metadata}
        super().__init__(metadata=tmp_metadata, **kwargs)

    def __getattr__(self, attribute: str) -> EssentialBackboneMetabase:
        """Provides attribute lookup for installed *essentials*.

        Parameters
        ----------
        attribute
            Class name of the *essential*.

        Returns
        -------
            Instance of :class:`~pmaf.biome.essentials._base.EssentialBackboneBase`
        """
        for essential in self.__controller.essentials:
            if attribute == type(essential).__name__:
                return essential
        super().__getattribute__(attribute)

    def __dir__(self):
        """Provides list of installed *essential* class names for built-in
        :func:`dir` method()"""
        return sorted(
            dir(type(self))
            + [type(essential).__name__ for essential in self.__controller.essentials]
        )

    def _repr_appendage__(self):
        """Helper for `__repr__` method of class
        :class:`~pmaf.biome.BiomeBackboneBase`"""
        return {}

[docs]    def copy(self) -> "BiomeSurvey":
        """Copy of the instance."""
        copied_essentials = [
            essential.copy() for essential in self.__controller.essentials
        ]
        refs_assemblies = self.__assembiles
        return type(self)(
            _copyself={
                "_assemblies": refs_assemblies,
                "_essentials": copied_essentials,
            },
            name=self.name,
            metadata=self.metadata,
        )

[docs]    def to_assembly(self) -> BiomeAssembly:
        """Converts to the
        :class:`~pmaf.biome.assembly._assembly.BiomeAssembly` instance."""
        return BiomeAssembly(
            self.__controller.essentials,
            copy=True,
            name=self.name,
            metadata=self.metadata,
        )

    @property
    def essentials(self) -> List[EssentialBackboneMetabase]:
        """List of *essentials*"""
        return self.__controller.essentials

    @property
    def assemblies(self) -> Tuple[BiomeAssembly]:
        """Tuple of surveyed assemblies."""
        return tuple(self.__assembiles)

    @property
    def xrid(self) -> AnyGenericIdentifier:
        """Feature identifiers."""
        return pd.Index(
            self.__controller.xrid
            if self.__controller.xrid is not None
            else np.array([], dtype=object)
        )

    @property
    def xsid(self) -> AnyGenericIdentifier:
        """Sample identifiers."""
        return pd.Index(
            self.__controller.xsid
            if self.__controller.xsid is not None
            else np.array([], dtype=object)
        )

    @property
    def controller(self) -> EssentialsController:
        """:class:`~pmaf.biome.essentials._controller.EssentialsController` of *essentials*"""
        return self.__controller