import warnings
warnings.simplefilter("ignore", category=FutureWarning)
from pmaf.biome.essentials._metakit import EssentialFeatureMetabase
from pmaf.biome.essentials._base import EssentialBackboneBase
from pmaf.internal._constants import (
AVAIL_TAXONOMY_NOTATIONS,
jRegexGG,
jRegexQIIME,
BIOM_TAXONOMY_NAMES,
VALID_RANKS,
)
from pmaf.internal._shared import (
generate_lineages_from_taxa,
get_rank_upto,
indentify_taxon_notation,
validate_ranks,
extract_valid_ranks,
cols2ranks,
)
from collections import defaultdict
from os import path
import pandas as pd
import numpy as np
import biom
from typing import Union, Sequence, Tuple, Any, Optional
from pmaf.internal._typing import AnyGenericIdentifier, Mapper
[docs]class RepTaxonomy(EssentialBackboneBase, EssentialFeatureMetabase):
"""An `essential` class for handling taxonomy data."""
def __init__(
self,
taxonomy: Union[pd.DataFrame, pd.Series, str],
taxonomy_columns: Union[str, int, Sequence[Union[int, str]]] = None,
**kwargs: Any
) -> None:
"""Constructor for :class:`.RepTaxonomy`
Parameters
----------
taxonomy
Data containing feature taxonomy
taxonomy_columns
Column(s) containing taxonomy data
kwargs
Passed to :func:`~pandas.read_csv` or :mod:`biome` loader.
"""
tmp_metadata = kwargs.pop("metadata", {})
self.__avail_ranks = []
self.__internal_taxonomy = None
if isinstance(taxonomy, pd.DataFrame):
if taxonomy.shape[0] > 0:
if taxonomy.shape[1] > 1:
if validate_ranks(list(taxonomy.columns.values), VALID_RANKS):
tmp_taxonomy = taxonomy
else:
raise ValueError(
"Provided `taxonomy` Datafame has invalid ranks."
)
else:
tmp_taxonomy = taxonomy.iloc[:, 0]
else:
raise ValueError("Provided `taxonomy` Datafame is invalid.")
elif isinstance(taxonomy, pd.Series):
if taxonomy.shape[0] > 0:
tmp_taxonomy = taxonomy
else:
raise ValueError("Provided `taxonomy` Series is invalid.")
elif isinstance(taxonomy, str):
if path.isfile(taxonomy):
file_extension = path.splitext(taxonomy)[-1].lower()
if file_extension in [".csv", ".tsv"]:
if taxonomy_columns is None:
tmp_taxonomy = pd.read_csv(
taxonomy,
sep=kwargs.pop("sep", ","),
header=kwargs.pop("header", "infer"),
index_col=kwargs.pop("index_col", None),
)
else:
if isinstance(taxonomy_columns, int):
tmp_taxonomy = pd.read_csv(
taxonomy,
sep=kwargs.pop("sep", ","),
header=kwargs.pop("header", "infer"),
index_col=kwargs.pop("index_col", None),
).iloc[:, taxonomy_columns]
else:
tmp_taxonomy = pd.read_csv(
taxonomy,
sep=kwargs.pop("sep", ","),
header=kwargs.pop("header", "infer"),
index_col=kwargs.pop("index_col", None),
).loc[:, taxonomy_columns]
elif file_extension in [".biom", ".biome"]:
tmp_taxonomy, new_metadata = self.__load_biom(taxonomy, **kwargs)
tmp_metadata.update({"biom": new_metadata})
else:
raise NotImplementedError("File type is not supported.")
else:
raise FileNotFoundError("Provided `taxonomy` file path is invalid.")
else:
raise TypeError("Provided `taxonomy` has invalid type.")
self.__init_internal_taxonomy(tmp_taxonomy, **kwargs)
super().__init__(metadata=tmp_metadata, **kwargs)
[docs] @classmethod
def from_csv(
cls,
filepath: str,
taxonomy_columns: Union[str, int, Sequence[Union[int, str]]] = None,
**kwargs: Any
) -> "RepTaxonomy":
"""Factory method to construct a :class:`.RepTaxonomy` from CSV file.
Parameters
----------
filepath
Path to .csv File
taxonomy_columns
Column(s) containing taxonomy data
kwargs
Passed to the constructor.
filepath:
Returns
-------
Instance of
class:`.RepTaxonomy`
"""
if taxonomy_columns is None:
tmp_taxonomy = pd.read_csv(filepath, **kwargs)
else:
if isinstance(taxonomy_columns, int):
tmp_taxonomy = pd.read_csv(filepath, **kwargs).iloc[:, taxonomy_columns]
else:
tmp_taxonomy = pd.read_csv(filepath, **kwargs).loc[:, taxonomy_columns]
tmp_metadata = kwargs.pop("metadata", {})
tmp_metadata.update({"filepath": path.abspath(filepath)})
return cls(taxonomy=tmp_taxonomy, metadata=tmp_metadata, **kwargs)
[docs] @classmethod
def from_biom(cls, filepath: str, **kwargs: Any) -> "RepTaxonomy":
"""Factory method to construct a :class:`.RepTaxonomy` from :mod:`biom`
file.
Parameters
----------
filepath
:mod:`biom` file path.
kwargs
Passed to the constructor.
Returns
-------
Instance of
class:`.RepTaxonomy`
"""
taxonomy_frame, new_metadata = cls.__load_biom(filepath, **kwargs)
tmp_metadata = kwargs.pop("metadata", {})
tmp_metadata.update({"biom": new_metadata})
return cls(taxonomy=taxonomy_frame, metadata=tmp_metadata, **kwargs)
@classmethod
def __load_biom(cls, filepath: str, **kwargs: Any) -> Tuple[pd.DataFrame, dict]:
"""Actual private method to process :mod:`biom` file.
Parameters
----------
filepath
:mod:`biom` file path.
kwargs
Compatibility
"""
biom_file = biom.load_table(filepath)
if biom_file.metadata(axis="observation") is not None:
obs_data = biom_file.metadata_to_dataframe("observation")
col_names = list(obs_data.columns.values)
col_names_low = [col.lower() for col in col_names]
avail_col_names = [
colname
for tax_name in BIOM_TAXONOMY_NAMES
for colname in col_names_low
if colname[::-1].find(tax_name[::-1]) < 3
and colname[::-1].find(tax_name[::-1]) > -1
]
metadata_cols = [
col for col in col_names if col.lower() not in avail_col_names
]
if len(avail_col_names) == 1:
tmp_col_index = col_names_low.index(avail_col_names[0])
taxonomy_frame = obs_data[col_names[tmp_col_index]]
else:
taxonomy_frame = obs_data
tmp_metadata = obs_data.loc[:, metadata_cols].to_dict()
return taxonomy_frame, tmp_metadata
else:
raise ValueError("Biom file does not contain observation metadata.")
def _remove_features_by_id(
self, ids: AnyGenericIdentifier, **kwargs: Any
) -> Optional[AnyGenericIdentifier]:
"""Remove features by features ids and ratify action.
Parameters
----------
ids
Feature identifiers
kwargs
Compatibility
"""
tmp_ids = np.asarray(ids, dtype=self.__internal_taxonomy.index.dtype)
if len(tmp_ids) > 0:
self.__internal_taxonomy.drop(tmp_ids, inplace=True)
return self._ratify_action("_remove_features_by_id", ids, **kwargs)
def _merge_features_by_map(
self, map_dict: Mapper, done: bool = False, **kwargs: Any
) -> Optional[Mapper]:
"""Merge features and ratify action.
Parameters
----------
map_dict
Map to use for merging
done
Whether merging was completed or not. Compatibility.
kwargs
Compatibility
"""
if not done:
raise NotImplementedError
if map_dict:
return self._ratify_action(
"_merge_features_by_map",
map_dict,
_annotations=self.__internal_taxonomy.loc[:, "lineage"].to_dict(),
**kwargs
)
[docs] def drop_feature_by_id(
self, ids: AnyGenericIdentifier, **kwargs: Any
) -> Optional[AnyGenericIdentifier]:
"""Remove features by feature `ids`.
Parameters
----------
ids
Feature identifiers
kwargs
Compatibility
"""
target_ids = np.asarray(ids)
if self.xrid.isin(target_ids).sum() == len(target_ids):
return self._remove_features_by_id(target_ids, **kwargs)
else:
raise ValueError("Invalid feature ids are provided.")
[docs] def get_taxonomy_by_id(
self, ids: Optional[AnyGenericIdentifier] = None
) -> pd.DataFrame:
"""Get taxonomy :class:`~pandas.DataFrame` by feature `ids`.
Parameters
----------
ids
Either feature indices or None for all.
Returns
-------
class:`pandas.DataFrame` with taxonomy data
"""
if ids is None:
target_ids = self.xrid
else:
target_ids = np.asarray(ids)
if self.xrid.isin(target_ids).sum() <= len(target_ids):
return self.__internal_taxonomy.loc[target_ids, self.__avail_ranks]
else:
raise ValueError("Invalid feature ids are provided.")
[docs] def get_lineage_by_id(
self,
ids: Optional[AnyGenericIdentifier] = None,
missing_rank: bool = False,
desired_ranks: Union[bool, Sequence[str]] = False,
drop_ranks: Union[bool, Sequence[str]] = False,
**kwargs: Any
) -> pd.Series:
"""Get taxonomy lineages by feature `ids`.
Parameters
----------
ids
Either feature indices or None for all.
missing_rank
If True will generate prefix like `s__` or `d__`
desired_ranks
List of desired ranks to generate.
If False then will generate all main ranks
drop_ranks
List of ranks to drop from desired ranks.
This parameter only useful if `missing_rank` is True
kwargs
Compatibility.
Returns
-------
class:`pandas.Series` with consensus lineages and corresponding IDs
"""
if ids is None:
target_ids = self.xrid
else:
target_ids = np.asarray(ids)
tmp_desired_ranks = VALID_RANKS if desired_ranks is False else desired_ranks
total_valid_rids = self.xrid.isin(target_ids).sum()
if total_valid_rids == len(target_ids):
return generate_lineages_from_taxa(
self.__internal_taxonomy.loc[target_ids],
missing_rank,
tmp_desired_ranks,
drop_ranks,
)
elif total_valid_rids < len(target_ids):
return generate_lineages_from_taxa(
self.__internal_taxonomy.loc[np.unique(target_ids)],
missing_rank,
tmp_desired_ranks,
drop_ranks,
)
else:
raise ValueError("Invalid feature ids are provided.")
[docs] def find_features_by_pattern(
self, pattern_str: str, case_sensitive: bool = False, regex: bool = False
) -> np.ndarray:
"""Searches for features with taxa that matches `pattern_str`
Parameters
----------
pattern_str
Pattern to search for
case_sensitive
Case sensitive mode
regex
Use regular expressions
Returns
-------
class:`~numpy.ndarray` with indices
"""
return self.__internal_taxonomy[
self.__internal_taxonomy.loc[:, "lineage"].str.contains(
pattern_str, case=case_sensitive, regex=regex
)
].index.values
[docs] def drop_features_without_taxa(
self, **kwargs: Any
) -> Optional[AnyGenericIdentifier]:
"""Remove features that do not contain taxonomy.
Parameters
----------
kwargs
Compatibility
"""
ids_to_drop = self.find_features_without_taxa()
return self._remove_features_by_id(ids_to_drop, **kwargs)
[docs] def drop_features_without_ranks(
self, ranks: Sequence[str], any: bool = False, **kwargs: Any
) -> Optional[AnyGenericIdentifier]: # Done
"""Remove features that do not contain `ranks`
Parameters
----------
ranks
Ranks to look for
any
If True removes feature with single occurrence of missing rank.
If False all `ranks` must be missing.
kwargs
Compatibility
"""
target_ranks = np.asarray(ranks)
if self.__internal_taxonomy.columns.isin(target_ranks).sum() == len(
target_ranks
):
no_rank_mask = self.__internal_taxonomy.loc[:, ranks].isna()
no_rank_mask_adjusted = (
no_rank_mask.any(axis=1) if any else no_rank_mask.all(axis=1)
)
ids_to_drop = self.__internal_taxonomy.loc[no_rank_mask_adjusted].index
return self._remove_features_by_id(ids_to_drop, **kwargs)
else:
raise ValueError("Invalid ranks are provided.")
[docs] def merge_duplicated_features(self, **kwargs: Any) -> Optional[Mapper]:
"""Merge features with duplicated taxonomy.
Parameters
----------
kwargs
Compatibility
"""
ret = {}
groupby = self.__internal_taxonomy.groupby("lineage")
if any([len(group) > 1 for group in groupby.groups.values()]):
tmp_feature_lineage = []
tmp_groups = []
group_indices = list(range(len(groupby.groups)))
for lineage, feature_ids in groupby.groups.items():
tmp_feature_lineage.append(lineage)
tmp_groups.append(list(feature_ids))
self.__init_internal_taxonomy(
pd.Series(data=tmp_feature_lineage, index=group_indices)
)
ret = dict(zip(group_indices, tmp_groups))
return self._merge_features_by_map(ret, True, **kwargs)
[docs] def merge_features_by_rank(self, level: str, **kwargs: Any) -> Optional[Mapper]:
"""Merge features by taxonomic rank/level.
Parameters
----------
level
Taxonomic rank/level to use for merging.
kwargs
Compatibility
"""
ret = {}
if not isinstance(level, str):
raise TypeError("`rank` must have str type.")
if level in self.__avail_ranks:
target_ranks = get_rank_upto(self.avail_ranks, level, True)
if target_ranks:
tmp_lineages = generate_lineages_from_taxa(
self.__internal_taxonomy, False, target_ranks, False
)
groups = tmp_lineages.groupby(tmp_lineages)
if len(groups.groups) > 1:
tmp_feature_lineage = []
tmp_groups = []
group_indices = list(range(len(groups.groups)))
for lineage, feature_ids in groups.groups.items():
tmp_feature_lineage.append(lineage)
tmp_groups.append(list(feature_ids))
self.__init_internal_taxonomy(
pd.Series(data=tmp_feature_lineage, index=group_indices)
)
ret = dict(zip(group_indices, tmp_groups))
else:
raise ValueError("Invalid rank are provided.")
return self._merge_features_by_map(ret, True, **kwargs)
[docs] def find_features_without_taxa(self) -> np.ndarray:
"""Find features without taxa.
Returns
-------
class:`~numpy.ndarray` with feature indices.
"""
return self.__internal_taxonomy.loc[
self.__internal_taxonomy.loc[:, VALID_RANKS].agg(
lambda rank: len("".join(map(lambda x: (str(x or "")), rank))), axis=1
)
< 1
].index.values
[docs] def get_subset(
self, rids: Optional[AnyGenericIdentifier] = None, *args, **kwargs: Any
) -> "RepTaxonomy":
"""Get subset of the :class:`.RepTaxonomy`.
Parameters
----------
rids
Feature identifiers.
args
Compatibility
kwargs
Compatibility
Returns
-------
class:`.RepTaxonomy`
"""
if rids is None:
target_rids = self.xrid
else:
target_rids = np.asarray(rids).astype(self.__internal_taxonomy.index.dtype)
if not self.xrid.isin(target_rids).sum() == len(target_rids):
raise ValueError("Invalid feature ids are provided.")
return type(self)(
taxonomy=self.__internal_taxonomy.loc[target_rids, "lineage"],
metadata=self.metadata,
name=self.name,
)
def _export(
self, taxlike: str = "lineage", ascending: bool = True, **kwargs: Any
) -> Tuple[pd.Series, dict]:
"""Creates taxonomy for export.
Parameters
----------
taxlike
Generate taxonomy in format(currently only `lineage` is supported.)
ascending
Sorting
kwargs
Compatibility
"""
if taxlike == "lineage":
return (
self.get_lineage_by_id(**kwargs).sort_values(ascending=ascending),
kwargs,
)
else:
raise NotImplemented
[docs] def export(
self,
output_fp: str,
*args,
_add_ext: bool = False,
sep: str = ",",
**kwargs: Any
) -> None:
"""Exports the taxonomy into the specified file.
Parameters
----------
output_fp
Export filepath
args
Compatibility
_add_ext
Add file extension or not.
sep
Delimiter
kwargs
Compatibility
"""
tmp_export, rkwarg = self._export(*args, **kwargs)
if _add_ext:
tmp_export.to_csv("{}.csv".format(output_fp), sep=sep)
else:
tmp_export.to_csv(output_fp, sep=sep)
[docs] def copy(self) -> "RepTaxonomy":
"""Copy of the instance."""
return type(self)(
taxonomy=self.__internal_taxonomy.loc[:, "lineage"],
metadata=self.metadata,
name=self.name,
)
def __fix_taxon_names(self) -> None:
"""Fix invalid taxon names."""
def taxon_fixer(taxon):
if taxon is not None and pd.notna(taxon):
tmp_taxon_trimmed = taxon.lower().strip()
if len(tmp_taxon_trimmed) > 0:
if tmp_taxon_trimmed[0] == "[":
tmp_taxon_trimmed = tmp_taxon_trimmed[1:]
if tmp_taxon_trimmed[-1] == "]":
tmp_taxon_trimmed = tmp_taxon_trimmed[:-1]
return tmp_taxon_trimmed.capitalize()
else:
return None
else:
return None
self.__internal_taxonomy.loc[:, VALID_RANKS] = self.__internal_taxonomy.loc[
:, VALID_RANKS
].applymap(taxon_fixer)
def __reconstruct_internal_lineages(self) -> None:
"""Reconstruct the internal lineages."""
self.__internal_taxonomy.loc[:, "lineage"] = generate_lineages_from_taxa(
self.__internal_taxonomy, True, self.__avail_ranks, False
)
def __init_internal_taxonomy(
self,
taxonomy_data: Union[pd.Series, pd.DataFrame],
taxonomy_notation: Optional[str] = "greengenes",
order_ranks: Optional[Sequence[str]] = None,
**kwargs: Any
) -> None:
"""Main method to initialize taxonomy.
Parameters
----------
taxonomy_data
Incoming parsed taxonomy data
taxonomy_notation
Taxonomy lineage notation style. Can be one of
:const:`pmaf.internals._constants.AVAIL_TAXONOMY_NOTATIONS`
order_ranks
List with the target rank order. Default is set to None.
The 'silva' notation require `order_ranks`.
kwargs
Compatibility
"""
if isinstance(taxonomy_data, pd.Series):
new_taxonomy = self.__init_taxonomy_from_lineages(
taxonomy_data, taxonomy_notation, order_ranks
)
elif isinstance(taxonomy_data, pd.DataFrame):
if taxonomy_data.shape[1] == 1:
taxonomy_data_series = pd.Series(
data=taxonomy_data.iloc[:, 0], index=taxonomy_data.index
)
new_taxonomy = self.__init_taxonomy_from_lineages(
taxonomy_data_series, taxonomy_notation, order_ranks
)
else:
new_taxonomy = self.__init_taxonomy_from_frame(
taxonomy_data, taxonomy_notation, order_ranks
)
else:
raise RuntimeError(
"`taxonomy_data` must be either pd.Series or pd.Dataframe"
)
if new_taxonomy is None:
raise ValueError("Provided taxonomy is invalid.")
# Assign newly constructed taxonomy to the self.__internal_taxonomy
self.__internal_taxonomy = new_taxonomy
self.__fix_taxon_names() # Fix incorrect taxa
tmp_avail_ranks = [rank for rank in VALID_RANKS if rank in new_taxonomy.columns]
self.__avail_ranks = [
rank for rank in tmp_avail_ranks if new_taxonomy.loc[:, rank].notna().any()
]
# Reconstruct internal lineages for default greengenes notation
self.__reconstruct_internal_lineages()
self._init_state = True
def __init_taxonomy_from_lineages(
self,
taxonomy_series: pd.Series,
taxonomy_notation: Optional[str],
order_ranks: Optional[Sequence[str]],
) -> pd.DataFrame: # Done
"""Main method that produces taxonomy dataframe from lineages.
Parameters
----------
taxonomy_series
:class:`pandas.Series` with taxonomy lineages
taxonomy_notation
Taxonomy lineage notation style. Can be one of :const:`pmaf.internals._constants.AVAIL_TAXONOMY_NOTATIONS`
order_ranks
List with the target rank order. Default is set to None. The 'silva' notation require `order_ranks`.
"""
# Check if taxonomy is known and is available for parsing. Otherwise indentify_taxon_notation() will try to identify notation
if taxonomy_notation in AVAIL_TAXONOMY_NOTATIONS:
notation = taxonomy_notation
else:
# Get first lineage _sample for notation testing assuming the rest have the the same notations
sample_taxon = taxonomy_series.iloc[0]
# Identify notation of the lineage string
notation = indentify_taxon_notation(sample_taxon)
if order_ranks is not None:
if all([rank in VALID_RANKS for rank in order_ranks]):
target_order_ranks = order_ranks
else:
raise NotImplementedError
else:
target_order_ranks = VALID_RANKS
if notation == "greengenes":
lineages = taxonomy_series.reset_index().values.tolist()
ordered_taxa_list = []
ordered_indices_list = [elem[0] for elem in lineages]
for lineage in lineages:
tmp_lineage = jRegexGG.findall(lineage[1])
tmp_taxa_dict = {
elem[0]: elem[1] for elem in tmp_lineage if elem[0] in VALID_RANKS
}
for rank in VALID_RANKS:
if rank not in tmp_taxa_dict.keys():
tmp_taxa_dict.update({rank: None})
tmp_taxa_ordered = [tmp_taxa_dict[rank] for rank in VALID_RANKS]
ordered_taxa_list.append([None] + tmp_taxa_ordered)
taxonomy = pd.DataFrame(
index=ordered_indices_list,
data=ordered_taxa_list,
columns=["lineage"] + VALID_RANKS,
)
return taxonomy
elif notation == "qiime":
lineages = taxonomy_series.reset_index().values.tolist()
tmp_taxa_dict_list = []
tmp_ranks = set()
for lineage in lineages:
tmp_lineage = jRegexQIIME.findall(lineage[1])
tmp_lineage.sort(key=lambda x: x[0])
tmp_taxa_dict = defaultdict(None)
tmp_taxa_dict[None] = lineage[0]
for rank, taxon in tmp_lineage:
tmp_taxa_dict[rank] = taxon
tmp_ranks.add(rank)
tmp_taxa_dict_list.append(dict(tmp_taxa_dict))
tmp_taxonomy_df = pd.DataFrame.from_records(tmp_taxa_dict_list)
tmp_taxonomy_df.set_index(None, inplace=True)
tmp_taxonomy_df = tmp_taxonomy_df.loc[:, sorted(list(tmp_ranks))]
tmp_taxonomy_df.columns = [
rank for rank in target_order_ranks[::-1][: len(tmp_ranks)]
][::-1]
for rank in VALID_RANKS:
if rank not in tmp_taxonomy_df.columns:
tmp_taxonomy_df.loc[:, rank] = None
return tmp_taxonomy_df
elif notation == "silva":
lineages = taxonomy_series.reset_index().values.tolist()
tmp_taxa_dict_list = []
tmp_ranks = set()
for lineage in lineages:
tmp_lineage = lineage[1].split(";")
tmp_taxa_dict = defaultdict(None)
tmp_taxa_dict[None] = lineage[0]
for rank_i, taxon in enumerate(tmp_lineage):
rank = target_order_ranks[rank_i]
tmp_taxa_dict[rank] = taxon
tmp_ranks.add(rank)
tmp_taxa_dict_list.append(dict(tmp_taxa_dict))
tmp_taxonomy_df = pd.DataFrame.from_records(tmp_taxa_dict_list)
tmp_taxonomy_df.set_index(None, inplace=True)
tmp_rank_ordered = [
rank for rank in target_order_ranks if rank in VALID_RANKS
]
tmp_taxonomy_df = tmp_taxonomy_df.loc[:, tmp_rank_ordered]
tmp_taxonomy_df.columns = [
rank for rank in target_order_ranks[::-1][: len(tmp_ranks)]
][::-1]
for rank in VALID_RANKS:
if rank not in tmp_taxonomy_df.columns:
tmp_taxonomy_df.loc[:, rank] = None
return tmp_taxonomy_df
else:
raise NotImplementedError
def __init_taxonomy_from_frame(
self,
taxonomy_dataframe: pd.DataFrame,
taxonomy_notation: Optional[str],
order_ranks: Optional[Sequence[str]],
) -> pd.DataFrame: # Done # For now only pass to _init_taxonomy_from_series
"""Main method that produces taxonomy sheet from dataframe.
Parameters
----------
taxonomy_dataframe
:class:`~pandas.DataFrame` with taxa split by ranks.
taxonomy_notation
Taxonomy lineage notation style. Can be one of :const:`pmaf.internals._constants.AVAIL_TAXONOMY_NOTATIONS`
order_ranks
List with the target rank order. Default is set to None. The 'silva' notation require `order_ranks`.
Returns
-------
:class:`~pandas.DataFrame`
"""
valid_ranks = extract_valid_ranks(taxonomy_dataframe.columns, VALID_RANKS)
if valid_ranks is not None:
if len(valid_ranks) > 0:
return pd.concat(
[
taxonomy_dataframe,
pd.DataFrame(
data="",
index=taxonomy_dataframe.index,
columns=[
rank for rank in VALID_RANKS if rank not in valid_ranks
],
),
],
axis=1,
)
else:
taxonomy_series = taxonomy_dataframe.apply(
lambda taxa: ";".join(taxa.values.tolist()), axis=1
)
return self.__init_taxonomy_from_lineages(
taxonomy_series, taxonomy_notation, order_ranks
)
else:
valid_ranks = cols2ranks(taxonomy_dataframe.columns)
taxonomy_dataframe.columns = valid_ranks
taxonomy_series = taxonomy_dataframe.apply(
lambda taxa: ";".join([(t if isinstance(t,str) else '') for t in taxa.values]), axis=1
)
return self.__init_taxonomy_from_lineages(
taxonomy_series, taxonomy_notation, order_ranks
)
@property
def avail_ranks(self) -> Sequence[str]:
"""List of available taxonomic ranks."""
return self.__avail_ranks
@property
def duplicated(self) -> pd.Index:
"""List of duplicated feature indices."""
return self.__internal_taxonomy.index[
self.__internal_taxonomy["lineage"].duplicated(keep=False)
]
@property
def data(self) -> pd.DataFrame:
"""Actual data representation as pd.DataFrame."""
return self.__internal_taxonomy
@property
def xrid(self) -> pd.Index:
"""Feature indices as pd.Index."""
return self.__internal_taxonomy.index