import warnings
warnings.simplefilter("ignore", category=FutureWarning)
from pmaf.biome.essentials._metakit import EssentialFeatureMetabase
import pandas as pd
import numpy as np
from pmaf.internal.io._seq import SequenceIO
from pmaf.internal._extensions._cpython._pmafc_extension._helper import (
make_sequence_record_tuple,
)
from pmaf.internal._shared import get_stats_for_sequence_record_df
from pmaf.biome.essentials._base import EssentialBackboneBase
from pmaf.sequence._sequence._nucleotide import Nucleotide
from pmaf.sequence._multiple._multiple import MultiSequence
from typing import Union, Optional, Tuple, Any
from pmaf.internal._typing import AnyGenericIdentifier, Mapper
[docs]class RepSequence(EssentialBackboneBase, EssentialFeatureMetabase):
"""An `essential` class for handling feature sequence data."""
def __init__(
self,
sequences: Union[str, MultiSequence, pd.DataFrame, pd.Series],
**kwargs: Any
) -> None:
"""Constructor for :class:`.RepSequence`
Parameters
----------
sequences
Sequence data
kwargs
Compatibility
"""
super().__init__(**kwargs)
tmp_sequences = []
if isinstance(sequences, str):
seqio = SequenceIO(sequences, upper=True)
seqio_gen = seqio.pull_parser(id=True, description=True, sequence=True)
for rid, desc, seq in seqio_gen:
seq_rec = make_sequence_record_tuple(rid, seq) + (desc,)
tmp_sequences.append(seq_rec)
elif isinstance(sequences, MultiSequence):
for seq in sequences.sequences:
seq_rec = make_sequence_record_tuple(seq.name, str(seq.text)) + ("",)
tmp_sequences.append(seq_rec)
elif isinstance(sequences, pd.DataFrame):
for rid, seq in sequences.loc[:, "sequence"].iteritems():
seq_rec = make_sequence_record_tuple(rid, seq) + ("",)
tmp_sequences.append(seq_rec)
elif isinstance(sequences, pd.Series):
for rid, seq in sequences.iteritems():
seq_rec = make_sequence_record_tuple(rid, seq) + ("",)
tmp_sequences.append(seq_rec)
else:
raise TypeError("`sequences` has unsupported type.")
seq_record_df = pd.DataFrame.from_records(
tmp_sequences,
columns=["rid", "sequence", "length", "tab", "description"],
index=["rid"],
)
self.__sequence_df = pd.concat(
[
seq_record_df[["sequence", "description"]],
get_stats_for_sequence_record_df(seq_record_df),
],
axis=1,
)
def _remove_features_by_id(
self, ids: AnyGenericIdentifier, **kwargs: Any
) -> Optional[AnyGenericIdentifier]:
"""Remove features by `ids` and ratify action.
Parameters
----------
ids
Feature identifiers
kwargs
Compatibility
"""
tmp_ids = np.asarray(ids, dtype=self.__sequence_df.index.dtype)
if len(tmp_ids) > 0:
self.__sequence_df.drop(tmp_ids, inplace=True)
return self._ratify_action("_remove_features_by_id", ids, **kwargs)
def _merge_features_by_map(
self, map_dict: Mapper, **kwargs: Any
) -> Optional[Mapper]:
"""Merge features and ratify action. THIS METHOD IS INCOMPLETE.
Parameters
----------
map_dict
Map to use for merging
kwargs
Compatibility
"""
print(
"ASSUME ALIGNED SEQUENCES! :))"
) # TODO: This method must align sequences.
return self._ratify_action("_merge_features_by_map", map_dict, **kwargs)
[docs] def copy(self) -> "RepSequence":
"""Copy of the instance."""
return type(self)(
sequences=self.__sequence_df.loc[:, "sequence"],
metadata=self.metadata,
name=self.name,
)
[docs] def get_subset(
self, rids: Optional[AnyGenericIdentifier] = None, *args: Any, **kwargs: Any
) -> "RepSequence":
"""Get subset of the :class:`.RepSequence`.
Parameters
----------
rids
Feature identifiers.
args
Compatibility
kwargs
Compatibility
Returns
-------
class:`.RepSequence`
"""
if rids is None:
target_rids = self.xrid
else:
target_rids = np.asarray(rids).astype(self.__sequence_df.index.dtype)
if not self.xrid.isin(target_rids).sum() == len(target_rids):
raise ValueError("Invalid feature ids are provided.")
return type(self)(
sequences=self.__sequence_df.loc[target_rids, "sequence"],
metadata=self.metadata,
name=self.name,
)
[docs] def to_multiseq(self) -> MultiSequence:
"""Creates an instance of
:class:`~pmaf.sequence._multiple._multiple.MultiSequence` containing
sequences.
Returns
-------
class:`~pmaf.sequence._multiple._multiple.MultiSequence`
"""
tmp_sequences = []
for ix, seq, desc in self.__sequence_df[
:, ["sequence", "describtion"]
].itertuples():
tmp_sequences.append(
Nucleotide(seq, name=None, metadata={"description": desc})
)
return MultiSequence(
tmp_sequences, name=self.name, metadata=self.metadata, internal_id="taxid"
)
def _export(self, *args, **kwargs: Any) -> Tuple[MultiSequence, dict]:
"""Present only for backward compatibility with other `essentials`."""
return self.to_multiseq(), kwargs
[docs] def export(
self, output_fp: str, *args, _add_ext: bool = False, **kwargs: Any
) -> None:
"""Exports the FASTA sequences into the specified file.
Parameters
----------
output_fp
Export filepath
args
Compatibility
_add_ext
Add file extension or not.
kwargs
Compatibility
"""
tmp_export, rkwarg = self._export(*args, **kwargs)
if _add_ext:
tmp_export.write("{}.fasta".format(output_fp), **rkwarg)
else:
tmp_export.write(output_fp, **rkwarg)
@property
def data(self) -> pd.DataFrame:
""":class:`pandas.DataFrame` with sequence data"""
return self.__sequence_df
@property
def xrid(self) -> pd.Index:
"""Feature identifiers."""
return self.__sequence_df.index