import warnings
warnings.simplefilter("ignore", category=FutureWarning)
from pmaf.internal.io._seq import SequenceIO
from pmaf.sequence._shared import (
validate_seq_mode,
mode_as_str,
mode_as_skbio,
sniff_mode,
)
from pmaf.sequence._metakit import NucleotideMetabase
from skbio.sequence import GrammaredSequence, DNA, RNA, Protein
from io import StringIO, IOBase
import copy
from numpy import isscalar
from typing import Union, Optional, Sequence, Any
[docs]class Nucleotide(NucleotideMetabase):
"""Class that represent single nucleotide sequence."""
def __init__(
self,
sequence: Union[GrammaredSequence, str],
name: Optional[str] = None,
metadata: Optional[dict] = None,
mode: str = "DNA",
**kwargs: Any
):
"""Constructor for the :class:`.Nucleotide`.
Parameters
----------
sequence
Sequence data
name
Name of the sequence instance
metadata
Metadata of the sequence instance
mode
Sequence type/mode of the new instance # TODO: Validation currently passes "protein" fix it.
kwargs
Compatibility
"""
if name is None or isscalar(name):
tmp_name = name
else:
raise TypeError("`name` can be any scalar or None")
if isinstance(metadata, dict):
tmp_metadata = metadata
elif metadata is None:
tmp_metadata = {}
else:
raise TypeError("`metadata` can be dict or None")
if mode is not None:
if validate_seq_mode(mode):
tmp_mode = mode.lower()
else:
raise ValueError("`mode` is invalid.")
else:
if isinstance(sequence, GrammaredSequence):
tmp_mode = mode_as_str(type(sequence))
else:
tmp_mode = None
if isinstance(sequence, GrammaredSequence):
tmp_sequence_str = str(sequence).upper()
tmp_metadata = {**sequence.metadata, **tmp_metadata}
if tmp_name is None:
tmp_name = sequence.metadata.get("id", None)
elif isinstance(sequence, str):
tmp_sequence_str = sequence.upper()
else:
raise TypeError("`sequence` has invalid type.")
if tmp_mode is None:
tmp_skbio_type = sniff_mode(tmp_sequence_str)
else:
tmp_skbio_type = mode_as_skbio(tmp_mode)
self.__sequence = tmp_skbio_type(tmp_sequence_str)
self.__mode = mode_as_str(tmp_skbio_type)
self.__skbio_mode = tmp_skbio_type
if tmp_name is not None:
self.__sequence.metadata["id"] = tmp_name
self.__metadata = tmp_metadata
self.__name = tmp_name
self.__buckled = bool(kwargs.get("buckled", None))
def __repr__(self):
class_name = self.__class__.__name__
name = self.__name if self.__name is not None else "N/A"
length = len(self.__sequence)
metadata_state = "Present" if len(self.__metadata) > 0 else "N/A"
mode = self.__mode.upper() if self.__mode is not None else "N/A"
repr_str = "<{}:[{}], Name:[{}], Mode:[{}], Metadata:[{}]>".format(
class_name, length, name, mode, metadata_state
)
return repr_str
[docs] def buckle_by_uid(self, uid: str) -> dict:
"""Buckle sequences based on unique identifier `uid`. Sequence are
usually buckled prior to alignment in order to not loose sequence
specific metadata during alignment process.
Parameters
----------
uid
Unique identifier string.
Returns
-------
Packed metadata of current nucleotide instance for backup.
"""
if not self.__buckled:
packed_metadata = {
"master-metadata": self.__metadata,
"__name": self.__name,
}
self.__name = uid
self.__sequence.metadata["id"] = uid
self.__buckled = True
return packed_metadata
else:
raise RuntimeError("Nucleotide instance is already buckled.")
[docs] def unbuckle_uid(self) -> str:
"""Retrieve unique identifier assigned during buckling.
Returns
-------
If instance is buckled the return the `uid`. Otherwise raise error.
"""
if self.__buckled:
return self.__sequence.metadata["id"]
else:
raise RuntimeError("Nucleotide instance is not buckled.")
[docs] def restore_buckle(self, buckled_pack: dict) -> None:
"""Restore the buckle using packed metadata `buckle_pack`
Parameters
----------
buckled_pack
Packed metadata backed up during bucking process
Returns
-------
None if success otherwise error.
"""
if self.__buckled:
if isinstance(buckled_pack, dict):
if len(buckled_pack) > 0:
self.__name = buckled_pack["__name"]
self.__sequence.metadata["id"] = buckled_pack["__name"]
self.__metadata.update(buckled_pack["master-metadata"])
else:
ValueError("`buckled_pack` is empty.")
else:
TypeError("`buckled_pack` has invalid type.")
else:
raise RuntimeError("Nucleotide instance is not buckled.")
[docs] def get_string_as(self, format: str = "fasta", **kwargs: Any) -> str:
"""Get string of the sequence.
Parameters
----------
format
Format of the string to retrieve
kwargs
Compatibility
Returns
-------
Formatted sequence data as string
"""
with StringIO() as tmp_buffer_io:
self.__write_by_handle(tmp_buffer_io, format=format, **kwargs)
return tmp_buffer_io.getvalue()
[docs] def write(
self, file: Union[str, IOBase], format: str = "fasta", **kwargs: Any
) -> None:
"""Write sequence data to the file.
Parameters
----------
file
File path or IO stream to write into
format
Format of the output file
kwargs
Compatibility
"""
self.__write_by_handle(file, format=format, **kwargs)
def __write_by_handle(self, file, format, mode="w", **kwargs):
"""Write into the IO handler."""
self.__sequence.metadata = self.__metadata
self.__sequence.metadata["id"] = self.__name
if isinstance(file, IOBase):
if file.writable():
if mode[0] == "a":
file.seek(0, 2)
elif mode[0] == "w":
file.seek(0, 0)
else:
raise ValueError("`mode` has invalid value.")
else:
raise ValueError("`file` must be writable.")
tmp_file = file
elif isinstance(file, str):
tmp_file = open(file, mode)
else:
raise ValueError("`file` is invalid.")
with StringIO() as tmp_io: # This is done to compensate skbio bug. Skbio writer does not recogine mode kwarg properly.
self.__sequence.write(tmp_io, format=format, **kwargs)
tmp_io.seek(0, 0)
tmp_file.write(tmp_io.read())
self.__sequence.metadata = {"id": self.__name}
[docs] def complement(self):
"""Return the sequence complement as new instance."""
seq_complement = str(self.__sequence.complement())
return type(self)(
seq_complement, name=self.__name, metadata=self.__metadata, mode=self.__mode
)
[docs] def copy(self):
"""Copy of the current instance."""
return copy.deepcopy(self)
[docs] @classmethod
def read(
cls,
file: Any,
name: Optional[str] = None,
metadata: Optional[dict] = None,
mode: str = "DNA",
**kwargs: any
) -> "Nucleotide":
"""Factory class that reads the sequence data.
Parameters
----------
file
Unspecified data for sequence.
Can be file path, IO stream, string, etc.
name
Name of the sequence instance
metadata
Metadata for the sequence instance
mode
Sequence type/mode can be 'DNA' or 'RNA'
kwargs
Compatibility
Returns
-------
Return new instance of :class:`.Nucleotide`
"""
if isinstance(name, (str, int, type(None))):
tmp_name = name
else:
raise TypeError("`name` can be str, int or None")
if isinstance(metadata, dict):
tmp_metadata = metadata
elif metadata is None:
tmp_metadata = {}
else:
raise TypeError("`metadata` can be dict or None")
seq_gen = SequenceIO(file, upper=True).pull_parser(
parser="simple", id=True, description=True, sequence=True
)
tmp_sequence_str = ""
for sid, desc, seq_str in seq_gen:
if len(tmp_sequence_str) == 0:
tmp_sequence_str = seq_str
if not "description" in tmp_metadata.keys():
tmp_metadata.update({"description": desc})
if tmp_name is None:
tmp_name = sid
else:
raise ValueError(
"`sequence` must contain only one sequence. For _multiple sequence reads use MultiSequence."
)
return cls(
tmp_sequence_str, name=tmp_name, metadata=tmp_metadata, mode=mode, **kwargs
)
@property
def skbio(self) -> GrammaredSequence:
"""The :mod:`skbio` representation of the sequence as
:class:`skbio.sequence.GrammaredSequence`"""
return self.__sequence
@property
def text(self) -> str:
"""Sequence as string."""
return str(self.__sequence)
@property
def metadata(self) -> dict:
"""Sequence instance metadata."""
return self.__metadata
@property
def mode(self) -> str:
"""Sequence instance mode/type."""
return self.__mode
@property
def skbio_mode(self) -> Union[DNA, RNA, Protein]:
"""The :mod:`skbio` mode of the sequence."""
return self.__skbio_mode
@property
def length(self) -> int:
"""Length of the sequence."""
return len(self.__sequence)
@property
def name(self) -> str:
"""Name of the sequence instance."""
return self.__name
@property
def is_buckled(self) -> bool:
"""Is sequence instance is buckled or not."""
return self.__buckled