Source code for valerius.sequences

"""Classes for the various kinds of Biological sequence."""

from collections import Counter
import numpy as np
from .sets import DotPlot

[docs]class Sequence: """A string sequence of some kind. :param str string: the raw string.""" _CODES = {} _type = "unknown" def __init__(self, string, label=""): self._string = string.upper() self._label = label def __repr__(self): return "<{} (length: {})>".format( self.__class__.__name__, len(self._string) ) def __str__(self): if len(self._string) < 25: return self._string else: return "{}...({} omitted)...{}".format( self._string[:10], len(self._string) - 20, self._string[-10:] ) def __len__(self): return len(self._string) def __eq__(self, other): try: return self._string == other._string except AttributeError: return self._string == other def __iter__(self): return iter(self._string) def __contains__(self, substring): return substring in self._string def __getitem__(self, index): return self.__class__(self._string[index]) def __setitem__(self, index, char): if len(char) != 1: raise ValueError("Can only substitute one character at a time") self._string = "{}{}{}".format( self._string[:index], char, "" if index == -1 else self._string[index + 1:] ) @property def type(self): """The type of sequence. :rtype: ``str``""" return self._type @property def length(self): """The length of the string. :rtype: ``int``""" return len(self) @property def string(self): """The sequence's raw string. :rtype: ``str``""" return self._string @property def label(self): """The sequence's descriptive label. :rtype: ``str``""" return self._label @property def frequencies(self): """Returns the frequency of each character in the sequence. :rtype: ``Counter``""" return Counter(self._string) @property def codes(self): """Returns the list of multi-letter codes corresponding to this sequence. :rtype: ``list``""" return [self._CODES.get(char, "XXX") for char in self._string] def dot_matrix(self, other): return DotPlot(self, other) def dynamic(self, other): def print_matrix(m, s1, s2): print(" " + " ".join(s2)) print(" ", end="") for x in m[0]: print(str(x).rjust(3), end="") print() for i, char in enumerate(s1, start=1): print(" " + char, end="") for x in m[i]: print(str(x).rjust(3), end="") print() print() # Make matrix matrix = np.zeros((len(self) + 1, len(other) + 1), dtype=int) print_matrix(matrix, self._string, other._string) # Scores INDEL = -1 MISMATCH = -1 MATCH = 1 # Top and side for i in range(len(matrix[0])): matrix[0][i] = 0 - i matrix[i][0] = 0 - i print_matrix(matrix, self._string, other._string) # Fill out matrix for row in range(1, len(self) + 1): for col in range(1, len(other) + 1): scores = [] left = matrix[row][col - 1] diagonal = matrix[row - 1][col - 1] top = matrix[row - 1][col] scores.append(diagonal + (MATCH if self[row - 1] == other[col - 1] else MISMATCH)) scores.append(top + MISMATCH) scores.append(left + MISMATCH) matrix[row][col] = max(scores) print_matrix(matrix, self._string, other._string) '''# First row for n in range(len(other)): matrix[0][n] = 1 if other[n] == self[0] else 0 print(matrix, end="\n\n") # Other rows for row_num in range(1, len(self)): for col_num in range(len(other)): if col_num == 0: matrix[row_num][col_num] = 1 if other[col_num] == self[row_num] else 0 else: scores = [ matrix[row_num][col_num - 1], matrix[row_num - 1][col_num], matrix[row_num - 1][col_num - 1] ] score = 1 if other[col_num] == self[row_num] else 0 matrix[row_num][col_num] = max(scores) + score print(matrix, end="\n\n")'''
[docs]class PeptideSequence(Sequence): """A sequence of protein residues. :param str string: the raw string.""" _CODES = { "V": "VAL", "I": "ILE", "L": "LEU", "E": "GLU", "Q": "GLN", "D": "ASP", "N": "ASN", "H": "HIS", "W": "TRP", "F": "PHE", "Y": "TYR", "R": "ARG", "K": "LYS", "S": "SER", "T": "THR", "M": "MET", "A": "ALA", "G": "GLY", "P": "PRO", "C": "CYS" } _type = "peptide"
[docs]class NucleotideSequence(Sequence): """A sequence of nucleotide bases. :param str string: the raw string.""" _type = "nucleotide" @property def gc_content(self): """Returns the proportion of G and C residues in the sequence. :rtype: ``float``""" if len(self._string) == 0: return 0 gc_bases = self._string.count("G") + self._string.count("C") return gc_bases / len(self._string)
[docs]class DnaSequence(NucleotideSequence): """A sequence of DNA nucleotide bases. :param str string: the raw string.""" _CODES = { "A": "DA", "C": "DC", "G": "DG", "T": "DT" } _type = "DNA"
[docs]class RnaSequence(NucleotideSequence): """A sequence of RNA nucleotide bases. :param str string: the raw string.""" _CODES = { "A": "A", "C": "C", "G": "G", "U": "U" } _type = "RNA"