"""Classes for the various kinds of Biological sequence."""
from collections import Counter
import numpy as np
from .sets import DotPlot
[docs]class Sequence:
"""A string sequence of some kind.
:param str string: the raw string."""
_CODES = {}
_type = "unknown"
def __init__(self, string, label=""):
self._string = string.upper()
self._label = label
def __repr__(self):
return "<{} (length: {})>".format(
self.__class__.__name__, len(self._string)
)
def __str__(self):
if len(self._string) < 25:
return self._string
else:
return "{}...({} omitted)...{}".format(
self._string[:10], len(self._string) - 20, self._string[-10:]
)
def __len__(self):
return len(self._string)
def __eq__(self, other):
try:
return self._string == other._string
except AttributeError:
return self._string == other
def __iter__(self):
return iter(self._string)
def __contains__(self, substring):
return substring in self._string
def __getitem__(self, index):
return self.__class__(self._string[index])
def __setitem__(self, index, char):
if len(char) != 1:
raise ValueError("Can only substitute one character at a time")
self._string = "{}{}{}".format(
self._string[:index], char, "" if index == -1 else self._string[index + 1:]
)
@property
def type(self):
"""The type of sequence.
:rtype: ``str``"""
return self._type
@property
def length(self):
"""The length of the string.
:rtype: ``int``"""
return len(self)
@property
def string(self):
"""The sequence's raw string.
:rtype: ``str``"""
return self._string
@property
def label(self):
"""The sequence's descriptive label.
:rtype: ``str``"""
return self._label
@property
def frequencies(self):
"""Returns the frequency of each character in the sequence.
:rtype: ``Counter``"""
return Counter(self._string)
@property
def codes(self):
"""Returns the list of multi-letter codes corresponding to this
sequence.
:rtype: ``list``"""
return [self._CODES.get(char, "XXX") for char in self._string]
def dot_matrix(self, other):
return DotPlot(self, other)
def dynamic(self, other):
def print_matrix(m, s1, s2):
print(" " + " ".join(s2))
print(" ", end="")
for x in m[0]: print(str(x).rjust(3), end="")
print()
for i, char in enumerate(s1, start=1):
print(" " + char, end="")
for x in m[i]: print(str(x).rjust(3), end="")
print()
print()
# Make matrix
matrix = np.zeros((len(self) + 1, len(other) + 1), dtype=int)
print_matrix(matrix, self._string, other._string)
# Scores
INDEL = -1
MISMATCH = -1
MATCH = 1
# Top and side
for i in range(len(matrix[0])):
matrix[0][i] = 0 - i
matrix[i][0] = 0 - i
print_matrix(matrix, self._string, other._string)
# Fill out matrix
for row in range(1, len(self) + 1):
for col in range(1, len(other) + 1):
scores = []
left = matrix[row][col - 1]
diagonal = matrix[row - 1][col - 1]
top = matrix[row - 1][col]
scores.append(diagonal + (MATCH if self[row - 1] == other[col - 1] else MISMATCH))
scores.append(top + MISMATCH)
scores.append(left + MISMATCH)
matrix[row][col] = max(scores)
print_matrix(matrix, self._string, other._string)
'''# First row
for n in range(len(other)):
matrix[0][n] = 1 if other[n] == self[0] else 0
print(matrix, end="\n\n")
# Other rows
for row_num in range(1, len(self)):
for col_num in range(len(other)):
if col_num == 0:
matrix[row_num][col_num] = 1 if other[col_num] == self[row_num] else 0
else:
scores = [
matrix[row_num][col_num - 1],
matrix[row_num - 1][col_num],
matrix[row_num - 1][col_num - 1]
]
score = 1 if other[col_num] == self[row_num] else 0
matrix[row_num][col_num] = max(scores) + score
print(matrix, end="\n\n")'''
[docs]class PeptideSequence(Sequence):
"""A sequence of protein residues.
:param str string: the raw string."""
_CODES = {
"V": "VAL", "I": "ILE", "L": "LEU", "E": "GLU", "Q": "GLN",
"D": "ASP", "N": "ASN", "H": "HIS", "W": "TRP", "F": "PHE",
"Y": "TYR", "R": "ARG", "K": "LYS", "S": "SER", "T": "THR",
"M": "MET", "A": "ALA", "G": "GLY", "P": "PRO", "C": "CYS"
}
_type = "peptide"
[docs]class NucleotideSequence(Sequence):
"""A sequence of nucleotide bases.
:param str string: the raw string."""
_type = "nucleotide"
@property
def gc_content(self):
"""Returns the proportion of G and C residues in the sequence.
:rtype: ``float``"""
if len(self._string) == 0: return 0
gc_bases = self._string.count("G") + self._string.count("C")
return gc_bases / len(self._string)
[docs]class DnaSequence(NucleotideSequence):
"""A sequence of DNA nucleotide bases.
:param str string: the raw string."""
_CODES = {
"A": "DA", "C": "DC", "G": "DG", "T": "DT"
}
_type = "DNA"
[docs]class RnaSequence(NucleotideSequence):
"""A sequence of RNA nucleotide bases.
:param str string: the raw string."""
_CODES = {
"A": "A", "C": "C", "G": "G", "U": "U"
}
_type = "RNA"