"""
Tools to manipulate matrices as returned by
:meth:`commonnexus.blocks.characters.Characters.get_matrix`.
"""
import string
import textwrap
import collections
import typing
from commonnexus.blocks.characters import GAP, State, StateMatrix
HashableState = typing.Union[None, str, typing.FrozenSet[str], typing.Tuple[str]]
[docs]class CharacterMatrix(collections.OrderedDict):
"""
A wrapper for the nested ordered dicts returned by
:meth:`commonnexus.blocks.characters.Characters.get_matrix`, providing
simpler access to some properties of the data and some conversion functionality.
"""
[docs] def iter_rows(self) -> typing.Generator[typing.List[State], None, None]:
"""Iterate lists of states per taxon."""
for row in self.values():
yield list(row.values())
[docs] def iter_columns(self) -> typing.Generator[typing.List[State], None, None]:
"""Iterate lists of states per character."""
for char in self.characters:
yield [self[taxon][char] for taxon in self.taxa]
@property
def taxa(self) -> typing.List[str]:
"""The list of taxa (labels or numbers) in a matrix."""
return list(self.keys())
@property
def characters(self) -> typing.List[str]:
"""The list of characters (labels or numbers) in a matrix."""
for row in self.values():
return list(row.keys())
@property
def distinct_states(self) -> typing.Set[HashableState]:
"""
The set of distinct states in a matrix (including missing and gap, if found).
"""
res = set()
for r in self.values():
for v in r.values():
res.add(frozenset(v) if isinstance(v, set) else v)
return res
@property
def has_missing(self) -> bool:
return None in self.distinct_states
@property
def has_gaps(self) -> bool:
return GAP in self.distinct_states
@property
def has_uncertain(self) -> bool:
for s in self.distinct_states:
if isinstance(s, frozenset):
return True
return False
@property
def has_polymorphic(self) -> bool:
for s in self.distinct_states:
if isinstance(s, tuple):
return True
return False
@property
def symbols(self) -> typing.Set[typing.Union[str, typing.FrozenSet[str], typing.Union[str]]]:
"""The set of state symbols, excluding missing and gapped."""
res = set()
for s in self.distinct_states:
if (s is not None) and (s != GAP):
res |= set(s)
return res
@property
def is_binary(self) -> bool:
return self.symbols.issubset({'0', '1'}) and not self.has_gaps
@classmethod
def binarised(cls,
matrix: StateMatrix,
statelabels: typing.Optional[typing.Dict[str, typing.Dict[str, str]]] = None) \
-> 'CharacterMatrix':
statelabels = statelabels or {}
matrix = cls(matrix)
charstates = collections.defaultdict(set)
for i, col in enumerate(matrix.iter_columns()):
for v in col:
if v is not None and v != GAP:
charstates[matrix.characters[i]] |= set(v)
charstates = {k: sorted(v, key=lambda vv: str(vv)) for k, v in charstates.items()}
new = collections.OrderedDict()
for taxon, row in matrix.items():
new[taxon] = collections.OrderedDict()
for char, value in row.items():
#
# FIXME: don't binarise what's already binary!
#
for i, state in enumerate(charstates[char], start=1):
if value is None:
v = None
elif value == GAP:
v = GAP
else:
v = ({'1'} if isinstance(value, set) else '1') if state in value else '0'
statelabel = statelabels.get(char, {}).get(state) or state
new[taxon]['{}_{}'.format(char, statelabel)] = v
return cls(new)
[docs] @classmethod
def multistatised(cls, matrix: StateMatrix, multicharlabel: typing.Optional[str] = None)\
-> 'CharacterMatrix':
"""
Convert character data of the form 0010000 to a single multi-state character.
This kind of data may be obtained from coding wordlist data as "word belongs to cognate set"
vectors.
If 26..52 characters are given, RESPECTCASE is added to FORMAT, and A-Za-z is used as symbol
set.
"""
matrix = cls(matrix)
available_states = list(string.ascii_uppercase + string.ascii_lowercase)
assert matrix.is_binary, "All state symbols must be 0, 1 or None (missing)"
assert len(matrix.characters) <= len(available_states), \
"Too many characters to multistatise"
multicharlabel = multicharlabel or '1'
# Seed the resulting matrix with `None`, i.e. "missing" values.
multistate_matrix = collections.OrderedDict(
[(t, collections.OrderedDict([(multicharlabel, None)])) for t in matrix])
for i, charlabel in enumerate(matrix.characters):
for taxon in matrix:
if matrix[taxon][charlabel] == '1':
if multistate_matrix[taxon][multicharlabel]:
multistate_matrix[taxon][multicharlabel] = tuple(
list(multistate_matrix[taxon][multicharlabel]) + [available_states[i]])
else:
multistate_matrix[taxon][multicharlabel] = available_states[i]
return cls(multistate_matrix)
[docs] @classmethod
def from_characters(cls,
matrix: StateMatrix,
drop_chars: typing.Optional[typing.Iterable[str]] = None,
inverse: bool = False,
drop_uncertain: bool = False,
drop_polymorphic: bool = False,
drop_missing: bool = False,
drop_gapped: bool = False,
drop_constant: bool = False) -> 'CharacterMatrix':
"""
:param chars:
:param inverse:
:return: A **new** matrix constructed as copy, omitting specified characters.
"""
drop_chars = drop_chars or set()
matrix = cls(matrix)
taxa, characters = matrix.taxa, matrix.characters
res = collections.OrderedDict([(t, collections.OrderedDict()) for t in matrix])
for i, col in enumerate(matrix.iter_columns()):
char = characters[i]
if drop_chars and not inverse and (char in drop_chars):
continue
if drop_chars and inverse and (char not in drop_chars):
continue
if drop_uncertain and any(isinstance(v, set) for v in col):
continue
if drop_polymorphic and any(isinstance(v, tuple) for v in col):
continue
if drop_missing and any(v is None for v in col):
continue
if drop_gapped and any(v == GAP for v in col):
continue
if drop_constant and \
len(set(frozenset(v) if isinstance(v, set) else v for v in col)) == 1:
continue
for j, v in enumerate(col):
res[taxa[j]][char] = v
return cls(res)
def to_phylip(self) -> str:
def phylip_name(s):
res = ''
for c in s:
if c in string.printable and (c not in '()[]:;,'):
res += c
if len(res) >= 10:
break
return res.ljust(10)
if self.has_uncertain or self.has_polymorphic:
raise ValueError('Cannot convert matrix with uncertain or polymorphic states.')
if self.has_missing and '?' in self.symbols:
raise ValueError('Missing symbol ? used as state symbol') # pragma: no cover
if self.has_gaps and '-' in self.symbols:
raise ValueError('Gap symbol - used as state symbol') # pragma: no cover
res = [" {} {}".format(len(self.taxa), len(self.characters))]
for taxon, states in self.items():
seq = ''
for state in states.values():
assert isinstance(state, str) or state is None
seq += '?' if state is None else ('-' if state == GAP else state)
res.append('{}{}'.format(phylip_name(taxon), seq))
return '\n'.join(res)
[docs] def to_fasta(self) -> str:
"""
:return: The character matrix serialized in the \
`FASTA format <https://en.wikipedia.org/wiki/FASTA_format>`_
"""
# convert states codes as digits to letters
# convert missing *and* gap to '-'
if self.has_uncertain or self.has_polymorphic:
raise ValueError('Cannot convert matrix with uncertain or polymorphic states.')
digits = {s: None for s in self.symbols if s in string.digits}
if digits:
if len(self.symbols) > len(string.ascii_uppercase): # pragma: no cover
raise ValueError('Too many symbols in matrix to replace digits with letters')
for digit in sorted(digits):
for c in string.ascii_uppercase:
if c not in self.symbols and (c not in digits.values()):
digits[digit] = c
break
res = []
for taxon, states in self.items():
seq = ''.join(
'-' if state is None or state == GAP else digits.get(state, state)
for state in states.values())
res.append('> {}'.format(taxon))
res.extend(textwrap.wrap(seq, 70))
return '\n'.join(res)
[docs] @classmethod
def from_fasta(cls, fasta: str) -> 'CharacterMatrix':
"""
.. code-block:: python
>>> from commonnexus import Nexus
>>> from commonnexus.blocks import Data
>>> from commonnexus.tools.matrix import CharacterMatrix
>>> print(Nexus.from_blocks(Data.from_data(CharacterMatrix.from_fasta(
... '> t1\\nABA BAA\\n> t2\\nBAB ABA'))))
#NEXUS
BEGIN DATA;
DIMENSIONS NCHAR=6;
FORMAT DATATYPE=STANDARD MISSING=? GAP=- SYMBOLS="AB";
MATRIX
t1 ABABAA
t2 BABABA;
END;
"""
def get_row(nchar, seq, taxon):
if nchar is not None:
assert len(seq) == nchar, "Only aligned sequences can be converted."
assert taxon is not None
return len(seq), collections.OrderedDict([(str(i + 1), c) for i, c in enumerate(seq)])
res = collections.OrderedDict()
taxon, seq, nchar = None, [], None
for line in fasta.split('\n'):
if line.startswith('>'):
if seq:
nchar, res[taxon] = get_row(nchar, seq, taxon)
taxon, seq = line[1:].strip(), []
continue
for chunk in line.strip().split():
for c in chunk:
if c not in string.digits:
seq.append(c)
if seq:
_, res[taxon] = get_row(nchar, seq, taxon)
return cls(res)