Source code for commonnexus.blocks.distances

import typing
import decimal
import warnings
import itertools
import collections

from commonnexus.tokenizer import iter_words_and_punctuation, iter_lines, Word
from .base import Block, Payload
from . import characters
from . import taxa

if typing.TYPE_CHECKING:  # pragma: no cover
    from commonnexus import Nexus

ODict = typing.OrderedDict


[docs]class Dimensions(characters.Dimensions): """ The NTAX subcommand of this command is needed to process the matrix when some defined taxa are omitted from the distance matrix. The NCHAR subcommand is optional and can be used to indicate the number of characters for those analyses that need to know how many characters (if any) were used to calculate the distances. NCHAR is not required for successful reading of the matrix. As for the CHARACTERS and UNALIGNED block, taxa can be defined in a DISTANCES block if NEWTAXA precedes the NTAXA subcommand in the DIMENSIONS command. It is advised that new taxa not be defined in a DISTANCES block, for the reasons discussed in the description of the DATA block. NEWTAXA, if present, must be appear before the NTAX subcommand. :ivar bool newtaxa: :ivar typing.Optional[int] nchar: :ivar int ntax: """ def check(self): assert (not self.newtaxa) or self.ntax
[docs]class Format(Payload): """ This command specifies the formatting of the MATRIX. The [NO]LABELS and MISSING subcommands are as described in the CHARACTERS block. 1. TRIANGLE = {LOWER | UPPER | BOTH}. This subcommand specifies whether only the lower left half of the matrix is present, or only the upper right, or both halves. Below is one example of an upper triangular matrix and one of a matrix with both halves included. .. code-block:: BEGIN DISTANCES; FORMAT TRIANGLE=UPPER; MATRIX taxon_1 0.0 1.0 2.0 4.0 7.0 taxon_2 0.0 3.0 5.0 8.0 taxon_3 0.0 6.0 9.0 taxon_4 0.0 10.0 taxon_5 0.0; END; .. code-block:: BEGIN DISTANCES; FORMAT TRIANGLE = BOTH; MATRIX taxon_1 0 1.0 2.0 4.0 7.0 taxon_2 1.0 0 3.0 5.0 8.0 taxon_3 2.0 3.0 0 6.0 9.0 taxon_4 4.0 5.0 6.0 0 10.0 taxon_5 7.0 8.0 9.0 10.0 0; END; 2. DIAGONAL. If DIAGONAL is turned off, the diagonal elements are not included: .. code-block:: FORMAT NODIAGONAL; MATRIX taxon_1 taxon_2 1.0 taxon_3 2.0 3.0 taxon_4 4.0 5.0 6.0 taxon_5 7.0 8.0 9.0 10.0; If TRIANGLE is not BOTH and DIAGONAL is turned off, then there will be one row that contains only the name of a taxon. This row is required. If TRIANGLE=BOTH, then the diagonal must be included. 3. INTERLEAVE. As in the CHARACTERS block, this subcommand indicates sections in the matrix, although interleaved matrices take a slightly different form for distance matrices: .. code-block:: taxon_1 0 taxon_2 1 0 taxon_3 2 3 0 taxon_4 4 5 6 taxon_5 7 8 9 taxon_6 11 12 13 taxon_4 0 taxon_5 10 0 taxon_6 14 15 0; As in the CHARACTERS block, newline characters in interleaved matrices are significant, in that they indicate a switch to a new taxon. """ def __init__(self, tokens, nexus=None): super().__init__(tokens, nexus=nexus) self.missing = '?' self.labels = True self.interleave = False self.diagonal = True self.triangle = 'LOWER' if tokens is None: return words = iter_words_and_punctuation(self._tokens, nexus=nexus) def word_after_equals(): n = next(words) assert n.text == '=' res = next(words) return res if isinstance(res, str) else res.text while 1: try: word = next(words) subcommand = None if isinstance(word, str): subcommand = word.upper() if subcommand in ['TRIANGLE', 'MISSING']: setattr(self, subcommand.lower(), word_after_equals()) elif subcommand in ['INTERLEAVE']: setattr(self, subcommand.lower(), True) elif subcommand in ['NOLABELS', 'LABELS', 'NODIAGONAL', 'DIAGONAL']: setattr(self, subcommand.replace('NO', '').lower(), 'NO' not in subcommand) except StopIteration: break self.triangle = self.triangle.upper()
[docs]class Taxlabels(taxa.Taxlabels): """ This command allows specification of the names and ordering of the taxa. It serves to define taxa and is allowed only if the NEWTAXA token is included in the DIMENSIONS statement. """
[docs]class Matrix(Payload): """ This command contains the distance data. .. note:: Since reading the matrix data only makes sense if information from other commands - in particular :class:`FORMAT <Format>` - is considered, the ``Matrix`` object does not have any attributes for data access. Instead, the matrix data can be read via :meth:`Distances.get_matrix`. """
[docs]class Distances(Block): """ This block contains distance matrices. Taxa are usually not defined in a DISTANCES block; if they are not, this block must be preceded by a block that defines taxon labels and ordering (e.g., TAXA). The syntax of the block is as follows: .. rst-class:: nexus | BEGIN DISTANCES; | [:class:`DIMENSIONS <Dimensions>` [NEWTAXA] NTAX=num-taxa NCHAR=num-characters;] | [:class:`FORMAT <Format>` | [TRIANGLE={LOWER | UPPER | BOTH}] | [[NO]DIAGONAL] | [[NO]LABELS] | [MISSING=SYMBOL] | [INTERLEAVE] | ;] | [:class:`TAXLABELS <Taxlabels>` taxon-name [taxon-name...];] | :class:`MATRIX <Matrix>` distance-matrix; | END; Commands must appear in the order listed. Only one of each command is allowed per block. """ __commands__ = [Dimensions, Format, Taxlabels, Matrix]
[docs] def get_matrix(self) -> ODict[str, ODict[str, typing.Union[None, decimal.Decimal]]]: """ :return: A full distance matrix encoded as nested ordered dictionaries. .. code-block:: python >>> from commonnexus import Nexus >>> nex = Nexus('''#NEXUS ... BEGIN DISTANCES; ... DIMENSIONS NEWTAXA NTAX=5; ... TAXLABELS taxon_1 taxon_2 taxon_3 taxon_4 taxon_5; ... FORMAT TRIANGLE=UPPER; ... MATRIX ... taxon_1 0.0 1.0 2.0 4.0 7.0 ... taxon_2 0.0 3.0 5.0 8.0 ... taxon_3 0.0 6.0 9.0 ... taxon_4 0.0 10.0 ... taxon_5 0.0; ... END;''') >>> nex.DISTANCES.get_matrix()['taxon_3']['taxon_1'] Decimal('2.0') """ format = self.FORMAT or Format(None) ntax, taxlabels = None, {} if self.DIMENSIONS: ntax = self.DIMENSIONS.ntax if self.TAXLABELS: taxlabels = self.TAXLABELS.labels ntax = self.DIMENSIONS.ntax elif self.nexus.TAXA: taxlabels = self.nexus.TAXA.TAXLABELS.labels ntax = self.nexus.TAXA.DIMENSIONS.ntax res = collections.OrderedDict() label, entries = None, [] def required_cols(row_index=None): # The number of required entries for a distance matrix row depends on TRIANGLE, # DIAGONAL and the row index. if format.triangle == 'BOTH': # Each row has entries for all taxa. ncols = ntax elif format.triangle == 'LOWER': # Each row has one more entry than the previous one. if row_index: ncols = row_index else: ncols = 1 if not res else \ len(list(res.values())[-1]) + (2 if format.diagonal is False else 1) else: # Each row has one entry less than the previous row. if row_index: ncols = ntax - row_index + 1 else: ncols = ntax - len(list(res.values())) if not format.diagonal: # And if the diagonal is missing, we expect one entry less in all cases. ncols -= 1 return ncols # Now read the matrix data: for i, line in enumerate( list(iter_lines(self.MATRIX._tokens)) if format.interleave else [self.MATRIX._tokens], start=1 ): words = iter_words_and_punctuation(line, nexus=self.nexus) if (not format.labels) and required_cols() == 0 and 1 not in res: # NODIAGONAL NOLABELS TRIANGLE=LOWER res[1] = [] while 1: try: t = next(words) if (format.labels is not False) and label is None: assert isinstance(t, str) label = t if not format.diagonal and not format.interleave and \ format.triangle == 'LOWER' and not res: # We're done with this row after the label. res[label] = [] label = None continue entries.append(None if t == format.missing else decimal.Decimal(t)) if not format.interleave and (len(entries) == required_cols()): res[label or (len(res) + 1)] = entries label, entries = None, [] except StopIteration: break if format.interleave: # We collected a row of entries, now append them to the correct taxon. # If we have a label, that's easy. if not label: for ri in taxlabels: if ri not in res: # First pass must go through all taxa! label = ri break if not label: # The next label to append entries to is the next one still in need of # entries! for row_index in res: if len(res[row_index]) < required_cols(row_index): label = row_index break if label not in res: res[label] = [] res[label].extend(entries) label, entries = None, [] if format.labels is False: assert taxlabels elif not taxlabels: taxlabels = collections.OrderedDict((i, label) for i, label in enumerate(res, start=1)) # We pad the result rows with None columns on the left as necessary, to make lookup by # column index work. if format.triangle == 'UPPER': for i, key in enumerate(res): res[key] = [None] * (i if format.diagonal else i + 1) + res[key] if format.diagonal is False and format.labels is False: assert (ntax) not in res, str(res.keys()) res[ntax] = [None] elif format.triangle == 'BOTH' and not format.diagonal: for i, key in enumerate(res): res[key].insert(i, None) # Match matrix rows to expected taxa: validtaxa = set(str(k) for k in taxlabels).union(taxlabels.values()) restaxa = {str(k): k for k in res} if not set(restaxa).issubset(validtaxa): warnings.warn('Pruning undeclared taxa from DISTANCES matrix.') for taxon in set(restaxa) - validtaxa: del res[restaxa[taxon]] if len(res) < len(taxlabels): # Not all taxa appear in the matrix. Prune expected taxa to make lookup work. for k in list(taxlabels.keys()): if (str(k) not in restaxa) and (taxlabels[k] not in restaxa): del taxlabels[k] taxlabels = collections.OrderedDict( (i, label) for i, label in enumerate(taxlabels.values(), start=1)) res = {taxlabels.get(k, k): v for k, v in res.items()} assert set(res.keys()).issubset(taxlabels.values()), "Unmatched taxa in DISTANCES matrix." # Now populate a complete matrix with the data read from the tokens: matrix = collections.OrderedDict([ (label, collections.OrderedDict([(ll, None) for ll in taxlabels.values()])) for label in taxlabels.values()]) for (na, la), (nb, lb) in itertools.combinations_with_replacement(taxlabels.items(), r=2): if na == nb and format.diagonal is False: matrix[la][lb] = 0 continue if format.triangle == 'BOTH': matrix[la][lb] = res[la][nb - 1] matrix[lb][la] = res[lb][na - 1] else: val = res[la][nb - 1] if format.triangle == 'UPPER' else res[lb][na - 1] if nb != na: matrix[la][lb] = matrix[lb][la] = val else: matrix[la][lb] = val return matrix
[docs] @classmethod def from_data(cls, matrix: typing.OrderedDict[str, typing.OrderedDict[ str, typing.Union[None, float, int, decimal.Decimal]]], taxlabels: bool = False, comment: typing.Optional[str] = None, nexus: typing.Optional["Nexus"] = None, TITLE: typing.Optional[str] = None, ID: typing.Optional[str] = None, LINK: typing.Optional[typing.Union[str, typing.Tuple[str, str]]] = None) \ -> 'Block': """ Create a DISTANCES block from the distance matrix `matrix`. :param matrix: The distance matrix as dict mapping taxon labels to dicts mapping taxon \ labels to numbers. A "full" matrix is expected here, just like it is returned from \ :meth:`Distances.get_matrix`. :param taxlabels: Whether to include a TAXLABELS command. """ dimensions = 'NTAX={}'.format(len(matrix)) if taxlabels: dimensions = 'NEWTAXA NTAX={} {}'.format(len(matrix), dimensions) cmds = [ # DIMENSIONS: necessary if not all taxa are present in the matrix! ('DIMENSIONS', dimensions), ('FORMAT', 'TRIANGLE=BOTH MISSING=?'), ] maxlen, tlabels = 0, {} for taxon in matrix: # We compute maximum taxon label length for pretty printing. tlabels[taxon] = Word(taxon).as_nexus_string() maxlen = max([maxlen, len(tlabels[taxon])]) if taxlabels: cmds.append(('TAXLABELS', ' '.join(tlabels.values()))) mrows = [] for taxon, dists in matrix.items(): row = [tlabels[taxon].ljust(maxlen)] row.extend(['?' if v is None else str(v) for v in dists.values()]) mrows.append(' '.join(row)) cmds.append(('MATRIX', ''.join('\n' + row for row in mrows) + '\n')) return cls.from_commands(cmds, nexus=nexus, TITLE=TITLE, LINK=LINK, ID=ID, comment=comment)