Source code for commonnexus.blocks.trees

import typing
import warnings
import functools
import collections

import newick

from .base import Payload, Block
from commonnexus.util import log_or_raise
from commonnexus.tokenizer import TokenType, iter_words_and_punctuation, Word, Token

if typing.TYPE_CHECKING:  # pragma: no cover
    from commonnexus.nexus import Nexus

TreeSpec = typing.Tuple[str, typing.Union[str, newick.Node], typing.Union[None, bool]]


[docs]class Translate(Payload):
    """
    The tree description requires references to the taxa defined in a TAXA, DATA,
    CHARACTERS, UNALIGNED, or DISTANCES block. These references can be made using the label assigned
    to them in the TAXA or DATA blocks, their numbers, or a token specified in the TRANSLATE
    command. The TRANSLATE statement maps arbitrary labels in the tree specification to valid taxon
    names. If the arbitrary labels are integers, they are mapped onto the valid taxon names as
    dictated by the TRANSLATE command without any consideration of the order of the taxa in the
    matrix. Thus, if an integer is encountered in the tree description, a program first checks to
    see if it matches one of the arbitrary labels defined in the TRANSLATE command; only if
    no matching label is found will the integer be presumed to refer to the taxon in that
    position in the matrix (e.g., if the label in the description is 15, but this is not a label
    defined in the TRANSLATE command, a program should take this to refer to the 15th taxon).

    In the following example,

    .. code-block::

        BEGIN TAXA;
            TAXLABELS Scarabaeus Drosophila Aranaeus;
        END;
        BEGIN TREES;
            TRANSLATE beetle Scarabaeus, fly Drosophila, spider Aranaeus;
            TREE tree1 = ((1,2),3);
            TREE tree2 = ((beetle,fly),spider);
            TREE tree3= ((Scarabaeus,Drosophila),Aranaeus);
        END;

    the TRANSLATE command specifies that the label "beetle" can be used in the tree description to
    refer to Scarabaeus, "fly" to Drosophila, and "spider" to Aranaeus. This means that Scarabaeus
    can be referred to in a tree description as 1, Scarabaeus, or beetle. Thus, the three trees are
    identical.

    :ivar typing.Dict[str, str] mapping: The mapping of tokens used in the tree description to \
    valid taxon names.

    .. note::

        The ``TRANSLATE`` data is typically not accessed directly, but just used implicitly when
        calling :meth:`Trees.translate`.
    """
    def __init__(self, tokens, nexus=None):
        super().__init__(tokens, nexus=nexus)
        # get a word, and another word, then look for comma.
        self.mapping = collections.OrderedDict()
        key, value = None, None
        for t in iter_words_and_punctuation(self._tokens, nexus=nexus):
            if not key:
                assert isinstance(t, str)
                key = t
                continue
            if not value:
                assert isinstance(t, str)
                value = t
                continue
            assert t.is_punctuation and t.text == ',', t.text
            self.mapping[key] = value
            key, value = None, None
        if key and value:
            self.mapping[key] = value


[docs]class Tree(Payload):
    """
    This command describes a tree. Tree descriptions are standard object definition commands.
    They use the familiar parenthesis notation, with node names, branch lengths, and comments
    following the established Newick tree standard (see Felsenstein, 1993).

    The label of the node is a NEXUS token that is a taxon's defined name, a taxon's number, a
    taxon's label from the translation table, or a clade's defined name. The label is optional for
    internal nodes that are not observed taxa; it is not optional for terminal nodes. Internal
    nodes that have no label are represented implicitly by the parentheses containing the list of
    subclades. If the name of a TAXSET is used, it is interpreted as a list of the terminal taxa
    defined to be in the TAXSET (with commas implicitly inserted between the taxa). The length of
    the branch below the node is a number, positive or negative. Rooted and unrooted trees can be
    specified using the [&R] and [&U] comments at the start of the tree description. For example,

    .. code-block::

        TREE mytree = [&R] ((1,2),(3,4));

    is a rooted tree, whereas

    .. code-block::

        TREE mytree = [&U] ((1,2),(3,4));

    is an unrooted tree. The NEXUS standard does not specify whether rooted or unrooted is default.

    An example tree with branch lengths is

    .. code-block::

        TREE tree4 = ((beetle:4.3,fly:1.1):1.8,spider:2.5);

    If a file (and its data matrix) has four defined taxa, Crocodile, Bluebird, Archaeopteryx, and
    Rattlesnake, the following tree,

    .. code-block::

        TREE tree4= (((Bluebird)Archaeopteryx,Crocodile)Archosauria,Rattlesnake);

    would indicate that the taxon Archaeopteryx is ancestral to Bluebird and that Crocodile
    is their sister. Archosauria, because it does not refer to a taxon that has been defined
    in a TAXA or DATA block, is interpreted as the name of the clade including Archaeopteryx,
    Bluebird, and Crocodile. Any additional information about a clade, its ancestral node, or the
    branch below it is to be placed in NEXUS comment commands associated with the node. Al-
    though different programs may choose their own conventions for how to embed
    information in comments, the comments that begin with &N are reserved for future
    NEXUS comment commands. The NEXUS standard places no restrictions on the number of taxa
    contained in each tree.

    :ivar str name: The name of the tree.
    :ivar typing.Union[bool, None] rooted: Flag indicating whether the tree is rooted (or `None` \
    if no information is given)
    :ivar newick.Node newick: The tree description as `newick.Node`.

    .. code-block:: python

        >>> tree = Tree('tree4= (((Bluebird)Archaeopteryx,Crocodile)Archosauria,Rattlesnake)')
        >>> tree.name
        'tree4'
        >>> print(tree.newick.ascii_art())
                                        ┌─Archaeopteryx ──Bluebird
                        ┌─Archosauria───┤
        ────────────────┤               └─Crocodile
                        └─Rattlesnake
    """
    __multivalued__ = True

    def __init__(self, tokens, nexus=None):
        super().__init__(tokens, nexus=nexus)
        # We parse tree name and rooting information right away.
        self.name, ncomplete, e, self._rooted, nwk = None, False, False, None, False

        tokens = iter(self._tokens)
        while not self.name:
            t = next(tokens)
            if t.type in {TokenType.WORD, TokenType.QWORD}:
                self.name = t.text
                if t.type == TokenType.QWORD:
                    ncomplete = True

        while not e:
            t = next(tokens)
            if t.type == TokenType.WORD:
                assert not ncomplete, 'Stuff between tree name and ='
                self.name += t.text
            if t.type == TokenType.PUNCTUATION:
                if t.text == '=':
                    e = True
                else:
                    assert not ncomplete, 'Stuff between tree name and ='
                    self.name += t.text  # FIXME: Should we append punctuation to tree names?

        while not nwk:
            t = next(tokens)
            if t.type == TokenType.COMMENT and t.text.startswith('&'):
                self._rooted = t.text
            if t.type == TokenType.PUNCTUATION and t.text == '(':
                nwk = True
        assert nwk
        # Since Newick node construction is somewhat expensive, we defer it to lazy properties.
        self.newick_tokens = [Token(text='(', type=TokenType.PUNCTUATION)] + [t for t in tokens]
        self._nn = None

[docs]    @staticmethod
    def format(name: str,
               newick_node: newick.Node,
               rooted: typing.Optional[bool] = None) -> str:
        """
        Returns a representation of a tree as NEXUS string, suitable as payload of a ``TREE``
        command.
        """
        return '{} = {}{}'.format(
            Word(name).as_nexus_string(),
            '' if rooted is None else '[&{}] '.format('R' if rooted else 'U'),
            newick_node.newick)

    @property
    def rooted(self) -> typing.Union[None, bool]:
        """
        Whether the tree is rooted (`True`) or not (`False`) or no information is given (`None`).
        """
        if self._rooted:
            return self._rooted == '&R'

    @functools.cached_property
    def newick_string(self) -> str:
        """
        The Newick-formatted string representation of the tree.

        .. note::

            This property is intended for cases where only the string representation is of interest
            and the somewhat expensive construction of a `newick.Node` object is not necessary.
            Accessing the :meth:`Tree.newick` property will trigger node construction.

        .. warning::

            Due to some normalization (e.g. of whitespace) done by the Newick parser,
            `newick_string` may differ from `newick.newick`.

            .. code-block:: python

                >>> from commonnexus import Nexus
                >>> nex = Nexus('#nexus begin trees; tree 1 = (a,b)\\nc; end;')
                >>> nex.TREES.TREE.newick_string
                '(a,b)\\nc;'
                >>> nex.TREES.TREE.newick.newick
                '(a,b)c'
        """
        return ''.join(str(t) for t in self.newick_tokens) + ';'

    @functools.cached_property
    def newick(self) -> newick.Node:
        """
        A `newick.Node` instance parsed from the Newick representation of the tree.

        .. code-block:: python

            >>> from commonnexus import Nexus
            >>> nex = Nexus('#nexus begin trees; tree 1 = ((a,b)c,d)e; end;')
            >>> print(nex.TREES.TREE.newick.ascii_art())
                    ┌─a
                ┌─c─┤
            ──e─┤   └─b
                └─d
        """
        if self.nexus and self.nexus.cfg.validate_newick:
            # More correct, but slower: Let the newick parser validate the data.
            return newick.loads(self.newick_string)[0]
        # Quicker but by-passes some validation: Instantiate NewickString from pre-parsed Nexus
        # tokens!
        nt = [newick.Token('(', newick.TokenType.OBRACE, 0)]
        word, level = [], 1

        for token in self.newick_tokens[1:]:
            # now we assemble newick string and newick tokens in one go.
            if token.type == TokenType.WORD:
                word.append(token.text)
            elif token.type == TokenType.PUNCTUATION:
                if token.text in newick.RESERVED_PUNCTUATION:
                    # delimits words!
                    if word:
                        nt.append(newick.Token(''.join(word), newick.TokenType.WORD, level))
                        word = []
                    if token.text == ')':
                        level -= 1
                    nt.append(
                        newick.Token(token.text, newick.RESERVED_PUNCTUATION[token.text], level))
                    if token.text == '(':
                        level += 1
                else:
                    word.append(token.text)
            elif token.type == TokenType.COMMENT:
                if word:
                    nt.append(newick.Token(''.join(word), newick.TokenType.WORD, level))
                    word = []
                nt.append(newick.Token('[' + token.text + ']', newick.TokenType.COMMENT, level))
            elif token.type == TokenType.QWORD:
                assert not word  # As in NEXUS, a newick QWORD can not follow a WORD.
                nt.append(
                    newick.Token(Word(token.text).as_nexus_string(), newick.TokenType.QWORD, level))
        if word:
            nt.append(newick.Token(''.join(word), newick.TokenType.WORD, level))
        return newick.NewickString(nt).to_node()


[docs]class Trees(Block):
    """
    This block stores information about trees. The syntax for the TREES block is

    .. rst-class:: nexus

        | BEGIN TREES;
        |     [:class:`TRANSLATE <Translate>` arbitrary-token-used-in-tree-description
        |       valid-taxon-name [, arbitrary-token-used-in-tree-description valid-taxon-name...];]
        |     [:class:`TREE <Tree>` [*] tree-name= tree-specification;]
        | END;

    A :class:`TRANSLATE <Translate>` command, if present, must precede any
    :class:`TREE <Tree>` command.
    """
    __commands__ = [Translate, Tree]

    @property
    def trees(self) -> typing.List[Tree]:
        """
        Since TREE is one of the few NEXUS commands which may appear multiple times per block, we
        provide a shortcut to this list.
        """
        return self.commands['TREE']

    def validate(self, log=None):
        super().validate(log=log)
        valid, with_translate, tree_seen = True, False, False
        for i, cmd in enumerate(self[1:-1]):
            if cmd.name not in self.payload_map:  # pragma: no cover
                valid = log_or_raise(
                    'Invalid command for {} block: {}'.format(self.name, cmd.name), log=log)
            if cmd.name == 'TRANSLATE':
                if with_translate:  # pragma: no cover
                    valid = log_or_raise('Duplicate TRANSLATE command', log=log)
                elif tree_seen:  # pragma: no cover
                    valid = log_or_raise(
                        'TRANSLATE command **after** TREE command', log=log, level='warning')
                else:
                    with_translate = True
            else:
                tree_seen = True
        return valid

    @functools.cached_property
    def translate_mapping(self):
        mapping = {}
        if 'TAXA' in self.linked_blocks:
            mapping.update({
                str(k): v for k, v in self.linked_blocks['TAXA'].TAXLABELS.labels.items()})
        elif self.nexus.TAXA and self.nexus.TAXA.TAXLABELS:
            mapping.update({str(k): v for k, v in self.nexus.TAXA.TAXLABELS.labels.items()})
        if 'TRANSLATE' in self.commands:
            mapping.update(self.TRANSLATE.mapping)
        return mapping

[docs]    def translate(self, tree: typing.Union[Tree, newick.Node]) -> newick.Node:
        """
        Translate a tree according to the mapping TREES TRANSLATE.

        :return: A Newick node where the node labels have been translated to valid taxon labels.

        .. note::

            Translating a tree does **not** change tree's representation in the containing
            ``Nexus`` instance. To replace un-translated trees in a NEXUS file with translated
            ones, the following code would work:

            .. code-block:: python

                >>> untranslated = Nexus.from_file(path)
                >>> trees = []
                >>> for tree in untranslated.TREES.trees:
                ...     trees.append(Tree.format(
                ...         tree.name,
                ...         untranslated.TREES.translate(tree).newick,
                ...         rooted=tree.rooted))
                >>> untranslated.replace_block(
                ...     untranslated.TREES, [('TREE', tree) for tree in trees])
                >>> path.write_text(str(untranslated))
        """
        res = (tree.newick if isinstance(tree, Tree) else tree).rename(
            auto_quote=True, **self.translate_mapping)
        if not set(n.unquoted_name for n in res.walk() if n.name and n.is_leaf).issubset(
                self.translate_mapping.values()):
            warnings.warn('un-translatable leaf nodes!')
        return res

[docs]    @classmethod
    def from_data(cls,
                  *tree_specs: TreeSpec,
                  nexus: typing.Optional["Nexus"] = None,
                  comment: typing.Optional[str] = None,
                  lowercase_command: bool = False,
                  TITLE: typing.Optional[str] = None,
                  LINK: typing.Optional[str] = None,
                  ID: typing.Optional[str] = None,
                  **translate_labels: typing.Dict[str, str]) -> 'Trees':
        """
        Create a TREES block from a list of tree specifications.

        A tree specification is a triple (label, newick, rooted), e.g. `('t1', '(a,b)c;', False)`.

        If `translate_labels` are passed in, a corresponding TRANSLATE command will be added to the
        block and the trees will be "de-translated" accordingly.

        .. code-block:: python

            >>> print(Trees.from_data(('t1', '(a,b)c;', False), comment='A consensus tree'))
            [A consensus tree]
            BEGIN TREES;
            TREE t1 = [&U] (a,b)c;
            END;
        """
        cmds = []

        if translate_labels:
            cmds.append((
                'TRANSLATE',
                ',\n'.join('{} {}'.format(
                    Word(k).as_nexus_string(),
                    Word(v).as_nexus_string())
                    for k, v in sorted(translate_labels.items()))
            ))
        for name, nwk, rooted in tree_specs:
            if isinstance(nwk, str):
                nwk = newick.loads(nwk)[0]
            if translate_labels:
                nwk.rename(auto_quote=True, **{v: k for k, v in translate_labels.items()})
            cmds.append(('tree' if lowercase_command else 'TREE', Tree.format(name, nwk, rooted)))
        return cls.from_commands(cmds, nexus=nexus, TITLE=TITLE, LINK=LINK, ID=ID, comment=comment)