Source code for commonnexus.blocks.characters

import types
import typing
import warnings
import functools
import collections

from .base import Block, Payload
from commonnexus.util import log_or_raise
from commonnexus.tokenizer import (
    iter_words_and_punctuation, Token, iter_delimited, iter_lines, BOOLEAN, word_after_equals, Word,
)
from .taxa import Taxlabels

if typing.TYPE_CHECKING:  # pragma: no cover
    from commonnexus import Nexus

# A state in a CHARACTERS matrix may be missing, gapped, a state symbol or uncertain/polymorphic
# states.
State = typing.Union[None, str, typing.Set[str], typing.Tuple[str]]
StateMatrix = typing.OrderedDict[str, typing.OrderedDict[str, State]]

GAP = '\uFFFD'  # REPLACEMENT CHARACTER used to replace an [...] unrepresentable character
#: Some - but not all - punctuation is invalid as (special) state symbol.
INVALID_SYMBOLS = "()[]{}/\\,;:=*'\"*`<>^"


def duplicate_charlabel(label, cmd, nexus):
    if nexus and nexus.cfg.strict:  # pragma: no cover
        raise ValueError('character names must be unique!')
    else:
        warnings.warn(
            'Duplicate character name "{}" in {} command'.format(label, cmd))


[docs]class Eliminate(Payload): """ This command allows specification of a list of characters that are to be excluded from consideration. Programs are expected to ignore ELIMINATEd characters completely during reading. In avoiding allocation of memory to store character information, the programs can save a considerable amount of computer memory. (This subcommand is similar to ZAP in version 3.1.1 of PAUP.) For example, .. code-block:: ELIMINATE 4-100; tells the program to skip over characters 4 through 100 in reading the matrix. Character-set names are not allowed in the character list. This command does not affect character numbers. .. warning:: The ``ELIMINATE`` command is currently not supported in `commonnexus`. """ def __init__(self, tokens, nexus=None): super().__init__(tokens, nexus=nexus) if nexus is not None and not nexus.cfg.ignore_unsupported: raise NotImplementedError('The ELIMINATE command is not supported')
[docs]class Dimensions(Payload): """ The DIMENSIONS command specifies the number of characters. The number following NCHAR is the number of characters in the data matrix. The NEXUS standard does not impose limits on the number of characters; a limit may be imposed by particular computer programs. It is strongly advised that new taxa not be defined in a CHARACTERS block, for the reasons discussed in the description of the DATA block. If new taxa are to be defined, this must be indicated by the NEWTAXA subcommand, specifying that new taxa are to be defined (this allows the computer program to prepare for creation of new taxa). NEWTAXA, if present, must appear before the NTAX subcommand. The NTAX subcommand, indicating the number of taxa in the MATRIX command in the block, is optional, unless NEWTAXA is specified, in which case it is required. :ivar bool newtaxa: :ivar typing.Optional[int] ntax: :ivar int nchar: """ def __init__(self, tokens, nexus=None): super().__init__(tokens, nexus=nexus) self.newtaxa = False self.ntax = None self.char = None words = iter_words_and_punctuation(self._tokens, nexus=nexus) while 1: try: word = next(words) subcommand = None if isinstance(word, str): subcommand = word.upper() if subcommand == 'NEWTAXA': self.newtaxa = True elif subcommand == 'NTAX': n = next(words) assert n.text == '=' self.ntax = int(next(words)) elif subcommand == 'NCHAR': n = next(words) assert n.text == '=' self.nchar = int(next(words)) except StopIteration: break self.check() def check(self): assert self.nchar and ((not self.newtaxa) or self.ntax)
[docs]class Format(Payload): """ The FORMAT command specifies the format of the data MATRIX. This is a crucial command because misinterpretation of the format of the data matrix could lead to anything from incorrect results to spectacular crashes. The DATATYPE subcommand must appear first in the command. The RESPECTCASE subcommand must appear before the MISSING, GAP, SYMBOLS, and MATCHCHAR subcommands. The following are possible formatting subcommands. 1. DATATYPE = {STANDARD | DNA | RNA | NUCLEOTIDE | PROTEIN | CONTINUOUS}. This subcommand specifies the class of data. If present, it must be the first subcommand in the FORMAT command. Standard data consist of any general sort of discrete character data, and this class is typically used for morphological data, restriction site data, and so on. DNA, RNA, NUCLEOTIDE, and PROTEIN designate molecular sequence data. Meristic morphometric data and other information with continuous values can be housed in matrices of DATATYPE=CONTINUOUS. These DATATYPES are described in detail, with examples, at the end of the description of the CHARACTERS block. .. warning:: ``DATATYPE=CONTINUOUS`` is currently not supported in `commonnexus`. Some programs accept (or expect) datatypes beyond the ones defined in the NEXUS spec; e.g. MrBayes has ``DATATYPE=RESTRICTION`` and Beauti may create "NEXUS" files with ``DATATYPE=BINARY``. `commonnexus` does not accept these non-standard datatypes and raises an exception when trying to read the MATRIX. Thus, to make "NEXUS" files with non-standard datatypes readable for `commonnexus`, substituting ``DATATYPE=STANDARD`` is typically the right thing to do. 2. RESPECTCASE. By default, information in a MATRIX may be entered in uppercase, lowercase, or a mixture of uppercase and lowercase. If RESPECTCASE is requested, case is considered significant in SYMBOLS, MISSING, GAP, and MATCHCHAR subcommands and in subsequent references to states. For example, if RESPECTCASE is invoked, then SYMBOLS="A a B b" designates four states whose symbols are A, a, B, and b, which can then each be used in the MATRIX command and elsewhere. If RESPECTCASE is not invoked, then A and a are considered homonymous state symbols. This subcommand must appear before the SYMBOLS subcommand. This subcommand is not applicable to DATATYPE = DNA, RNA, NUCLEOTIDE, PROTEIN, and CONTINUOUS. 3. MISSING. This subcommand declares the symbol that designates missing data. The default is "?". For example, MISSING=X defines an X to represent missing data. Whitespace is illegal as a missing data symbol, as are the :data:`INVALID_SYMBOLS` 4. GAP. This subcommand declares the symbol that designates a data gap (e.g., base absent in DNA sequence because of deletion or an inapplicable character in morphological data). There is no default gap symbol; a gap symbol must be defined by the GAP subcommand before any gaps can be entered into the matrix. For example, GAP=- defines a hyphen to represent a gap. Whitespace is illegal as a gap symbol, as are the :data:`INVALID_SYMBOLS` 5. SYMBOLS. This subcommand specifies the symbols and their order for character states used in the file (including in the MATRIX command). For example, SYMBOLS="0 1 2 3 4 5 6 7" designates numbers 0 through 7 as acceptable symbols in a matrix. The SYMBOLS subcommand is not allowed for DATATYPE=CONTINUOUS. The default symbols list differs from one DATATYPE to another, as described under state symbol in the Appendix. Whitespace is not needed between elements: SYMBOLS="012" is equivalent to SYMBOLS="0 1 2". For STANDARD DATATYPES, a SYMBOLS subcommand will replace the default symbols list of "0 1". For DNA, RNA, NUCLEOTIDE, and PROTEIN DATATYPES, a SYMBOLS subcommand will not replace the default symbols list but will add character-state symbols to the SYMBOLS list. The NEXUS standard does not define the position of these additional symbols within the SYMBOLS list. (These additional symbols will be inserted at the beginning of the SYMBOLS list in PAUP and at the end in MacClade. MacClade will accept additional symbols for PROTEIN but not DNA, RNA, and NUCLEOTIDE matrices.) .. warning:: While the specification requires the content of the SYMBOLS subcommand to be enclosed in doublequotes, `commonnexus` also allows unquoted content; i.e. `SYMBOLS=01` is treated as equivalent to `SYMBOLS="01"`. 6. EQUATE. This subcommand allows one to define symbols to represent one matrix entry. For example, EQUATE="E=(012)" means that each occurrence of E in the MATRIX command will be interpreted as meaning states 0, 1, and 2. The equate symbols cannot be any of the :data:`INVALID_SYMBOLS` or any or the currently defined MISSING, GAP, MATCHCHAR, or state SYMBOLS. Case is significant in equate symbols. That is, MISSING=? EQUATE="E=(012)e=?" means that E will be interpreted as 0, 1, and 2 and e will be interpreted as missing data. 7. MATCHCHAR. This subcommand defines a matching character symbol. If this subcommand is included, then a matching character symbol in the MATRIX indicates that the states are equivalent to the states possessed by the first taxon listed in the matrix for that character. In the following matrix, the sequence for taxon 2 is GACTTTC: .. code-block:: BEGIN DATA; DIMENSIONS NCHAR = 7; FORMAT DATATYPE=DNA MATCHCHAR=.; MATRIX taxon_l GACCTTA taxon_2 ...T..C taxon_3 ..T.C..; END; Whitespace is illegal as a matching character symbol, as are the :data:`INVALID_SYMBOLS` .. warning:: `commonnexus` uses `"."` as default MATCHCHAR. So if `"."` is used as a regular state symbol, the NEXUS must be read using the `no_default_matchchar` config option. 8. [NO]LABELS. This subcommand declares whether taxon or character labels are to appear on the left side of the matrix. By default, they should appear. If NOLABELS is used, then no labels appear, but then all currently defined taxa must be included in the MATRIX in the order in which they were originally defined. 9. TRANSPOSE. This subcommand indicates that the MATRIX is in transposed format, with each row of the matrix representing the information from one character and each column representing the information from one taxon. The following is an example of a TRANSPOSEd MATRIX: .. code-block:: MATRIX character_1 101101 character_2 011100 character_3 011110; 10. INTERLEAVE. This subcommand indicates that the MATRIX is in interleaved format, i.e., it is broken up into sections. If the data are not transposed, then each section contains the information for some of the characters for all taxa. For example, the first section might contain data for characters 1-50 for all taxa, the second section contains data for characters 51-100, etc. Taxa in each section must occur in the same order. This format is especially useful for molecular sequence data, where the number of characters can be large. A small interleaved matrix follows: .. code-block:: MATRIX taxon_1 ACCTCGGC taxon_2 ACCTCGGC taxon_3 ACGTCGCT taxon_4 ACGTCGCT taxon_1 TTAACGA taxon_2 TTAACCA taxon_3 CTCACCA taxon_4 TTCACCA The interleaved sections need not all be of the same length. In an interleaved matrix, newline characters are significant: they indicate that the next character information encountered applies to a different taxon (for nontransposed matrices). 11. ITEMS. Each entry in the matrix gives information about a character's condition in a taxon. The ITEMS subcommand indicates what items of information are listed at each entry of the matrix. With discrete character data, the entry typically consists of the states observed in the taxon (either the single state observed or several states if the taxon is polymorphic or of uncertain state). This can be specified by the state- ment ITEMS=STATES, but because it is the default and the only option allowed by most current programs for discrete data, an ITEMS statement is usually unnecessary. For continuous data, however, the wealth of alternatives (average, median, variance, minimum, maximum, sample size)t often requires an explicit ITEMS statement to in- dicate what information is represented in each data matrix entry. Some ITEMS (such as VARIANCE) would be appropriate to only some DATATYPES; other ITEMS such as SAMPLESIZE and STATES would be appropriate to most or all DATATYPES. If more than one item is indicated, parentheses must be used to surround the list of items, e.g., ITEMS=(AVERAGE VARIANCE); otherwise the parentheses are unnecessary, e.g., ITEMS=AVERAGE. More information about ITEMS options can be found in the discussion of the different DATATYPES under MATRIX; information specifically about the STATES option is given under STATESFORMAT. .. warning:: Settings other than ``ITEMS=STATES`` are currently not supported in `commonnexus`. 12. STATESFORMAT. The entry in a matrix usually lists (for discrete data) or may list (for continuous data) the states observed in the taxon. The STATESFORMAT subcommand specifies what information is conveyed in that list of STATES. In most current programs for discrete data, when a taxon is polymorphic the entry of the matrix lists only what distinct states were observed, without any indication of the number or frequency of individuals sampled with each of the states. Thus, if all individuals sampled within the taxon have state A, the matrix entry would be A, whereas if some have state A and others have state B, the entry would be (AB), which corresponds to the option STATESFORMAT=STATESPRESENT. Because it is the default for discrete data, this statement is typically unnecessary with current programs. The other STATESFORMAT options can be illustrated with an example, in which two individuals of a taxon were observed to have state A and three were observed to have state B. When STATESFORMAT=INDIVIDUALS, the state of each of the individuals (or other appropriate sampling subunit) is listed exhaustively, (A A B B B); when STATESFORMAT=COUNT, the number of individuals with each observed state is indicated, e.g., (A:2 B:3); when STATESFORMAT=FREQUENCY, the frequencies of various observed states are indicated, e.g., (A:0.40 B:0.60). The STATESFORMAT command may also be used for continuous data, for which the default is STATESFORMAT=INDIVIDUALS. .. warning:: Only the default setting ``STATESFORMAT=STATESPRESENT`` is currently supported in `commonnexus`. 13. [NO]TOKENS. This subcommand specifies whether data matrix entries are single symbols or whether they can be tokens. If TOKENS, then the data values must be full NEXUS tokens, separated by whitespace or punctuation as appropriate, as in the following example: .. code-block:: BEGIN CHARACTERS; DIMENSIONS NCHAR= 3 ; CHARSTATELABELS 1 hair/absent present, 2 color/red blue, 3 size/small big; FORMAT TOKENS; MATRIX taxon_1 absent red big taxon_2 absent blue small taxon_3 present blue small ; END; TOKENS is the default (and the only allowed option) for DATATYPE=CONTINUOUS; NOTOKENS is the default for all other DATATYPES. TOKENS is not allowed for DATATYPES DNA, RNA, and NUCLEOTIDE. If TOKENS is invoked, the standard three-letter amino acid abbreviations can be used with DATATYPE=PROTEIN and defined state names can be used for DATATYPE=STANDARD. .. warning:: ``TOKENS`` is currently not supported in `commonnexus`. :ivar str datatype: :ivar bool respectcase: :ivar str missing: :ivar typing.Optional[str] gap: :ivar typing.List[str] symbols: :ivar typing.Dict[str, str] equate: :ivar typing.Optional[str] matchchar: :ivar typing.Optional[bool] labels: :ivar bool transpose: :ivar bool interleave: :ivar typing.List[str] items: :ivar typing.Optional[str] statesformat: :ivar typing.Optional[bool] tokens: .. note:: It's typically not necessary to access the attributes of a ``Format`` instance from user code. Instead, the information is accessed when reading the matrix data in :meth:`Characters.get_matrix`. """ def __init__(self, tokens, nexus=None): super().__init__(tokens, nexus=nexus) self.datatype = None self.respectcase = False self.missing = '?' self.gap = None self.symbols = ['0', '1'] # The default for DATATYPE=STANDARD self.equate = {} self.matchchar = None if nexus and nexus.cfg.no_default_matchchar else '.' self.labels = None self.transpose = False self.interleave = False self.items = [] self.statesformat = None self.tokens = None self.explicit_symbols = False if tokens is None: return words = iter_words_and_punctuation(self._tokens, nexus=nexus) after_equals = functools.partial(word_after_equals, words) subcommand = None subcommands_set = set() while 1: try: word = next(words) if isinstance(word, str): subcommand = word.upper() elif isinstance(word, Token) and word.text == '=': if subcommand in ['RESPECTCASE', 'TRANSPOSE', 'INTERLEAVE', 'LABELS', 'TOKENS']: # Some NEXUS variants set boolean subcommands always with "=no|yes" word = next(words).lower() if subcommand == 'LABELS' and word == 'left': word = 'yes' setattr(self, subcommand.lower(), BOOLEAN[word]) subcommands_set.add(subcommand) elif subcommand: # pragma: no cover raise ValueError(subcommand) if subcommand in ['DATATYPE', 'MISSING', 'MATCHCHAR', 'GAP', 'STATESFORMAT']: setattr(self, subcommand.lower(), after_equals()) if subcommand == 'DATATYPE' and self.datatype.upper() != 'STANDARD': self.symbols = [] elif subcommand in ['RESPECTCASE', 'TRANSPOSE', 'INTERLEAVE']: if subcommand not in subcommands_set: setattr(self, subcommand.lower(), True) elif subcommand in ['NOLABELS', 'LABELS', 'NOTOKENS', 'TOKENS']: setattr(self, subcommand.replace('NO', '').lower(), 'NO' not in subcommand) elif subcommand == 'SYMBOLS': self.explicit_symbols = True self.symbols = [] next_token_text = after_equals() if not next_token_text.startswith('"'): self.symbols = list(next_token_text) else: for w in iter_delimited(next_token_text, words): if isinstance(w, str): self.symbols.extend(list(w)) else: assert w.text in '+-' self.symbols.append(w.text) elif subcommand == 'EQUATE': key, e, bracket = None, False, None for t in iter_delimited(after_equals(), words): if isinstance(t, Token): if t.text == '=': assert key e = True bracket = None else: bracket = t.text elif isinstance(t, str): if key: assert e if bracket is None: assert len(t) == 1 self.equate[key] = t elif bracket == '(': self.equate[key] = tuple(t) elif bracket == '{': self.equate[key] = set(t) else: # pragma: no cover raise ValueError( 'Invalid punctuation in EQUATE content: {}'.format(bracket)) key, e = None, False else: key = t elif subcommand == 'ITEMS': for w in iter_delimited( after_equals(), words, delimiter='()', allow_single_word=True): assert isinstance(w, str) self.items.append(w) except StopIteration: break if self.datatype: self.datatype = self.datatype.upper() assert self.datatype in { 'STANDARD', 'DNA', 'RNA', 'NUCLEOTIDE', 'PROTEIN', 'CONTINUOUS'} if self.datatype == 'CONTINUOUS' and not self.nexus.cfg.ignore_unsupported: raise NotImplementedError('DATATYPE=CONTINUOUS is not supported!') self.items = [i.upper() for i in self.items] assert all( i in 'MIN MAX MEDIAN AVERAGE VARIANCE STDERROR SAMPLESIZE STATES'.split() for i in self.items) if not self.items: self.items = ['STATES'] if self.items != ['STATES']: raise NotImplementedError('Only ITEMS=STATES is supported!') if self.statesformat: self.statesformat = self.statesformat.upper() assert self.statesformat in {'STATESPRESENT', 'INDIVIDUALS', 'COUNT', 'FREQUENCY'} else: self.statesformat = 'STATESPRESENT' if self.statesformat != 'STATESPRESENT': raise NotImplementedError( 'STATESFORMATs other than STATESPRESENT are not supported') for attr in ['missing', 'gap', 'matchchar']: c = getattr(self, attr) if c: assert len(c) == 1 and c not in INVALID_SYMBOLS if self.tokens: raise NotImplementedError('TOKENS is not supported') if self.datatype in {'DNA', 'RNA', 'NUCLEOTIDE'}: T = 'U' if self.datatype == 'RNA' else 'T' self.symbols.extend(list('ACG' + T)) self.equate.update( R=set('AG'), Y=set('C' + T), M=set('AC'), K=set('G' + T), S=set('CG'), W=set('A' + T), H=set('AC' + T), B=set('CG' + T), V=set('ACG'), D=set('AG' + T), N=set('ACG' + T), X=set('ACG' + T), ) if self.datatype == 'NUCLEOTIDE': self.equate.update(U='T') elif self.datatype == 'PROTEIN': self.symbols.extend(list('ACDEFGHIKLMNPQRSTVWY*')) self.equate.update(B=set('DN'), Z=set('EQ')) if not self.respectcase: self.equate = {k.upper(): v for k, v in self.equate.items()} invalid_equate = \ list(INVALID_SYMBOLS) + self.symbols + \ [self.missing or '', self.gap or '', self.matchchar or ''] assert not any(c in invalid_equate for c in self.equate)
[docs]class Charstatelabels(Payload): """ This command allows specification of both the names of the characters and the names of the states. This command was developed as an alternative to the older commands CHARLABELS and STATELABELS. For example, .. code-block:: CHARSTATELABELS 1 eye_color/red blue green, 3 head_shape/round square, 5 pronotum_size/small medium large A forward slash (/) separates the character name and the state names, with a comma separating the information for different characters. If no state names are to be specified, the slash may be omitted; if no character names are to be specified, the slash must be included, but no token needs to be included between the character number and the slash. If state x is the last state to be named, then subsequent states need not be named, but states 1 through x must be. If no name is to be applied to a state, enter a single underscore for its name. Character and state names are single NEXUS words. Character names must not correspond to another character name or number; thus, 1 is not a valid name for the second character listed. State names cannot be applied if DATATYPE=CONTINUOUS. :ivar typing.List[types.SimpleNamespace] characters: .. code-block:: python >>> cmd = Charstatelabels('1 eye_color/red blue green, 3 head_shape/round square') >>> cmd.characters[0].name 'eye_color' >>> cmd.characters[0].states ['red', 'blue', 'green'] .. warning:: In strict mode (see :class:`commonnexus.nexus.Config`) duplicate character names will raise a ``ValueError``, otherwise a ``UserWarning`` will be emitted. While a matrix with duplicate character names can still be read, it will typically **not** be as expected, because only the values for the last character for a given name will be present. """ def __init__(self, tokens, nexus=None): super().__init__(tokens, nexus=nexus) self.characters = [] names = set() words = iter_words_and_punctuation(self._tokens, nexus=nexus) num, name, states, in_states, comma = None, None, [], False, False while 1: try: w = next(words) if num is None: num = int(w) continue if isinstance(w, Token) and w.text == ',': comma = True # We want to be able to detect trailing commas! if name and name in names: duplicate_charlabel(name, 'CHARSTATELABELS', nexus) names.add(name) self.characters.append( types.SimpleNamespace(number=num, name=name, states=states)) num, name, states, in_states = None, None, [], False continue if in_states: states.append(w) continue if isinstance(w, Token) and w.text == '/': in_states = True continue if name: raise ValueError( 'Illegal token in charstatelabel: "{}{}"'.format(name, w)) name = w except StopIteration: break if num: if name and name in names: duplicate_charlabel(name, 'CHARSTATELABELS', nexus) self.characters.append(types.SimpleNamespace(number=num, name=name, states=states)) elif comma: # There was a comma, but no new label. warnings.warn('Trailing comma in CHARSTATELABELS command')
[docs]class Charlabels(Payload): """ This command allows specification of names of characters: .. code-block:: CHARLABELS flange microsculpture body_length hind_angles #_spines spine_size _ _ head_size pubescent_intervals head_color clypeal_margin; Character labels are listed consecutively. If character x is the last character to be named, then subsequent characters need not be named, but characters 1 through x need to be. If no name is to be applied to a character, a single underscore can be used for its name. Character names are single NEXUS words. They must not correspond to another character name or number; thus, 1 is not a valid name for the second character listed. The command should be used only for nontransposed matrices (in transposed matrices, the character labels are defined in the MATRIX command). We recommend that programs abandon this command in place of the more flexible CHARSTATELABELS command when writing NEXUS files, although programs should continue to read CHARLABELS because many existing NEXUS files use CHARLABELS. :ivar typing.List[types.SimpleNamespace] characters: """ def __init__(self, tokens, nexus=None): super().__init__(tokens, nexus=nexus) self.characters = [] names = set() for i, w in enumerate(iter_words_and_punctuation(self._tokens, nexus=nexus)): assert isinstance(w, str) if w and w in names: duplicate_charlabel(w, 'CHARLABELS', nexus) names.add(w) self.characters.append(types.SimpleNamespace(number=i + 1, name=w, states=[]))
[docs]class Statelabels(Payload): """ This command allows specification of the names of states: .. code-block:: STATELABELS 1 absent present, 2 isodiametric transverse, 3 '4.5-6.2mm' '6.3-7.0mm' '7.7-11.0mm' '>12.0mm', 4 rounded subangulate angulate, 10 0 '1-4' '6-9' '7-9' '8-9' 7 8 9, 11 black rufous metallic flavous, 12 straight concave, State labels need not be specified for all characters. A comma must separate state labels for each character. State labels are listed consecutively within a character. If state x is the last state to be named, then subsequent states need not be named, but states 1 through x must be. If no name is to be applied to a state, enter a single underscore for its name. State names are single NEXUS words. This command is not valid for DATATYPE=CONTINUOUS. We recommend that programs abandon this command in place of the more flexible CHARSTATELABELS command when writing NEXUS files, although programs should continue to read STATELABELS because many existing NEXUS files use STATELABELS. :ivar typing.List[types.SimpleNamespace] characters: """ def __init__(self, tokens, nexus=None): super().__init__(tokens, nexus=nexus) self.characters = [] words = iter_words_and_punctuation(self._tokens, nexus=nexus) num, states = None, [] while 1: try: w = next(words) if num is None: num = int(w) continue if isinstance(w, Token) and w.text == ',': self.characters.append( types.SimpleNamespace(number=num, name=None, states=states)) num, states = None, [] continue assert isinstance(w, str) states.append(w) except StopIteration: break if num and states: self.characters.append(types.SimpleNamespace(number=num, name=None, states=states))
[docs]class Matrix(Payload): """ In its standard format, the MATRIX command contains a sequence of taxon names and state information for that taxon. The MATRIX itself is of the form .. code-block:: MATRIX taxon-name entry entry... entry taxon-name entry entry... entry taxon-name entry entry... entry; Each entry in the matrix is the information about a particular character for a particular taxon. For example, it might be the assignment of state 0 to taxon 1 for character 1. Thus, the entry would consist of one state symbol, 0. If the taxon were polymorphic, the entry would consist of multiple state symbols, e.g. (0 1), indicating the taxon has both states 0 and 1. More details about the nature of each entry of the matrix are given under ITEMS and under each DATATYPE. Each entry needs to be enclosed in parentheses or braces whenever more than one state symbol is given, e.g. (01) with standard data and the default NOTOKENS option, or if the information is conveyed by more than one NEXUS token, e.g., (0:100) or (2.3 4.5 6.7). Otherwise, the parentheses or braces are optional. No whitespace is needed between entries in the matrix unless the TOKENS subcommand of the FORMAT command is invoked or implied and parentheses or braces do not surround an entry. Taxa need not be in the same order as in the TAXA block, and the matrix need not contain all taxa. For interleaved matrices, all sections must have the same taxa in the same order. Examples of matrices of different DATATYPES are described below. 1. For STANDARD data, each entry of the matrix consists of a single state-set. Under the defaults (ITEMS=STATES and STATESFORMAT=STATESPRESENT), each entry of the matrix consists of a single state-set; if there are multiple states, then the entry must be enclosed in parentheses (indicating polymorphism) or braces (indicating uncertainty in state). For example, in the following matrix, .. code-block:: BEGIN CHARACTERS; DIMENSIONS NCHAR=9; FORMAT SYMBOLS="-+x"; MATRIX taxon_1 (-+){-+}+---+-- taxon_2 +x-++--+x taxon_3 -++++--+x; END; taxon_1 is polymorphic for the first character and has either state - or state + for the second character. If STATESFORMAT=COUNT or FREQUENCY, then each entry must be enclosed in parentheses because more than one token is required to convey information for even one state: .. code-block:: BEGIN CHARACTERS; DIMENSIONS NCHAR=3; FORMAT STATESFORMAT=FREQUENCY SYMBOLS = "012"; MATRIX taxon_1 (0:0.251:0.75) (0:0.31:0.7) (0:0.51:0.32:0.2) taxon_2 (0:0.41:0.6) (0:0.81:0.2) (1:0.152:0.85) taxon_3 (0:0.01:1.0) (0:0.551:0.45) (0:0.11:0.9); END; 2. For DNA, RNA, NUCLEOTIDE, and PROTEIN data, each entry of the matrix consists of one or more state symbols describing the state(s) at one site in a molecular sequence. If STATESFORMAT=STATESPRESENT and if an entry represents a single state, then it is represented as a single state symbol (or if DATATYPE=PROTEIN and TOKENS, as a three-letter amino acid name). If an entry represents multiple states, then it must be enclosed in parentheses (indicating polymorphism) or braces (indicating uncertainty in state). Following is a matrix of DATATYPE=DNA: .. code-block:: BEGIN CHARACTERS; DIMENSIONS NCHAR=12; FORMAT DATATYPE = DNA; MATRIX taxon_1 ACCATGGTACGT taxon_2 TCCATGCTACCC taxon_3 TCCATGGAACCC; END; 3. For CONTINUOUS data, each entry in the matrix must be enclosed by parentheses if more than one item is specified in the ITEMS subcommand. Parentheses must also be used whenever multiple tokens are needed for an entry in the matrix. If an entry consists of a single token (eg., 0.231), it may be written without parentheses but must then be separated from other entries by whitespace. .. code-block:: MATRIX A 0.453 1.43 78.6 B 0.34 1.02 55.7 C 0.22 1.79 69.1; A matrix entry can include average, minimum, maximum, variance, standard error, sample size, and a listing of states observed in the taxon, as specified in the ITEMS subcommand. The sample size, if included, must be in the form of an integer; the other numbers can be either in English decimal (e.g., 0.00452) or in exponential form (e.g., 4.52E-3). The information listed for each taxon for a continuous character is specified in the ITEMS subcommand of the FORMAT command. For example, if the matrix contains only information about the minimum and maximum value for each taxon, the ITEMS subcommand would be ITEMS=(MIN MAX) and a small matrix might look something like this: .. code-block:: MATRIX taxon_1 (0.21 0.45) (0.34 0.36) taxon_2 (0.13 0.22) (0.45 0.55); If the ITEMS include the raw measurements (states), e.g., to list a sample of measurements from individuals, then the other items must precede the listing of states. There is no restriction on the number of elements in the listing of states. This example has only one continuous character: .. code-block:: FORMAT DATATYPE=CONTINUOUS ITEMS=(AVERAGE STATES) STATESFORMAT=INDIVIDUALS; MATRIX taxon_1 (1.2 2.1 1.6 0.8 1.8 0.3 0.6) taxon_2 (1.6 2.2 1.7 1.0 2.0 1.6 1.9 0.8); in which the first value is the sample average and the subsequent values comprise the sample of observed states. Possible ITEMS to be included are MIN (minimum), MAX (maximum), AVERAGE (sample average), VARIANCE (sample variance), STDERROR (standard error), MEDIAN (sample median), SAMPLESIZE, and STATES. The manner of presentations of states can be indicated using the STATESFORMAT command. The default ITEMS for continuous data is AVERAGE. .. note:: Since reading the matrix data only makes sense if information from other commands - in particular :class:`FORMAT <Format>` - is considered, the ``Matrix`` object does not have any attributes for data access. Instead, the matrix data can be read via :meth:`Characters.get_matrix`. """
[docs]class Characters(Block): """ A CHARACTERS block defines characters and includes character data. Taxa are usually not defined in a CHARACTERS block; if they are not, the CHARACTERS block must be preceded by a block that defines taxon labels and ordering (e.g., TAXA). Syntax of the CHARACTERS block is as follows: .. rst-class:: nexus | BEGIN CHARACTERS; | :class:`DIMENSIONS <Dimensions>` [NEWTAXA NTAX=num-taxa] NCHAR=num-characters; | [:class:`FORMAT <Format>` | [DATATYPE = { STANDARD| DNA | RNA | NUCLEOTIDE | PROTEIN | CONTINUOUS} ] | [RESPECTCASE] | [MISSING=symbol] | [GAP=symbol] | [SYMBOLS="symbol [symbol...]"] | [EQUATE="symbol = entry [symbol = entry... ] " ] | [MATCHCHAR= symbol ] | [[NO]LABELS] | [TRANSPOSE] | [INTERLEAVE] | [ITEMS=([MIN] [MAX] [MEDIAN] [AVERAGE] [VARIANCE] [STDERROR] [SAMPLESIZE] [STATES])] | [STATESFORMAT= {STATESPRESENT | INDIVIDUALS | COUNT | FREQUENCY}] | [[NO]TOKENS] | ;] | [:class:`ELIMINATE <Eliminate>` character-set;] | [:class:`TAXLABELS <commonnexus.blocks.taxa.Taxlabels>` taxon-name [taxon-name ...];] | [:class:`CHARSTATELABELS <Charstatelabels>` | character-number [character-name] [/state-name [state-name...]] | [, character-number [character-name] [/state-name [state-name...]]...] | ;] | [:class:`CHARLABELS <Charlabels>` character-name [character-name...];] | [:class:`STATELABELS <Statelabels>` | character-number [state-name [state-name ...]] | [, character-number [state-name [state-name...]]...] | ;] | :class:`MATRIX <Matrix>` data-matrix; | END; :class:`DIMENSIONS <Dimensions>`, :class:`FORMAT <Format>`, and :class:`ELIMINATE <Eliminate>` must all precede :class:`CHARLABELS <Charlabels>`, :class:`CHARSTATELABELS <Charstatelabels>`, :class:`STATELABELS <Statelabels>`, and :class:`MATRIX <Matrix>`. :class:`DIMENSIONS <Dimensions>` must precede :class:`ELIMINATE <Eliminate>`. Only one of each command is allowed per block. """ __commands__ = [ Dimensions, Format, Eliminate, Taxlabels, Charstatelabels, Charlabels, Statelabels, Matrix]
[docs] def is_binary(self) -> bool: """ :return: Whether the matrix in the block is binary, i.e. codes items as presence/absence \ using symbols "01". """ return not bool(self.FORMAT) or (self.FORMAT.symbols == ['0', '1'])
[docs] def get_matrix(self, labeled_states: bool = False) -> StateMatrix: """ :param labeled_states: Flag signaling whether state symbols should be translated to state \ labels (if available). :return: The values of the matrix, read according to FORMAT. The matrix is returned as \ ordered `dict`, mapping taxon labels (if available, else numbers) to ordered `dict`s \ mapping character labels (if available, else numbers) to state values. State values are \ either atomic values (of type `str`) or `tuple`s (indicating polymorphism) or `set`s \ (indicating uncertainty) of atomic values. Atomic values may be `None` (indicating missing \ data), the special string `GAP` (indicating gaps) or state symbols or labels (if available \ and explicitly requested via `labeled_states=True`). State symbols are returned using the \ case given in FORMAT SYMBOLS, i.e. if a RESPECTCASE directive is missing and \ FORMAT SYMBOLS="ABC", a value "a" in the matrix will be returned as "A". """ format = Format(None) if 'FORMAT' not in self.commands else self.FORMAT # Determine dimensions and labels: ntax, taxlabels = self.get_taxlabels(format) nchar = self.DIMENSIONS.nchar charlabels, statelabels = self.get_charstatelabels(nchar, format) if format.transpose and (not format.interleave) and (format.labels != False) and (not ntax): raise ValueError("Can't read transposed matrix without NTAX.") # pragma: no cover if format.datatype == 'CONTINUOUS': # pragma: no cover raise NotImplementedError("Can't read a matrix of datatype CONTINUOUS") # We read the matrix data in an agnostic way, ignoring whether it's transposed or not, as # ordered dictionary mapping row labels (or numbers) to lists of entries. res = collections.OrderedDict() label, entries = None, [] ncols, nrows = ntax if format.transpose else nchar, nchar if format.transpose else ntax for i, line in enumerate( list(iter_lines(self.MATRIX._tokens)) if format.interleave else [self.MATRIX._tokens], start=1): words = iter_words_and_punctuation( line, allow_punctuation_in_word='+-', nexus=self.nexus) while 1: try: t = next(words) if (format.labels is not False) and label is None: assert isinstance(t, str) label = t continue if isinstance(t, Token): if t.text == '(': w = next(words) symbols = '' while isinstance(w, str) or (w.text in format.symbols) \ or (w.text == ","): if isinstance(w, str) or w.text != ',': symbols += getattr(w, 'text', w) w = next(words) assert w.text == ')', "Expected )" entries.append(tuple(symbols)) elif t.text == '{': w = next(words) vals = set() while isinstance(w, str) or (w.text in format.symbols) \ or (w.text == format.gap) or (w.text == ","): if isinstance(w, str) or w.text != ',': vals |= set(getattr(w, 'text', w)) w = next(words) assert w.text == '}', "Expected }" entries.append(vals) elif t.text in format.symbols: # pragma: no cover entries.append(t.text) else: # pragma: no cover raise ValueError('Unexpected punctuation in matrix') else: entries.extend(list(t)) # We split a word into a list of symbols. if not format.interleave and (len(entries) == ncols): res[label or (len(res) + 1)] = entries label, entries = None, [] except StopIteration: break if format.interleave: key = label or (i % nrows or nrows) if key not in res: res[key] = [] res[key].extend(entries) label, entries = None, [] cols = ncols or len(list(res.values())[0]) assert all(len(states) == cols for states in res.values()), "Incomplete matrix read!" if not taxlabels: assert not format.transpose taxlabels = {i + 1: key for i, key in enumerate(res)} def apply_to_state(func, state, *args, **kw): # We have to do a bit of mapping and renaming, which always needs to deal with the # three different types of state. if isinstance(state, str): return func(state, *args, **kw) if isinstance(state, tuple): return tuple(func(s, *args, **kw) for s in state) if isinstance(state, set): return set(func(s, *args, **kw) for s in state) raise ValueError(state) # pragma: no cover lax_symbols = not format.explicit_symbols and format.datatype in {None, 'STANDARD'} \ and not (self.nexus and self.nexus.cfg.strict) def replace_symbol(s, i, r): if (format.respectcase and s == format.missing) or \ (not format.respectcase and (s.upper() == format.missing.upper())): return None if format.gap: if (format.respectcase and s == format.gap) or \ (not format.respectcase and (s.upper() == format.gap.upper())): return GAP if format.matchchar: # match entries from first row! if (format.respectcase and s == format.matchchar) or \ (not format.respectcase and (s.upper() == format.matchchar.upper())): assert r return r[i] if s not in format.symbols: s = s.lower() if s.isupper() else s.upper() if not lax_symbols: assert s in format.symbols, '{} {}'.format(s, format.symbols) return s def resolve_symbols(s, i, r): def resolve(c, i, r): c = format.equate.get(c.upper(), c) # May result in ambiguous or multiple states! return apply_to_state(replace_symbol, c, i, r) return apply_to_state(resolve, s, i, r) firstrow = None for i, l in enumerate(res): res[l] = [resolve_symbols(s, i, firstrow) for i, s in enumerate(res[l])] if i == 0: # We need the fully resolved entries of the first row around to resolve MATCHCHARs. firstrow = res[l] # Create the final result, an OrderedDict mapping taxa labels (or numbers) to OrderedDicts # mapping character labels or numbers to state symbols or labels. matrix = collections.OrderedDict() if not format.transpose: tlabels = {str(k) for k in res.keys()} valid_taxa = {str(k) for k in taxlabels}.union(taxlabels.values()) if not tlabels.issubset(valid_taxa): if self.nexus and self.nexus.cfg.strict: # pragma: no cover raise ValueError('Found undeclared taxa in characters matrix') else: warnings.warn('Dropping undeclared taxa from characters matrix.') for tnum, tlabel in sorted(taxlabels.items()): if format.transpose: # We have to pick the tnum column in each list in res. matrix[tlabel] = collections.OrderedDict() for cnum, clabel in sorted(charlabels.items()): entries = res[clabel] if clabel in res else res[cnum] matrix[tlabel][clabel] = entries[tnum - 1] else: key = tlabel if tlabel in res else (tnum if tnum in res else str(tnum)) if key in res: # Non-transposed matrices may not have data for each taxon! entries = res[key] matrix[tlabel] = collections.OrderedDict( [(charlabels[i], s) for i, s in enumerate(entries, start=1)]) if labeled_states: for entries in matrix.values(): for char in entries: if char in statelabels: if entries[char] not in {None, GAP}: entries[char] = apply_to_state( lambda s: statelabels[char].get(s) or s, entries[char]) return matrix
def get_taxlabels(self, format): if self.TAXLABELS: taxlabels = self.TAXLABELS.labels ntax = self.DIMENSIONS.ntax elif 'TAXA' in self.linked_blocks: taxlabels = self.linked_blocks['TAXA'].TAXLABELS.labels ntax = self.linked_blocks['TAXA'].DIMENSIONS.ntax elif self.nexus.TAXA: taxlabels = self.nexus.TAXA.TAXLABELS.labels ntax = self.nexus.TAXA.DIMENSIONS.ntax else: ntax, taxlabels = None, {} if format.interleave and format.labels is False and not format.transpose: # If the matrix has no row labels and is not transposed, we need the number of taxa to # compute the size of the interleaved blocks. assert ntax taxlabels = taxlabels or {i + 1: str(i + 1) for i in range(ntax)} return ntax, taxlabels def get_charstatelabels(self, nchar=None, format=None): nchar = nchar or self.DIMENSIONS.nchar charlabels = {i + 1: str(i + 1) for i in range(nchar)} statelabels = {} if self.CHARSTATELABELS: charlabels = { int(c.number): c.name or str(c.number) for c in self.CHARSTATELABELS.characters} statelabels = { c.number: c.states for c in self.CHARSTATELABELS.characters} elif self.CHARLABELS: charlabels = { int(c.number): c.name or str(c.number) for c in self.CHARLABELS.characters} if self.STATELABELS: statelabels = {c.number: c.states for c in self.STATELABELS.characters} format = format or self.FORMAT or Format(None) if statelabels: statelabels = {charlabels[cnum]: states for cnum, states in statelabels.items()} for clabel in statelabels: states = statelabels[clabel] labeled = collections.OrderedDict() if format: for i, symbol in enumerate(format.symbols): if i < len(states) and states[i] != '_': labeled[symbol] = states[i] statelabels[clabel] = labeled else: statelabels = {} assert len(charlabels) == nchar return charlabels, statelabels def validate(self, log=None): res = super().validate(log) if 'TAXLABELS' in self.commands and not self.DIMENSIONS.newtaxa: return log_or_raise( 'TAXLABELS may only be defined in {} block if NEWTAXA is specified.'.format( self.name), log=log) return res
[docs] @classmethod def from_data(cls, matrix: StateMatrix, taxlabels: bool = False, statelabels: typing.Optional[typing.Dict[str, typing.Dict[str, str]]] = None, datatype: str = 'STANDARD', missing: str = '?', gap: str = '-', comment: typing.Optional[str] = None, nexus: typing.Optional["Nexus"] = None, TITLE: typing.Optional[str] = None, ID: typing.Optional[str] = None, LINK: typing.Optional[typing.Union[str, typing.Tuple[str, str]]] = None) \ -> 'Characters': """ Instantiate a CHARACTERS or DATA block from a metrix. This functionality can be used to normalize the NEXUS formatting of CHARACTERS matrices: .. code-block:: python >>> nex = Nexus('''#NEXUS ... BEGIN TAXA; ... DIMENSIONS NTAX=3; ... TAXLABELS t1 t2 t3; ... END; ... BEGIN CHARACTERS; ... DIMENSIONS NCHAR=3; ... FORMAT TRANSPOSE NOLABELS; ... MATRIX 100 010 001; ... END;''') >>> matrix = nex.CHARACTERS.get_matrix() >>> nex.replace_block(nex.CHARACTERS, Characters.from_data(matrix)) >>> print(nex) #NEXUS BEGIN TAXA; DIMENSIONS NTAX=3; TAXLABELS t1 t2 t3; END; BEGIN CHARACTERS; DIMENSIONS NCHAR=3; FORMAT DATATYPE=STANDARD MISSING=? GAP=- SYMBOLS="01"; MATRIX t1 100 t2 010 t3 001 ; END; :param matrix: A matrix as returned by :meth:`Characters.get_matrix()`, with unlabeled \ states. I.e. `None` is used to mark missing values, and `GAP` to mark gapped values. These \ special states will be converted to the symbols passed as `missing` and `gap` upon writing. :param taxlabels: If `True`, include a TAXLABELS command rather than relying on a TAXA \ block being present. :param datatype: :param missing: :param gap: :param nexus: An optional Nexus instance to lookup global config options. """ if datatype != 'STANDARD': # pragma: no cover raise NotImplementedError('Only DATATYPE=STANDARD is supported for writing CHARACTERS') symbols, rows, charlabels, maxlen, tlabels = set(), [], None, 0, {} for taxon in matrix: # We compute maximum taxon label length for pretty printing. tlabels[taxon] = Word(taxon).as_nexus_string() maxlen = max([maxlen, len(tlabels[taxon])]) symbol = lambda c: missing if c is None else (gap if c == GAP else c) # noqa: E731 for taxon, entries in matrix.items(): if not charlabels: charlabels = collections.OrderedDict( [(str(i + 1), c) for i, c in enumerate(entries)]) row = [] for entry in entries.values(): if entry: symbols |= set(entry) if isinstance(entry, tuple): # polymorphism -> () row.append('({})'.format(''.join(symbol(c) for c in entry))) elif isinstance(entry, set): # uncertainty -> {} row.append('{{{}}}'.format(''.join(sorted(symbol(c) for c in entry)))) else: row.append(symbol(entry)) rows.append('\n{} {}'.format(tlabels[taxon].ljust(maxlen), ''.join(row))) symbols = ''.join(sorted([s for s in symbols if s not in [None, GAP]])) if missing in symbols or (gap in symbols): raise ValueError('MISSING or GAP markers must be distinct from "{}"'.format(symbols)) respectcase = any(c.isupper() for c in symbols) and any(c.islower() for c in symbols) dimensions = 'NCHAR={}'.format(len(list(matrix.values())[0])) if taxlabels: dimensions = 'NEWTAXA NTAX={} {}'.format(len(tlabels), dimensions) cmds = [ ('DIMENSIONS', dimensions), ('FORMAT', 'DATATYPE=STANDARD {}MISSING={} GAP={} SYMBOLS="{}"'.format( 'RESPECTCASE ' if respectcase else '', missing, gap, symbols)), ] statelabels = statelabels or {} if any(k != v for k, v in charlabels.items()): cmds.append(( 'CHARSTATELABELS', ', '.join('\n {} {}{}'.format( n, Word(l).as_nexus_string(), '/' + ' '.join(Word(ll).as_nexus_string() for ll in statelabels[l].values()) if statelabels.get(l) else '', ) for n, l in charlabels.items()))) if taxlabels: cmds.append(('TAXLABELS', ' '.join(tlabels.values()))) cmds.append(('MATRIX', ''.join(rows) + '\n')) return cls.from_commands(cmds, nexus=nexus, TITLE=TITLE, ID=ID, LINK=LINK, comment=comment)
[docs]class Options(Payload): """ The GAPMODE subcommand of the OPTIONS command of the ASSUMPTIONS block was originally housed in an OPTIONS command in the DATA block. :ivar typing.Optional[str] gapmode: `missing` or `newstate`. """ def __init__(self, tokens, nexus=None): super().__init__(tokens, nexus=nexus) self.gapmode = None words = iter_words_and_punctuation(self._tokens, nexus=nexus) while 1: try: word = next(words) if isinstance(word, str) and word.upper() == 'GAPMODE': self.gapmode = word_after_equals(words).lower() except StopIteration: break assert self.gapmode in {None, 'missing', 'newstate'}
[docs]class Data(Characters): """ This block is equivalent to a CHARACTERS block in which the NEWTAXA subcommand is included in the DIMENSIONS command. That is, the DATA block is a CHARACTERS block that includes not only the definition of characters but also the definition of taxa. .. note:: The GAPMODE subcommand of the OPTIONS command of the ASSUMPTIONS block was originally housed in an :class:`OPTIONS <Options>` command in the DATA block. """ __commands__ = [ Dimensions, Format, Options, Eliminate, Taxlabels, Charstatelabels, Charlabels, Statelabels, Matrix]