import types
import typing
import warnings
import functools
import collections
from .base import Block, Payload
from commonnexus.util import log_or_raise
from commonnexus.tokenizer import (
iter_words_and_punctuation, Token, iter_delimited, iter_lines, BOOLEAN, word_after_equals, Word,
)
from .taxa import Taxlabels
if typing.TYPE_CHECKING: # pragma: no cover
from commonnexus import Nexus
# A state in a CHARACTERS matrix may be missing, gapped, a state symbol or uncertain/polymorphic
# states.
State = typing.Union[None, str, typing.Set[str], typing.Tuple[str]]
StateMatrix = typing.OrderedDict[str, typing.OrderedDict[str, State]]
GAP = '\uFFFD' # REPLACEMENT CHARACTER used to replace an [...] unrepresentable character
#: Some - but not all - punctuation is invalid as (special) state symbol.
INVALID_SYMBOLS = "()[]{}/\\,;:=*'\"*`<>^"
def duplicate_charlabel(label, cmd, nexus):
if nexus and nexus.cfg.strict: # pragma: no cover
raise ValueError('character names must be unique!')
else:
warnings.warn(
'Duplicate character name "{}" in {} command'.format(label, cmd))
[docs]class Eliminate(Payload):
"""
This command allows specification of a list of characters that are to be excluded from
consideration. Programs are expected to ignore ELIMINATEd characters completely during reading.
In avoiding allocation of memory to store character information, the programs can save a
considerable amount of computer memory. (This subcommand is similar to ZAP in version 3.1.1
of PAUP.) For example,
.. code-block::
ELIMINATE 4-100;
tells the program to skip over characters 4 through 100 in reading the matrix. Character-set
names are not allowed in the character list. This command does not affect character numbers.
.. warning:: The ``ELIMINATE`` command is currently not supported in `commonnexus`.
"""
def __init__(self, tokens, nexus=None):
super().__init__(tokens, nexus=nexus)
if nexus is not None and not nexus.cfg.ignore_unsupported:
raise NotImplementedError('The ELIMINATE command is not supported')
[docs]class Dimensions(Payload):
"""
The DIMENSIONS command specifies the number of characters. The number following NCHAR
is the number of characters in the data matrix. The NEXUS standard does not impose limits on the
number of characters; a limit may be imposed by particular computer programs.
It is strongly advised that new taxa not be defined in a CHARACTERS block, for the reasons
discussed in the description of the DATA block. If new taxa are to be defined, this must be
indicated by the NEWTAXA subcommand, specifying that new taxa are to be defined (this allows
the computer program to prepare for creation of new taxa). NEWTAXA, if present, must appear
before the NTAX subcommand. The NTAX subcommand, indicating the number of taxa in the MATRIX
command in the block, is optional, unless NEWTAXA is specified, in which case it is required.
:ivar bool newtaxa:
:ivar typing.Optional[int] ntax:
:ivar int nchar:
"""
def __init__(self, tokens, nexus=None):
super().__init__(tokens, nexus=nexus)
self.newtaxa = False
self.ntax = None
self.char = None
words = iter_words_and_punctuation(self._tokens, nexus=nexus)
while 1:
try:
word = next(words)
subcommand = None
if isinstance(word, str):
subcommand = word.upper()
if subcommand == 'NEWTAXA':
self.newtaxa = True
elif subcommand == 'NTAX':
n = next(words)
assert n.text == '='
self.ntax = int(next(words))
elif subcommand == 'NCHAR':
n = next(words)
assert n.text == '='
self.nchar = int(next(words))
except StopIteration:
break
self.check()
def check(self):
assert self.nchar and ((not self.newtaxa) or self.ntax)
[docs]class Charstatelabels(Payload):
"""
This command allows specification of both the names of the characters and the names of the
states. This command was developed as an alternative to the older commands CHARLABELS and
STATELABELS. For example,
.. code-block::
CHARSTATELABELS
1 eye_color/red blue green,
3 head_shape/round square,
5 pronotum_size/small medium large
A forward slash (/) separates the character name and the state names, with a comma separating
the information for different characters. If no state names are to be specified, the slash may
be omitted; if no character names are to be specified, the slash must be included, but no token
needs to be included between the character number and the slash. If state x is the last state
to be named, then subsequent states need not be named, but states 1 through x must be. If no
name is to be applied to a state, enter a single underscore for its name. Character and state
names are single NEXUS words. Character names must not correspond to another character name or
number; thus, 1 is not a valid name for the second character listed. State names cannot be
applied if DATATYPE=CONTINUOUS.
:ivar typing.List[types.SimpleNamespace] characters:
.. code-block:: python
>>> cmd = Charstatelabels('1 eye_color/red blue green, 3 head_shape/round square')
>>> cmd.characters[0].name
'eye_color'
>>> cmd.characters[0].states
['red', 'blue', 'green']
.. warning::
In strict mode (see :class:`commonnexus.nexus.Config`) duplicate character names will raise
a ``ValueError``, otherwise a ``UserWarning`` will be emitted. While a matrix with duplicate
character names can still be read, it will typically **not** be as expected, because only
the values for the last character for a given name will be present.
"""
def __init__(self, tokens, nexus=None):
super().__init__(tokens, nexus=nexus)
self.characters = []
names = set()
words = iter_words_and_punctuation(self._tokens, nexus=nexus)
num, name, states, in_states, comma = None, None, [], False, False
while 1:
try:
w = next(words)
if num is None:
num = int(w)
continue
if isinstance(w, Token) and w.text == ',':
comma = True # We want to be able to detect trailing commas!
if name and name in names:
duplicate_charlabel(name, 'CHARSTATELABELS', nexus)
names.add(name)
self.characters.append(
types.SimpleNamespace(number=num, name=name, states=states))
num, name, states, in_states = None, None, [], False
continue
if in_states:
states.append(w)
continue
if isinstance(w, Token) and w.text == '/':
in_states = True
continue
if name:
raise ValueError(
'Illegal token in charstatelabel: "{}{}"'.format(name, w))
name = w
except StopIteration:
break
if num:
if name and name in names:
duplicate_charlabel(name, 'CHARSTATELABELS', nexus)
self.characters.append(types.SimpleNamespace(number=num, name=name, states=states))
elif comma: # There was a comma, but no new label.
warnings.warn('Trailing comma in CHARSTATELABELS command')
[docs]class Charlabels(Payload):
"""
This command allows specification of names of characters:
.. code-block::
CHARLABELS
flange microsculpture
body_length
hind_angles #_spines
spine_size _ _ head_size
pubescent_intervals head_color
clypeal_margin;
Character labels are listed consecutively. If character x is the last character to be named,
then subsequent characters need not be named, but characters 1 through x need to be. If no name
is to be applied to a character, a single underscore can be used for its name. Character names
are single NEXUS words. They must not correspond to another character name or number; thus, 1
is not a valid name for the second character listed. The command should be used only for
nontransposed matrices (in transposed matrices, the character labels are defined in the MATRIX
command). We recommend that programs abandon this command in place of the more flexible
CHARSTATELABELS command when writing NEXUS files, although programs should continue to read
CHARLABELS because many existing NEXUS files use CHARLABELS.
:ivar typing.List[types.SimpleNamespace] characters:
"""
def __init__(self, tokens, nexus=None):
super().__init__(tokens, nexus=nexus)
self.characters = []
names = set()
for i, w in enumerate(iter_words_and_punctuation(self._tokens, nexus=nexus)):
assert isinstance(w, str)
if w and w in names:
duplicate_charlabel(w, 'CHARLABELS', nexus)
names.add(w)
self.characters.append(types.SimpleNamespace(number=i + 1, name=w, states=[]))
[docs]class Statelabels(Payload):
"""
This command allows specification of the names of states:
.. code-block::
STATELABELS
1 absent present,
2 isodiametric transverse,
3 '4.5-6.2mm' '6.3-7.0mm' '7.7-11.0mm' '>12.0mm',
4 rounded subangulate angulate,
10 0 '1-4' '6-9' '7-9' '8-9' 7 8 9,
11 black rufous metallic flavous,
12 straight concave,
State labels need not be specified for all characters. A comma must separate state labels for
each character. State labels are listed consecutively within a character. If state x is the
last state to be named, then subsequent states need not be named, but states 1 through x must
be. If no name is to be applied to a state, enter a single underscore for its name. State
names are single NEXUS words. This command is not valid for DATATYPE=CONTINUOUS.
We recommend that programs abandon this command in place of the more flexible
CHARSTATELABELS command when writing NEXUS files, although programs should continue to read
STATELABELS because many existing NEXUS files use STATELABELS.
:ivar typing.List[types.SimpleNamespace] characters:
"""
def __init__(self, tokens, nexus=None):
super().__init__(tokens, nexus=nexus)
self.characters = []
words = iter_words_and_punctuation(self._tokens, nexus=nexus)
num, states = None, []
while 1:
try:
w = next(words)
if num is None:
num = int(w)
continue
if isinstance(w, Token) and w.text == ',':
self.characters.append(
types.SimpleNamespace(number=num, name=None, states=states))
num, states = None, []
continue
assert isinstance(w, str)
states.append(w)
except StopIteration:
break
if num and states:
self.characters.append(types.SimpleNamespace(number=num, name=None, states=states))
[docs]class Matrix(Payload):
"""
In its standard format, the MATRIX command contains a sequence of taxon names and state
information for that taxon. The MATRIX itself is of the form
.. code-block::
MATRIX
taxon-name entry entry... entry
taxon-name entry entry... entry
taxon-name entry entry... entry;
Each entry in the matrix is the information about a particular character for a particular taxon.
For example, it might be the assignment of state 0 to taxon 1 for character 1. Thus, the entry
would consist of one state symbol, 0. If the taxon were polymorphic, the entry would consist
of multiple state symbols, e.g. (0 1), indicating the taxon has both states 0 and 1. More
details about the nature of each entry of the matrix are given under ITEMS and under each
DATATYPE. Each entry needs to be enclosed in parentheses or braces whenever more than one state
symbol is given, e.g. (01) with standard data and the default NOTOKENS option, or if the
information is conveyed by more than one NEXUS token, e.g., (0:100) or (2.3 4.5 6.7). Otherwise,
the parentheses or braces are optional. No whitespace is needed between entries in the matrix
unless the TOKENS subcommand of the FORMAT command is invoked or implied and parentheses or
braces do not surround an entry.
Taxa need not be in the same order as in the TAXA block, and the matrix need not contain all
taxa. For interleaved matrices, all sections must have the same taxa in the same order.
Examples of matrices of different DATATYPES are described below.
1. For STANDARD data, each entry of the matrix consists of a single state-set. Under the
defaults (ITEMS=STATES and STATESFORMAT=STATESPRESENT), each entry of the matrix consists of
a single state-set; if there are multiple states, then the entry must be enclosed in
parentheses (indicating polymorphism) or braces (indicating uncertainty in state). For
example, in the following matrix,
.. code-block::
BEGIN CHARACTERS;
DIMENSIONS NCHAR=9;
FORMAT SYMBOLS="-+x";
MATRIX
taxon_1 (-+){-+}+---+--
taxon_2 +x-++--+x
taxon_3 -++++--+x;
END;
taxon_1 is polymorphic for the first character and has either state - or state + for the
second character. If STATESFORMAT=COUNT or FREQUENCY, then each entry must be enclosed in
parentheses because more than one token is required to convey information for even one state:
.. code-block::
BEGIN CHARACTERS;
DIMENSIONS NCHAR=3;
FORMAT STATESFORMAT=FREQUENCY SYMBOLS = "012";
MATRIX
taxon_1 (0:0.251:0.75) (0:0.31:0.7) (0:0.51:0.32:0.2)
taxon_2 (0:0.41:0.6) (0:0.81:0.2) (1:0.152:0.85)
taxon_3 (0:0.01:1.0) (0:0.551:0.45) (0:0.11:0.9);
END;
2. For DNA, RNA, NUCLEOTIDE, and PROTEIN data, each entry of the matrix consists of one or more
state symbols describing the state(s) at one site in a molecular sequence. If
STATESFORMAT=STATESPRESENT and if an entry represents a single state, then it is represented
as a single state symbol (or if DATATYPE=PROTEIN and TOKENS, as a three-letter amino acid
name). If an entry represents multiple states, then it must be enclosed in parentheses
(indicating polymorphism) or braces (indicating uncertainty in state). Following is a matrix
of DATATYPE=DNA:
.. code-block::
BEGIN CHARACTERS;
DIMENSIONS NCHAR=12;
FORMAT DATATYPE = DNA;
MATRIX
taxon_1 ACCATGGTACGT
taxon_2 TCCATGCTACCC
taxon_3 TCCATGGAACCC;
END;
3. For CONTINUOUS data, each entry in the matrix must be enclosed by parentheses if more than
one item is specified in the ITEMS subcommand. Parentheses must also be used whenever
multiple tokens are needed for an entry in the matrix. If an entry consists of a single
token (eg., 0.231), it may be written without parentheses but must then be separated from
other entries by whitespace.
.. code-block::
MATRIX
A 0.453 1.43 78.6
B 0.34 1.02 55.7
C 0.22 1.79 69.1;
A matrix entry can include average, minimum, maximum, variance, standard error, sample size,
and a listing of states observed in the taxon, as specified in the ITEMS subcommand. The
sample size, if included, must be in the form of an integer; the other numbers can be either
in English decimal (e.g., 0.00452) or in exponential form (e.g., 4.52E-3). The information
listed for each taxon for a continuous character is specified in the ITEMS subcommand of the
FORMAT command. For example, if the matrix contains only information about the minimum and
maximum value for each taxon, the ITEMS subcommand would be ITEMS=(MIN MAX) and a small
matrix might look something like this:
.. code-block::
MATRIX
taxon_1 (0.21 0.45) (0.34 0.36)
taxon_2 (0.13 0.22) (0.45 0.55);
If the ITEMS include the raw measurements (states), e.g., to list a sample of measurements
from individuals, then the other items must precede the listing of states. There is no
restriction on the number of elements in the listing of states. This example has only one
continuous character:
.. code-block::
FORMAT DATATYPE=CONTINUOUS ITEMS=(AVERAGE STATES) STATESFORMAT=INDIVIDUALS;
MATRIX
taxon_1 (1.2 2.1 1.6 0.8 1.8 0.3 0.6)
taxon_2 (1.6 2.2 1.7 1.0 2.0 1.6 1.9 0.8);
in which the first value is the sample average and the subsequent values comprise the sample
of observed states. Possible ITEMS to be included are MIN (minimum), MAX (maximum), AVERAGE
(sample average), VARIANCE (sample variance), STDERROR (standard error), MEDIAN
(sample median), SAMPLESIZE, and STATES. The manner of presentations of states can be
indicated using the STATESFORMAT command. The default ITEMS for continuous data is AVERAGE.
.. note::
Since reading the matrix data only makes sense if information from other commands - in
particular :class:`FORMAT <Format>` - is considered, the ``Matrix`` object does not have
any attributes for data access. Instead, the matrix data can be read via
:meth:`Characters.get_matrix`.
"""
[docs]class Characters(Block):
"""
A CHARACTERS block defines characters and includes character data.
Taxa are usually not defined in a CHARACTERS block; if they are not, the CHARACTERS block must
be preceded by a block that defines taxon labels and ordering (e.g., TAXA).
Syntax of the CHARACTERS block is as follows:
.. rst-class:: nexus
| BEGIN CHARACTERS;
| :class:`DIMENSIONS <Dimensions>` [NEWTAXA NTAX=num-taxa] NCHAR=num-characters;
| [:class:`FORMAT <Format>`
| [DATATYPE = { STANDARD| DNA | RNA | NUCLEOTIDE | PROTEIN | CONTINUOUS} ]
| [RESPECTCASE]
| [MISSING=symbol]
| [GAP=symbol]
| [SYMBOLS="symbol [symbol...]"]
| [EQUATE="symbol = entry [symbol = entry... ] " ]
| [MATCHCHAR= symbol ]
| [[NO]LABELS]
| [TRANSPOSE]
| [INTERLEAVE]
| [ITEMS=([MIN] [MAX] [MEDIAN] [AVERAGE] [VARIANCE] [STDERROR] [SAMPLESIZE] [STATES])]
| [STATESFORMAT= {STATESPRESENT | INDIVIDUALS | COUNT | FREQUENCY}]
| [[NO]TOKENS]
| ;]
| [:class:`ELIMINATE <Eliminate>` character-set;]
| [:class:`TAXLABELS <commonnexus.blocks.taxa.Taxlabels>` taxon-name [taxon-name ...];]
| [:class:`CHARSTATELABELS <Charstatelabels>`
| character-number [character-name] [/state-name [state-name...]]
| [, character-number [character-name] [/state-name [state-name...]]...]
| ;]
| [:class:`CHARLABELS <Charlabels>` character-name [character-name...];]
| [:class:`STATELABELS <Statelabels>`
| character-number [state-name [state-name ...]]
| [, character-number [state-name [state-name...]]...]
| ;]
| :class:`MATRIX <Matrix>` data-matrix;
| END;
:class:`DIMENSIONS <Dimensions>`, :class:`FORMAT <Format>`, and :class:`ELIMINATE <Eliminate>`
must all precede :class:`CHARLABELS <Charlabels>`, :class:`CHARSTATELABELS <Charstatelabels>`,
:class:`STATELABELS <Statelabels>`, and :class:`MATRIX <Matrix>`.
:class:`DIMENSIONS <Dimensions>` must precede :class:`ELIMINATE <Eliminate>`.
Only one of each command is allowed per block.
"""
__commands__ = [
Dimensions, Format, Eliminate, Taxlabels, Charstatelabels, Charlabels, Statelabels, Matrix]
[docs] def is_binary(self) -> bool:
"""
:return: Whether the matrix in the block is binary, i.e. codes items as presence/absence \
using symbols "01".
"""
return not bool(self.FORMAT) or (self.FORMAT.symbols == ['0', '1'])
[docs] def get_matrix(self, labeled_states: bool = False) -> StateMatrix:
"""
:param labeled_states: Flag signaling whether state symbols should be translated to state \
labels (if available).
:return: The values of the matrix, read according to FORMAT. The matrix is returned as \
ordered `dict`, mapping taxon labels (if available, else numbers) to ordered `dict`s \
mapping character labels (if available, else numbers) to state values. State values are \
either atomic values (of type `str`) or `tuple`s (indicating polymorphism) or `set`s \
(indicating uncertainty) of atomic values. Atomic values may be `None` (indicating missing \
data), the special string `GAP` (indicating gaps) or state symbols or labels (if available \
and explicitly requested via `labeled_states=True`). State symbols are returned using the \
case given in FORMAT SYMBOLS, i.e. if a RESPECTCASE directive is missing and \
FORMAT SYMBOLS="ABC", a value "a" in the matrix will be returned as "A".
"""
format = Format(None) if 'FORMAT' not in self.commands else self.FORMAT
# Determine dimensions and labels:
ntax, taxlabels = self.get_taxlabels(format)
nchar = self.DIMENSIONS.nchar
charlabels, statelabels = self.get_charstatelabels(nchar, format)
if format.transpose and (not format.interleave) and (format.labels != False) and (not ntax):
raise ValueError("Can't read transposed matrix without NTAX.") # pragma: no cover
if format.datatype == 'CONTINUOUS': # pragma: no cover
raise NotImplementedError("Can't read a matrix of datatype CONTINUOUS")
# We read the matrix data in an agnostic way, ignoring whether it's transposed or not, as
# ordered dictionary mapping row labels (or numbers) to lists of entries.
res = collections.OrderedDict()
label, entries = None, []
ncols, nrows = ntax if format.transpose else nchar, nchar if format.transpose else ntax
for i, line in enumerate(
list(iter_lines(self.MATRIX._tokens)) if format.interleave else
[self.MATRIX._tokens],
start=1):
words = iter_words_and_punctuation(
line, allow_punctuation_in_word='+-', nexus=self.nexus)
while 1:
try:
t = next(words)
if (format.labels is not False) and label is None:
assert isinstance(t, str)
label = t
continue
if isinstance(t, Token):
if t.text == '(':
w = next(words)
symbols = ''
while isinstance(w, str) or (w.text in format.symbols) \
or (w.text == ","):
if isinstance(w, str) or w.text != ',':
symbols += getattr(w, 'text', w)
w = next(words)
assert w.text == ')', "Expected )"
entries.append(tuple(symbols))
elif t.text == '{':
w = next(words)
vals = set()
while isinstance(w, str) or (w.text in format.symbols) \
or (w.text == format.gap) or (w.text == ","):
if isinstance(w, str) or w.text != ',':
vals |= set(getattr(w, 'text', w))
w = next(words)
assert w.text == '}', "Expected }"
entries.append(vals)
elif t.text in format.symbols: # pragma: no cover
entries.append(t.text)
else: # pragma: no cover
raise ValueError('Unexpected punctuation in matrix')
else:
entries.extend(list(t)) # We split a word into a list of symbols.
if not format.interleave and (len(entries) == ncols):
res[label or (len(res) + 1)] = entries
label, entries = None, []
except StopIteration:
break
if format.interleave:
key = label or (i % nrows or nrows)
if key not in res:
res[key] = []
res[key].extend(entries)
label, entries = None, []
cols = ncols or len(list(res.values())[0])
assert all(len(states) == cols for states in res.values()), "Incomplete matrix read!"
if not taxlabels:
assert not format.transpose
taxlabels = {i + 1: key for i, key in enumerate(res)}
def apply_to_state(func, state, *args, **kw):
# We have to do a bit of mapping and renaming, which always needs to deal with the
# three different types of state.
if isinstance(state, str):
return func(state, *args, **kw)
if isinstance(state, tuple):
return tuple(func(s, *args, **kw) for s in state)
if isinstance(state, set):
return set(func(s, *args, **kw) for s in state)
raise ValueError(state) # pragma: no cover
lax_symbols = not format.explicit_symbols and format.datatype in {None, 'STANDARD'} \
and not (self.nexus and self.nexus.cfg.strict)
def replace_symbol(s, i, r):
if (format.respectcase and s == format.missing) or \
(not format.respectcase and (s.upper() == format.missing.upper())):
return None
if format.gap:
if (format.respectcase and s == format.gap) or \
(not format.respectcase and (s.upper() == format.gap.upper())):
return GAP
if format.matchchar: # match entries from first row!
if (format.respectcase and s == format.matchchar) or \
(not format.respectcase and (s.upper() == format.matchchar.upper())):
assert r
return r[i]
if s not in format.symbols:
s = s.lower() if s.isupper() else s.upper()
if not lax_symbols:
assert s in format.symbols, '{} {}'.format(s, format.symbols)
return s
def resolve_symbols(s, i, r):
def resolve(c, i, r):
c = format.equate.get(c.upper(), c) # May result in ambiguous or multiple states!
return apply_to_state(replace_symbol, c, i, r)
return apply_to_state(resolve, s, i, r)
firstrow = None
for i, l in enumerate(res):
res[l] = [resolve_symbols(s, i, firstrow) for i, s in enumerate(res[l])]
if i == 0:
# We need the fully resolved entries of the first row around to resolve MATCHCHARs.
firstrow = res[l]
# Create the final result, an OrderedDict mapping taxa labels (or numbers) to OrderedDicts
# mapping character labels or numbers to state symbols or labels.
matrix = collections.OrderedDict()
if not format.transpose:
tlabels = {str(k) for k in res.keys()}
valid_taxa = {str(k) for k in taxlabels}.union(taxlabels.values())
if not tlabels.issubset(valid_taxa):
if self.nexus and self.nexus.cfg.strict: # pragma: no cover
raise ValueError('Found undeclared taxa in characters matrix')
else:
warnings.warn('Dropping undeclared taxa from characters matrix.')
for tnum, tlabel in sorted(taxlabels.items()):
if format.transpose:
# We have to pick the tnum column in each list in res.
matrix[tlabel] = collections.OrderedDict()
for cnum, clabel in sorted(charlabels.items()):
entries = res[clabel] if clabel in res else res[cnum]
matrix[tlabel][clabel] = entries[tnum - 1]
else:
key = tlabel if tlabel in res else (tnum if tnum in res else str(tnum))
if key in res:
# Non-transposed matrices may not have data for each taxon!
entries = res[key]
matrix[tlabel] = collections.OrderedDict(
[(charlabels[i], s) for i, s in enumerate(entries, start=1)])
if labeled_states:
for entries in matrix.values():
for char in entries:
if char in statelabels:
if entries[char] not in {None, GAP}:
entries[char] = apply_to_state(
lambda s: statelabels[char].get(s) or s, entries[char])
return matrix
def get_taxlabels(self, format):
if self.TAXLABELS:
taxlabels = self.TAXLABELS.labels
ntax = self.DIMENSIONS.ntax
elif 'TAXA' in self.linked_blocks:
taxlabels = self.linked_blocks['TAXA'].TAXLABELS.labels
ntax = self.linked_blocks['TAXA'].DIMENSIONS.ntax
elif self.nexus.TAXA:
taxlabels = self.nexus.TAXA.TAXLABELS.labels
ntax = self.nexus.TAXA.DIMENSIONS.ntax
else:
ntax, taxlabels = None, {}
if format.interleave and format.labels is False and not format.transpose:
# If the matrix has no row labels and is not transposed, we need the number of taxa to
# compute the size of the interleaved blocks.
assert ntax
taxlabels = taxlabels or {i + 1: str(i + 1) for i in range(ntax)}
return ntax, taxlabels
def get_charstatelabels(self, nchar=None, format=None):
nchar = nchar or self.DIMENSIONS.nchar
charlabels = {i + 1: str(i + 1) for i in range(nchar)}
statelabels = {}
if self.CHARSTATELABELS:
charlabels = {
int(c.number): c.name or str(c.number) for c in self.CHARSTATELABELS.characters}
statelabels = {
c.number: c.states for c in self.CHARSTATELABELS.characters}
elif self.CHARLABELS:
charlabels = {
int(c.number): c.name or str(c.number) for c in self.CHARLABELS.characters}
if self.STATELABELS:
statelabels = {c.number: c.states for c in self.STATELABELS.characters}
format = format or self.FORMAT or Format(None)
if statelabels:
statelabels = {charlabels[cnum]: states for cnum, states in statelabels.items()}
for clabel in statelabels:
states = statelabels[clabel]
labeled = collections.OrderedDict()
if format:
for i, symbol in enumerate(format.symbols):
if i < len(states) and states[i] != '_':
labeled[symbol] = states[i]
statelabels[clabel] = labeled
else:
statelabels = {}
assert len(charlabels) == nchar
return charlabels, statelabels
def validate(self, log=None):
res = super().validate(log)
if 'TAXLABELS' in self.commands and not self.DIMENSIONS.newtaxa:
return log_or_raise(
'TAXLABELS may only be defined in {} block if NEWTAXA is specified.'.format(
self.name),
log=log)
return res
[docs] @classmethod
def from_data(cls,
matrix: StateMatrix,
taxlabels: bool = False,
statelabels: typing.Optional[typing.Dict[str, typing.Dict[str, str]]] = None,
datatype: str = 'STANDARD',
missing: str = '?',
gap: str = '-',
comment: typing.Optional[str] = None,
nexus: typing.Optional["Nexus"] = None,
TITLE: typing.Optional[str] = None,
ID: typing.Optional[str] = None,
LINK: typing.Optional[typing.Union[str, typing.Tuple[str, str]]] = None) \
-> 'Characters':
"""
Instantiate a CHARACTERS or DATA block from a metrix.
This functionality can be used to normalize the NEXUS formatting of CHARACTERS matrices:
.. code-block:: python
>>> nex = Nexus('''#NEXUS
... BEGIN TAXA;
... DIMENSIONS NTAX=3;
... TAXLABELS t1 t2 t3;
... END;
... BEGIN CHARACTERS;
... DIMENSIONS NCHAR=3;
... FORMAT TRANSPOSE NOLABELS;
... MATRIX 100 010 001;
... END;''')
>>> matrix = nex.CHARACTERS.get_matrix()
>>> nex.replace_block(nex.CHARACTERS, Characters.from_data(matrix))
>>> print(nex)
#NEXUS
BEGIN TAXA;
DIMENSIONS NTAX=3;
TAXLABELS t1 t2 t3;
END;
BEGIN CHARACTERS;
DIMENSIONS NCHAR=3;
FORMAT DATATYPE=STANDARD MISSING=? GAP=- SYMBOLS="01";
MATRIX
t1 100
t2 010
t3 001
;
END;
:param matrix: A matrix as returned by :meth:`Characters.get_matrix()`, with unlabeled \
states. I.e. `None` is used to mark missing values, and `GAP` to mark gapped values. These \
special states will be converted to the symbols passed as `missing` and `gap` upon writing.
:param taxlabels: If `True`, include a TAXLABELS command rather than relying on a TAXA \
block being present.
:param datatype:
:param missing:
:param gap:
:param nexus: An optional Nexus instance to lookup global config options.
"""
if datatype != 'STANDARD': # pragma: no cover
raise NotImplementedError('Only DATATYPE=STANDARD is supported for writing CHARACTERS')
symbols, rows, charlabels, maxlen, tlabels = set(), [], None, 0, {}
for taxon in matrix: # We compute maximum taxon label length for pretty printing.
tlabels[taxon] = Word(taxon).as_nexus_string()
maxlen = max([maxlen, len(tlabels[taxon])])
symbol = lambda c: missing if c is None else (gap if c == GAP else c) # noqa: E731
for taxon, entries in matrix.items():
if not charlabels:
charlabels = collections.OrderedDict(
[(str(i + 1), c) for i, c in enumerate(entries)])
row = []
for entry in entries.values():
if entry:
symbols |= set(entry)
if isinstance(entry, tuple): # polymorphism -> ()
row.append('({})'.format(''.join(symbol(c) for c in entry)))
elif isinstance(entry, set): # uncertainty -> {}
row.append('{{{}}}'.format(''.join(sorted(symbol(c) for c in entry))))
else:
row.append(symbol(entry))
rows.append('\n{} {}'.format(tlabels[taxon].ljust(maxlen), ''.join(row)))
symbols = ''.join(sorted([s for s in symbols if s not in [None, GAP]]))
if missing in symbols or (gap in symbols):
raise ValueError('MISSING or GAP markers must be distinct from "{}"'.format(symbols))
respectcase = any(c.isupper() for c in symbols) and any(c.islower() for c in symbols)
dimensions = 'NCHAR={}'.format(len(list(matrix.values())[0]))
if taxlabels:
dimensions = 'NEWTAXA NTAX={} {}'.format(len(tlabels), dimensions)
cmds = [
('DIMENSIONS', dimensions),
('FORMAT', 'DATATYPE=STANDARD {}MISSING={} GAP={} SYMBOLS="{}"'.format(
'RESPECTCASE ' if respectcase else '', missing, gap, symbols)),
]
statelabels = statelabels or {}
if any(k != v for k, v in charlabels.items()):
cmds.append((
'CHARSTATELABELS',
', '.join('\n {} {}{}'.format(
n,
Word(l).as_nexus_string(),
'/' + ' '.join(Word(ll).as_nexus_string() for ll in statelabels[l].values())
if statelabels.get(l) else '',
) for n, l in charlabels.items())))
if taxlabels:
cmds.append(('TAXLABELS', ' '.join(tlabels.values())))
cmds.append(('MATRIX', ''.join(rows) + '\n'))
return cls.from_commands(cmds, nexus=nexus, TITLE=TITLE, ID=ID, LINK=LINK, comment=comment)
[docs]class Options(Payload):
"""
The GAPMODE subcommand of the OPTIONS command of the ASSUMPTIONS block was originally
housed in an OPTIONS command in the DATA block.
:ivar typing.Optional[str] gapmode: `missing` or `newstate`.
"""
def __init__(self, tokens, nexus=None):
super().__init__(tokens, nexus=nexus)
self.gapmode = None
words = iter_words_and_punctuation(self._tokens, nexus=nexus)
while 1:
try:
word = next(words)
if isinstance(word, str) and word.upper() == 'GAPMODE':
self.gapmode = word_after_equals(words).lower()
except StopIteration:
break
assert self.gapmode in {None, 'missing', 'newstate'}
[docs]class Data(Characters):
"""
This block is equivalent to a CHARACTERS block in which the NEWTAXA subcommand is included in
the DIMENSIONS command. That is, the DATA block is a CHARACTERS block that includes not only
the definition of characters but also the definition of taxa.
.. note::
The GAPMODE subcommand of the OPTIONS command of the ASSUMPTIONS block was originally
housed in an :class:`OPTIONS <Options>` command in the DATA block.
"""
__commands__ = [
Dimensions, Format, Options,
Eliminate, Taxlabels, Charstatelabels, Charlabels, Statelabels, Matrix]