"""
Provides a class representing a NEXUS file as list of tokens.
"""
from typing import Union, Optional, Any, Type
import logging
import pathlib
import itertools
import collections
import dataclasses
from collections.abc import Iterable, Generator, Container
from commonnexus.blocks import Block
from commonnexus.command import Command
from .tokenizer import TokenType, iter_tokens, get_name, TokenList, TokenGenerator
from .util import log_or_raise
__all__ = ['Config', 'Nexus']
NEXUS = '#NEXUS'
[docs]@dataclasses.dataclass
class Config:
"""
The global behaviour of a :class:`Nexus` instance can be configured.
The available configuration options are set and accessed from an instance of `Config`.
"""
#: Specifies whether "-", aka ASCII hyphen-minus, is considered punctuation or not.
hyphenminus_is_text: bool = True
#: Specifies whether "*", aka asterisk, is considered punctuation or not.
asterisk_is_text: bool = True
#: Specifies whether Newick nodes for TREEs are constructed by parsing the Newick string or
#: from the Nexus tokens. The latter is slightly faster but will bypass some input validation.
validate_newick: bool = False
#: Specifies whether unsupported NEXUS commands/options are ignored or raise an error. Note \
#: that the effect of this option may only set in when a block or command is accessed.
ignore_unsupported: bool = True
#: Specifies the text encoding of a NEXUS file.
encoding: str = 'utf8'
#: The NEXUS spec does not explicitly state a default value for the MATCHCHAR directive in the
#: FORMAT command of a CHARACTERS block. `commonnexus` - in agreement with many NEXUS files
#: encountered "in the wild" - assumes a default of ".". To force no default value for
#: MATCHCHAR, e.g. because matrix data uses "." as regular state symbol, set
#: `no_default_matchchar` to `True`.
no_default_matchchar: bool = False
#: Sometimes the NEXUS spec is not followed entirely by files found in the wild. If somewhat
#: lax interpretation does not lead to ambiguities, that's what commonnexus does. To force
#: stricter adherence to the spec, set `strict` to `True`.
strict: bool = False
[docs]class Nexus(list):
"""
A NEXUS object implemented as list of commands with methods to read and write blocks.
From the spec:
The tokens in a NEXUS file are organized into commands, which are in turn organized into
blocks.
This is reflected in the ``Nexus`` object. The ``Nexus`` object is just a ``list`` of
:class:`Commands <Command>`, and has a property :meth:`Nexus.blocks` giving access to
commands grouped by block:
.. code-block::
>>> nex = Nexus('#NEXUS BEGIN myblock; mycmd a b c; END;')
>>> nex[0].__class__
<class 'commonnexus.nexus.Command'>
>>> len(nex.blocks['MYBLOCK'])
1
.. note::
NEXUS is for the most part case-insensitive. `commonnexus` reflects this by giving all
blocks and commands uppercase names. Thus, even if a command or block has a lowercase or
mixed-case name in the file, the corresponding ``Command`` or ``Block`` object must be
addressed using the uppercase name.
"""
[docs] def __init__(
self,
s: Optional[Union[Iterable, list[Command]]] = None,
block_implementations: Optional[dict[str, Block]] = None,
config: Optional[Config] = None,
**kw) -> None:
"""
:param s: The NEXUS content.
:param block_implementations: Custom implementations for non-public blocks.
:param config: Configuration.
:param kw: If no :class:`Config` object is passed as `config`, keyword parameters will be \
interpreted as configuration options. Thus,
.. code-block:: python
>>> nex = Nexus(encoding='latin')
is a shortcut for
.. code-block:: python
>>> nex = Nexus(config=Config(encoding='latin'))
"""
self.cfg: Config = config or Config(**kw)
self.trailing_whitespace: TokenList = []
self.leading: TokenList = []
self.block_implementations: dict[str, Type[Block]] = {}
for cls in Block.__subclasses__():
self.block_implementations[cls.__name__.upper()] = cls
for scls in cls.__subclasses__():
self.block_implementations.setdefault(scls.__name__.upper(), scls)
self.block_implementations.update(block_implementations or {})
s = s or NEXUS
if isinstance(s, str):
s = iter(s)
if not isinstance(s, list):
nexus, commands, tokens = False, [], []
for token in itertools.dropwhile(
lambda t: t.type == TokenType.WHITESPACE, iter_tokens(s)):
if not nexus:
assert token.type == TokenType.WORD and token.text.upper() == NEXUS, \
"No #NEXUS token found."
nexus = True
else:
tokens.append(token)
if token.is_semicolon:
commands.append(Command(tuple(tokens)))
tokens = []
if commands:
self.trailing_whitespace = tokens
else:
self.leading = tokens
s = commands
list.__init__(self, s)
[docs] @classmethod
def from_file(
cls,
p: Union[str, pathlib.Path],
config: Optional[Config] = None,
**kw,
) -> Optional['Nexus']:
"""
Instantiate a `Nexus` object from the contents of a NEXUS file.
:param p: Path of the file.
:param config: An optional configuration object.
:param kw: Configuration options, if no `Config` object is passed in.
:return: A `Nexus` instance.
"""
config = config or Config(**kw)
p = pathlib.Path(p)
not_utf8 = False
with p.open(encoding=config.encoding) as f:
try:
return cls(itertools.chain.from_iterable(f), config=config)
except UnicodeDecodeError:
not_utf8 = config.encoding == 'utf8'
if not_utf8:
# We don't want to do a lot of guessing, but if the default encoding was tried and
# didn't work, we try with the all-time favourite "latin1":
config.encoding = 'latin1'
with p.open(encoding=config.encoding) as f:
return cls(itertools.chain.from_iterable(f), config=config)
return None # pragma: no cover
[docs] @classmethod
def from_blocks(cls, *blocks: Block) -> 'Nexus':
"""Initializes a `Nexus` instance from a list of blocks."""
res = cls()
for block in blocks:
res.append_block(block)
return res
@property
def blocks(self) -> dict[str, list[Block]]:
"""
A `dict` mapping uppercase block names to lists of instances of these blocks ordered as they
appear in the NEXUS content.
For a shortcut to access blocks which are known to appear just once in the NEXUS content,
see :meth:`Nexus.__getattribute__`.
"""
res = collections.defaultdict(list)
for block in self.iter_blocks():
res[block.name].append(block)
return res
[docs] def __getattribute__(self, name: str) -> Any:
"""
NEXUS does not make any prescriptions regarding how many blocks with the same name may
exist in a file. Thus, the primary way to access blocks is by looking up the list of blocks
for a given name in :meth:`Nexus.blocks`. If it can be assumed that just one block for a
name exists, or only the first block with that name is of interest, this block can also be
accessed as `Nexus.<BLOCK_NAME>`, i.e. using the uppercase block name as attribute of the
`Nexus` instance.
.. code-block:: python
>>> nex = Nexus('#NEXUS begin block; cmd; end;')
>>> nex.BLOCK.name
'BLOCK'
>>> len(nex.BLOCK.commands)
1
"""
if name.isupper():
try:
return self.blocks[name][0]
except IndexError:
return None
return list.__getattribute__(self, name)
[docs] def __str__(self) -> str:
"""
The string representation of a `Nexus` object is just its NEXUS content.
.. code-block:: python
>>> nex = Nexus()
>>> nex.append_block(Block.from_commands([]))
>>> print(nex)
#NEXUS
BEGIN BLOCK;
END;
"""
return NEXUS \
+ ''.join(str(t) for t in self.leading) \
+ ''.join(''.join(str(t) for t in cmd) for cmd in self) \
+ ''.join(str(t) for t in self.trailing_whitespace)
[docs] def to_file(self, p: Union[str, pathlib.Path]) -> None:
"""
Write the NEXUS content of a `Nexus` object to a file.
"""
p = pathlib.Path(p)
text = str(self)
text += '\n' if not text.endswith('\n') else ''
p.write_text(text, encoding=self.cfg.encoding)
@property
def comments(self) -> list[str]:
"""
Comments may appear anywhere in a NEXUS file. Thus, they are the only kind of tokens not
really grouped into a command.
While comments **in** commands can also be accessed from the command, comments preceding
any command (and all others) can accessed via this property.
.. code-block:: python
>>> nex = Nexus("#nexus [created by commonnexus] begin block; cmd [does nothing]; end;")
>>> nex.BLOCK.CMD.comments
['does nothing']
>>> nex.comments[0]
'created by commonnexus'
"""
return [t.text for t in self.iter_comments()]
[docs] def iter_blocks(self) -> Generator[Block, None, None]:
"""A generator for the blocks in the NEXUS file."""
block: list[Command] = []
for command in itertools.dropwhile(lambda c: not c.is_beginblock, self):
if command.is_endblock:
assert block
block.append(command)
# Look up a suitable Block implementation.
name = get_name(block[0].iter_payload_tokens())
yield self.block_implementations.get(name, Block)(self, block)
block = []
elif command.is_beginblock:
block = [command]
elif block:
block.append(command)
[docs] def validate(self, log: Optional[logging.Logger] = None) -> bool:
"""Validates the NEXUS file."""
valid = True
if any(t.type not in {TokenType.WHITESPACE, TokenType.COMMENT} for t in self.leading):
log_or_raise('Invalid token in preamble', log=log)
for block in self.iter_blocks():
#
# FIXME: pylint: disable=fixme
# we can do a lot of validation here! If block.__commands__ is a list, there is
# some fixed order between commands.
# If Payload.__multivalued__ == False, only one command instance is allowed, ...
#
valid = valid and block.validate(log=log)
if any(t.type not in {TokenType.WHITESPACE, TokenType.COMMENT}
for t in self.trailing_whitespace):
log_or_raise('Invalid token in text after the last command', log=log)
return valid
[docs] def get_numbers(self, object_name: str, items: Container[str]) -> list[str]:
"""Determine object numbers suitable for inclusion in a set spec."""
if object_name == 'TAXON':
return [str(i + 1) for i, tax in enumerate(self.taxa)
if (tax in items) or (str(i + 1) in items)]
if object_name == 'CHARACTER':
charlabels, _ = self.characters.get_charstatelabels()
return [str(i) for i, label in charlabels.items()
if (label in items) or (str(i) in items)]
if object_name == 'TREE':
return [str(i + 1) for i, tree in enumerate(self.TREES.trees)
if (tree.name in items) or (str(i + 1) in items)]
raise NotImplementedError(object_name) # pragma: no cover
[docs] def resolve_set_spec(
self, object_name: str, spec: list[str], chars: Optional[list[str]] = None
) -> Optional[Union[list[str], dict[str, list[str]]]]:
"""
Resolve a set spec to a list of included items, specified by label or number.
:param object_name:
:param spec:
:return:
"""
def numbers(maxn: int) -> list[int]:
"""Turns a set spec into a list of item numbers."""
res = []
start, in_range = None, False
for token in spec:
if not start:
start = token
continue
if token == '-':
in_range = True
continue
if in_range: # Figure out the end of the range and append items.
res.extend(list(range(int(start), int(token if token != '.' else maxn) + 1)))
in_range, start = False, None
continue
# No range specification detected. Just append `start` as single number.
res.append(int(start))
start = token
if start:
res.append(int(start))
return res
object_name = object_name.upper()
assert object_name in ['TAXON', 'CHARACTER', 'STATE', 'TREE']
# Now we determine the available labels for the object type.
n, labels = None, None
if object_name == 'TAXON':
if self.TAXA:
# FIXME: What if there's more than one TAXA block? pylint: disable=fixme
n = self.TAXA.DIMENSIONS.ntax
labels = self.TAXA.TAXLABELS.labels
elif self.CHARACTERS and self.CHARACTERS.DIMENSIONS.newtaxa:
n = self.CHARACTERS.DIMENSIONS.ntax
if self.CHARACTERS.TAXLABELS:
labels = self.CHARACTERS.TAXLABELS.labels
elif object_name == 'CHARACTER':
block = self.CHARACTERS or self.DATA
if block:
labels, _ = block.get_charstatelabels()
n = len(labels)
elif object_name == 'STATE':
chars = chars or [] # Labeled states make only sense in relation to characters.
block = self.CHARACTERS or self.DATA
if block:
_, slabels = block.get_charstatelabels()
res: dict[str, list[str]] = collections.defaultdict(list)
# need to transpose
for char, states in slabels.items():
if char in chars:
res[char] = [
label or state for state, label in states.items() if state in spec]
return res
return None # pragma: no cover
elif object_name == 'TREE':
labels = {i + 1: tree.name for i, tree in enumerate(self.TREES.trees)}
if not labels:
assert n
labels = {i + 1: str(i + 1) for i in range(n)}
return [labels[n] for n in numbers(n)]
[docs] def remove_block(self, block: Block) -> None:
"""Removes a block from the NEXUS file."""
for cmd in block:
self.remove(cmd)
[docs] def append_block(self, block: Block) -> None:
"""Adds a new block at the end of the NEXUS file."""
self.extend(block)
[docs] def prepend_block(self, block: Block) -> None:
"""Adds a new block at the beginning of the NEXUS file."""
for cmd in reversed(block):
self.insert(0, cmd)
[docs] def replace_block(self, old: Block, new: Union[Block, list[tuple[str, str]]]) -> None:
"""
Replace a block.
The new block can be specified either a `Block` instance or as list of (name, payload)
pairs which will be taken to construct the commands within the new block.
"""
bname = old.name
for i, cmd in enumerate(self):
if cmd is old[0]:
break
else:
raise ValueError('Block not found') # pragma: no cover
for cmd in old:
self.remove(cmd)
if isinstance(new, Block):
new.nexus = self
for cmd in reversed(new):
self.insert(i, cmd)
else:
self.insert(i, Command.from_name_and_payload('END'))
for n, payload in reversed(new):
self.insert(i, Command.from_name_and_payload(n, payload))
self.insert(i, Command.from_name_and_payload('BEGIN', bname))
[docs] def append_command(self, block: Block, name: str, payload: str = None) -> None:
"""Append a command at the end of the block (before the block's END command)."""
self.insert(
self.index(block[-1]),
Command.from_name_and_payload(name, payload))
# Shortcuts:
@property
def characters(self) -> Union[Block, None]:
"""
Shortcut to get around the DATA/CHARACTERS ambiguity.
I.e. if one is interested in the characters matrix of a NEXUS file no matter whether this
is included in a DATA or CHARACTERS block, ``Nexus.characters.get_matrix()`` can be used
rather than ``(Nexus.DATA or NEXUS.CHARACTERS).get_matrix()``.
:return: The first DATA or CHARACTERS block.
"""
assert not (self.DATA and self.CHARACTERS)
return self.DATA or self.CHARACTERS
@property
def taxa(self) -> Optional[list[str]]:
"""
Shortcut to retrieve the list of taxa a NEXUS file provides data on.
:return: The list of taxa labels used in a NEXUS file.
.. note::
There are various ways to encode taxa labels in a NEXUS file. This method looks up
different places ordered by explicitness, i.e.
1. A TAXLABELS command in a TAXA block.
2. A TAXLABELS command in a DATA or CHARACTERS block.
3. Taxa labels given implicitly as labels in a MATRIX command.
4. A TAXLABELS command in a DISTANCES block.
5. Taxa labels given implicitly as labels in a DISTANCES.MATRIX command.
6. Taxa labels given as mappings in the TRANSLATE command of a TREES block.
7. Taxa labels given implicitly as node names in the Newick representation of a tree in
a TREE command in a TREES block.
.. warning::
Taxa descriptions in NEXUS may be inconsistent, e.g. a NEXUS file might contain a
TAXA block, but introduce new taxa via NEWTAXA/TAXLABELS in a CHARACTERS block.
`commonnexus` does not make an effort to check for consistency.
"""
if self.TAXA and len(self.blocks['TAXA']) == 1:
return list(self.TAXA.TAXLABELS.labels.values())
if self.characters:
return list(self.characters.get_matrix())
if self.DISTANCES:
return list(self.DISTANCES.get_matrix())
if self.TREES:
if self.TREES.TRANSLATE:
return list(self.TREES.TRANSLATE.mapping.values())
return [node.name for node in self.TREES.TREE.newick.walk() if node.name]
return None