"""Functions for generating random barcodes."""
from __future__ import annotations
from typing import Union
from collections.abc import Generator, Iterable, Mapping, Sequence
from collections import Counter
from functools import reduce
from itertools import groupby, product
import operator
from random import choices, sample
import streq as sq
from .utils import _CODONS
def _tmat(x: Iterable[str]) -> Mapping:
xy = sorted(zip(*x),
key=lambda x: x[0])
counter = {key: Counter(following for _, following in group)
for key, group in groupby(xy, lambda x: x[0])}
return [{key: (tuple(c.keys()), tuple(c.values())) for key, c in counter.items()}]
[docs]def transition_matrix(x: Sequence[str]) -> Sequence[Mapping]:
"""Generate transition frequencies from one item to the next in a sequence.
Counts the occurence of the next letter conditioned on the preceding letter.
Parameters
----------
x : Sequence[str]
List of strings to take transition frequencies from.
Returns
-------
tuple
A length-n tuple, where n is the minimum length of x. Each item is a
2-tuple containing the next possible letters and their frequencies.
Examples
--------
>>> transition_matrix(['ATC', 'ATG']) # doctest: +NORMALIZE_WHITESPACE
({None: (('A',), (2,))}, {'A': (('T',), (2,))}, {'T': (('C', 'G'), (1, 1))})
>>> transition_matrix(['ATC', 'CTG']) # doctest: +NORMALIZE_WHITESPACE
({None: (('A', 'C'), (1, 1))}, {'A': (('T',), (1,)), 'C': (('T',), (1,))}, {'T': (('C', 'G'), (1, 1))})
>>> transition_matrix(['ATC', 'CAG']) # doctest: +NORMALIZE_WHITESPACE
({None: (('A', 'C'), (1, 1))}, {'A': (('T',), (1,)), 'C': (('A',), (1,))}, {'A': (('G',), (1,)), 'T': (('C',), (1,))})
"""
initial = Counter(_x[0] for _x in x)
initial = [{None: (tuple(initial.keys()), tuple(initial.values()))}]
preceding = zip(*x)
following = zip(*(_x[1:] for _x in x))
return tuple(reduce(operator.add, map(_tmat, zip(preceding, following)), initial))
[docs]def codon_barcodes(seq: str,
ordered: bool = False) -> Generator[str]:
"""Generate a stream of barcodes encoding an amino
acid sequence.
Makes no consideration of codon usage preferences. If `ordered` is `True`,
it is ignored if the number of possible combinations is more than
100,000.
Parameters
----------
seq : str
Amino acid sequence to encode, in one-letter code.
ordered : bool
Whether to produce barcodes in sorted order. Default: False.
Yields
------
sequence : str
DNA sequence encoding amino acid sequence.
Examples
--------
>>> list(codon_barcodes("L", ordered=True)) # doctest: +NORMALIZE_WHITESPACE
['CTT', 'CTC', 'CTA', 'CTG', 'TTA', 'TTG']
>>> list(codon_barcodes("L")) # doctest: +SKIP
['TTA', 'CTT', 'CTA', 'CTG', 'CTC', 'TTG']
"""
codons = [_CODONS[aa] for aa in seq]
n_combos = reduce(operator.mul, map(len, codons))
combos_tried = set()
if ordered and n_combos < 1e5:
codons = (_CODONS[aa] for aa in seq)
for combo in product(*codons):
yield ''.join(combo)
else:
while len(combos_tried) < n_combos:
this_sample = ''.join(sample(codon, k=1)[0] for codon in codons)
if this_sample not in combos_tried:
combos_tried.add(this_sample)
yield this_sample
[docs]def infinite_barcodes(length: int = 12,
alphabet: Union[Iterable[str], Iterable[Mapping]] = sq.sequences.DNA,
check_used: bool = True) -> Generator[str]:
"""Generate an stream of random barcodes by
randomly sampling from an alphabet.
Not actually infinite by default. Set `check_used = False`. This
will produce barcodes forever, so make sure you have some
end condition in your loop.
Parameters
----------
length : int
Length of barcode to generate.
alphabet : Iterable, optional
Set of letters from which to sample.
check_used : bool
Only produce unique sequences. Default: True.
Yields
------
sequence : str
Sequence with desired length.
Examples
--------
>>> sorted(infinite_barcodes(2)) # doctest: +NORMALIZE_WHITESPACE
['AA', 'AC', 'AG', 'AT', 'CA', 'CC', 'CG', 'CT', 'GA', 'GC', 'GG', 'GT', 'TA', 'TC', 'TG', 'TT']
>>> sorted(infinite_barcodes(2, alphabet='cats')) # doctest: +NORMALIZE_WHITESPACE
['aa', 'ac', 'as', 'at', 'ca', 'cc', 'cs', 'ct', 'sa', 'sc', 'ss', 'st', 'ta', 'tc', 'ts', 'tt']
>>> sorted(infinite_barcodes(2, alphabet=transition_matrix(['ATCG', 'ATTT']))) # doctest: +NORMALIZE_WHITESPACE
['AT']
>>> for bc in infinite_barcodes(20, check_used=False): # doctest: +SKIP
... print(bc)
... break
...
ATCAGTCGTCACACTAGTTA
"""
combos_tried = set()
if isinstance(alphabet[0], str):
ones = (1., ) * len(alphabet)
alphabet = (({None: (alphabet, ones)},) +
tuple({letter: (alphabet, ones) for letter in alphabet}
for _ in range(length - 1)))
alphabet = alphabet[:length]
n_combos = reduce(operator.mul,
(max(len(letters) for _, (letters, _) in position.items())
for position in alphabet))
while len(combos_tried) < n_combos or not check_used:
this_sample = []
letter = None
for transition_matrix in alphabet:
letters, weights = transition_matrix[letter]
letter = choices(letters, weights=weights, k=1).pop()
this_sample.append(letter)
this_sample = ''.join(this_sample)
if check_used and (this_sample not in combos_tried):
combos_tried.add(this_sample)
yield this_sample
elif not check_used:
yield this_sample