pax_global_header00006660000000000000000000000064136520506620014517gustar00rootroot0000000000000052 comment=fb61b1f88cd4476bdd819aed890d926fa700e411 yanosim-0.1/000077500000000000000000000000001365205066200130365ustar00rootroot00000000000000yanosim-0.1/.gitignore000077500000000000000000000037251365205066200150400ustar00rootroot00000000000000# Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class # C extensions *.so # Distribution / packaging .Python build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ wheels/ pip-wheel-metadata/ share/python-wheels/ *.egg-info/ .installed.cfg *.egg MANIFEST # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .nox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *.cover *.py,cover .hypothesis/ .pytest_cache/ cover/ # Translations *.mo *.pot # Django stuff: *.log local_settings.py db.sqlite3 db.sqlite3-journal # Flask stuff: instance/ .webassets-cache # Scrapy stuff: .scrapy # Sphinx documentation docs/_build/ # PyBuilder target/ # Jupyter Notebook .ipynb_checkpoints # IPython profile_default/ ipython_config.py # pyenv # For a library or package, you might want to ignore these files since the code is # intended to run in multiple environments; otherwise, check them in: # .python-version # pipenv # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. # However, in case of collaboration, if having platform-specific dependencies or dependencies # having no cross-platform support, pipenv may install dependencies that don't work, or not # install all needed dependencies. #Pipfile.lock # PEP 582; used by e.g. github.com/David-OConnor/pyflow __pypackages__/ # Celery stuff celerybeat-schedule celerybeat.pid # SageMath parsed files *.sage.py # Environments .env .venv env/ venv/ ENV/ env.bak/ venv.bak/ # Spyder project settings .spyderproject .spyproject # Rope project settings .ropeproject # mkdocs documentation /site # mypy .mypy_cache/ .dmypy.json dmypy.json # Pyre type checker .pyre/ # pytype static type analyzer .pytype/ yanosim-0.1/LICENSE000077500000000000000000000020571365205066200140520ustar00rootroot00000000000000MIT License Copyright (c) 2020 Matthew Parker Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. yanosim-0.1/README.md000077500000000000000000000032621365205066200143230ustar00rootroot00000000000000# Yanosim (yet another nanopore simulator) Read simulator for nanopore DRS datasets. ## Installation: `pip install git+git://github.com/bartongroup/yanosim` ## Usage: ### `model`: ``` $ yanosim model --help Usage: yanosim model [OPTIONS] Creates an model of mismatches, insertions and deletions based on an alignment of nanopore DRS reads to a reference. Reads should be aligned to a transcriptome i.e. without spliced alignment, using minimap2. They should have the cs tag. Options: -b, --bam-fn TEXT [required] -o, --output-fn TEXT [required] -p, --processes INTEGER --help Show this message and exit. ``` ### `quantify`: ``` $ yanosim quantify --help Usage: yanosim quantify [OPTIONS] Quantify the number of reads mapping to each transcript in a reference, so that the right number of reads can be simulated. Options: -b, --bam-fn TEXT [required] -o, --output-fn TEXT [required] -g, --gtf-fn TEXT -f, --filtered-gtf-output-fn TEXT -r, --remove-ensembl-version --help Show this message and exit. ``` ### `simulate`: ``` $ yanosim simulate --help Usage: yanosim simulate [OPTIONS] Given a model created using yanosim model, and per-transcript read counts created using yanosim simulate, simulate error-prone long-reads from the given fasta file. Options: -f, --fasta-fn TEXT [required] -m, --model-fn TEXT [required] -q, --quantification-fn TEXT [required] -o, --output-fn TEXT [required] --model-frag / --no-model-frag -p, --processes INTEGER -s, --seed INTEGER --help Show this message and exit. ```yanosim-0.1/setup.py000077500000000000000000000006441365205066200145570ustar00rootroot00000000000000from setuptools import setup setup( name='yanosim', version='0.1', description=( 'nanopore DRS read simulator' ), author='Matthew Parker', entry_points={ 'console_scripts': [ 'yanosim = yanosim.main:yanosim' ] }, packages=[ 'yanosim', ], install_requires=[ 'numpy', 'scipy', 'click', 'pysam', ], ) yanosim-0.1/yanosim/000077500000000000000000000000001365205066200145155ustar00rootroot00000000000000yanosim-0.1/yanosim/__init__.py000077500000000000000000000000001365205066200166170ustar00rootroot00000000000000yanosim-0.1/yanosim/main.py000077500000000000000000000071361365205066200160250ustar00rootroot00000000000000import re import pysam import click import numpy as np from .model import parallel_build_prob_tree from .simulate import parallel_simulate_read from .utils import write_model, load_model @click.group() def yanosim(): pass @yanosim.command() @click.option('-b', '--bam-fn', required=True) @click.option('-o', '--output-fn', required=True) @click.option('-p', '--processes', default=4) def model(bam_fn, output_fn, processes): ''' Creates an model of mismatches, insertions and deletions based on an alignment of nanopore DRS reads to a reference. Reads should be aligned to a transcriptome i.e. without spliced alignment, using minimap2. They should have the cs tag. ''' basecall_model, homopolymer_model, frag_model = parallel_build_prob_tree( bam_fn, processes=processes ) write_model(output_fn, basecall_model, homopolymer_model, frag_model) @yanosim.command() @click.option('-b', '--bam-fn', required=True) @click.option('-o', '--output-fn', required=True) @click.option('-g', '--gtf-fn', required=False) @click.option('-f', '--filtered-gtf-output-fn', required=False) @click.option('-r', '--remove-ensembl-version', default=False, is_flag=True) def quantify(bam_fn, output_fn, gtf_fn, filtered_gtf_output_fn, remove_ensembl_version): ''' Quantify the number of reads mapping to each transcript in a reference, so that the right number of reads can be simulated. ''' if gtf_fn and not filtered_gtf_output_fn: raise click.BadParameter('If -g is specified, must also provide -f') expressed = set() with open(output_fn, 'w') as f, pysam.AlignmentFile(bam_fn) as bam: for ref in bam.references: i = 0 for aln in bam.fetch(ref): if not aln.is_secondary: i += 1 f.write(f'{ref}\t{i}\n') if i: if remove_ensembl_version: ref = ref.split('.')[0] expressed.add(ref) if gtf_fn: with open(gtf_fn) as g, open(filtered_gtf_output_fn, 'w') as f: for record in g: try: transcript_id = re.search('transcript_id "(.*?)";', record).group(1) except AttributeError: continue if transcript_id in expressed: f.write(record) @yanosim.command() @click.option('-f', '--fasta-fn', required=True) @click.option('-m', '--model-fn', required=True) @click.option('-q', '--quantification-fn', required=True) @click.option('-o', '--output-fn', required=True) @click.option('--model-frag/--no-model-frag', default=True) @click.option('-p', '--processes', default=4) @click.option('-s', '--seed', default=None, type=int) def simulate(fasta_fn, model_fn, quantification_fn, output_fn, model_frag, processes, seed): ''' Given a model created using yanosim model, and per-transcript read counts created using yanosim simulate, simulate error-prone long-reads from the given fasta file. ''' basecall_model, homopolymer_model, fragmentation_model = load_model(model_fn) if seed is not None: np.random.seed(seed) with open(output_fn, 'w') as f: sim_reads = parallel_simulate_read( fasta_fn, quantification_fn, processes, basecall_ptree=basecall_model, hp_ptree=homopolymer_model, frag_model=fragmentation_model, model_frag=model_frag, polya_len=10, five_prime_loss=False, chunk_size=1000, ) for read_id, seq in sim_reads: f.write(f'>{read_id}\n{seq}\n') f.flush()yanosim-0.1/yanosim/model.py000077500000000000000000000232631365205066200162000ustar00rootroot00000000000000import re from bisect import bisect_left import itertools as it from collections import defaultdict, Counter from functools import lru_cache, partial from multiprocessing import Pool import numpy as np from scipy import stats import pysam import click from .utils import write_model RC = str.maketrans('ACGT', 'TGCA') @lru_cache(maxsize=128) def revcomp(seq): return seq.translate(RC)[::-1] CS_SPLITTER = '([-+*~=:])' def parse_cs_tag_to_alignment(cs_tag, strand): ''' generalisable function for parsing minimap2 cs tag (long form only) into pw alignment ''' cs_tag = re.split(CS_SPLITTER, cs_tag)[1:] cs_ops = cs_tag[::2] cs_info = cs_tag[1::2] if strand == '-': cs_ops = cs_ops[::-1] cs_info = cs_info[::-1] qur_seq = [] ref_seq = [] i = 0 for op, info in zip(cs_ops, cs_info): if op == '=': # long frm match if strand == '-': info = revcomp(info) qur_seq.append(info) ref_seq.append(info) i += len(info) elif op == ':': # short form match raise ValueError('Need long form CS') elif op == '*': # mismatch info = info.upper() if strand == '-': info = info.translate(RC) ref = info[0] alt = info[1] qur_seq.append(alt) ref_seq.append(ref) i += 1 elif op == '+': if strand == '-': info = revcomp(info) qur_seq.append(info.upper()) ref_seq.append('-' * len(info)) elif op == '-': if strand == '-': info = revcomp(info) qur_seq.append('-' * len(info)) ref_seq.append(info.upper()) i += len(info) elif op == '~': # ignore pass return ''.join(qur_seq), ''.join(ref_seq) class ref_ignore_ins: def __init__(self): self.prev = None def __call__(self, aln_col): qur_base, ref_base = aln_col if self.prev is None: # the first base shouldn't be an insertion assert ref_base != '-' self.prev = ref_base if ref_base == '-': return self.prev else: self.prev = ref_base return ref_base @lru_cache(maxsize=128) def identify_state(ref_base, basecall): if len(basecall) > 1: return '+' elif basecall == '-': return '-' elif ref_base != basecall: return '*' else: return '=' class PairwiseAlignment: def __init__(self, cs_tag=None, strand=None, qur_seq=None, ref_seq=None): if cs_tag is not None: if strand is None: strand = '+' self.qur_seq, self.ref_seq = parse_cs_tag_to_alignment(cs_tag, strand) else: if qur_seq is None or ref_seq is None: raise ValueError() self.validate_seqs(qur_seq, ref_seq) # RNA is sequenced 3' -> 5' so reverse it to build the model self.qur_seq = qur_seq[::-1] self.ref_seq = ref_seq[::-1] def validate_seqs(self, qur_seq, ref_seq): assert len(qur_seq) == len(ref_seq), 'Seq lengths not equal' assert len(qur_seq) == len(re.findall('[ACGTN-]', qur_seq)), 'Query contains non ACGT- chars' assert len(ref_seq) == len(re.findall('[ACGTN-]', ref_seq)), 'Ref contains non ACGT- chars' def ref_rle(self): for base, hp_aln in it.groupby(zip(self.qur_seq, self.ref_seq), key=ref_ignore_ins()): qur_hp, ref_hp = zip(*hp_aln) qur_hp = ''.join(qur_hp) ref_hp = ''.join(ref_hp) yield base, qur_hp, ref_hp def iter_ref_cols(self, pad_size=0, pad_val='A'): qur_padded = pad_val * pad_size + self.qur_seq ref_padded = pad_val * pad_size + self.ref_seq q_with_ins = [] for q, r in zip(qur_padded, ref_padded): q_with_ins.append(q) if r != '-': q = ''.join(q_with_ins) state = identify_state(r, q) yield q, r, state q_with_ins = [] def iter_ref_kmers_and_bc(self, k=5): qur_cols, ref_cols, states = zip(*self.iter_ref_cols(pad_size=k - 1)) for i in range(k, len(ref_cols) + 1): ref_kmer = ''.join(ref_cols[i - k: i]) prev_states = ''.join(states[i - k: i - 1]) next_state = states[i - 1] bc = qur_cols[i - 1] yield ref_kmer, prev_states, bc, next_state def count_and_compress_homopolymers(p_aln, min_hp_length=3): qur_comp = [] ref_comp = [] homopolymer_counts = defaultdict(Counter) for n, qur_hp, ref_hp in p_aln.ref_rle(): ref_hp_no_ins = ref_hp.replace('-', '') hp_ln = len(ref_hp_no_ins) if hp_ln >= min_hp_length: qur_hp_no_ins = qur_hp.replace('-', '') homopolymer_counts[ref_hp_no_ins][qur_hp_no_ins] += 1 qur_comp.append(qur_hp_no_ins) ref_comp.append(qur_hp_no_ins) else: qur_comp.append(qur_hp) ref_comp.append(ref_hp) p_aln_comp = PairwiseAlignment( qur_seq=''.join(qur_comp), ref_seq=''.join(ref_comp) ) return p_aln_comp, homopolymer_counts def build_prob_tree(tag_generator, *, kmer_size, max_ins, min_hp_ln): basecall_counts = defaultdict(partial(defaultdict, Counter)) homopolymer_counts = defaultdict(Counter) aln_lengths = defaultdict(list) for i, (ref_name, aln_len, cs_tag, strand) in enumerate(tag_generator): aln_lengths[ref_name].append(aln_len) p_aln = PairwiseAlignment(cs_tag, strand) p_aln, hp = count_and_compress_homopolymers(p_aln, min_hp_ln) nested_dd_of_counters_update(homopolymer_counts, hp) for ref_kmer, prev_states, bc, next_state in p_aln.iter_ref_kmers_and_bc(k=kmer_size): if len(bc) > (max_ins + 1): # if there is a huge insertion we make the assumption (perhaps wrongly) # that it is an alignment failure not a basecall failure bc = bc[-1] next_state = identify_state(ref_kmer[-1], bc) basecall_counts[ref_kmer][prev_states][(bc, next_state)] += 1 return basecall_counts, homopolymer_counts, aln_lengths def chunk_bam_tags(bam_fn, chunk_size=10_000): with pysam.AlignmentFile(bam_fn) as bam: tags = [] i = 0 mapped = bam.mapped with click.progressbar(bam.fetch(), length=mapped) as fetch_iter: for aln in fetch_iter: if aln.is_secondary: continue try: cs_tag = aln.get_tag('cs') except KeyError: continue ref_name = aln.reference_name aln_len = aln.query_alignment_length strand = '+-'[aln.is_reverse] tags.append((ref_name, aln_len, cs_tag, strand)) i += 1 if i == chunk_size: yield tags i = 0 tags = [] else: if tags: yield tags def nested_dd_of_counters_update(d1, d2): for k in d2: if isinstance(d2[k], Counter): assert isinstance(d1[k], Counter) d1[k] += d2[k] elif isinstance(d2[k], list): assert isinstance(d1[k], list) d1[k] += d2[k] elif isinstance(d2[k], defaultdict): assert isinstance(d1[k], defaultdict) nested_dd_of_counters_update(d1[k], d2[k]) else: raise ValueError() def cumsum_counts(dd): cum_counts = defaultdict(dict) for k in dd: if isinstance(dd[k], Counter): ns, cs = zip(*dd[k].most_common()) cs = np.cumsum(cs) / sum(cs) cs = np.insert(cs, 0, 0) cum_counts[k] = ns, cs elif isinstance(dd[k], defaultdict): cum_counts[k] = cumsum_counts(dd[k]) else: raise ValueError() return cum_counts def estimate_p_fragmentation(aln_lens, perc=95, tol=500, min_reads=100): max_lengths = [] frag_frac = [] for ref, a in aln_lens.items(): if len(a) >= min_reads: a = np.array(a) mlen = np.percentile(a, perc) n_frag = sum(a < (mlen - tol)) max_lengths.append(mlen) frag_frac.append(n_frag / len(a)) model = stats.linregress(max_lengths, frag_frac)._asdict() return model def parallel_build_prob_tree(bam_fn, processes=12, chunk_size=1000, kmer_size=5, max_ins=15, min_hp_len=5, frag_full_length_perc=95, frag_full_length_tol=500, frag_min_reads=100): basecall_counts = defaultdict(lambda: defaultdict(Counter)) homopolymer_counts = defaultdict(Counter) aln_lengths = defaultdict(list) with Pool(processes=processes) as pool: chunk_iter = chunk_bam_tags(bam_fn, chunk_size) build_prob_tree_ = partial( build_prob_tree, kmer_size=kmer_size, max_ins=max_ins, min_hp_ln=min_hp_len ) for i, (b, h, a) in enumerate(pool.imap_unordered(build_prob_tree_, chunk_iter)): nested_dd_of_counters_update(basecall_counts, b) nested_dd_of_counters_update(homopolymer_counts, h) nested_dd_of_counters_update(aln_lengths, a) frag_model = estimate_p_fragmentation( aln_lengths, frag_full_length_perc, frag_min_reads ) return cumsum_counts(basecall_counts), cumsum_counts(homopolymer_counts), frag_modelyanosim-0.1/yanosim/simulate.py000077500000000000000000000075351365205066200167270ustar00rootroot00000000000000import itertools as it from functools import partial from multiprocessing import Pool import numpy as np import click from .utils import read_fasta, load_quantification def random_choice(bases, probs): p = np.random.random() idx = np.searchsorted(probs, p) - 1 return bases[idx] def mutate_basecalls(ref_seq, basecall_ptree, k=5): state = '=' * (k - 1) m = [] for i in range(k, len(ref_seq) + 1): ref_kmer = ref_seq[i - k: i] try: possible_basecalls, probs = basecall_ptree[ref_kmer][state] bc, new_state = random_choice(*basecall_ptree[ref_kmer][state]) except KeyError: # just use ref seq as basecall bc = ref_kmer[-1] new_state = '=' m.append(bc) state = state[1:] + new_state return ''.join(m).replace('-', '') def mutate_homopolymers(ref_seq, hp_ptree, min_hp_len=5): m = [] for a, g in it.groupby(ref_seq): hp = ''.join(list(g)) if len(hp) >= min_hp_len: while True: if hp in hp_ptree: sim_hp = random_choice(*hp_ptree[hp]) break else: # if homopolymer is so long that it hasn't been seen before, # shorten by one and try again hp = hp[:-1] else: sim_hp = hp m.append(sim_hp) return ''.join(m) def random_fragment(ref_seq, frag_model): slen = len(ref_seq) p_frag = max(frag_model['slope'] * slen + frag_model['intercept'], 0) is_frag = np.random.random() <= p_frag if is_frag: frag_point = np.random.randint(0, slen) ref_seq = ref_seq[frag_point:] return ref_seq def simulate_read(ref_seq, *, basecall_ptree, hp_ptree, frag_model, model_frag=True, polya_len=10, five_prime_loss=False): if polya_len: ref_seq = ref_seq + 'A' * polya_len # sometimes the read is fragmented at a random point if model_frag: ref_seq = random_fragment(ref_seq, frag_model) # reverse sequence for simulation as RNA is sequenced 3' -> 5' ref_seq = ref_seq[::-1] # mutate the read to match the errors in the model sim_read = mutate_basecalls(ref_seq, basecall_ptree) # mutate the read to match the homopolymer errors in the model sim_read = mutate_homopolymers(sim_read, hp_ptree) # re-reverse sequence to 5' -> 3' direction sim_read = sim_read[::-1] # Direct RNA reads are known to generally be missing 11nt from the 5' end if five_prime_loss: sim_read = sim_read[11:] return sim_read def get_seqs_to_sim(fasta_fn, quantification): for ref_name, seq in read_fasta(fasta_fn): for i in range(1, quantification[ref_name] + 1): read_id = f'{ref_name}_sim{i}' yield read_id, seq def get_reads(fasta_fn, quantification_fn, chunk_size=10_000): quantification = load_quantification(quantification_fn) nsim = sum(list(quantification.values())) chunk = [] seqs = get_seqs_to_sim(fasta_fn, quantification) with click.progressbar(seqs, length=nsim) as seqs: for read_id, seq in seqs: chunk.append((read_id, seq)) if len(chunk) == chunk_size: yield chunk chunk = [] else: if len(chunk): yield chunk def _imap_simulate(chunk, **sim_kwargs): sim = [] for read_id, seq in chunk: sim.append((read_id, simulate_read(seq, **sim_kwargs))) return sim def parallel_simulate_read(fasta_fn, quantification_fn, processes, chunk_size=10_000, **sim_kwargs): with Pool(processes=processes) as pool: for res in pool.imap_unordered( partial(_imap_simulate, **sim_kwargs), get_reads(fasta_fn, quantification_fn, chunk_size)): yield from resyanosim-0.1/yanosim/utils.py000077500000000000000000000026561365205066200162430ustar00rootroot00000000000000import numpy as np import gzip import json import itertools as it import re def serialise(obj): if isinstance(obj, np.ndarray): return obj.tolist() else: return obj def write_model(output_fn, basecall_ptree, homopolymer_ptree, frag_model): model = { 'basecalls': basecall_ptree, 'homopolymers': homopolymer_ptree, 'fragmentation': frag_model, } with gzip.open(output_fn, 'wt', encoding="ascii") as f: json.dump(model, f, default=serialise) def load_model(input_fn): with gzip.open(input_fn, 'rt', encoding='ascii') as f: model = json.load(f) return model['basecalls'], model['homopolymers'], model['fragmentation'] def load_quantification(quant_fn): quantification = {} with open(quant_fn) as q: for record in q: ref_name, n_reads = record.split() n_reads = int(n_reads) quantification[ref_name] = n_reads return quantification def read_fasta(fasta_fn): with open(fasta_fn) as f: header_grouped = it.groupby(f, key=lambda line: line.startswith('>')) for _, read_id in header_grouped: read_id = list(read_id) assert len(read_id) == 1 read_id = read_id[0][1:].strip() read_id = re.split('[\s|]', read_id)[0] _, seq = next(header_grouped) seq = ''.join([line.strip() for line in seq]) yield read_id, seq