pax_global_header00006660000000000000000000000064131672664560014532gustar00rootroot0000000000000052 comment=cd750ae45ad9e5d4d791974619fb32e6bbc1db5e Yanagiba-1.0.0/000077500000000000000000000000001316726645600132435ustar00rootroot00000000000000Yanagiba-1.0.0/.gitignore000066400000000000000000000022251316726645600152340ustar00rootroot00000000000000#Testing data/ # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class # C extensions *.so # Distribution / packaging .Python env/ build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ wheels/ *.egg-info/ .installed.cfg *.egg # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *.cover .hypothesis/ # Translations *.mo *.pot # Django stuff: *.log local_settings.py # Flask stuff: instance/ .webassets-cache # Scrapy stuff: .scrapy # Sphinx documentation docs/_build/ # PyBuilder target/ # Jupyter Notebook .ipynb_checkpoints # pyenv .python-version # celery beat schedule file celerybeat-schedule # SageMath parsed files *.sage.py # dotenv .env # virtualenv .venv venv/ ENV/ # Spyder project settings .spyderproject .spyproject # Rope project settings .ropeproject # mkdocs documentation /site # mypy .mypy_cache/ Yanagiba-1.0.0/LICENSE.txt000066400000000000000000000020551316726645600150700ustar00rootroot00000000000000MIT License Copyright (c) 2017 Adam Taranto Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. Yanagiba-1.0.0/MANIFEST.in000066400000000000000000000000451316726645600150000ustar00rootroot00000000000000include README.md include LICENSE.txtYanagiba-1.0.0/README.md000066400000000000000000000061211316726645600145220ustar00rootroot00000000000000# Yanagiba *A yanagiba is a japanese blade used for cutting sashimi slices. Albacore is a species of tuna. Puns.* Yanagiba is used to filter short or low quality Oxford Nanopore reads which have been basecalled with Albacore. Takes fastq.gz and an Albacore summary file as input. If no Albacore summary file is provided attempt to calculate mean qscore from directly from fastq file using [NanoMath](https://github.com/wdecoster/nanomath). *Note:* Calculated quality scores appear to be lower for reads called with Metrichor, you may need to lower your minqual setting in this case. # Table of contents * [Getting started](#getting-started) * [Installing Yanagiba](#installing-yanagiba) * [Example usage](#example-usage) * [Standard options](#standard-options) * [License](#license) ## Getting started ### Installing Yanagiba Install from PyPi: ``` pip install yanagiba ``` Clone and install from this repository: ``` git clone git@github.com:Adamtaranto/Yanagiba.git && cd Yanagiba && pip install -e . ``` ### Example usage **Input requirements:** Albacore summary file must be tab delimited and have the header columns: - "read_id" - "sequence_length_template" - "mean_qscore_template" Unfiltered reads must be provided as fastq.gz file. Extract reads from "albacoreReads.fastq.gz" and retain those with a quality score > 10 and length >= 1000bp. Finally, clip 50 bp from either end of the retained reads and write to "trimmedreads.fastq.bgz" ``` yanagiba --minlen 1000 --headtrim 50 --tailtrim 50 --minqual 10 \ --summaryfile summary.txt \ --infile albacoreReads.fastq.gz \ --outfile trimmedreads.fastq.bgz ``` *Note:* Output files are in fastq.bgz compressed format. Unzip with: ``` gunzip -c trimmedreads.fastq.bgz > trimmedreads.fastq ``` ## Standard options Run `yanagiba --help` to view the program's most commonly used options: ``` Usage: yanagiba [-h] [-l MINLEN] [-q MINQUAL] [-s SUMMARYFILE] -i INFILE [-o OUTFILE] [--headtrim HEADTRIM] [--tailtrim TAILTRIM] [-u] Filter and slice Nanopore reads which have been basecalled with Albacore. Takes fastq.gz and an Albacore summary file. Help: -h, --help Show this help message and exit Input options: -i, --infile Input fastq.gz file. -s, --summaryfile Albacore summary file with header row. Output options: -o, --outfile Write filtered reads to this file in .bgz format. Settings: -l, --minlen Exclude reads shorter than this length. (Default: 0) -q, --minqual Minimum quality score to retain a read. (Default: 10) --headtrim Trim x bases from begining of each read. (Default: 0) --tailtrim Trim x bases from end of each read. (Default: None) -u, --forceunique Enforce unique reads. Only store first instance of a read from fastq input where readID occurs multiple times. (Default: False) ``` ## License Software provided under MIT license. Yanagiba-1.0.0/setup.cfg000066400000000000000000000000471316726645600150650ustar00rootroot00000000000000[metadata] description-file = README.mdYanagiba-1.0.0/setup.py000066400000000000000000000023761316726645600147650ustar00rootroot00000000000000from setuptools import setup pypi_classifiers = [ 'Programming Language :: Python :: 2.7', 'Programming Language :: Python :: 3', "Development Status :: 4 - Beta", "Environment :: Console", "Operating System :: OS Independent", 'Intended Audience :: Science/Research', 'Natural Language :: English', 'Topic :: Scientific/Engineering :: Bio-Informatics', "Topic :: Software Development :: Libraries :: Python Modules", 'License :: OSI Approved :: MIT License', ] install_requires = [ "nanomath>=0.13.0", "pandas>=0.20.3", 'biopython>=1.70', ] desc = """Filter short or low quality Oxford Nanopore reads which have been basecalled with Albacore.""" setup(name='yanagiba', version='1.0.0', description=desc, url='https://github.com/Adamtaranto/Yanagiba', author='Adam Taranto', author_email='adam.taranto@anu.edu.au', license='MIT', packages=['yanagiba'], classifiers=pypi_classifiers, keywords=["Albacore","Nanopore","basecalling","genome","DNA","sequencing"], install_requires=install_requires, include_package_data=True, zip_safe=False, entry_points={ 'console_scripts': [ 'yanagiba=yanagiba.cmd_line:main', ], }, )Yanagiba-1.0.0/yanagiba/000077500000000000000000000000001316726645600150165ustar00rootroot00000000000000Yanagiba-1.0.0/yanagiba/__init__.py000077500000000000000000000041371316726645600171370ustar00rootroot00000000000000#!/usr/bin/env python #yanagiba #Version 1. Adam Taranto, August 2017 #Contact, Adam Taranto, adam.taranto@anu.edu.au ############################################################################# # Filter and slice Nanopore reads which have been basecalled with Albacore. # # Takes fastq.gz and an Albacore summary file. # ############################################################################# import gzip import pandas from Bio import SeqIO, bgzf from nanomath import ave_qual def getTargets(summaryfile,minlen,minqual): df = pandas.read_csv(summaryfile, sep='\t', header=0) keep = df.ix[(df['sequence_length_template'] >= minlen) & (df['mean_qscore_template'] >= minqual)] keeplist = keep['read_id'].tolist() return keeplist def directFilter(infile,outfile,minqual=0,minlen=0,headtrim=None,tailtrim=None,forceuniq=False): seen = list() total = 0 with gzip.open(infile, "rt") as handle, bgzf.BgzfWriter(outfile, "wb") as output_handle: for record in SeqIO.parse(handle, "fastq"): total += 1 if ave_qual(record.letter_annotations["phred_quality"]) > minqual and len(record) > minlen: if not forceuniq: seen.append(record.id) SeqIO.write(record[headtrim:tailtrim], handle=output_handle, format="fastq") elif (forceuniq and record.id not in seen): seen.append(record.id) SeqIO.write(record[headtrim:tailtrim], handle=output_handle, format="fastq") print("Saved %s records out of %s records seen." % (len(seen),str(total)) ) def filterReads(infile,outfile,keeplist=None,headtrim=None,tailtrim=None,forceuniq=False): seen = list() with gzip.open(infile, "rt") as handle, bgzf.BgzfWriter(outfile, "wb") as output_handle: for record in SeqIO.parse(handle, "fastq"): if record.id in keeplist: if not forceuniq: SeqIO.write(record[headtrim:tailtrim], handle=output_handle, format="fastq") elif (forceuniq and record.id not in seen): seen.append(record.id) SeqIO.write(record[headtrim:tailtrim], handle=output_handle, format="fastq") if forceuniq: print("Saved %s unique records from %s filtered records." % (len(seen),len(keeplist)))Yanagiba-1.0.0/yanagiba/cmd_line.py000066400000000000000000000054571316726645600171550ustar00rootroot00000000000000#!/usr/bin/env python #yanagiba #Version 1. Adam Taranto, August 2017 #Contact, Adam Taranto, adam.taranto@anu.edu.au ############################################################################# # Filter and slice Nanopore reads which have been basecalled with Albacore. # # Takes fastq.gz and an Albacore summary file. # ############################################################################# import argparse import yanagiba as yb def mainArgs(): parser = argparse.ArgumentParser( description='Filter and slice Nanopore reads which have been basecalled with Albacore. Takes fastq.gz and an Albacore summary file.', prog='yanagiba') parser.add_argument('-i','--infile', type=str, required=True, default=None, help='Input fastq.gz file.') parser.add_argument('-s','--summaryfile', type=str, default=None, help='Albacore summary file with header row.') parser.add_argument('-o','--outfile', type=str, default="filtered.fastq.bgz", help='Write filtered reads to this file in .bgz format.') parser.add_argument('-l','--minlen', type=int, default=0, help='Exclude reads shorter than this length. Default: 0') parser.add_argument('-q','--minqual', type=int, default=10, help='Minimum quality score to retain a read. Default: 10') parser.add_argument('--headtrim', type=int, default=0, help='Trim x bases from begining of each read. Default: 0') parser.add_argument('--tailtrim', type=int, default=None, help='Trim x bases from end of each read. Default: None') parser.add_argument('-u','--forceunique', action='store_true', default=False, help='Enforce unique reads. Only store first instance of a read from fastq input where readID occurs multiple times.') args = parser.parse_args() return args def main(): #Get args args = mainArgs() # Convert tail trim len to neg int if set. if args.tailtrim: tailtrim = args.tailtrim * -1 # Preferentially source read information from summary file. if args.summaryfile: # Get names of reads which pass filter. keeplist = yb.getTargets(args.summaryfile,args.minlen,args.minqual) # Read in fastq.gz, keep records which passed filter and trim is required. yb.filterReads(args.infile,args.outfile,keeplist=keeplist,headtrim=args.headtrim,tailtrim=tailtrim,forceuniq=args.forceunique) else: # If no summary file provided, attempt to calculate quality scores directly from fastq using nanomath. yb.directFilter(args.infile,args.outfile,minqual=args.minqual,minlen=args.minlen,headtrim=args.headtrim,tailtrim=tailtrim,forceuniq=args.forceunique)