pax_global_header 0000666 0000000 0000000 00000000064 13301047114 0014504 g ustar 00root root 0000000 0000000 52 comment=8ef170cc7693d4654a40ef82db60d4554e675b4d
QCumber-2.3.0/ 0000775 0000000 0000000 00000000000 13301047114 0013044 5 ustar 00root root 0000000 0000000 QCumber-2.3.0/.conda_cache/ 0000775 0000000 0000000 00000000000 13301047114 0015331 5 ustar 00root root 0000000 0000000 QCumber-2.3.0/.conda_cache/this_file_only_exists_in_order_to_track_this_directory 0000664 0000000 0000000 00000000000 13301047114 0030532 0 ustar 00root root 0000000 0000000 QCumber-2.3.0/.gitignore 0000775 0000000 0000000 00000000042 13301047114 0015033 0 ustar 00root root 0000000 0000000 .Rhistory
.idea
.svn
__pycache__
QCumber-2.3.0/.gitlab-ci.yml 0000664 0000000 0000000 00000002071 13301047114 0015500 0 ustar 00root root 0000000 0000000 image: continuumio/miniconda3:latest
cache:
paths:
- $CI_PROJECT_DIR/.conda_cache
before_script:
- conda update -y -n base conda
# - "export _PYTHON_SYSCONFIGDATA_NAME='_sysconfigdata_m_linux_x86_64-linux-gnu'"
# - conda update -y pip
# - "grep allow_softlinks $CONDA_PREFIX/.condarc || echo allow_softlinks: False >> $CONDA_PREFIX/.condarc"
# - conda update -y gxx_linux-64 || conda install gxx_linux-64
# - unset _PYTHON_SYSCONFIGDATA_NAME
- conda update -y gcc_linux-64 || conda install gcc_linux-64
- export _PYTHON_SYSCONFIGDATA_NAME='_sysconfigdata_m_linux_x86_64-linux-gnu'
# - source activate $CXX
# - gcc --version
- bash gitlab-ci.sh
- source activate $CI_PROJECT_DIR/.conda_cache/qcumber
- bash build.sh
stages:
- test
test:
only:
- development_unstable
stage: test
script:
- cd test
# - grep MemTotal /proc/meminfo
- ./test_qcumber2.py
# - ./test_qcumber2.py > /dev/null 2>&1 &
# - cat /sys/fs/cgroup/memory/memory.usage_in_bytes
# - while true; do cat /sys/fs/cgroup/memory/memory.usage_in_bytes; sleep 3; done
QCumber-2.3.0/CHANGELOG 0000664 0000000 0000000 00000003614 13301047114 0014262 0 ustar 00root root 0000000 0000000 ## [Unreleased]
Activation of new file input comming in 2.4
Fix for kraken will be in 2.3.1
## [2.3.0] - 2018-16-05
### Added
- Support for continous integration
- Sample sheet generation (not used)
#### Test script:
- Verbosity Option
- intoduced diffrent run levels for tests (low to high spec) (CI Node friendly)
- remote data support
### Fixes
- SE MODE abort because of regex issue #19
- TrimBetter regex issue caused it to trim everything
## [2.2.1] - 2018-20-03
### Added
#### Test script:
- bash completion
- regex testing
- mapping
- local real data tests (no validation yet)
### Changes
- touch ups
- zipped reference input works consistently
- reexecution of rules is now based on their parameters
#### Test script:
- utility functions for manipulation of goldstandard
### Fixes
- matplotlib issue with long filenames (tight_layout() does not like them)
-
## [2.2.0] - 2018-24-01
### Added
- An extended test suite is introduced. For more information please see the readme, which you will find in the test folder (soon: see #15) #4
- Insert size estimation: Distribution plots in mapping folder for each sample and text files with average, min and max fragment length. In batch report: boxplot with fragment length distribution
### Changed
- When using the option - - save for the Illumina Sequence Analysis Viewer, now all standard generated .xml files are required, as well as the InterOp folder. #14
- “Couldn’t rename sample files” warnings are not displayed anymore.
- In single end data the plots in the batch report are now colored.
- In the kraken plots the top 10 taxonomies are displayed, instead of all over 5%. The taxanomy “root” is silenced. #7
## [2.1.1]
## [2.1.0] - 2017-31-12
### Added
- unclassified out option for kraken
### Changed
- report generation and format of output. Fixes #6 and concurrency issue
- Permission Updates
## [2.0.4] - 2017-17-11 QCumber-2.3.0/LICENSE 0000775 0000000 0000000 00000017210 13301047114 0014055 0 ustar 00root root 0000000 0000000 GNU LESSER GENERAL PUBLIC LICENSE
Version 3, 29 June 2007
Copyright (C) 2007 Free Software Foundation, Inc.
Everyone is permitted to copy and distribute verbatim copies
of this license document, but changing it is not allowed.
This version of the GNU Lesser General Public License incorporates
the terms and conditions of version 3 of the GNU General Public
License, supplemented by the additional permissions listed below.
0. Additional Definitions.
As used herein, "this License" refers to version 3 of the GNU Lesser
General Public License, and the "GNU GPL" refers to version 3 of the GNU
General Public License.
"The Library" refers to a covered work governed by this License,
other than an Application or a Combined Work as defined below.
An "Application" is any work that makes use of an interface provided
by the Library, but which is not otherwise based on the Library.
Defining a subclass of a class defined by the Library is deemed a mode
of using an interface provided by the Library.
A "Combined Work" is a work produced by combining or linking an
Application with the Library. The particular version of the Library
with which the Combined Work was made is also called the "Linked
Version".
The "Minimal Corresponding Source" for a Combined Work means the
Corresponding Source for the Combined Work, excluding any source code
for portions of the Combined Work that, considered in isolation, are
based on the Application, and not on the Linked Version.
The "Corresponding Application Code" for a Combined Work means the
object code and/or source code for the Application, including any data
and utility programs needed for reproducing the Combined Work from the
Application, but excluding the System Libraries of the Combined Work.
1. Exception to Section 3 of the GNU GPL.
You may convey a covered work under sections 3 and 4 of this License
without being bound by section 3 of the GNU GPL.
2. Conveying Modified Versions.
If you modify a copy of the Library, and, in your modifications, a
facility refers to a function or data to be supplied by an Application
that uses the facility (other than as an argument passed when the
facility is invoked), then you may convey a copy of the modified
version:
a) under this License, provided that you make a good faith effort to
ensure that, in the event an Application does not supply the
function or data, the facility still operates, and performs
whatever part of its purpose remains meaningful, or
b) under the GNU GPL, with none of the additional permissions of
this License applicable to that copy.
3. Object Code Incorporating Material from Library Header Files.
The object code form of an Application may incorporate material from
a header file that is part of the Library. You may convey such object
code under terms of your choice, provided that, if the incorporated
material is not limited to numerical parameters, data structure
layouts and accessors, or small macros, inline functions and templates
(ten or fewer lines in length), you do both of the following:
a) Give prominent notice with each copy of the object code that the
Library is used in it and that the Library and its use are
covered by this License.
b) Accompany the object code with a copy of the GNU GPL and this license
document.
4. Combined Works.
You may convey a Combined Work under terms of your choice that,
taken together, effectively do not restrict modification of the
portions of the Library contained in the Combined Work and reverse
engineering for debugging such modifications, if you also do each of
the following:
a) Give prominent notice with each copy of the Combined Work that
the Library is used in it and that the Library and its use are
covered by this License.
b) Accompany the Combined Work with a copy of the GNU GPL and this license
document.
c) For a Combined Work that displays copyright notices during
execution, include the copyright notice for the Library among
these notices, as well as a reference directing the user to the
copies of the GNU GPL and this license document.
d) Do one of the following:
0) Convey the Minimal Corresponding Source under the terms of this
License, and the Corresponding Application Code in a form
suitable for, and under terms that permit, the user to
recombine or relink the Application with a modified version of
the Linked Version to produce a modified Combined Work, in the
manner specified by section 6 of the GNU GPL for conveying
Corresponding Source.
1) Use a suitable shared library mechanism for linking with the
Library. A suitable mechanism is one that (a) uses at run time
a copy of the Library already present on the user's computer
system, and (b) will operate properly with a modified version
of the Library that is interface-compatible with the Linked
Version.
e) Provide Installation Information, but only if you would otherwise
be required to provide such information under section 6 of the
GNU GPL, and only to the extent that such information is
necessary to install and execute a modified version of the
Combined Work produced by recombining or relinking the
Application with a modified version of the Linked Version. (If
you use option 4d0, the Installation Information must accompany
the Minimal Corresponding Source and Corresponding Application
Code. If you use option 4d1, you must provide the Installation
Information in the manner specified by section 6 of the GNU GPL
for conveying Corresponding Source.)
5. Combined Libraries.
You may place library facilities that are a work based on the
Library side by side in a single library together with other library
facilities that are not Applications and are not covered by this
License, and convey such a combined library under terms of your
choice, if you do both of the following:
a) Accompany the combined library with a copy of the same work based
on the Library, uncombined with any other library facilities,
conveyed under the terms of this License.
b) Give prominent notice with the combined library that part of it
is a work based on the Library, and explaining where to find the
accompanying uncombined form of the same work.
6. Revised Versions of the GNU Lesser General Public License.
The Free Software Foundation may publish revised and/or new versions
of the GNU Lesser General Public License from time to time. Such new
versions will be similar in spirit to the present version, but may
differ in detail to address new problems or concerns.
Each version is given a distinguishing version number. If the
Library as you received it specifies that a certain numbered version
of the GNU Lesser General Public License "or any later version"
applies to it, you have the option of following the terms and
conditions either of that published version or of any later version
published by the Free Software Foundation. If the Library as you
received it does not specify a version number of the GNU Lesser
General Public License, you may choose any version of the GNU Lesser
General Public License ever published by the Free Software Foundation.
If the Library as you received it specifies that a proxy can decide
whether future versions of the GNU Lesser General Public License shall
apply, that proxy's public statement of acceptance of any version is
permanent authorization for you to choose that version for the
Library.
QCumber-2.3.0/QCumber-2 0000775 0000000 0000000 00000073735 13301047114 0014506 0 ustar 00root root 0000000 0000000 #!/usr/bin/env python3
__author__ = 'LieuV'
__version__ = "2.1.1"
import argparse
import re
import getpass
import warnings
import os
import json
import sys
from itertools import groupby
from collections import OrderedDict
import subprocess
from pandas import read_csv
import snakemake
import datetime
import yaml
import input_utils
# Set paths
ADAPTER_PATH = "" # Adapter path from trimmomatic.
# Should be set during installation
KRAKEN_DB = subprocess.check_output("echo $KRAKEN_DB_PATH",
shell=True).decode("utf-8").strip()
if len(KRAKEN_DB) == 0:
KRAKEN_DB = " " # insert space such that snakemake can handle empty value
# Pattern for Illumina readnames
base_pattern = (
r"(?P.*)_(?PL\d{3})_(?P(R1|R2))_(?P\d{3}).*")
# --------------------------------------< Functions >----------------------------------------------------------
def main():
parser = argparse.ArgumentParser(
formatter_class=argparse.RawDescriptionHelpFormatter, description="""\
--------------------------------------------------------------------
< QCumber >
< Quality control and read trimming of NGS data >
https://gitlab.com/RKIBioinformaticsPipelines/QCumber/
--------------------------------------------------------------------""",
epilog=("Example usage: QCumber-2 --input fastq_folder"
" --reference reference.fasta"))
# ------------------------------------------------< INPUT >---------------------------------------------------------#
group_input = parser.add_argument_group("Input")
group_input.add_argument(
'--input', '-i', dest='input',
help=("input sample folder. Illumina filenames should end with"
"___number, e.g. Sample_12_345_R1_001.fastq,"
" to find the right paired set."),
required=False, nargs="+")
group_input.add_argument(
'--read1', '-1', dest='r1', help="Read 1 file", required=False)
group_input.add_argument(
'--read2', '-2', dest='r2', help="Read 2 file", required=False)
group_input.add_argument(
'--technology', '-T', dest='technology', choices=["Illumina",
"IonTorrent",
"PacBio"],
required=False,
help=("If not set, automatically determine technology and "
"search for fastq and bam files. "
"Set technology to IonTorrent if all files are bam-files,"
" else set technology to Illumina."))
group_input.add_argument(
'--adapter', '-a', dest='adapter',
choices=['TruSeq2-PE', 'TruSeq2-SE', 'TruSeq3-PE',
'TruSeq3-SE', 'TruSeq3-PE-2', 'NexteraPE-PE'],
help="Adapter name for trimming. Default: all")
mapping_exclusion = group_input.add_mutually_exclusive_group()
mapping_exclusion.add_argument(
'--reference', '-r', dest='reference', required=False,
help=("Map reads against reference."
+ " Reference needs to be in fasta-format."))
mapping_exclusion.add_argument(
'--index', '-I', dest='index', required=False,
help="Bowtie2 index if available.")
group_input.add_argument(
'--kraken_db', '-d', dest='kraken_db',
help=("Custom Kraken database. Default value is taken from"
" environment variable KRAKEN_DB_PATH. "
"Default: %(default)s."),
required=False, default=KRAKEN_DB)
group_input.add_argument(
'--kraken_classified_out', dest='kraken_classified_out',
help=("Kraken (un)classified-out option."
" If set, both the --classified-out"
" and --unclassified-out option are set. "
"Default: %(default)s."),
required=False, default=False, action='store_true')
group_optional = parser.add_argument_group("Optional steps")
group_optional.add_argument(
'--sav', '-w', dest='sav', required=False,
help=("Illumina folder for SAV. Requires RunInfo.xml, RunParameter.xml"
"and Interop folder."))
group_optional.add_argument(
'--trimBetter', choices=["assembly", "mapping", "default"],
help=("Optimize trimming parameter using 'Per sequence base content'"
+ " from fastqc. Not recommended for amplicons."))
group_optional.add_argument('--nokraken', '-K', action="store_true")
group_optional.add_argument('--notrimming', '-Q', action="store_true")
group_params = parser.add_argument_group("Parameter settings")
group_params.add_argument(
'--illuminaclip', '-L', dest='illuminaclip', default="2:30:10",
help=('Illuminaclip option: '
'::.'
'Default: %(default)s'))
group_params.add_argument(
'--only_trim_adapters', '-A', action='store_true',
help='If this option is selected, only adapters will be clipped')
group_params.add_argument(
'--minlen', '-m', default=50, dest='minlen',
help=('Minlen parameter for Trimmomatic. Drops read short than minlen.'
' Default: %(default)s'),
type=int)
group_params.add_argument(
'--trimOption', '-O', dest="trimOption",
help=('Additional Trimmomatic input.'
' Default (if trimBetter is not set): SLIDINGWINDOW:4:20'),
type=str)
group_params.add_argument(
'--trimBetter_threshold', '-b', dest='trimBetter_threshold',
help=("Set -trimBetter to use this option.Default setting"
" for Illumina: 0.15 and for IonTorrent: 0.25."),
required=False, type=float)
group_output = parser.add_argument_group("Output")
group_output.add_argument('--output', '-o', dest='output', default="")
group_output.add_argument(
'--rename', '-R', dest="rename", required=False,
help="TSV File with two columns: ")
group_output.add_argument(
'--save_mapping', '-S', action="store_true", default=False)
parser.add_argument('--threads', '-t', dest='threads', default=4, type=int,
help="Number of threads. Default: %(default)s")
parser.add_argument(
'--config', '-c', dest='config',
help=("Configfile to run pipeline. "
"Additional parameters in the commandline "
"will override arguments in configfile."
"If not given and config/config.txt exists in"
"the directory of the QCumber-2 executable,"
"that file will be loaded by default."))
parser.add_argument('--version', '-v',
action='version', version='%(prog)s v' + __version__)
arguments, unknown_args = parser.parse_known_args()
arguments = vars(arguments)
if len(sys.argv) == 1:
parser.print_help()
sys.exit(1)
configfile = arguments["config"]
default_configfile = os.path.join(
os.path.dirname(os.path.realpath(__file__)),
"config",
"config.txt")
if not arguments["config"] and os.path.isfile(default_configfile):
configfile = default_configfile
if configfile:
config_args = yaml.load(open(configfile, "r"))
keep_args = dict()
for arg in config_args.keys():
print(arg)
if (arguments[arg] is None) or arguments[arg] == " ":
arguments[arg] = config_args[arg]
arguments["output"] = os.path.abspath(arguments["output"])
if arguments["only_trim_adapters"]:
arguments["trimBetter"] = None
if not os.path.isdir(arguments["output"]):
os.mkdir(arguments["output"])
if arguments["reference"]:
arguments["reference"] = os.path.abspath(arguments["reference"])
if arguments["sav"]:
arguments["sav"] = os.path.abspath(arguments["sav"])
if arguments["rename"]:
arguments["rename"] = os.path.abspath(arguments["rename"])
parameter = yaml.load(
open(os.path.join(
os.path.dirname(os.path.realpath(__file__)),
"config",
"parameter.txt"),
"r"))
check_input_validity(arguments)
# Load adaptive filetypes
qcumber_path = os.path.dirname(os.path.realpath(__file__))
sample_file_name = os.path.join(arguments["output"], "samples.yaml")
with open(os.path.join(qcumber_path,
'filenames.yaml'),
'r') as filetype_h:
filename_types = yaml.load(filetype_h)
all_files = []
for file_or_dir in arguments["input"]:
if os.path.isdir(file_or_dir):
for root, dirs, files in os.walk(file_or_dir):
for file in files:
if os.path.getsize(os.path.join(root, file)) != 0:
all_files.append(os.path.join(root, file))
elif os.path.isfile(file_or_dir):
if os.path.getsize(file_or_dir):
all_files.append(file_or_dir)
# Get Parsed Samples fomr input utils module
formats_found, discarded = input_utils.parse_sample_info(
all_files, filename_types, ['pacbio', 'illumina_fastq'])
try:
illumina_data = formats_found['illumina_fastq']
# print(repr(format_known.mfrs).replace('>,', '>,\n'))
except KeyError:
exit('No samples found or none met criteria!!\n'
'These files were discarded:\n'
'%s' % '\n'.join(discarded))
try:
pacbio_data = formats_found['pacbio']
print('looking for pacbio data...')
pac_samples = pacbio_data.get_samples()
with open(sample_file_name.replace('.yaml',
'_pacbio.yaml'),
'w') as sample_file:
yaml.dump(pac_samples, sample_file, default_flow_style=False)
print('looking for illumina data...')
except KeyError:
pass
sample_dict = illumina_data.flatten_naive()
# flatten naive just drops read info from sampel name
# if it is paired end data.
len_known = len(sample_dict)
if False:
try:
salvaged_dict = illumina_data.leftovers.process_leftovers(
rename=True,
rename_start_index=len_known+1)
sample_dict.update(salvaged_dict)
except input_utils.AmbigiousPairedReadsError as err:
eprint('Failed parsing files with unrecognized'
' naming convention\n',
'Reason:\n', err)
# Write samples to working directory
with open(sample_file_name, 'w') as sample_file:
yaml.dump(sample_dict, sample_file, default_flow_style=False)
type, samples, joined_samples, name_dict, join_reads = (
get_input(arguments, parameter))
get_defaults(arguments, parameter)
force_run_list = []
os.makedirs(arguments["output"], exist_ok=True)
config_file_path = os.path.join(arguments["output"], "config.yaml")
#if os.path.isfile(config_file_path):
# pass
#else:
with open(config_file_path, 'w') as config_fh:
yaml.dump(arguments, config_fh,
default_flow_style=False)
# additional infos
general_information = OrderedDict()
general_information["User"] = getpass.getuser()
general_information["QCumber"] = __version__
general_information["QCumber_path"] = os.path.dirname(
os.path.realpath(__file__))
general_information["Execution time"] = datetime.datetime.now().ctime()
system_info = os.uname()
general_information["Operating system"] = OrderedDict()
general_information["Operating system"]["System"] = system_info.sysname
general_information["Operating system"]["Server"] = system_info.nodename
general_information["Operating system"]["Operating version"] = (
system_info.version)
general_information["Operating system"]["Release"] = system_info.release
general_information["Operating system"]["Machine"] = system_info.machine
general_information["Tool versions"] = OrderedDict()
general_information["Tool versions"]["Python"] = re.sub("\n",
"", sys.version)
general_information["Tool versions"]["Snakemake"] = snakemake.__version__
general_information["Tool versions"]["FastQC"] = (
get_version("fastqc --version"))
if not arguments["notrimming"]:
general_information["Tool versions"]["Trimmomatic"] = (
get_version("trimmomatic -version", "trimmomatic"))
if arguments["technology"] == "Illumina" and not arguments["adapter"]:
general_information["adapter"] = (
os.path.join(os.path.dirname(os.path.realpath(__file__)),
"config", "adapters.fa"))
elif arguments["technology"] == "Illumina" and arguments["adapter"]:
general_information["adapter"] = os.path.join(ADAPTER_PATH,
(arguments["adapter"]
+ ".fa"))
if arguments["reference"] or arguments["index"]:
general_information["Tool versions"]["Bowtie2"] = (
get_version("bowtie2 --version"))
if not arguments["nokraken"]:
general_information["Tool versions"]["Kraken"] = (
get_version("kraken --version"))
general_information["Sample information"] = OrderedDict()
general_information["Sample information"]["type"] = type
general_information["Sample information"]["samples"] = samples
general_information["Sample information"]["join_reads"] = join_reads
general_information["Sample information"]["join_lanes"] = joined_samples
general_information["Sample information"]["rename"] = name_dict
os.makedirs(os.path.join(arguments["output"],
"QCResults", "_data"), exist_ok=True)
general_information_file = os.path.join(arguments["output"],
"QCResults", "_data",
"general_information.json")
json.dump(general_information, open(general_information_file, "w"))
# Fixed:
# if QCumber is run repeatedly, the rule bowtie_mapping
# will not be executed.
# hence it is necessary to force to run the rule everytime
# if (arguments["save_mapping"]):
# force_run = "--forcerun bowtie_mapping"
# else:
force_run = "--forcerun"
cmd_string = (
"snakemake "
"--configfile {workdir}/config.yaml "
"--snakefile {snakefile} {additional_commands} "
"--directory {workdir} "
"--cores {cores} {targets} {force_run} "
).format(
additional_commands=" ".join(unknown_args),
snakefile=os.path.join(os.path.dirname(os.path.realpath(__file__)),
"Snakefile"),
workdir=arguments["output"],
configfile=general_information_file,
cores=arguments["threads"],
targets='', # "QCResults/batch_report.html",
force_run=force_run)
print(cmd_string)
process = subprocess.Popen(
("snakemake "
"--configfile {workdir}/config.yaml "
"--snakefile {snakefile} {additional_commands} "
"--directory {workdir} "
"--cores {cores} {targets} {force_run} "
' -R $(snakemake --list-params-changes '
# '--list-input-changes --list-code-changes '
'--configfile {workdir}/config.yaml '
'--snakefile {snakefile} {additional_commands} '
'--directory {workdir} '
'--cores {cores} {targets} {force_run})'
).format(
additional_commands=" ".join(unknown_args),
snakefile=os.path.join(os.path.dirname(os.path.realpath(__file__)),
"Snakefile"),
workdir=arguments["output"],
configfile=general_information_file,
cores=arguments["threads"],
targets='', # "QCResults/batch_report.html",
force_run=force_run),
shell=True)
process.wait()
exit(process.returncode)
def get_basename(abs_name):
return os.path.basename(os.path.splitext(
os.path.splitext(os.path.basename(abs_name))[0])[0])
def getFilenameWithoutExtension(string, getBase=False):
if getBase:
string = os.path.basename(string)
string = os.path.splitext(string)[0]
i = 0
while os.path.splitext(string)[-1] in [
".gz", ".gzip", ".zip", ".bz", ".fasta", ".fastq", ".bam"]:
string = os.path.splitext(string)[0]
return string
def get_setname(filename, base=True, grouping=True):
""" Get Setname
Args:
filename (obj::`str`): filename;
Kwargs:
base (bool): transform filename to basename (default = True)
grouping (bool): if not grouping, only return basename of file
(default = True)
Returns:
"""
if not grouping:
return get_basename(filename)
try:
if base:
filename = get_basename(filename)
sep = iter([";", "\t"])
if arguments["rename"]:
rename_dict = read_csv(arguments["rename"],
index_col=0, header=None)
while (rename_dict.columns.__len__() == 0
or sep.__length_hint__() != 0):
rename_dict = read_csv(arguments["rename"], index_col=0,
header=None, sep=next(sep))
if rename_dict.columns.__len__() == 0:
warnings.warn(
("Problems reading rename-file %s. "
"Valid delimiters are"
" ';', ',' or '\\t'.") % arguments["rename"],
UserWarning, stacklevel=2)
new_name = [(x, rename_dict.ix[x][1]) for x
in rename_dict.index
if filename.startswith(x)]
if len(new_name) != 1:
warnings.warn(
("Could not find unique renames."
" Found %s for %s") % (new_name, filename),
UserWarning, stacklevel=2)
filename = getFilenameWithoutExtension(filename, True)
else:
filename = filename.replace(new_name[0][0], new_name[0][1])
except:
# print("Couldnt rename sample files.")
pass
try:
paired_reads_pattern = base_pattern
setname_pattern = re.search(paired_reads_pattern,
os.path.basename(filename))
if setname_pattern:
return setname_pattern.group("setname")
else:
return filename
except:
print("Problems getting samplenames: %s" % filename)
return filename
def get_defaults(arguments, parameter):
if arguments["only_trim_adapters"]:
return
if arguments["trimBetter"] == "assembly":
# if arguments["forAssembly"]:
if not arguments["trimBetter_threshold"]:
arguments["trimBetter_threshold"] = (
parameter["forAssembly." + arguments['technology']]
['trimBetter_threshold'])
if not arguments["trimOption"]:
arguments["trimOption"] = (
parameter["forAssembly." + arguments['technology']]
["trimOption"])
elif arguments["trimBetter"] == "mapping":
# elif arguments["forMapping"]:
if not arguments["trimBetter_threshold"]:
arguments["trimBetter_threshold"] = (
parameter["forMapping." + arguments['technology']]
["trimBetter_threshold"])
if not arguments["trimOption"]:
arguments["trimOption"] = (
parameter["forMapping." + arguments['technology']]
["trimOption"])
elif arguments["trimBetter"] == "default":
arguments["trimBetter_threshold"] = (parameter["Trimmomatic"]
["trimBetter_threshold"])
if not arguments["trimOption"]:
arguments["trimOption"] = parameter["Trimmomatic"]["trimOption"]
if arguments["trimBetter_threshold"]:
arguments["trimBetter_threshold"] = (parameter["Trimmomatic"]
["trimBetter_threshold"])
def get_version(cmd, jar=None):
try:
return re.match(r"(?P.*)\n",
subprocess.check_output(
cmd, shell=True, stderr=subprocess.PIPE
).decode("utf-8")).group("version")
except:
try:
try:
return subprocess.check_output(
cmd, shell=True,
stderr=subprocess.PIPE
).decode("utf-8")
except:
from debpackageinfo import get_upstream_version
return get_upstream_version(jar)
except:
return "NaN"
def get_input(arguments, parameter):
"""
Get Read Input
Checks wether input reads are from Ion Torrent or Illumina, by checking
file extensions of bam and/or fastq. Detects presence of read pair in input
and returns type of reads used (single end / paired end reads)
Args:
None
Returns:
(obj::`str`): type - PE or SE (Paired Ends / Single Ends)
(obj::`collections.OrderedDict`): sample_dict
(obj::`collections.OrderedDict`): join_lanes
(obj::`dict`): name_dict
(obj::`dict`): join_reads
"""
bam_ext = [x.strip(" ") for x in parameter["Fileextension"]["bam"]]
fastq_ext = [x.strip(" ") for x in parameter["Fileextension"]["fastq"]]
sample_dict = OrderedDict()
all_files = []
name_dict = {}
join_reads = {}
type = "PE"
join_lanes = OrderedDict()
if arguments["r1"]:
assert os.path.getsize(arguments["r1"]) != 0, (
"File %s is empty." % arguments["r1"])
if any([arguments["r1"].endswith(ext) for ext in bam_ext]):
arguments["technology"] = "IonTorrent"
else:
arguments["technology"] = "Illumina"
if arguments["r2"]:
assert os.path.getsize(arguments["r2"]) != 0, (
"File %s is empty." % arguments["r2"])
sample_dict[get_setname(arguments["r1"])] = (
[os.path.abspath(arguments["r1"]),
os.path.abspath(arguments["r2"])])
name_dict[get_basename(arguments["r1"])] = (
get_setname(arguments["r1"]) + "_R1")
name_dict[get_basename(arguments["r2"])] = (
get_setname(arguments["r2"]) + "_R2")
else:
type = "SE"
sample_dict[get_setname(arguments["r1"])] = (
[os.path.abspath(arguments["r1"])])
name_dict[get_basename(arguments["r1"])] = (
get_setname(arguments["r1"]))
else:
if os.path.isdir(arguments["input"][0]):
for root, dirs, files in os.walk(arguments["input"][0]):
for file in files:
if any([file.endswith(ext)
for ext in fastq_ext + bam_ext]):
if os.path.getsize(os.path.join(root, file)) != 0:
all_files.append(os.path.join(root, file))
else:
warnings.warn("Skip empty file %s" % file,
stacklevel=2)
else:
all_files = arguments["input"]
assert all_files, (
("Check input again. "
"No files found for pattern %s ") % arguments["input"])
if len([x for x
in all_files
if any([ext in x for ext in bam_ext])
]
) == len(all_files):
arguments["technology"] = "IonTorrent"
else:
arguments["technology"] = "Illumina"
if (len(all_files) == 0):
sys.exit(str(arguments["input"])
+ " does not contain fastq or bam files.")
# find read pairs
all_files = sorted(list(all_files))
if all([re.search(base_pattern, x) for x in all_files]):
for setname, files in groupby(all_files,
key=lambda x: re.search(
base_pattern,
x
).group("setname")):
read_pairs = dict()
setname = get_setname(setname)
for lane, lane_file in groupby(list(files),
key=lambda x: re.search(
base_pattern,
x
).group("lane")):
read_pairs[lane] = []
for readgroup, readfiles in groupby(
list(lane_file),
key=lambda x: re.search(base_pattern, x
).group("read")):
readfiles = list(readfiles)
if len(readfiles) != 0:
if len(readfiles) > 1:
concat_reads = (
"QCResults/tmp/join_reads/"
+ "_".join(
re.search(base_pattern,
os.path.basename(
readfiles[0])
).groups()[:-2])
+ "_000.fastq.gz")
join_reads[concat_reads] = [os.path.abspath(x)
for x in readfiles]
readfiles = concat_reads
else:
readfiles = os.path.abspath(readfiles[0])
read_pairs[lane].append(readfiles)
# Multiple lanes
if len(read_pairs) > 1:
join_lanes[setname] = []
for key in sorted(read_pairs.keys()):
samplename = setname + "_" + key
sample_dict[samplename] = read_pairs[key]
if len(read_pairs[key]) == 2: # type == "PE":
name_dict[get_basename(read_pairs[key][0])] = (
get_setname(samplename) + "_R1")
name_dict[get_basename(read_pairs[key][1])] = (
get_setname(samplename) + "_R2")
else:
type = "SE"
name_dict[get_basename(read_pairs[key][0])] = (
get_setname(samplename, grouping=False))
if len(read_pairs) > 1:
join_lanes[setname].append(samplename)
else: # treat each file as sample
print("Treat files as single end")
sample_dict = OrderedDict(
[[get_setname(x, grouping=False), [os.path.abspath(x)]]
for x in all_files])
name_dict = dict(
[get_basename(x), get_setname(x, grouping=False)]
for x in all_files)
type = "SE"
return type, sample_dict, join_lanes, name_dict, join_reads
def check_input_validity(arguments):
if arguments["reference"]:
ref_file = arguments["reference"]
seq_record = ""
if not os.path.exists(arguments["reference"]):
sys.exit("Reference does not exist.")
try:
if ref_file[-2:] == "gz":
with subprocess.Popen(["gzip", "-cd", ref_file],
stdout=subprocess.PIPE) as gz_proc:
seq_record = gz_proc.stdout.readline().decode()
gz_proc.terminate()
else:
with open(arguments["reference"], "r") as ref_fh:
seq_record = ref_fh.readline()
assert seq_record.startswith(">"), ("Error: Reference"
"file is not valid.")
except AssertionError as e:
sys.exit(e)
except FileNotFoundError:
sys.exit("Error: Reference file not found")
#
# Check validity of Kraken DB
if not arguments["nokraken"]:
if not os.path.exists(arguments["kraken_db"]):
sys.exit("ERROR: %s does not exist.i"
" Enter a valid database"
" for kraken" % arguments["kraken_db"])
else:
if "database.kdb" not in os.listdir(arguments["kraken_db"]):
sys.exit("ERROR: database "
+ arguments["kraken_db"]
+ " does not contain necessary file database.kdb")
#
# Check input
if arguments["input"]:
if not all([os.path.exists(x[0]) for x in arguments["input"]]):
sys.exit(str(arguments["input"]) + " does not exist.")
else:
if not arguments["r1"]:
sys.exit("Pleaser enter an input file (--input or -1/-2)")
if not os.path.isfile(arguments["r1"]):
sys.exit(arguments["r1"]
+ " does not exist. Input file required."
+ " Use option -input or -1 / -2.")
if arguments["r2"]:
if not os.path.isfile(arguments["r2"]):
sys.exit(arguments["r2"] + " does not exist.")
if arguments["trimBetter_threshold"] and not arguments["trimBetter"]:
sys.exit("--trimBetter must be set to use --trimbetter_threshold."
" Add --trimBetter to your command "
"or remove --trimbetter_threshold.")
# --------------------------------------< main >-------------------------------
if __name__ == "__main__":
main()
QCumber-2.3.0/Rscripts/ 0000775 0000000 0000000 00000000000 13301047114 0014655 5 ustar 00root root 0000000 0000000 QCumber-2.3.0/Rscripts/barplot.R 0000775 0000000 0000000 00000003200 13301047114 0016441 0 ustar 00root root 0000000 0000000 options(warn=-1)
require(jsonlite)
require(ggplot2)
args = commandArgs(trailingOnly=TRUE)
convert2filename<- function(string, ext=".png"){
string<- gsub("\\[%\\]", "percentage", string)
string<- gsub("\\[#\\]", "number", string)
string<- gsub(" ", "_", string)
return(paste(string, ext, sep=""))
}
summary_json<- jsonlite::fromJSON(args[1])$summary
tablenames<- names(summary_json)[!names(summary_json) %in% c("images", "Trim Parameter")]
summary_json<- summary_json[tablenames]
#summary<- as.data.frame(read.csv(args[1]))
for( i in tablenames[2:length(tablenames)]){
ggplot(summary_json, aes(x=summary_json[,"setname"], y = summary_json[,i]), environment = environment()) +
geom_bar(stat = "identity", fill="#4593C1") +
theme(axis.text.x=element_text(angle=90, hjust=1, vjust = 0.5), legend.position = "none") +
ggtitle(i) +
xlab("Sample")+
ylab(i)
ggsave(paste(args[2], convert2filename(i), sep="/"))
}
temp_json <- data.frame(rbind(cbind( setname = summary_json$setname, type = "Total reads [#]", value= summary_json$`Total reads [#]`),
cbind( setname = summary_json$setname, type = "Reads after trimming [#]", value= summary_json$`Reads after trimming [#]`)
))
temp_json$value <- as.numeric(as.character(temp_json$value))
ggplot(temp_json, aes(x= ))
ggplot(temp_json, aes(x=setname, y = value, by=type, fill=type))+
geom_bar(stat="identity",position = "identity", alpha= 0.9) +
theme(axis.text.x=element_text(angle=90, hjust=1, vjust = 0.5)) +
ggtitle("Number of Reads") +
xlab("Sample")+
ylab("Number of Reads")
ggsave(paste(args[2], "number_of_reads.png", sep="/"))
QCumber-2.3.0/Rscripts/boxplot.R 0000775 0000000 0000000 00000002220 13301047114 0016466 0 ustar 00root root 0000000 0000000 #!/usr/bin/env Rscript
options(warn=-1)
library(ggplot2)
args = commandArgs(trailingOnly=TRUE)
mytable<- read.csv(args[1], header = F)
colnames(mytable) <- c("Sample", "Trimmed", "Read", "Value", "Count")
mytable<- mytable[which(mytable$Count>0),]
if(!any(is.na(mytable$Read))){
gp<- ggplot(mytable, aes(fill=Read,group=interaction(Sample, Trimmed, Read),x=Sample, y=Value, weight=Count))
gp<- gp+ geom_boxplot(outlier.size = 0.5) +theme(axis.text.x=element_text(angle=90, hjust=1, vjust = 0.5),legend.position="none") +
scale_fill_manual(values=c("#E25845", "#4593C1")) +
ggtitle(args[3]) +
xlab(args[4])+
ylab(args[5])
}else{
gp<- ggplot(mytable, aes(fill="R1",group=Sample,x=Sample, y=Value, weight=Count))
gp<- gp+ geom_boxplot(outlier.size = 0.5) +theme(axis.text.x=element_text(angle=90, hjust=1, vjust = 0.5),legend.position="none") +
scale_fill_manual(values=c("#E25845")) +
ggtitle(args[3]) +
xlab(args[4])+
ylab(args[5])
}
if(length(unique(mytable$Sample))>15){
gp<-gp + facet_wrap(~ Trimmed, ncol=1)
height=20
}else{
gp<-gp + facet_wrap(~ Trimmed)
height=10
}
ggsave(args[2],plot=gp, unit="cm") QCumber-2.3.0/Rscripts/sav.R 0000775 0000000 0000000 00000014252 13301047114 0015600 0 ustar 00root root 0000000 0000000 library(savR)
library(reshape2)
args = commandArgs(trailingOnly=TRUE)
project <- savR(args[1])
################
## Indexing ##
################
#total reads
total_reads<- clusters(project, 1L)
pf_reads<- pfClusters(project, 1L)
################
## Plots ##
################
##
# Data By Cycle
##
extraction<- extractionMetrics((project))
pdf("QCResults/SAV.pdf")
# Data By Cycle, FWHM/All Lanes / Both surfaces / All Bases
reshaped_extraction <- melt(extraction, measure.vars= c("FWHM_A","FWHM_C", "FWHM_T","FWHM_G"))
FWHM<- (aggregate(reshaped_extraction$value, by=list(reshaped_extraction$cycle, reshaped_extraction$variable), FUN=mean))
colnames(FWHM) <- c("Cycles","FWHM", "Value")
FWHM$FWHM<- sub("FWHM_","",FWHM$FWHM)
ggplot(data=FWHM )+
geom_line( aes(x=Cycles , y =Value, color=FWHM)) +
ggtitle("Data by Cycle - FWHM") +
xlab("Cycle") +
ylab("All bases FWHM")
ggsave(paste(args[2], "/data_by_cycle_fwhm.png", sep=""))
# Data By Cycle,Intensity /All Lanes / Both surfaces / All Bases
reshaped_extraction <- melt(extraction, measure.vars= c("int_A","int_C", "int_T","int_G"))
intensity<- (aggregate(reshaped_extraction$value, by=list(reshaped_extraction$cycle, reshaped_extraction$variable), FUN=mean))
colnames(intensity) <- c("Cycles","Intensity", "Value")
intensity$Intensity<- sub("int_","", intensity$Intensity)
ggplot(data=intensity )+
geom_line( aes(x=Cycles , y =Value, color=Intensity))+
ggtitle("Data By Cycle - Intensity")+
xlab("Cycle")+ylab("All bases intensity")
ggsave(paste(args[2], "/data_by_cycle_intensity.png", sep=""))
# Data By Cycle, %Base /All Lanes / Both surfaces / All Bases
#
corr<- correctedIntensities(project)
corr[,seq(14,17)]<-round(corr[,seq(14,17)] / apply(corr[,seq(14,17)], 1, sum) *100,2)
corr<- melt(corr, measure.vars= c("num_A","num_C", "num_T","num_G"))
corr<-(aggregate(corr$value, by=list(corr$cycle, corr$variable), FUN=mean))
colnames(corr)<- c("Cycle", "Base", "Perc_Base")
corr$Base<- sub("num_","", corr$Base)
ggplot(corr) +
geom_line(aes(x=Cycle, y= Perc_Base, color=Base)) +
ylab("All Bases % Base") +
ggtitle("Data by Cycle - % Base")
ggsave(paste(args[2], "/data_by_cycle_base.png" , sep =""))
##
# Data By Lane
##
tiles<- tileMetrics(project)
# Density, Both Surfaces
#pfBoxplot(project) # Generate a boxplot of the numbers of clusters and the number of Illumina pass-filter clusters per tile and lane
dens <-(tiles[which(tiles$code==100 | tiles$code==101 ),])
dens[which(dens$code==100),]$code <- "Raw Clusters"
dens[which(dens$code==101),]$code<- "PF Clusters"
dens$value <- dens$value/1000
ggplot(data = dens , aes(x=lane, y=value, fill=code))+
geom_boxplot() +
ggtitle("Data By Lane - Cluster Density") +
xlab("Lane")+ylab("Cluster Density (K/mm2)")
ggsave(paste(args[2], "/data_by_lane_cluster.png", sep=""))
# Phasing, Both Surfaces, All Bases
phasing_code <- seq(200, (200 + (length(project@reads)-1)*2),2)
phasing <-(tiles[which(tiles$code %in% phasing_code) ,])
for(i in phasing_code){
cat(paste("Read ",((i-200)/2)+1))
phasing[which(phasing$code==i),]$code = paste("Read ",((i-200)/2)+1)
}
ggplot(data = phasing[which(phasing$value>0),] , aes(x=lane, y=value*100, fill=code))+
geom_boxplot() +
ggtitle("Data By Lane - Phasing")+
xlab("Lane")+
ylab("% Phasing")+
scale_x_continuous(breaks = unique(phasing$lane))
ggsave(paste(args[2], "/data_by_lane_phasing.png", sep=""))
# Pre-Phasing, Both Surfaces, All Bases
prephasing_code <- seq(201, (201 + (length(project@reads)-1)*2),2)
prephasing <-(tiles[which(tiles$code %in% prephasing_code) ,])
for(i in prephasing_code){
prephasing[which(prephasing$code==i),]$code = paste("Read ",((i-201)/2)+1)
}
ggplot(data = prephasing[which(prephasing$value>0),] , aes(x=lane, y=value*100, fill=code))+
geom_boxplot() +
ggtitle("Data By Lane - Prephasing")+
xlab("Lane")+
ylab("% Prephasing") +
scale_x_continuous(breaks = unique(prephasing$lane))
ggsave(paste(args[2], "/data_by_lane_prephasing.png", sep=""))
##
# QScore Heatmap
##
png(paste(args[2], "/qscore_heatmap.png", sep=""), height=1025, width = 2571, res = 200)
qualityHeatmap(project, lane=seq(1,project@layout@lanecount) ,read=c(1,2))+ theme(axis.title.y = element_blank())
dev.off()
qualityHeatmap(project, lane=seq(1,project@layout@lanecount) ,read=c(1,2))+ theme(axis.title.y = element_blank())
qualy<- qualityMetrics(project)
qualy<- data.frame(apply(qualy, 2, as.numeric))
qualy_all<- melt(qualy, measure.vars= colnames(qualy)[4:ncol(qualy)])
qualy_all<- aggregate(qualy_all$value, by=list(qualy_all$variable), FUN=sum)
colnames(qualy_all)<- c("QScore","Total")
qualy_all$Total <- qualy_all$Total/1000000
qualy_all$QScore <- as.numeric(qualy_all$QScore)
ggplot(qualy_all, aes(x=QScore, y = Total )) +
geom_bar(stat="identity", aes(fill=QScore>=30)) +
ylab("Total (million)") +
geom_vline(aes(xintercept=30), linetype="dashed") +
geom_text(aes(x=35, y=max(Total)-max(Total)*0.1 ,label=(paste("QScore >=30 \n",
round(sum(qualy_all[which(qualy_all$QScore>=30),]$Total)/1000,2),
"G \n",
round(sum(qualy_all[which(qualy_all$QScore>=30),]$Total)/ sum(qualy_all$Total)*100,2),
"%")
))) +
ggtitle("QScore Distribution") +
theme(legend.position="none")
ggsave(paste(args[2], "/qscore_distr.png", sep=""))
over_q30 <- which(colnames(qualy) =="Q30"):ncol(qualy)
qualy_q30 <- as.data.frame(cbind(qualy[which(qualy$cycle>=25),"cycle"], apply(qualy[which(qualy$cycle>=25),over_q30],1, sum)))
colnames(qualy_q30) <- c("cycle", "sum")
sum_per_cycle <- cbind(qualy[which(qualy$cycle>=25),"cycle"], apply(qualy[which(qualy$cycle>=25),],1, sum))
colnames(sum_per_cycle) <- c("cycle", "sum")
qualy_q30$sum <-100* qualy_q30$sum/ sum_per_cycle[,"sum"]
ggplot(qualy_q30, aes(x=cycle, y = as.numeric(sum) )) +
geom_point()+
ylab("% >=Q30") +
ggtitle("Data by Cycle - %>=Q30")
ggsave(paste(args[2], "/qscore_q30.png", sep=""))
dev.off()
QCumber-2.3.0/Snakefile 0000775 0000000 0000000 00000051773 13301047114 0014710 0 ustar 00root root 0000000 0000000 __version__ = "2.0.0"
include: "modules/init.snakefile"
include: "modules/sav.snakefile"
include: "modules/fastqc.snakefile"
include: "modules/trimming.snakefile"
include: "modules/mapping.snakefile"
include: "modules/classification.snakefile"
#-------------------< Helper functions >---------------------------------------------------------#
from modules.json_output import write_summary_json, write_summary_json_new, get_fastqc_results, combine_csv, get_plot_type_names
from modules.utils import which
def trimming_input(wildcards):
if not config["notrimming"]:
if geninfo_config["Sample information"]["type"] == "PE":
return expand("{path}/trimmed/{sample}_{read}_fastqc",
path = fastqc_path, read=["R1", "R2"],
sample=geninfo_config["Sample information"]["samples"])
else:
return expand("{path}/trimmed/{sample}_fastqc",
path = fastqc_path, sample=sample_dict.keys())
else:
return None
def get_input(wildcards, if_not, ext, samplelist =[], path = ""):
if path !="":
path += "/"
if not config[if_not]:
if samplelist:
return expand("{path}{sample}{ext}" ,
path = path, ext=ext, sample=samplelist)
return expand("{path}{sample}{ext}",
path = path, ext=ext, sample=wildcards.sample)
else:
return ""
def get_all_fastqc(wildcards, path = fastqc_path + "/raw"):
'''
Generate raw sample names
Note:
I(Rene) believe that this should also return the _fastqc_data.txt
files, because they are required by trimbetter and the summary.
'''
return ["%s/%s_fastqc%s" % (
path,
geninfo_config["Sample information"]["rename"][get_name(x)],
fastqc_stat)
for x in unique_samples[wildcards.sample]
for fastqc_stat in ["","/fastqc_data.txt"]]
def get_trimmomatic_fastqc(wildcards, ext, path = trimming_path):
'''
Generate list of filepaths ending with read identifying string and _fastqc
Returns:
obj::`list` of filenames
Example:
["Path/to/QCResults/FastQC/trimmed/Sample1_S1_L001_R1_fastqc",
"Path/to/QCResults/FastQC/trimmed/Sample1_S1_L001_R2_fastqc"]
'''
if config["notrimming"]:
return []
paired = []
if geninfo_config["Sample information"]["type"]=="PE" and ext =="_fastqc":
paired =["_R1","_R2"]
if wildcards.sample in geninfo_config["Sample information"]["samples"].keys():
if paired:
return expand("{path}/{sample}{paired}{ext}",
sample= wildcards.sample, ext = ext, path = path,
paired = paired)
else:
return expand("{path}/{sample}{ext}" , sample= wildcards.sample,
ext = ext, path = path)
else:
if paired:
return expand("{path}/{sample}{paired}{ext}",
sample=(geninfo_config["Sample information"]
["join_lanes"][wildcards.sample]),
ext = ext, path = path, paired = paired)
else:
return expand("{path}/{sample}{ext}",
sample=(geninfo_config["Sample information"]
["join_lanes"][wildcards.sample]),
ext = ext, path = path)
assert False, "Something went wrong"
def get_trimmomatic_pseudofile(wildcards):
'''
Provides locations for pseudofiles used to force trimmomatic to run
This used to be done with log files, which caused those to disappear in
case of an error.
These files have been used to report to get_trimmomatic_results(),
but as they do not contain any data they produced bad values in report.
'''
if wildcards.sample in geninfo_config["Sample information"]["samples"].keys():
return expand("{path}/{sample}.trimmomatic.log" ,
sample= wildcards.sample, path=log_path)
else:
return expand("{path}/{sample}.trimmomatic.log",
sample=(geninfo_config["Sample information"]
["join_lanes"][wildcards.sample]),
path = log_path)
def get_trimmomatic_params(wildcards):
if wildcards.sample in geninfo_config["Sample information"]["samples"].keys():
return expand("{path}/{sample}.trimmomatic.params",
sample = wildcards.sample, path=trimming_path)
else:
return expand("{path}/{sample}.trimmomatic.params",
sample=(geninfo_config["Sample information"]
["join_lanes"][wildcards.sample]),
path=trimming_path)
def get_batch_files(wildcards):
steps = {"summary_json": data_path + "/summary.json"}
# if pdflatex is not installed on the system, skip pdf output files
if which("pdflatex") is not None:
steps["sample_report"] = expand("{path}/{sample}.pdf",
sample=unique_samples.keys(),
path=main_path)
if config["sav"]:
steps["sav"] = sav_results
if not config["nokraken"]:
steps["kraken_html"] = main_path + "/kraken.html"
steps["kraken_png"] = classification_path + "/kraken_batch.png"
return steps
#--------------------------------------------< RULES >-----------------------------------------------------------------#
rule run_all:
input:
main_path + "/batch_report.html",
lambda wildcards: ((
"%s/%s.sam" % (mapping_path, samp)
for samp in unique_samples.keys()) if config["save_mapping"] else [])
params:
save_mapping = config["save_mapping"]
rule write_final_report:
input:
unpack(get_batch_files)
output:
main_path + "/batch_report.html"
run:
#shell("cp {source} {output}", source = join(geninfo_config["QCumber_path"], "batch_report.html"))
env = Environment(
trim_blocks=True,
variable_start_string='{{~', variable_end_string="~}}")
env.loader = FileSystemLoader(geninfo_config["QCumber_path"])
template = env.get_template("batch_report.html")
summary = json.load(open(str(input.summary_json), "r"))
general_information = json.load(
open( data_path + "/general_information.json", "r"))
if config["sav"]:
sav = json.load(open( str(input.sav), "r"))
sav_json = json.dumps(sav)
else:
sav_json = []
#sav = json.load(open(str(input.general_information), "r"), object_pairs_hook=OrderedDict)
geninfo_config["Commandline"] = cmd_input
html = template.render(
general_information= json.dumps(config),
summary = json.dumps(summary["Results"]),
summary_img = json.dumps(summary["summary_img"]),
sav = sav_json )
html_file = open(str(output), "w")
html_file.write(html)
html_file.close()
# Write PDF report for each sample
def get_steps_per_sample(wildcards):
'''
Get dictionary of steps required to write sample output
sets up filenames required by rule "get_sample_json"
These vary depending on the arguments provided by the user
Affected by:
notrimming | reference | nokraken | nomapping
Returns:
steps (obj::`dict`): dictonary of required steps
key is obj::`str` step
value is obj::`list`(obj::`str`) filenames
'''
steps = {"raw_fastqc" : get_all_fastqc(wildcards)}
if not config["notrimming"]:
steps["trimming"]= get_trimmomatic_pseudofile(wildcards)
steps["trimming_params"] = get_trimmomatic_params(wildcards)
steps["trimming_fastqc"] = get_trimmomatic_fastqc(
wildcards,
"_fastqc", path=fastqc_path + "/trimmed")
if config["reference"] or config["index"]:
steps["mapping"] = get_input(
wildcards, if_not="nomapping",
ext=".bowtie2.log", samplelist=[], path=log_path)
if not config["nokraken"]:
steps["kraken"] = get_input(
wildcards,if_not = "nokraken", ext=".csv", samplelist=[],
path = classification_path ) # "{path}/{wildcards.sample}.kraken.png".format(path = classification_path, wildcards=wildcards)
steps["kraken_log"] = get_input(
wildcards,if_not = "nokraken", ext=".kraken.log",
samplelist=[], path = log_path)
return steps
''' raw_fastqc = get_all_fastqc,
trimming =get_trimmomatic_log,
trimming_params = lambda wildcards: get_trimmomatic_params(wildcards),
trimming_fastqc = lambda wildcards: get_trimmomatic_fastqc(wildcards, "_fastqc", path = fastqc_path + "/trimmed"),
mapping = lambda wildcards: get_input(wildcards,if_not = "nomapping", ext=".bowtie2.log",samplelist=[], path = log_path),
kraken = lambda wildcards: get_input(wildcards,if_not = "nokraken", ext=".csv", samplelist=[], path = classification_path ),
kraken_log = lambda wildcards: get_input(wildcards,if_not = "nokraken", ext=".kraken.log", samplelist=[], path = log_path)
'''
def get_sample_json_output():
output = {
"json": data_path + "/{sample}.json",
"newjson" : data_path + "/{sample}_new.json",
}
for plot_type_name in get_plot_type_names():
output["samplecsv" + plot_type_name] = temp(data_path + "/{sample}_" + plot_type_name + ".csv")
if not config["nokraken"]:
output["kraken_plot"] = classification_path + "/{sample}.kraken.png"
return output
'''
##### Note: Most run time bugs are some how involved with this rule ######
It calls getter functions from submodule snakefiles found in "./modules/"
This rule has lots of side effects
'''
rule write_sample_json:
input:
unpack(get_steps_per_sample)
output:
**get_sample_json_output()
params:
notrimming=config["notrimming"],
nokraken=config["nokraken"],
nomapping=config["nomapping"]
message:
"Write {wildcards.sample}.json"
run:
summary_dict = OrderedDict()
summary_dict["Name"] = wildcards.sample
summary_dict["Files"] = unique_samples[wildcards.sample]
summary_dict["Date"] = datetime.date.today().isoformat()
paired_end = geninfo_config["Sample information"]["type"] == "PE"
fastqc_dict, total_seq ,overrepr_count, adapter_content = (
get_fastqc_results(
parameter,
(x for x in input.raw_fastqc if x[-4:] != ".txt" ),
data_path , "raw", to_base64,
paired_end=paired_end)) #"QCResults/Report/tmp"
summary_dict["Total sequences"] = total_seq
summary_dict["%Overrepr sequences"] = overrepr_count
summary_dict["%Adapter content"] = adapter_content
summary_dict["raw_fastqc_results"] = fastqc_dict
if not params.notrimming:
summary_dict.update(get_trimmomatic_result(
list(input.trimming),
list(input.trimming_params)))
print(input.trimming)
fastqc_dict, total_seq, overrepr_count, adapter_content = (
get_fastqc_results(parameter, input.trimming_fastqc, data_path,"trimmed", to_base64))
if fastqc_dict !=[]:
summary_dict["trimmed_fastqc_results"] = fastqc_dict
summary_dict["%Overrepr sequences (trimmed)"] = overrepr_count
summary_dict["%Adapter content (trimmed)"] = adapter_content
# sort dict order
new_order = ["Name", "Files", "Date", "Total sequences",
"#Remaining Reads","%Remaining Reads",
"%Adapter content","%Adapter content (trimmed)",
"%Overrepr sequences",
"%Overrepr sequences (trimmed)",
"raw_fastqc_results","trimmed_fastqc_results"]
new_order.extend(list(
set(summary_dict.keys()) - set(new_order)))
summary_dict = OrderedDict(
(key, summary_dict[key]) for key in new_order)
if not params.nomapping:
summary_dict.update(get_bowtie2_result(str(input.mapping)))
summary_dict["Reference"] = config["reference"]
if not params.nokraken:
kraken_results = get_kraken_result(
str(input.kraken), str(output.kraken_plot))
if kraken_results:
summary_dict.update(kraken_results)
kraken_log = ""
with open(str(input.kraken_log),"r") as kraken_reader:
for line in kraken_reader.readlines():
if "..." not in line:
kraken_log +=line
summary_dict["kraken_log"] = kraken_log
json.dump(summary_dict, open(str(output.json), "w"))
fastqc_dict, total_seq ,overrepr_perc, adapter_content = (
get_fastqc_results(parameter,
(x for x in input.raw_fastqc if x[-4:] != ".txt" ),
data_path , "raw", to_base64))
res = dict()
res["Sample"] = dict()
res["Sample"]["Name"] = wildcards.sample
res["Sample"]["TS"] = total_seq
res["Sample"]["PAC"] = adapter_content
res["Sample"]["PORS"] = overrepr_perc
res["Sample"]["POST"] = "N/A"
res["Sample"]["PACT"] = "N/A"
res["Sample"]["NRR"] = "N/A"
res["Sample"]["PRR"] = "N/A"
res["Sample"]["NAR"] = "N/A"
res["Sample"]["PAR"] = "N/A"
res["Sample"]["NC"] = "N/A"
res["Sample"]["PC"] = "N/A"
if not config["notrimming"]:
fastqc_dict, total_seq, overrepr_perc, adapter_content = (
get_fastqc_results(parameter, input.trimming_fastqc, data_path,"trimmed", to_base64))
trimmomatic_results = get_trimmomatic_result(list(input.trimming), list(input.trimming_params))
res["Sample"]["POST"] = overrepr_perc
res["Sample"]["PACT"] = adapter_content
res["Sample"]["NRR"] = trimmomatic_results["#Remaining Reads"]
res["Sample"]["PRR"] = trimmomatic_results["%Remaining Reads"]
if not config["nomapping"]:
mapping_result = get_bowtie2_result(str(input.mapping))
res["Sample"]["NAR"] = mapping_result["#AlignedReads"]
res["Sample"]["PAR"] = mapping_result["%AlignedReads"]
if not config["nokraken"]:
kraken_results = get_kraken_result(str(input.kraken), str(output.kraken_plot))
if kraken_results is None:
res["Sample"]["NC"] = "N/A"
res["Sample"]["PC"] = "N/A"
json.dump(res, open(str(output.newjson), "w"))
def get_report_info(wildcards):
steps = {
"sample_json" : "{path}/{sample}.json".format(
sample = wildcards.sample, path = data_path),
"raw_fastqc" : get_all_fastqc(wildcards)}
if not config["notrimming"]:
try:
trimmed_path = fastqc_path + "/trimmed"
# ((fastqc_path + "/trimmed") # not needed and
# missing parentheses
# if not True # config["trimBetter"]
# else (trimbetter_path + "/FastQC"))
except KeyError:
trimmed_path = fastqc_path + "/trimmed"
steps["trimming_fastqc"]= get_trimmomatic_fastqc(
wildcards, "_fastqc", path=trimmed_path)
#if not config["nokraken"]:
# steps["kraken"] = classification_path + "/{sample}.translated".format(sample = wildcards.sample)
return steps
rule write_sample_report:
input:
unpack(get_report_info) #sample_json = data_path + "/{sample}.json"
output:
temp(main_path + "/{sample}.aux"),
pdf=main_path + "/{sample}.pdf",
tex=temp(main_path + "/{sample}.tex")
log:
log_path + "/texreport.log"
message:
"Write {wildcards.sample}.pdf"
run:
env = Environment(trim_blocks = True, variable_start_string='{{~',
variable_end_string = "~}}")
env.loader = FileSystemLoader(geninfo_config["QCumber_path"])
template = env.get_template("report.tex")
sample = json.load(open(str(input.sample_json),"r"),
object_pairs_hook=OrderedDict )
if "Reference" in sample.keys():
sample["Reference"] = basename(sample["Reference"] )
sample["path"] = dirname(sample["Files"][0])
sample["Files"] = [basename(x) for x in sample["Files"]]
# import pprint; pprint.pprint(sample)
pdf_latex = template.render(
#general_information=json.load(open(str(input.general.json),"r")),
general_information=geninfo_config,
sample=sample)
latex = open(str(output.tex), "w")
latex.write(pdf_latex)
latex.close()
#shell( "pdflatex -interaction=nonstopmode -output-directory=$(dirname {output.pdf}) {output.tex} -shell-escape 1>&2> {log}" )
with open(log[0], 'a') as f_log:
with subprocess.Popen(
["pdflatex", "-interaction=nonstopmode",
"-output-directory=%s" % dirname(output.pdf), output.tex],
stdout=f_log, stderr=sys.stdout) as pdflatex_proc:
pdflatex_proc.wait()
# dont knopw how to get rid of this log
# shell("mv {log} {mv_log}", log = str(output.pdf).replace(".pdf",
# ".log"),
# mv_log = str(log).replace("texreport.",
# "." + wildcards.sample + "."))
rule write_kraken_report:
input:
kraken = lambda wildcards: get_input(
wildcards, if_not = "nokraken", ext = ".csv",
samplelist= unique_samples.keys() , path = classification_path)
output:
kraken_html = main_path + "/kraken.html"
shell:
"ktImportText {input.kraken} -o {output.kraken_html}"
def get_files_of_all_steps():
steps = {"raw_fastqc": expand(
"{path}/raw/{sample}_fastqc",
sample=sample_dict.keys(), path=fastqc_path)}
if not config["notrimming"]:
steps["trimming"] = trimming_input
if not config["nomapping"]:
steps["mapping"] = lambda wildcards: get_input(
wildcards, if_not="nomapping", ext=".sam",
samplelist=unique_samples.keys(),path=mapping_path)
if not config["nokraken"]:
steps["kraken_png"] = classification_path + "/kraken_batch.png",
steps["sample_json"] = expand(
"{path}/{sample}.json", sample=unique_samples.keys(), path=data_path)
return steps
def get_batch_output():
''' Creation of dictonary that stores the
output of steps required to finish one batch
summary_json: Path/2/_data/summary.json
fastqc_plots: GC_content | length distribution
| per sequence quality scores
'''
steps = {}
steps["summary_json"] = data_path + "/summary.json"
steps["summary_json_new"] = data_path + "/summary_new.json"
steps["fastqc_plots"] = list(
expand("{path}/{img}.png", path="QCResults/_data",
img=["Per_sequence_GC_content", "Per_sequence_quality_scores",
"Sequence_Length_Distribution"])
)
steps["n_read_plot"] = "QCResults/_data/reads_after_trimming.png"
if not config["nomapping"]:
steps["mapping_plot"] = "QCResults/_data/mapping.png"
steps["insertsize_plot"] = "QCResults/_data/insertsize.png"
return steps
def get_batch_report_input():
steps={}
steps["sample_json"] = expand("{path}/{sample}.json", sample=unique_samples.keys(), path=data_path)
steps["sample_json_new"] = expand("{path}/{sample}_new.json", sample=unique_samples.keys(), path=data_path)
steps["samplecsv"] = expand(data_path + "/{sample}_{plot_type}.csv", sample=unique_samples.keys(), plot_type=get_plot_type_names())
if not config["nokraken"]:
steps["kraken_batch"] = classification_path + "/kraken_batch.png"
if config["reference"] or config["index"]:
steps["insertsize"] = expand("{mapping_path}/{sample}_insertsizes.txt", sample=unique_samples.keys(), mapping_path=mapping_path)
return steps
# Write html report for all samples
rule write_batch_report:
input:
#sample_json = expand("{path}/{sample}.json", sample=unique_samples.keys(), path=data_path)
**get_batch_report_input()
output:
**get_batch_output()
params:
nokraken = config["nokraken"]
run:
combine_csv(input.samplecsv, data_path)
fastqc_csv = expand("{path}/{img}.csv", path="QCResults/_data",
img = ["Per_sequence_GC_content",
"Per_sequence_quality_scores",
"Sequence_Length_Distribution"])
write_summary_json(output, config, input, fastqc_csv, geninfo_config, boxplots, shell, get_name, to_base64)
write_summary_json_new(output, input.sample_json_new)
QCumber-2.3.0/__init__.py 0000775 0000000 0000000 00000000000 13301047114 0015146 0 ustar 00root root 0000000 0000000 QCumber-2.3.0/batch_report.html 0000775 0000000 0000000 00002751012 13301047114 0016421 0 ustar 00root root 0000000 0000000
QCumber Batch Report