pax_global_header00006660000000000000000000000064134762306140014520gustar00rootroot0000000000000052 comment=8c33cabde8d18c2657cd6e38e7cb834f87cf9846 wasabi-1.0.1/000077500000000000000000000000001347623061400127655ustar00rootroot00000000000000wasabi-1.0.1/.Rbuildignore000066400000000000000000000000341347623061400154100ustar00rootroot00000000000000^.*\.Rproj$ ^\.Rproj\.user$ wasabi-1.0.1/.gitignore000066400000000000000000000003131347623061400147520ustar00rootroot00000000000000.Rproj.user .Rhistory .RData # History files .Rhistory .Rapp.history # Example code in package build process *-Ex.R # RStudio files .Rproj.user/ # produced vignettes vignettes/*.html vignettes/*.pdf wasabi-1.0.1/DESCRIPTION000066400000000000000000000010751347623061400144760ustar00rootroot00000000000000Package: wasabi Type: Package Title: Use Sailfish and Salmon with Sleuth; easily Version: 1.0.1 Date: 2019-06-05 Authors@R: c(person("Richard", "Smith-Unna", email = "rds45@cam.ac.uk", role = c("aut")), person("Rob", "Patro", email = "rob.patro@cs.stonybrook.edu", role = c("aut", "cre"))) Description: This package converts the output of the Sailfish and Salmon RNA-seq quantification tools so that it can be used with the Sleuth differential analysis package. License: BSD_3_clause + file LICENSE LazyData: TRUE Depends: methods Imports: data.table, rjson, rhdf5 wasabi-1.0.1/LICENSE000066400000000000000000000001161347623061400137700ustar00rootroot00000000000000YEAR: 2015 COPYRIGHT HOLDER: Richard Smith-Unna, Rob Patro ORGANIZATION: None wasabi-1.0.1/NAMESPACE000066400000000000000000000001161347623061400142020ustar00rootroot00000000000000exportPattern("^[[:alpha:]]+") import(data.table) import(rhdf5) import(rjson) wasabi-1.0.1/R/000077500000000000000000000000001347623061400131665ustar00rootroot00000000000000wasabi-1.0.1/R/wasabi.R000066400000000000000000000350671347623061400145720ustar00rootroot00000000000000#' Convert sailfish/salmon results for one or more samples to kallisto HDF5 #' #' @param fish_dirs a character vector of length greater than one where each #' string points to a sailfish/salmon output directory #' @export prepare_fish_for_sleuth <- function(fish_dirs, force=FALSE, fallback_mu=200, fallback_sd=80, fallback_num_reads=-1) { len_mu = length(fallback_mu) len_sd = length(fallback_sd) len_nr = length(fallback_num_reads) len_dir = length(fish_dirs) # from : http://stackoverflow.com/questions/4752275/test-for-equality-among-all-elements-of-a-single-vector compare <- function(v) all(sapply( as.list(v[-1]), FUN=function(z) {identical(z, v[1])})) same_len = compare(c(1, len_mu, len_sd, len_nr)) # We'll assume, for the time being, that all paths are from # the same version of the software testdir <- fish_dirs[1] # The default aux directory aux_dir = "aux" if (file.exists(file.path(testdir, "cmd_info.json"))) { cmd_info <- rjson::fromJSON(file=file.path(testdir, "cmd_info.json")) # If the user provided a different auxiliary directory, use that instead if (is.element("auxDir", names(cmd_info))) { aux_dir = cmd_info$auxDir } } # The path to the aux directory if it exists auxPath <- file.path(testdir, aux_dir) ## If we're dealing with the new format files if (file.exists(file.path(auxPath, "meta_info.json"))) { if (!same_len) { same_len = compare(c(len_dir, len_nr)) } if (same_len) { mapply(fish_to_hdf5, fish_dirs, force=force, fallback_num_reads=fallback_num_reads) } } else { ## We're dealing with the old format files same_len = compare(c(1, len_mu, len_sd, len_nr)) if (!same_len) { same_len = compare(c(len_dir, len_mu, len_sd, len_nr)) } if (same_len) { mapply(fish_to_hdf5_old, fish_dirs, force=force, fallback_mu=fallback_mu, fallback_sd=fallback_sd, fallback_num_reads=fallback_num_reads) } } if (!same_len) { msg <- paste("ERROR: If you are passing in fallback arguments (mu, sd, and num_reads) there must", "be either a single value for each to apply to all results, or it must be a", "vector of the same length as fish_dirs"); message(msg) FALSE } else { TRUE } } #' Convert sailfish results in new format #' SF ver >= 0.9.0, Salmon ver >= 0.6.0 for one sample to #' kallisto HDF5 #' #' @param fish_dir path to a sailfish output directory #' @param force if TRUE re-create the h5 file even if it exists fish_to_hdf5 <- function(fish_dir, force, fallback_num_reads) { h5file <- file.path(fish_dir, 'abundance.h5') if (!force && file.exists(h5file)) { print(paste("Skipping conversion: abundance.h5 already in ", fish_dir)) return() } # If we're forcing it, then we have to remove the file now # or h5createFile will complain later if (file.exists(h5file)) { file.remove(h5file) } # load quantification data quant <- fread(file.path(fish_dir, 'quant.sf')) setnames(quant, c('target_id', 'length', 'eff_length', 'tpm', 'est_counts')) if (!is.character(quant$target_id)) { quant$target_id <- as.character(quant$target_id) } aux_dir = "aux" cmd_info <- rjson::fromJSON(file=file.path(fish_dir, "cmd_info.json")) # If the user provided a different auxiliary directory, use that instead if (is.element("auxDir", names(cmd_info))) { aux_dir = cmd_info$auxDir } auxPath <- file.path(fish_dir, aux_dir) # get all of the meta info minfo <- rjson::fromJSON(file=file.path(auxPath, "meta_info.json")) sampType <- NULL # check if we have explicitly recorded the type of posterior sample # (salmon >= 0.7.3) if (is.element("samp_type", names(minfo))) { sampType <- minfo$samp_type } # load bootstrap data if it exists knownSampleTypes <- c("gibbs", "bootstrap") numBoot <- minfo$num_bootstraps if (numBoot > 0) { bootCon <- gzcon(file(file.path(auxPath, 'bootstrap', 'bootstraps.gz'), "rb")) #knownSampleType <- ifelse(is.null(sampType), FALSE, sampType %in% knownSampleTypes) #if (knownSampleType) { # boots <- readBin(bootCon, "double", n = num_targets * minfo$num_bootstraps) #} else { #} ## # Gibbs samples *used* to be integers, and bootstraps were doubles # Now, however, both types of samples are doubles. The code below # tries to load doubles first, but falls back to integers if it fails. ## if("num_targets" %in% names(minfo)) { num_targets = minfo$num_targets } else { num_targets = minfo$num_valid_targets } expected.n <- num_targets * minfo$num_bootstraps boots <- tryCatch({ boots.in <- readBin(bootCon, "double", n = expected.n) stopifnot(length(boots.in) == expected.n) boots.in }, error=function(...) { # close and re-open the connection to reset the file close(bootCon) bootCon <- gzcon(file(file.path(auxPath, 'bootstrap', 'bootstraps.gz'), "rb")) readBin(bootCon, "integer", n = expected.n) }) close(bootCon) boots <- as.double(boots) # rows are transcripts, columns are bootstraps dim(boots) <- c(num_targets, minfo$num_bootstraps) } # load stats numProcessed <- minfo$num_processed # If we're dealing with salmon, this field should be defined if (is.element("mapping_type", names(minfo))) { # In alignment mode (since unaligned reads may not be included in the BAM file) # we prefer to have the user supply the total number of reads in the input library if (minfo$mapping_type == "alignment") { if (fallback_num_reads == -1) { msg <- paste("Since salmon was run in alignment mode, it is recommended you provide", "the total number of input reads via the fallback_num_reads argument to", "prepare_fish_for_sleuth()") message(msg) } else { msg <- paste("Fish dir =", fish_dir, ":: fallback # reads =", fallback_num_reads) message(msg) numProcessed <- fallback_num_reads } } else if(minfo$mapping_type == "mapping" && fallback_num_reads != -1) { msg <- paste("It is generally NOT recommended to provide a fallback # of reads", "when salmon was run in mapping-based mode, as it should already", "accurately record this information. Please make sure you really", "mean to do this!") message(msg) msg <- paste("Fish dir =", fish_dir, ":: fallback # reads =", fallback_num_reads) message(msg) numProcessed <- fallback_num_reads } } # Otherwise, this is either sailfish or salmon's mapping-mode and the # numProcessed estimate should be OK. # build the hdf5 rhdf5::h5createFile(h5file) # counts are at root rhdf5::h5write(quant$est_counts, h5file, '/est_counts') # aux group has metadata about the run and targets rhdf5::h5createGroup(h5file, 'aux') rhdf5::h5write(numProcessed, h5file, 'aux/num_processed') rhdf5::h5write(numBoot, h5file, 'aux/num_bootstrap') rhdf5::h5write(quant$length, h5file, 'aux/lengths') rhdf5::h5write(quant$eff_length, h5file, 'aux/eff_lengths') rhdf5::h5write(quant$target_id, h5file, 'aux/ids') rhdf5::h5write('10', h5file, 'aux/index_version') rhdf5::h5write('sailfish', h5file, 'aux/kallisto_version') rhdf5::h5write(timestamp(prefix="", suffix=""), h5file, "aux/start_time") # bootstrap group has (.. wait for it ..) bootstrap data if (numBoot > 0) { rhdf5::h5createGroup(h5file, 'bootstrap') sapply(0:(numBoot-1), function(i) { bootid <- paste('bs', i, sep='') rhdf5::h5write(unlist(boots[,i+1]), h5file, paste('bootstrap', bootid, sep='/')) }) } bootCon <- gzcon(file(file.path(auxPath, 'fld.gz'), "rb")) fld <- readBin(bootCon, "int", n=minfo$frag_dist_length) close(bootCon) rhdf5::h5write(fld, h5file, 'aux/fld') bObsCon <- gzcon(file(file.path(auxPath, 'observed_bias.gz'), "rb")) bObs <- readBin(bObsCon, "int", n=minfo$num_bias_bins) close(bObsCon) rhdf5::h5write(bObs, h5file, 'aux/bias_observed') bExpCon <- gzcon(file(file.path(auxPath, 'expected_bias.gz'), "rb")) bExp <- readBin(bObsCon, "double", n=minfo$num_bias_bins) close(bExpCon) rhdf5::h5write(bExp, h5file, 'aux/bias_normalized') if(utils::packageVersion('rhdf5') < "2.23.0") { rhdf5::H5close() } else { rhdf5::h5closeAll() } print(paste("Successfully converted sailfish / salmon results in", fish_dir, "to kallisto HDF5 format")) } #' Convert sailfish results in the older format #' sf ver <= 0.8.0, salmon ver <= 0.5.1 for one sample #' to kallisto HDF5 #' #' @param fish_dir path to a sailfish output directory #' @param force if TRUE re-create the h5 file even if it exists fish_to_hdf5_old <- function(fish_dir, force, fallback_mu, fallback_sd, fallback_num_reads) { h5file <- file.path(fish_dir, 'abundance.h5') if (!force && file.exists(h5file)) { print(paste("Skipping conversion: abundance.h5 already in ", fish_dir)) return() } # If we're forcing it, then we have to remove the file now # or h5createFile will complain later if (file.exists(h5file)) { file.remove(h5file) } # load quantification data quant <- fread(file.path(fish_dir, 'quant.sf')) setnames(quant, c('target_id', 'length', 'tpm', 'est_counts')) # NOTE: this can re-order the rows! if (!is.character(quant$target_id)) { quant$target_id <- as.character(quant$target_id) } setkey(quant, 'target_id') # load bootstrap data if it exists bootspath <- file.path(fish_dir, 'quant_bootstraps.sf') numBoot <- 0 if (file.exists(bootspath)) { boots <- fread(bootspath) target_ids <- as.character(names(boots)) boots <- data.table(t(boots)) setnames(boots, sapply(0:(ncol(boots)-1), function(i) paste('bs', i, sep=''))) numBoot <- ncol(boots) boots[, target_id:=target_ids] setkey(boots, 'target_id') quant <- merge(quant, boots) } # load stats stats_file <- file.path(fish_dir, 'stats.tsv') ## # If the stats.tsv file exists, use that to get the # number of observed fragments and effective lengths ## if (file.exists(stats_file)) { stats_tbl <- fread(stats_file) stats <- stats_tbl$V2 names(stats) <- stats_tbl$V1 stats_tbl <- stats_tbl[-1] setnames(stats_tbl, c('target_id', 'eff_length')) if (!is.character(quant$target_id)) { stats_tbl$target_id <- as.character(stats_tbl$target_id) } setkey(stats_tbl, 'target_id') quant <- merge(quant, stats_tbl) numProcessed <- stats[['numObservedFragments']] } else { ## # Otherwise, use hte provided value for the number of observed # fragments and compute the effective lengths given the provided # fragment length mean and standard deviation. ## max_len <- 1000 print('Found no stats.tsv file when parsing old fish directory') print(sprintf('Generating fragment length distribution mu = %f, sd = %f, max len = %d', fallback_mu, fallback_sd, max_len)) norm_counts <- get_norm_fl_counts(fallback_mu, fallback_sd, max_len) correction_factors <- get_eff_length_correction_factors(norm_counts, max_len) quant$eff_length <- get_eff_lengths(quant$length, correction_factors, max_len) if (fallback_num_reads < 0) { numProcessed <- round(sum(quant$est_counts)) } else { msg <- paste("Fish dir =", fish_dir, ":: fallback # reads =", fallback_num_reads) message(msg) numProcessed <- fallback_num_reads } print(sprintf('Setting number of processed reads to %d', numProcessed)) } # build the hdf5 rhdf5::h5createFile(h5file) # counts are at root rhdf5::h5write(quant$est_counts, h5file, 'est_counts') # aux group has metadata about the run and targets rhdf5::h5createGroup(h5file, 'aux') rhdf5::h5write(numProcessed, h5file, 'aux/num_processed') rhdf5::h5write(numBoot, h5file, 'aux/num_bootstrap') rhdf5::h5write(quant$length, h5file, 'aux/lengths') rhdf5::h5write(quant$eff_length, h5file, 'aux/eff_lengths') rhdf5::h5write(quant$target_id, h5file, 'aux/ids') rhdf5::h5write('10', h5file, 'aux/index_version') rhdf5::h5write('sailfish', h5file, 'aux/kallisto_version') rhdf5::h5write(timestamp(prefix="", suffix=""), h5file, "aux/start_time") # bootstrap group has (.. wait for it ..) bootstrap data if (numBoot > 0) { rhdf5::h5createGroup(h5file, 'bootstrap') sapply(0:(numBoot-1), function(i) { bootid <- paste('bs', i, sep='') rhdf5::h5write(unlist(quant[, bootid, with=FALSE]), h5file, paste('bootstrap', bootid, sep='/')) }) } if(utils::packageVersion('rhdf5') < "2.23.0") { rhdf5::H5close() } else { rhdf5::h5closeAll() } print(paste("Successfully converted sailfish / salmon results in", fish_dir, "to kallisto HDF5 format")) } #' Produce a collection of counts consistent with a truncated normal #' distribution from 1 to max_len with the given mean and standard deviation #' get_norm_fl_counts <- function(mean = mean, std = std, max_len) { dist <- vector(mode="numeric", max_len) totCount <- 10000 # Evaluate the function at the point p kernel <- function(p) { invStd = 1.0 / std x = invStd * (p - mean) exp(-0.5 * x * x) * invStd } totMass = sum(unlist(lapply(1:max_len, kernel))) currDensity = 0.0 if (totMass > 0.0) { for (i in 1:max_len) { currDensity = kernel(i) dist[i] = round(currDensity * totCount / totMass) } } return(dist) } #' Given a collection of counts for each fragment length, produce #' a list of correction factors that should be applied to obtain the #' effective length of each transcript from its un-normalized length #' get_eff_length_correction_factors <- function(counts, max_len) { correctionFactors <- vector(mode="numeric", length=max_len) vals <- vector(mode="numeric", length=max_len) multiplicities <- vector(mode="numeric", length=max_len) multiplicities[1] = counts[1] for (i in 2:max_len) { ci = i pi = i-1 v = counts[ci] vals[ci] = (v * ci) + vals[pi] multiplicities[ci] = v + multiplicities[pi] if (multiplicities[ci] > 0) { correctionFactors[ci] = vals[ci] / multiplicities[ci] } } return(correctionFactors) } #' Given a list of transcript lengths and correction factors, return #' a list of effective lengths. get_eff_lengths <- function(lengths, correction_factors, max_len) { eff_length <- function(orig_len) { correction_factor <- if (orig_len >= max_len) { correction_factors[max_len] } else { correction_factors[orig_len] } eff_len <- orig_len - correction_factor + 1.0 if (eff_len < 1.0) { eff_len <- orig_len } return(eff_len) } unlist(lapply(lengths, eff_length)) } wasabi-1.0.1/README.md000066400000000000000000000063741347623061400142560ustar00rootroot00000000000000**Important note for Windows users**: If you are running Wasabi under R or R studio on Windows, there is a known issue (bug, I would say) that prevents normal operation. Salmon and Sailfish output important extra information like bootstrap samples and the fragment length distribution to a subdirectory of the quantification directory named `aux`. Windows, unfortunately, [forbids `aux` as a folder name](https://blog.onetechnical.com/2006/11/16/forbidden-file-and-folder-names-on-windows/). If you are lucky enough to be reading this *before* running Sailfish/Salmon, you can pass an alternative folder name to the `--auxDir` command line option to avoid this issue. Otherwise, we suggest the following work-around. First, rename the `aux` subdirectory of each Sailfish/Salmon quantification directory to be a folder name that is not forbidden under Windows e.g. `aux2`. Then, open the `cmd_info.json` file in the quantification directory, and add the following line: ``` "auxDir" : "aux2", ``` Now, things should work normally under windows. Salmon versions (>=0.7.0) use `aux_info` as the default folder to avoid this issue. Future releases of Sailfish (>0.10.1) will use a different default name for the auxiliary directory to avoid this issue as well. # What is wasabi? [![Join the chat at https://gitter.im/COMBINE-lab/wasabi](https://badges.gitter.im/COMBINE-lab/wasabi.svg)](https://gitter.im/COMBINE-lab/wasabi?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge) Wasabi allows you to easily prepare [Sailfish](https://github.com/kingsfordgroup/sailfish) and [Salmon](https://github.com/COMBINE-lab/salmon) output for downstream analysis. Currently, its main purpose it to prepare output for downstream analysis with [sleuth](http://pachterlab.github.io/sleuth/). # How to use wasabi ## Installation First, you need to install the wasabi package. There are two main ways to accomplish this: #### Installation with devtools With `devtools`, it's easy to install `wasabi`: ```r source("http://bioconductor.org/biocLite.R") biocLite("devtools") # only if devtools not yet installed biocLite("COMBINE-lab/wasabi") ``` #### Installation with bioconda Alternatively, you can use the [conda](http://conda.pydata.org/miniconda.html) package manager, along with the [bioconda](https://bioconda.github.io/) channel to install `wasabi`: ``` conda create -n wasabi r-wasabi ``` ## Loading and using wasabi Once wasabi is installed, you can load the library with: ```r library(wasabi) ``` Now that wasabi is installed, we can use it to convert the Sailfish / Salmon output into sleuth-compatible format. Imagine we have some samples (Sailfish quant directories) sitting in a data directory: ``` data/samp1 data/samp2 data/samp3 data/samp4 ``` First, we create a simple vector containing these directories (the `>` below is an R prompt): ```r > sfdirs <- file.path("data", c("samp1", "samp2", "samp3")) ``` Now, we simply run the `prepare_fish_for_sleuth` function: ```r > prepare_fish_for_sleuth(sfidrs) ``` The function will write some status messages to the console and, when it's done, each directory will now contain and `abundance.h5` file in a sleuth-compatible format. From this point forward, you can simply run sleuth normally. wasabi-1.0.1/wasabi.Rproj000066400000000000000000000005441347623061400152540ustar00rootroot00000000000000Version: 1.0 RestoreWorkspace: Default SaveWorkspace: Default AlwaysSaveHistory: Default EnableCodeIndexing: Yes UseSpacesForTab: Yes NumSpacesForTab: 2 Encoding: UTF-8 RnwWeave: Sweave LaTeX: pdfLaTeX AutoAppendNewline: Yes StripTrailingWhitespace: Yes BuildType: Package PackageUseDevtools: Yes PackageInstallArgs: --no-multiarch --with-keep.source