uniqtag/0000755000176200001440000000000012520012447011717 5ustar liggesusersuniqtag/tests/0000755000176200001440000000000012516500365013067 5ustar liggesusersuniqtag/tests/testthat.R0000644000176200001440000000007212516500365015051 0ustar liggesuserslibrary(testthat) library(uniqtag) test_check("uniqtag") uniqtag/tests/testthat/0000755000176200001440000000000012520012447014721 5ustar liggesusersuniqtag/tests/testthat/test-kmers-of.R0000644000176200001440000000034712516500365017556 0ustar liggesuserstest_that("kmers_of", expect_equal( kmers_of("hello", 3), c("hel", "ell", "llo"))) test_that("vkmers_of", expect_equal( vkmers_of(c("hello", "world"), 3), list(hello = c("hel", "ell", "llo"), world = c("wor", "orl", "rld")))) uniqtag/tests/testthat/test-make-unique.R0000644000176200001440000000110112516500365020241 0ustar liggesusersabc <- c("a", "b", "c") abcb <- c("a", "b", "c", "b") test_that("make_unique", expect_equal( make_unique(abcb), c("a", "b-1", "c", "b-2"))) test_that("make_unique_duplicates", expect_equal( make_unique_duplicates(abcb), c("a", "b", "c", "b-1"))) test_that("make_unique_all", expect_equal( make_unique_all(abcb), c("a-1", "b-1", "c-1", "b-2"))) test_that("make_unique_all_or_none 1", expect_equal( make_unique_all_or_none(abcb), c("a-1", "b-1", "c-1", "b-2"))) test_that("make_unique_all_or_none 2", expect_equal( make_unique_all_or_none(abc), c("a", "b", "c"))) uniqtag/tests/testthat/test-uniqtag.R0000644000176200001440000000255712516500365017510 0ustar liggesusersSys.setlocale("LC_COLLATE", "C") test_that("uniqtag aaaaaab k=3", expect_equal( unname(uniqtag(c("aaaaaab", "aaab"), k = 3)), c("aaa-1", "aaa-2"))) test_that("uniqtag aaaaaab k=4", expect_equal( unname(uniqtag(c("aaaaaab", "aaab"), k = 4)), c("aaaa", "aaab"))) states <- sub(" ", "", state.name) states3 <- setNames(c( "aba-1", "las-1", "Ari-1", "Ark-1", "Cal-1", "Col-1", "Con-1", "Del-1", "Flo-1", "Geo-1", "Haw-1", "Ida-1", "Ill-1", "Ind-1", "Iow-1", "Kan-1", "Ken-1", "Lou-1", "Mai-1", "Mar-1", "Mas-1", "Mic-1", "Min-1", "ipp-1", "our-1", "Mon-1", "Neb-1", "Nev-1", "Ham-1", "Jer-1", "Mex-1", "Yor-1", "Car-1", "Dak-1", "Ohi-1", "Okl-1", "Ore-1", "Pen-1", "Isl-1", "Car-2", "Dak-2", "Ten-1", "Tex-1", "Uta-1", "Ver-1", "Vir-1", "Was-1", "Wes-1", "Wis-1", "Wyo-1"), states) test_that("uniqtag states k=3", expect_equal(uniqtag(states, k = 3), states3)) states4 <- setNames(c( "Alab", "Alas", "Ariz", "Arka", "Cali", "Colo", "Conn", "Dela", "Flor", "Geor", "Hawa", "Idah", "Illi", "Indi", "Iowa", "Kans", "Kent", "Loui", "Main", "Mary", "Mass", "Mich", "Minn", "ippi", "isso", "Mont", "Nebr", "Neva", "Hamp", "Jers", "Mexi", "NewY", "rthC", "rthD", "Ohio", "Okla", "Oreg", "Penn", "Isla", "uthC", "uthD", "Tenn", "Texa", "Utah", "Verm", "Virg", "Wash", "West", "Wisc", "Wyom"), states) test_that("uniqtag states k=4", expect_equal(uniqtag(states, k = 4), states4)) uniqtag/NAMESPACE0000644000176200001440000000034512516503634013150 0ustar liggesusers# Generated by roxygen2 (4.1.1): do not edit by hand export(cumcount) export(kmers_of) export(make_unique) export(make_unique_all) export(make_unique_all_or_none) export(make_unique_duplicates) export(uniqtag) export(vkmers_of) uniqtag/R/0000755000176200001440000000000012516500365012126 5ustar liggesusersuniqtag/R/uniqtag.R0000644000176200001440000001203612516500365013723 0ustar liggesusers#' Abbreviate strings to short, unique identifiers. #' @docType package #' @name uniqtag-package #' @author Shaun Jackman \email{sjackman@@gmail.com} NULL #' Return the k-mers of a string. #' #' Return the k-mers (substrings of size \code{k}) of the string \code{x}, or #' return the string \code{x} itself if it is shorter than k. #' @describeIn kmers_of Return the k-mers of the string \code{x}. #' @param k the size of the substrings, an integer #' @param x a character string #' @return kmers_of: a character vector of the k-mers of \code{x} #' @export kmers_of <- function(x, k) if (nchar(x) < k) x else substring(x, 1:(nchar(x) - k + 1), k:nchar(x)) #' @describeIn kmers_of Return the k-mers of the strings \code{xs}. #' @param xs a character vector #' @return vkmers_of: a list of character vectors of the k-mers of \code{xs} #' @export vkmers_of <- function(xs, k) Vectorize(kmers_of, SIMPLIFY = FALSE)(xs, k) #' Cumulative count of strings. #' #' Return an integer vector counting the number of occurrences of each string up to that position in the vector. #' @param xs a character vector #' @return an integer vector of the cumulative string counts #' @examples #' cumcount(abbreviate(state.name, 3, strict = TRUE)) #' @export cumcount <- function(xs) { counts <- new.env(parent = emptyenv()) setNames(vapply(xs, function(x) counts[[x]] <- 1L + mget(x, counts, ifnotfound = 0L)[[1]], integer(1)), xs) } #' Make character strings unique. #' #' Apppend sequence numbers to duplicate elements to make all elements of a character vector unique. #' @param xs a character vector #' @param sep a character string used to separate a duplicate string from its sequence number #' @describeIn make_unique Append a sequence number to duplicated elements, including the first occurence. #' @seealso make.unique #' @examples #' abcb <- c("a", "b", "c", "b") #' make_unique(abcb) #' make_unique_duplicates(abcb) #' make_unique_all(abcb) #' make_unique_all_or_none(abcb) #' make_unique_all_or_none(c("a", "b", "c")) #' x <- make_unique(abbreviate(state.name, 3, strict = TRUE)) #' x[grep("-", x)] #' @export make_unique <- function(xs, sep = '-') { i <- xs %in% xs[duplicated(xs)] xs[i] <- make_unique_all(xs[i], sep) xs } #' @describeIn make_unique Append a sequence number to duplicated elements, except the first occurence. #' #' This function behaves similarly to make.unique #' @export make_unique_duplicates <- function(xs, sep = '-') { i <- duplicated(xs) xs[i] <- make_unique_all(xs[i], sep) xs } #' @describeIn make_unique Append a sequence number to every element. #' @export make_unique_all <- function(xs, sep = "-") { xs[] <- paste(xs, cumcount(xs), sep = sep) xs } #' @describeIn make_unique Append a sequence number to every element or no elements. #' #' Return \code{xs} unchanged if the elements of the character vector \code{xs} are already unique. #' Otherwise append a sequence number to every element. #' @export make_unique_all_or_none <- function(xs, sep = '-') if (anyDuplicated(xs)) make_unique_all(xs, sep) else xs #' Abbreviate strings to short, unique identifiers. #' #' Abbreviate strings to unique substrings of \code{k} characters. #' #' For each string in a set of strings, determine a unique tag that is a substring of fixed size \code{k} unique to that string, if it has one. If no such unique substring exists, the least frequent substring is used. If multiple unique substrings exist, the lexicographically smallest substring is used. This lexicographically smallest substring of size \code{k} is called the UniqTag of that string. #' #' The lexicographically smallest substring depend on the locale's sort order. #' You may wish to first call \code{Sys.setlocale("LC_COLLATE", "C")} #' #' @examples #' Sys.setlocale("LC_COLLATE", "C") #' states <- sub(" ", "", state.name) #' uniqtags <- uniqtag(states) #' uniqtags4 <- uniqtag(states, k = 4) #' uniqtags3 <- uniqtag(states, k = 3) #' uniqtags3x <- uniqtag(states, k = 3, uniq = make_unique) #' table(nchar(states)) #' table(nchar(uniqtags)) #' table(nchar(uniqtags4)) #' table(nchar(uniqtags3)) #' table(nchar(uniqtags3x)) #' uniqtags3[grep("-", uniqtags3x)] #' @param xs a character vector #' @param k the size of the identifier, an integer #' @param uniq a function to make the abbreviations unique, such as make_unique, make_unique_duplicates, make_unique_all_or_none, make_unique_all, make.unique, or to disable this function, identity or NULL #' @param sep a character string used to separate a duplicate string from its sequence number #' @return a character vector of the UniqTags of the strings \code{x} #' @seealso abbreviate, locales, make.unique #' @export uniqtag <- function(xs, k = 9, uniq = make_unique_all_or_none, sep = '-') { if (is.null(uniq)) { uniq <- identity sep <- NA } counts <- table(unlist(lapply(vkmers_of(xs, k), unique))) counts_kmers <- setNames( paste0(format(counts, justify = "right"), names(counts)), names(counts)) tags <- vapply(xs, function(x) names(counts_kmers)[match(min(counts_kmers[kmers_of(x, k)]), counts_kmers)], character(1)) if (is.na(sep)) uniq(tags) else uniq(tags, sep) } uniqtag/README.md0000644000176200001440000000405012516500365013203 0ustar liggesusersUniqTag ======= Abbreviate strings to short unique identifiers For each string in a set of strings, determine a unique tag that is a substring of fixed size *k* unique to that string, if it has one. If no such unique substring exists, the least frequent substring is used. If multiple unique substrings exist, the lexicographically smallest substring is used. This lexicographically smallest substring of size *k* is called the uniqtag of that string. Installation ================================================================================ Command line program ------------------------------------------------------------ ```sh curl -o ~/bin/uniqtag https://raw.githubusercontent.com/sjackman/uniqtag/master/uniqtag chmod +x ~/bin/uniqtag ``` or using [Homebrew][] or [Linuxbrew][] ```sh brew install uniqtag ``` [Homebrew]: http://brew.sh/ [Linuxbrew]: http://brew.sh/linuxbrew/ R package ------------------------------------------------------------ ```r # install.packages("devtools") devtools::install_github("sjackman/uniqtag") ``` Manuscript ========== [Shaun D. Jackman, Joerg Bohlmann, İnanç Birol (2014)][uniqtag-paper] UniqTag: Content-derived unique and stable identifiers for gene annotation. *bioRxiv*, [doi:10.1101/007583](http://dx.doi.org/10.1101/007583). [uniqtag-paper]: https://github.com/sjackman/uniqtag-paper Summary ======= When working on an ongoing genome sequencing and assembly project, it is rather inconvenient when gene identifiers change from one build of the assembly to the next. The gene labelling system described here, UniqTag, addresses this common challenge. UniqTag assigns a unique identifier to each gene that is a representative *k*-mer, a string of length *k*, selected from the sequence of that gene. Unlike serial numbers, these identifiers are stable between different assemblies and annotations of the same data without requiring that previous annotations be lifted over by sequence alignment. We assign UniqTag identifiers to nine builds of the Ensembl human genome spanning seven years to demonstrate this stability. uniqtag/MD50000644000176200001440000000133112520012447012225 0ustar liggesusers0cba23f3866b777f264d1d4b34f2d56c *DESCRIPTION fc63f2a88352f58f1f8e0524ca2dc14a *LICENSE 3c29aa1d148e7c18a91b2f4433df1278 *NAMESPACE f7279a6177c092e5e120270e1ad88bbc *R/uniqtag.R 5fc0318be7bcb7db242ce3e876ce3eb5 *README.md 1e2484c1e6cb1fcff799ce66b7c3ac77 *man/cumcount.Rd 8c177f03085b0e93392b036eb5873365 *man/kmers_of.Rd a5a8ad86c37ea4d5ce05575f607fe063 *man/make_unique.Rd 079eaa1788d8b593e5b69f91a00dff13 *man/uniqtag-package.Rd cc52b0113df5095470bb45ae08ba161e *man/uniqtag.Rd 4d957ae64a6c64be45db5dd5a38f1592 *tests/testthat.R 82c152aad01f744b1a862941afff5283 *tests/testthat/test-kmers-of.R a2bc0c4bbf15da4b795c06e8bbee30b8 *tests/testthat/test-make-unique.R 1867f837288914dd145a6c9377933b86 *tests/testthat/test-uniqtag.R uniqtag/DESCRIPTION0000644000176200001440000000161512520012447013430 0ustar liggesusersPackage: uniqtag Type: Package Version: 1.0 Title: Abbreviate Strings to Short, Unique Identifiers Description: For each string in a set of strings, determine a unique tag that is a substring of fixed size k unique to that string, if it has one. If no such unique substring exists, the least frequent substring is used. If multiple unique substrings exist, the lexicographically smallest substring is used. This lexicographically smallest substring of size k is called the "UniqTag" of that string. Authors@R: person("Shaun", "Jackman", , "sjackman@gmail.com", c("cre")) URL: https://github.com/sjackman/uniqtag BugReports: https://github.com/sjackman/uniqtag/issues Suggests: testthat License: MIT + file LICENSE Packaged: 2015-04-28 19:20:22 UTC; sjackman Author: Shaun Jackman [cre] Maintainer: Shaun Jackman NeedsCompilation: no Repository: CRAN Date/Publication: 2015-04-29 01:17:59 uniqtag/man/0000755000176200001440000000000012516500714012476 5ustar liggesusersuniqtag/man/kmers_of.Rd0000644000176200001440000000144412516500714014575 0ustar liggesusers% Generated by roxygen2 (4.1.1): do not edit by hand % Please edit documentation in R/uniqtag.R \name{kmers_of} \alias{kmers_of} \alias{vkmers_of} \title{Return the k-mers of a string.} \usage{ kmers_of(x, k) vkmers_of(xs, k) } \arguments{ \item{x}{a character string} \item{k}{the size of the substrings, an integer} \item{xs}{a character vector} } \value{ kmers_of: a character vector of the k-mers of \code{x} vkmers_of: a list of character vectors of the k-mers of \code{xs} } \description{ Return the k-mers (substrings of size \code{k}) of the string \code{x}, or return the string \code{x} itself if it is shorter than k. } \section{Functions}{ \itemize{ \item \code{kmers_of}: Return the k-mers of the string \code{x}. \item \code{vkmers_of}: Return the k-mers of the strings \code{xs}. }} uniqtag/man/cumcount.Rd0000644000176200001440000000074412516500714014627 0ustar liggesusers% Generated by roxygen2 (4.1.1): do not edit by hand % Please edit documentation in R/uniqtag.R \name{cumcount} \alias{cumcount} \title{Cumulative count of strings.} \usage{ cumcount(xs) } \arguments{ \item{xs}{a character vector} } \value{ an integer vector of the cumulative string counts } \description{ Return an integer vector counting the number of occurrences of each string up to that position in the vector. } \examples{ cumcount(abbreviate(state.name, 3, strict = TRUE)) } uniqtag/man/uniqtag-package.Rd0000644000176200001440000000052012516500714016023 0ustar liggesusers% Generated by roxygen2 (4.1.1): do not edit by hand % Please edit documentation in R/uniqtag.R \docType{package} \name{uniqtag-package} \alias{uniqtag-package} \title{Abbreviate strings to short, unique identifiers.} \description{ Abbreviate strings to short, unique identifiers. } \author{ Shaun Jackman \email{sjackman@gmail.com} } uniqtag/man/uniqtag.Rd0000644000176200001440000000337212516500714014442 0ustar liggesusers% Generated by roxygen2 (4.1.1): do not edit by hand % Please edit documentation in R/uniqtag.R \name{uniqtag} \alias{uniqtag} \title{Abbreviate strings to short, unique identifiers.} \usage{ uniqtag(xs, k = 9, uniq = make_unique_all_or_none, sep = "-") } \arguments{ \item{xs}{a character vector} \item{k}{the size of the identifier, an integer} \item{uniq}{a function to make the abbreviations unique, such as make_unique, make_unique_duplicates, make_unique_all_or_none, make_unique_all, make.unique, or to disable this function, identity or NULL} \item{sep}{a character string used to separate a duplicate string from its sequence number} } \value{ a character vector of the UniqTags of the strings \code{x} } \description{ Abbreviate strings to unique substrings of \code{k} characters. } \details{ For each string in a set of strings, determine a unique tag that is a substring of fixed size \code{k} unique to that string, if it has one. If no such unique substring exists, the least frequent substring is used. If multiple unique substrings exist, the lexicographically smallest substring is used. This lexicographically smallest substring of size \code{k} is called the UniqTag of that string. The lexicographically smallest substring depend on the locale's sort order. You may wish to first call \code{Sys.setlocale("LC_COLLATE", "C")} } \examples{ Sys.setlocale("LC_COLLATE", "C") states <- sub(" ", "", state.name) uniqtags <- uniqtag(states) uniqtags4 <- uniqtag(states, k = 4) uniqtags3 <- uniqtag(states, k = 3) uniqtags3x <- uniqtag(states, k = 3, uniq = make_unique) table(nchar(states)) table(nchar(uniqtags)) table(nchar(uniqtags4)) table(nchar(uniqtags3)) table(nchar(uniqtags3x)) uniqtags3[grep("-", uniqtags3x)] } \seealso{ abbreviate, locales, make.unique } uniqtag/man/make_unique.Rd0000644000176200001440000000304112516500714015266 0ustar liggesusers% Generated by roxygen2 (4.1.1): do not edit by hand % Please edit documentation in R/uniqtag.R \name{make_unique} \alias{make_unique} \alias{make_unique_all} \alias{make_unique_all_or_none} \alias{make_unique_duplicates} \title{Make character strings unique.} \usage{ make_unique(xs, sep = "-") make_unique_duplicates(xs, sep = "-") make_unique_all(xs, sep = "-") make_unique_all_or_none(xs, sep = "-") } \arguments{ \item{xs}{a character vector} \item{sep}{a character string used to separate a duplicate string from its sequence number} } \description{ Apppend sequence numbers to duplicate elements to make all elements of a character vector unique. } \section{Functions}{ \itemize{ \item \code{make_unique}: Append a sequence number to duplicated elements, including the first occurence. \item \code{make_unique_duplicates}: Append a sequence number to duplicated elements, except the first occurence. This function behaves similarly to make.unique \item \code{make_unique_all}: Append a sequence number to every element. \item \code{make_unique_all_or_none}: Append a sequence number to every element or no elements. Return \code{xs} unchanged if the elements of the character vector \code{xs} are already unique. Otherwise append a sequence number to every element. }} \examples{ abcb <- c("a", "b", "c", "b") make_unique(abcb) make_unique_duplicates(abcb) make_unique_all(abcb) make_unique_all_or_none(abcb) make_unique_all_or_none(c("a", "b", "c")) x <- make_unique(abbreviate(state.name, 3, strict = TRUE)) x[grep("-", x)] } \seealso{ make.unique } uniqtag/LICENSE0000644000176200001440000000005312516502452012727 0ustar liggesusersYEAR: 2015 COPYRIGHT HOLDER: Shaun Jackman