stringdist/ 0000755 0001762 0000144 00000000000 14740175532 012454 5 ustar ligges users stringdist/tests/ 0000755 0001762 0000144 00000000000 13471343110 013603 5 ustar ligges users stringdist/tests/tinytest.R 0000644 0001762 0000144 00000000136 13471343110 015611 0 ustar ligges users if ( requireNamespace("tinytest", quietly=TRUE) ){ tinytest::test_package("stringdist") } stringdist/MD5 0000644 0001762 0000144 00000007664 14740175532 013001 0 ustar ligges users aa21b8dccccee678e550d7de80db27a5 *DESCRIPTION cc3642555c30befb6ffe4e1186221167 *NAMESPACE df396d2c10fbf53ab861cd90db76a51c *NEWS 3919d5c7a272594e445aad05da1ea36e *R/afind.R 509a5780c5a8dd49e7a0d3b9dbe12630 *R/amatch.R 8b5dd455c5213cd167cc1cd34fc9b938 *R/doc_api.R 6cb655ba78d4baf1f3772ea1994beddd *R/doc_encoding.R 22271ee4a6af80b007f6628482fea93e *R/doc_metrics.R acf3f8fe998f209d0b01e90343b212d2 *R/doc_parallel.R 34a807d6e8319ed2337a5a008ccbf158 *R/phonetic.R da873c3cac3c228836534b74024f0a39 *R/qgrams.R fc7dea6e4bd38ae6530871054d9ce7a3 *R/seqdist.R c7eedfe26a439757078b477594433189 *R/stringdist.R 7d49b471bd1c44fe671d9e0b882b3c70 *R/stringsim.R 8e8417b8a98fea38db2704162b10a182 *R/utils.R e473e754d2208d96db0a981c5f12b994 *README.md 83c6f778289910867dea1a3e3058124a *build/vignette.rds 7d23a627daf862ea03ed7a03eb321b05 *inst/CITATION 977822cf7222d72a3cde2a6cd22bbb5a *inst/doc/RJournal_6_111-122-2014.Rnw c5536388374d1553c191a4cb9239ad10 *inst/doc/RJournal_6_111-122-2014.pdf 2da3ce2311eb264c1df440f6e639b386 *inst/doc/stringdist_C-Cpp_api.Rnw fd9eb7d9cdc9aa5208f383917d136b3f *inst/doc/stringdist_C-Cpp_api.pdf 25e4b2bb89435d45989313c68613d83a *inst/include/Doxyfile 18f26dcb8ed96f6f3774b0458b860d8a *inst/include/stringdist_api.h 92fab863854f148d021dc528f2f94c7e *inst/tinytest/test_afind.R 6361e4abb98071410d535d62ca32a162 *inst/tinytest/test_amatch.R 55413004afe98a57ce48b3bddb87bafa *inst/tinytest/test_gh_issue_59.R 2f8009ca5a9ed831d08138a43c9b9b8a *inst/tinytest/test_gh_issue_78.R 9f5226472e627d699ee2930ada08734f *inst/tinytest/test_gh_issue_88.R 73d347fd9cebda746458aa36a5491e10 *inst/tinytest/test_phonetic.R 92e0a1b89dee56c82fd3e3a5d27f7da5 *inst/tinytest/test_qgrams.R 6e34a35beb0fc716f9fd24640b923b65 *inst/tinytest/test_seq_dist.R 367d6543f89596193d64c5ca342f53fd *inst/tinytest/test_stringdist.R 5bae8490902bcb1cd365c481ca23122b *inst/tinytest/test_stringsim.R 04dc92413254fb174bbf6f6d01fd0003 *man/afind.Rd 7894da670a50d6b7570ff0014f58e454 *man/amatch.Rd 3e45e311af3d9533e9e8171cae31edfb *man/phonetic.Rd da59ff7e1771e0addd30cef6ad095584 *man/printable_ascii.Rd 50d192ee6fc473b07e5286309c950183 *man/qgrams.Rd 83c73d541e4a660986f6e48045de821f *man/seq_amatch.Rd 01281077c6ead9895ab3b3a9496abf12 *man/seq_dist.Rd ab1387ec1e3fa6b433f54c77f300de0c *man/seq_qgrams.Rd 754211d0d0389ecbb7b7796c4ff7cc8d *man/seq_sim.Rd 58293b7f098db8171bf6ec43139d6fef *man/stringdist-api.Rd d951d7f619777d2c9c98452ca31b4704 *man/stringdist-encoding.Rd 05869118f4c855c5c9fa2038c9fe2cae *man/stringdist-metrics.Rd 63f712730a28fdb7f7b29775510c89b3 *man/stringdist-package.Rd 8239284cb5064fb615134d1c5b7e2100 *man/stringdist-parallelization.Rd 46c3d86c6f37bf5268e7fadc928a6175 *man/stringdist.Rd 49f07272a0ab76d888f7e7a9948831b9 *man/stringsim.Rd c4340fd02e530432f1a46071ac9fe05d *src/Makevars c4340fd02e530432f1a46071ac9fe05d *src/Makevars.win f944fea36f8448948f2d340f6cb4bb04 *src/R_register_native.c 73360fdcebfb7a931e1151ec08387402 *src/Rstringdist.c 52a8474ebe8277ff3a9243c5b4f10d3c *src/dictionary.h ccab45efb43f7eb2deee70b80218e9df *src/dist.h 1c7491a78791ded634f49240cfe6f867 *src/dl.c 7ee727fc5df29fdeacca7d5831e23be7 *src/hamming.c 90138167c156487149efd846b4fce6c6 *src/jaro.c 237eb0a9d9f7191576fc31a93ec23ea5 *src/lcs.c b1c0a71907c59ae45b9a52b49f6f38a5 *src/lv.c 2e59616ad0a2e85b731fe459fab31d76 *src/osa.c f285d5ff7cb77bdfa42d00baf08e6005 *src/qgram.c 3295784960d90a9e03c0b8a3f187f52f *src/qtree.h c4fc61cf43521ce9aa57110178c67df4 *src/soundex.c c16ff90cd0a544170e768c950fc55770 *src/stringdist.c 5c4e0e27c562c81b4467e4ebcac82ee3 *src/stringdist.h 6a4a8820f5910e75b6b209e2e5137d09 *src/utf8ToInt.c 0fb0e362e859260309fb69a4fec6c541 *src/utils.c 887b451f7bb5b531783853bce08ae3c5 *src/utils.h f7908ae0c1da9a3cf98cd65cc9897835 *tests/tinytest.R 977822cf7222d72a3cde2a6cd22bbb5a *vignettes/RJournal_6_111-122-2014.Rnw 03da72eaea04f0800ee6da2a0e227b59 *vignettes/loo2014stringdist.pdf 2da3ce2311eb264c1df440f6e639b386 *vignettes/stringdist_C-Cpp_api.Rnw eb6d14105392a6fc37b5dbd38764e425 *vignettes/stringdist_api.pdf stringdist/R/ 0000755 0001762 0000144 00000000000 14726262513 012655 5 ustar ligges users stringdist/R/amatch.R 0000644 0001762 0000144 00000024555 14726257414 014254 0 ustar ligges users #' Approximate string matching #' #' Approximate string matching equivalents of \code{R}'s native #' \code{\link[base]{match}} and \code{\%in\%}. #' #' \code{ain} is currently defined as #' #' \code{ain(x,table,...) <- function(x,table,...) amatch(x, table, nomatch=0,...) > 0} #' #' #' @section Note on \code{NA} handling: #' \code{R}'s native \code{\link[base]{match}} function matches \code{NA} with #' \code{NA}. This may feel inconsistent with \code{R}'s usual \code{NA} #' handling, since for example \code{NA==NA} yields #' \code{NA} rather than \code{TRUE}. In most cases, one may reason about the #' behaviour under \code{NA} along the lines of ``if one of the arguments is #' \code{NA}, the result shall be \code{NA}'', simply because not all #' information necessary to execute the function is available. One uses special #' functions such as \code{is.na}, \code{is.null} \emph{etc.} to handle special #' values. #' #' The \code{amatch} function mimics the behaviour of \code{\link[base]{match}} #' by default: \code{NA} is matched with \code{NA} and with nothing else. Note #' that this is inconsistent with the behaviour of \code{\link{stringdist}} #' since \code{stringdist} yields \code{NA} when at least one of the arguments #' is \code{NA}. The same inconsistency exists between \code{\link[base]{match}} #' and \code{\link[utils]{adist}}. In \code{amatch} this behaviour can be #' controlled by setting \code{matchNA=FALSE}. In that case, if any of the #' arguments in \code{x} is \code{NA}, the \code{nomatch} value is returned, #' regardless of whether \code{NA} is present in \code{table}. In #' \code{\link[base]{match}} the behaviour can be controlled by setting the #' \code{incomparables} option. #' #' #' @param x elements to be approximately matched: will be coerced to #' \code{character} unless it is a list consisting of \code{integer} vectors. #' @param table lookup table for matching. Will be coerced to \code{character} #' unless it is a list consting of \code{integer} vectors. #' @param nomatch The value to be returned when no match is found. This is #' coerced to integer. #' @param matchNA Should \code{NA}'s be matched? Default behaviour mimics the #' behaviour of base \code{\link[base]{match}}, meaning that \code{NA} matches #' \code{NA} (see also the note on \code{NA} handling below). #' @param method Matching algorithm to use. See \code{\link{stringdist-metrics}}. #' @param useBytes Perform byte-wise comparison. See \code{\link{stringdist-encoding}}. #' @param weight For \code{method='osa'} or \code{'dl'}, the penalty for #' deletion, insertion, substitution and transposition, in that order. When #' \code{method='lv'}, the penalty for transposition is ignored. When #' \code{method='jw'}, the weights associated with characters of \code{a}, #' characters from \code{b} and the transposition weight, in that order. #' Weights must be positive and not exceed 1. \code{weight} is ignored #' completely when \code{method='hamming'}, \code{'qgram'}, \code{'cosine'}, #' \code{'Jaccard'}, \code{'lcs'}, or \code{'soundex'}. #' @param maxDist Elements in \code{x} will not be matched with elements of #' \code{table} if their distance is larger than \code{maxDist}. Note that the #' maximum distance between strings depends on the method: it should always be #' specified. #' @param nthread Number of threads used by the underlying C-code. A sensible #' default is chosen, see \code{\link{stringdist-parallelization}}. #' #' @param q q-gram size, only when method is \code{'qgram'}, \code{'jaccard'}, #' or \code{'cosine'}. #' @param p Winklers 'prefix' parameter for Jaro-Winkler distance, with #' \eqn{0\leq p\leq0.25}. Only when method is \code{'jw'} #' @param bt Winkler's boost threshold. Winkler's prefix factor is #' only applied when the Jaro distance is larger than \code{bt}. #' Applies only to \code{method='jw'} and \code{p>0}. #' #' @return \code{amatch} returns the position of the closest match of \code{x} #' in \code{table}. When multiple matches with the same smallest distance #' metric exist, the first one is returned. \code{ain} returns a #' \code{logical} vector of length \code{length(x)} indicating wether an #' element of \code{x} approximately matches an element in \code{table}. #' #' @family matching #' #' @example ../examples/amatch.R #' @export amatch <- function(x, table, nomatch=NA_integer_, matchNA=TRUE , method=c("osa","lv","dl","hamming","lcs","qgram","cosine","jaccard", "jw", "soundex") , useBytes = FALSE , weight=c(d=1,i=1,s=1,t=1) , maxDist=0.1, q=1, p=0, bt=0 , nthread = getOption("sd_num_thread")){ x <- as.character(x) table <- as.character(table) if (!useBytes){ x <- enc2utf8(x) table <- enc2utf8(table) } method <- match.arg(method) stopifnot( all(is.finite(weight)) , all(weight > 0) , all(weight <=1) , q >= 0 , p <= 0.25 , p >= 0 , matchNA %in% c(TRUE,FALSE) , maxDist > 0 , is.logical(useBytes) , ifelse(method %in% c('osa','dl'), length(weight) >= 4, TRUE) , ifelse(method %in% c('lv','jw') , length(weight) >= 3, TRUE) , length(nthread) == 1 , is.numeric(nthread) , nthread > 0 ) if (method == 'jw') weight <- weight[c(2,1,3)] method <- METHODS[method] if ( is.na(method) ){ stop(sprintf("method '%s' is not defined",method)) } .Call("R_amatch", x, table, method , as.integer(nomatch), as.integer(matchNA) , as.double(weight), as.double(p), as.double(bt) , as.integer(q) , as.double(maxDist), as.integer(useBytes) , as.integer(nthread) , PACKAGE="stringdist" ) } #' @param ... parameters to pass to \code{amatch} (except \code{nomatch}) #' #' #' @rdname amatch #' @export ain <- function(x,table,...){ amatch(x, table, nomatch=0, ...) > 0 } #' Approximate matching for integer sequences. #' #' #' For a \code{list} of integer vectors \code{x}, find the closest matches in a #' \code{list} of integer or numeric vectors in \code{table.} #' #' @section Notes: #' \code{seq_ain} is currently defined as #' #' \code{seq_ain(x,table,...) <- function(x,table,...) amatch(x, table, nomatch=0,...) > 0} #' #' All input vectors are converted with \code{as.integer}. This causes truncation for numeric #' vectors (e.g. \code{pi} will be treated as \code{3L}). #' #' #' @param x (\code{list} of) \code{integer} or \code{numeric} vector(s) to be #' approximately matched. Will be converted with \code{as.integer}. #' @param table (\code{list} of) \code{integer} or \code{numeric} vector(s) #' serving as lookup table for matching. Will be converted with #' \code{as.integer}. #' @param nomatch The value to be returned when no match is found. This is #' coerced to integer. #' @param matchNA Should \code{NA}'s be matched? Default behaviour mimics the #' behaviour of base \code{\link[base]{match}}, meaning that \code{NA} matches #' \code{NA}. With \code{NA}, we mean a missing entry in the \code{list}, represented as \code{NA_integer_}. #' If one of the integer sequences stored in the list has an \code{NA} entry, #' this is just treated as another integer (the representation of #' \code{NA_integer_}). #' @param method Matching algorithm to use. See \code{\link{stringdist-metrics}}. #' @param weight For \code{method='osa'} or \code{'dl'}, the penalty for #' deletion, insertion, substitution and transposition, in that order. When #' \code{method='lv'}, the penalty for transposition is ignored. When #' \code{method='jw'}, the weights associated with integers in elements of \code{a}, #' integers in elements of \code{b} and the transposition weight, in that order. #' Weights must be positive and not exceed 1. \code{weight} is ignored #' completely when \code{method='hamming'}, \code{'qgram'}, \code{'cosine'}, #' \code{'Jaccard'}, or \code{'lcs'}. #' @param maxDist Elements in \code{x} will not be matched with elements of #' \code{table} if their distance is larger than \code{maxDist}. Note that the #' maximum distance between strings depends on the method: it should always be #' specified. #' @param nthread Number of threads used by the underlying C-code. A sensible #' default is chosen, see \code{\link{stringdist-parallelization}}. #' #' @param q q-gram size, only when method is \code{'qgram'}, \code{'jaccard'}, #' or \code{'cosine'}. #' @param p Winkler's prefix parameter for Jaro-Winkler distance, with #' \eqn{0\leq p\leq0.25}. Only when method is \code{'jw'} #' @param bt Winkler's boost threshold. Winkler's prefix factor is #' only applied when the Jaro distance is larger than \code{bt}. #' Applies only to \code{method='jw'} and \code{p>0}. #' @return \code{seq_amatch} returns the position of the closest match of \code{x} #' in \code{table}. When multiple matches with the same minimal distance #' metric exist, the first one is returned. \code{seq_ain} returns a #' \code{logical} vector of length \code{length(x)} indicating wether an #' element of \code{x} approximately matches an element in \code{table}. #' #' @seealso \code{\link{seq_dist}}, \code{\link{seq_sim}}, \code{\link{seq_qgrams}} #' #' @example ../examples/seq_amatch.R #' @export seq_amatch <- function(x, table, nomatch=NA_integer_, matchNA=TRUE , method=c("osa","lv","dl","hamming","lcs","qgram","cosine","jaccard", "jw") , weight=c(d=1,i=1,s=1,t=1) , maxDist=0.1, q=1, p=0, bt=0 , nthread = getOption("sd_num_thread")){ x <- ensure_int_list(x) table <- ensure_int_list(table) method <- match.arg(method) stopifnot( all(is.finite(weight)) , all(weight > 0) , all(weight <=1) , q >= 0 , p <= 0.25 , p >= 0 , matchNA %in% c(TRUE,FALSE) , ifelse(method %in% c('osa','dl'), length(weight) >= 4, TRUE) , ifelse(method %in% c('lv','jw') , length(weight) >= 3, TRUE) , length(nthread) == 1 , is.numeric(nthread) , nthread > 0 ) if (method == 'jw') weight <- weight[c(2,1,3)] method <- METHODS[method] if ( is.na(method) ){ stop(sprintf("method '%s' is not defined",method)) } .Call("R_amatch", x, table, method , as.integer(nomatch), as.integer(matchNA) , as.double(weight), as.double(p), as.double(bt) , as.integer(q) , as.double(maxDist), 0L , as.integer(nthread) , PACKAGE="stringdist" ) } #' @param ... parameters to pass to \code{seq_amatch} (except \code{nomatch}) #' #' #' @rdname seq_amatch #' @export seq_ain <- function(x,table,...){ seq_amatch(x, table, nomatch=0, ...) > 0 } stringdist/R/phonetic.R 0000644 0001762 0000144 00000003666 14724546563 014634 0 ustar ligges users #' Phonetic algorithms #' #' Translate strings to phonetic codes. Similar sounding strings should get #' similar or equal codes. #' #' @param x a character vector whose elements are phonetically encoded. #' @param method name of the algorithm used. The default is \code{"soundex"}. #' @param useBytes Perform byte-wise comparison. \code{useBytes=TRUE} is faster #' but may yield different results depending on character encoding. For more #' information see the documentation of \code{\link{stringdist}}. #' #' @details #' Currently, only the soundex algorithm is implemented. Note that soundex coding #' is only meaningful for characters in the ranges a-z and A-Z. Soundex coding of strings #' containing non-printable ascii or non-ascii characters may be system-dependent and should #' not be trusted. If non-ascii or non-printable ascii charcters are encountered, a warning #' is emitted. #' #' @seealso \code{\link{printable_ascii}} #' #' #' @return #' The returns value depends on the method used. However, all currently #' implemented methods return a character vector of the same length of the input #' vector. Output characters are in the system's native encoding. #' #' @references #' \itemize{ #' \item{The Soundex algorithm implemented is the algorithm used by the #' \href{https://www.archives.gov/research/census/soundex}{National Archives}. #' This algorithm differs slightly from the original algorithm patented by R.C. Russell #' (US patents 1261167 (1918) and 1435663 (1922)). #' } #' } #' #' @example ../examples/phonetic.R #' #' @export phonetic <- function(x, method = c("soundex"), useBytes = FALSE) { x <- as.character(x) method <- match.arg(method) stopifnot(is.logical(useBytes)) if (!useBytes) x <- enc2utf8(x) if (method == "soundex") { r <- .Call("R_soundex", x, useBytes,PACKAGE="stringdist") if (!useBytes) int2char(r) else r } } int2char <- function(x) { enc2native(sapply(x, intToUtf8)) } stringdist/R/doc_parallel.R 0000644 0001762 0000144 00000004670 13471343110 015415 0 ustar ligges users #' @title #' Multithreading and parallelization in \pkg{stringdist} #' #' #' @description This page describes how \pkg{stringdist} uses parallel processing. #' #' @section Multithreading and parallelization in \pkg{stringdist}: #' The core #' functions of \pkg{stringdist} are implemented in C. On systems where #' \code{openMP} is available, \pkg{stringdist} will automatically take #' advantage of multiple cores. The #' \href{https://cran.r-project.org/doc/manuals/r-release/R-exts.html#OpenMP-support}{section #' on OpenMP} of the #' \href{https://cran.r-project.org/doc/manuals/r-release/R-exts.html}{Writing #' R Extensions} manual discusses on what systems OpenMP is available (at the time of writing more or #' less anywhere except on OSX). #' #' By default, the number of threads to use is taken from \code{options('sd_num_thread')}. #' When the package is loaded, the value for this option is determined as follows: #' \itemize{ #' \item{If the environment variable \code{OMP_NUM_THREADS} is set, this value is taken.} #' \item{Otherwise, the number of available cores is determined with \code{parallel::detectCores()} #' If this fails, the number of threads is set to 1 (with a message). If the nr of detected #' cores exceeds three, the number of used cores is set to \eqn{n-1}.} #' \item{If available, the environment variable \code{OMP_THREAD_LIMIT} is #' determined and The number of threads is set to the lesser of #' \code{OMP_THREAD_LIMIT} and the number of detected cores.} #' } #' #' The latter step makes sure that on machines with \eqn{n>3} cores, \eqn{n-1} #' cores are used. Some benchmarking showed that using all cores is often slower #' in such cases. This is probably because at least one of the threads will be #' shared with the operating system. #' #' Functions that use multithreading have an option named \code{nthread} that #' controls the maximum number of threads to use. If you need to do large #' calculations, it is probably a good idea to benchmark the performance on your #' machine(s) as a function of \code{'nthread'}, for example using the #' \href{https://cran.r-project.org/package=microbenchmark}{microbenchmark} #' package of Mersmann. #' #' #' #' #' @seealso #' \itemize{ #' \item{Functions running multithreaded: \code{\link{stringdist}}, \code{\link{stringdistmatrix}}, \code{\link{amatch}}, \code{\link{ain}} } #' } #' #' @name stringdist-parallelization #' @rdname stringdist-parallelization {} stringdist/R/afind.R 0000644 0001762 0000144 00000013666 14726260354 014076 0 ustar ligges users #' Stringdist-based fuzzy text search #' #' \code{afind} slides a window of fixed width over a string \code{x} and #' computes the distance between the each window and the sought-after #' \code{pattern}. The location, content, and distance corresponding to the #' window with the best match is returned. #' #' #' @param x strings to search in #' @param pattern strings to find (not a regular expression). For \code{grab}, #' \code{grabl}, and \code{extract} this must be a single string. #' @param window width of moving window. #' @param value toggle return matrix with matched strings. #' @inheritParams amatch #' #' @details #' Matching is case-sensitive. Both \code{x} and \code{pattern} are converted #' to \code{UTF-8} prior to search, unless \code{useBytes=TRUE}, in which case #' the distances are measured bytewise. #' #' Code is parallelized over the \code{x} variable: each value of \code{x} #' is scanned for every element in \code{pattern} using a separate thread (when \code{nthread} #' is larger than 1). #' #' The functions \code{grab} and \code{grabl} are approximate string matching #' functions that somewhat resemble base R's \code{\link[base]{grep}} and #' \code{\link[base:grep]{grepl}}. They are implemented as convenience wrappers #' of \code{afind}. #' #' @section Running cosine distance: #' This algorithm gains efficiency by using that two consecutive windows have #' a large overlap in their q-gram profiles. It gives the same result as #' the \code{"cosine"} distance, but much faster. #' #' #' @return #' For \code{afind}: a \code{list} of three matrices, each with #' \code{length(x)} rows and \code{length(pattern)} columns. In each matrix, #' element \eqn{(i,j)} corresponds to \code{x[i]} and \code{pattern[j]}. The #' names and description of each matrix is as follows. #' \itemize{ #' \item{\code{location}. \code{[integer]}, location of the start of best matching window. #' When \code{useBytes=FALSE}, this corresponds to the location of a \code{UTF} code point #' in \code{x}, possibly after conversion from its original encoding.} #' \item{\code{distance}. \code{[character]}, the string distance between pattern and #' the best matching window.} #' \item{\code{match}. \code{[character]}, the first, best matching window.} #' #' } #' #' @family matching #' #' @examples #' texts = c("When I grow up, I want to be" #' , "one of the harvesters of the sea" #' , "I think before my days are gone" #' , "I want to be a fisherman") #' patterns = c("fish", "gone","to be") #' #' afind(texts, patterns, method="running_cosine", q=3) #' #' grabl(texts,"grew", maxDist=1) #' extract(texts, "harvested", maxDist=3) #' #' #' @export afind <- function(x, pattern, window=NULL , value=TRUE , method = c("osa","lv","dl","hamming","lcs", "qgram","cosine","running_cosine","jaccard","jw","soundex") , useBytes = FALSE , weight=c(d=1,i=1,s=1,t=1) , q = 1 , p = 0 , bt = 0 , nthread = getOption("sd_num_thread") ){ stopifnot( all(is.finite(weight)) , all(weight > 0) , all(weight <=1) , is.null(window) || window >= 1 , q >= 0 , p <= 0.25 , p >= 0 , is.logical(useBytes) && !is.na(useBytes) , is.logical(value) && !is.na(value) , ifelse(method %in% c('osa','dl'), length(weight) >= 4, TRUE) , ifelse(method %in% c('lv','jw') , length(weight) >= 3, TRUE) , length(nthread) == 1 , is.numeric(nthread) , nthread > 0 ) x <- as.character(x) pattern <- as.character(pattern) if ( !useBytes ){ x <- enc2utf8(x) pattern <- enc2utf8(pattern) } if (is.null(window)){ window = nchar(pattern, type = if (useBytes) "bytes" else "char") } if (length(x) == 0) return(numeric(0)) method <- match.arg(method) if (method == 'jw') weight <- weight[c(2,1,3)] method <- METHODS[method] if ( is.na(method) ){ stop(sprintf("method '%s' is not defined",method)) } L <- .Call("R_afind" , x , pattern , as.integer(window) , method , as.double(weight) , as.double(p) , as.double(bt) , as.integer(q) , as.integer(useBytes) , as.integer(nthread) , PACKAGE="stringdist") names(L) <- c("location", "distance") if (isTRUE(value)){ matches = sapply(seq_along(pattern), function(i){ substr(x, L[[1]][,i], L[[1]][,i] + window[i]-1) }) L$match <- matrix(matches, nrow=length(x)) } L } #' @rdname afind #' @param ... passed to \code{afind}. #' @param maxDist Only windows with distance \code{<= maxDist} are considered a match. #' @return #' For \code{grab}, an \code{integer} vector, indicating in which elements of #' \code{x} a match was found with a distance \code{<= maxDist}. The matched #' values when \code{value=TRUE} (equivalent to \code{\link[base]{grep}}). #' @export grab <- function(x, pattern, maxDist=Inf, value=FALSE, ...){ stopifnot(is.numeric(maxDist), maxDist >= 0, length(pattern) == 1) L <- afind(x, pattern, value=value, ...) if (!value){ which(L$distance <= maxDist) } else { L$match[L$distance <= maxDist ] } } #' @rdname afind #' @param ... passed to \code{afind}. #' @return #' For \code{grabl}, a \code{logical} vector, indicating in which elements of #' \code{x} a match was found with a distance \code{<= maxDist}. (equivalent #' to \code{\link[base:grep]{grepl}}). #' @export grabl <- function(x, pattern, maxDist=Inf, ...){ stopifnot(is.numeric(maxDist), maxDist >= 0, length(pattern) == 1) L <- afind(x, pattern, value=FALSE, ...) as.logical(L$distance <= maxDist) } #' @rdname afind #' #' @return #' For \code{extract}, a \code{character} matrix with \code{length(x)} rows and #' \code{length(pattern)} columns. If match was found, element \eqn{(i,j)} #' contains the match, otherwise it is set to \code{NA}. #' @export extract <- function(x, pattern, maxDist = Inf, ...){ stopifnot(is.numeric(maxDist), maxDist >= 0, length(pattern) == 1) L <- afind(x, pattern, value=TRUE, ...) out <- L$match out[L$distance > maxDist] <- NA_character_ out } stringdist/R/seqdist.R 0000644 0001762 0000144 00000012327 14726257472 014471 0 ustar ligges users #' Compute distance metrics between integer sequences #' #' \code{seq_dist} computes pairwise string distances between elements of #' \code{a} and \code{b}, where the argument with less elements is recycled. #' \code{seq_distmatrix} computes the distance matrix with rows according to #' \code{a} and columns according to \code{b}. #' #' #' @section Notes: #' Input vectors are converted with \code{as.integer}. This causes truncation for numeric #' vectors (e.g. \code{pi} will be treated as \code{3L}). #' #' @param a (\code{list} of) \code{integer} or \code{numeric} vector(s). Will be converted with \code{as.integer} (target) #' @param b (\code{list} of) \code{integer} or \code{numeric} vector(s). Will be converted with \code{as.integer} (source). #' Optional for \code{seq_distmatrix}. #' @param method Distance metric. See \code{\link{stringdist-metrics}} #' @param weight For \code{method='osa'} or \code{'dl'}, the penalty for #' deletion, insertion, substitution and transposition, in that order. When #' \code{method='lv'}, the penalty for transposition is ignored. When #' \code{method='jw'}, the weights associated with characters of \code{a}, #' characters from \code{b} and the transposition weight, in that order. #' Weights must be positive and not exceed 1. \code{weight} is ignored #' completely when \code{method='hamming'}, \code{'qgram'}, \code{'cosine'}, #' \code{'Jaccard'}, or \code{'lcs'} #' @param q Size of the \eqn{q}-gram; must be nonnegative. Only applies to #' \code{method='qgram'}, \code{'jaccard'} or \code{'cosine'}. #' @param p Prefix factor for Jaro-Winkler distance. The valid range for #' \code{p} is \code{0 <= p <= 0.25}. If \code{p=0} (default), the #' Jaro-distance is returned. Applies only to \code{method='jw'}. #' @param bt Winkler's boost threshold. Winkler's prefix factor is #' only applied when the Jaro distance is larger than \code{bt} #' Applies only to \code{method='jw'} and \code{p>0}. #' @param nthread Maximum number of threads to use. By default, a sensible #' number of threads is chosen, see \code{\link{stringdist-parallelization}}. #' #' @return #' #' \code{seq_dist} returns a numeric vector with pairwise distances between \code{a} #' and \code{b} of length \code{max(length(a),length(b)}. #' #' For \code{seq_distmatrix} there are two options. If \code{b} is missing, the #' \code{\link[stats]{dist}} object corresponding to the \code{length(a) X #' length(a)} distance matrix is returned. If \code{b} is specified, the #' \code{length(a) X length(b)} distance matrix is returned. #' #' If any element of \code{a} or \code{b} is \code{NA_integer_}, the distance with #' any matched integer vector will result in \code{NA}. Missing values in the sequences #' themselves are treated as a number and not treated specially (Also see the examples). #' #' @seealso \code{\link{seq_sim}}, \code{\link{seq_amatch}}, \code{\link{seq_qgrams}} #' #' @example ../examples/seq_dist.R #' @export seq_dist <- function(a, b , method=c("osa","lv","dl","hamming","lcs", "qgram","cosine","jaccard","jw") , weight=c(d=1,i=1,s=1,t=1) , q=1, p=0, bt=0 , nthread = getOption("sd_num_thread") ){ a <- ensure_int_list(a) b <- ensure_int_list(b) stopifnot( all(is.finite(weight)) , all(weight > 0) , all(weight <=1) , q >= 0 , p <= 0.25 , p >= 0 , ifelse(method %in% c('osa','dl'), length(weight) >= 4, TRUE) , ifelse(method %in% c('lv','jw') , length(weight) >= 3, TRUE) , length(nthread) == 1 , is.numeric(nthread) , nthread > 0 ) if (length(a) == 0 || length(b) == 0){ return(numeric(0)) } if ( max(length(a),length(b)) %% min(length(a),length(b)) != 0 ){ warning(RECYCLEWARNING) } method <- match.arg(method) nthread <- as.integer(nthread) if (method == 'jw') weight <- weight[c(2,1,3)] do_dist(a=b, b=a , method=method , weight=weight , q=q , p=p , bt=bt , nthread=nthread) } #' @param useNames label the output matrix with \code{names(a)} and \code{names(b)}? #' @rdname seq_dist #' @export seq_distmatrix <- function(a, b , method=c("osa","lv","dl","hamming","lcs","qgram","cosine","jaccard","jw") , weight=c(d=1,i=1,s=1,t=1), q=1, p=0, bt=0 , useNames=c("names","none") , nthread = getOption("sd_num_thread") ){ useNames <- match.arg(useNames) method <- match.arg(method) nthread <- as.integer(nthread) if (method == 'jw') weight <- weight[c(2,1,3)] a <- ensure_int_list(a) # if b is missing, generate a 'dist' object. if (missing(b)){ return( lower_tri(a , method=method , weight=weight , q=q , p=p , bt=bt , useNames=useNames , nthread=nthread) ) } b <- ensure_int_list(b) if (length(a) == 0 || length(b) == 0){ return(matrix(numeric(0))) } if (useNames == "names"){ rowns <- names(a) colns <- names(b) } x <- vapply(b , function(src) do_dist(list(src), b=a, method=method, weight=weight, q=q, p=p,bt=bt, nthread=nthread) , USE.NAMES=FALSE, FUN.VALUE=numeric(length(a)) ) if (useNames == "names" ){ structure(matrix(x,nrow=length(a),ncol=length(b), dimnames=list(rowns,colns))) } else { matrix(x,nrow=length(a),ncol=length(b)) } } stringdist/R/doc_encoding.R 0000644 0001762 0000144 00000006667 13452644402 015426 0 ustar ligges users #' @title #' String metrics in \pkg{stringdist} #' #' @description #' This page gives an overview of encoding handling in \pkg{stringst}. #' #' #' @section Encoding in \pkg{stringdist}: #' #' All character strings are stored as a sequence of bytes. An encoding #' system relates a byte, or a short sequence of bytes to a symbol. Over the years, many #' encoding systems have been developed, and not all OS's and softwares use the same encoding #' as default. Similarly, depending on the system R is running on, R may use a #' different encoding for storing strings internally. #' #' The \pkg{stringdist} package is designed so users in principle need not #' worry about this. Strings are converted to \code{UTF-32} (unsigned integer) #' by default prior to any further computation. This means that results are #' encoding-independent and that strings are interpreted as a sequence of #' symbols, not as a sequence of pure bytes. In functions where this is #' relevant, this may be switched by setting the \code{useBytes} option to #' \code{TRUE}. However, keep in mind that results will then likely depend on the #' system R is running on, except when your strings are pure ASCII. #' Also, for multi-byte encodings, results for byte-wise computations #' will usually differ from results using encoded computations. #' #' Prior to \pkg{stringdist} version 0.9, setting \code{useBytes=TRUE} could #' give a significant performance enhancement. Since version 0.9, translation #' to integer is done by C code internal to \pkg{stringdist} and the difference in #' performance is now negligible. #' #' @section Unicode normalisation: #' In \code{utf-8}, the same (accented) character may be represented as several byte sequences. For example, an u-umlaut #' can be represented with a single byte code or as a byte code representing \code{'u'} followed by a modifier byte code #' that adds the umlaut. The \href{https://cran.r-project.org/package=stringi}{stringi} package #' of Gagolevski and Tartanus offers unicode normalisation tools. #' #' @section Some tips on character encoding and transliteration: #' Some algorithms (like soundex) are defined only on the printable ASCII character set. This excludes any character #' with accents for example. Translating accented characters to the non-accented ones is a form of transliteration. On #' many systems running R (but not all!) you can achieve this with #' #' \code{iconv(x,to="ASCII//TRANSLIT")}, #' #' where \code{x} is your character vector. See the documentation of \code{\link[base]{iconv}} for details. #' #' The \code{stringi} package (Gagolewski and Tartanus) should work on any system. The command #' \code{stringi::stri_trans_general(x,"Latin-ASCII")} transliterates character vector \code{x} to ASCII. #' #' @references #' \itemize{ #' \item{The help page of \code{\link[base]{Encoding}}} describes how R handles encoding. #' \item{The help page of \code{\link[base]{iconv}} has a good overview of base R's #' encoding conversion options. The capabilities of \code{iconv} depend on the system R is running on. #' The \pkg{stringi} package offers platform-independent encoding and normalization tools.} #' } #' #' @seealso #' \itemize{ #' \item{Functions using re-encoding: \code{\link{stringdist}}, \code{\link{stringdistmatrix}}, \code{\link{amatch}}, \code{\link{ain}}, \code{\link{qgrams}}} #' \item{Encoding related: \code{\link{printable_ascii}}} #' } #' @name stringdist-encoding #' @rdname stringdist-encoding {} stringdist/R/doc_metrics.R 0000644 0001762 0000144 00000021125 14724546165 015302 0 ustar ligges users #' @title #' String metrics in \pkg{stringdist} #' #' @description #' This page gives an overview of the string dissimilarity measures offered by #' \pkg{stringdist}. #' #' @section String Metrics: #' String metrics are ways of quantifying the dissimilarity between two finite #' sequences, usually text strings. Over the years, many such measures have been #' developed. Some are based on a mathematical understanding of the set of all #' strings that can be composed from a finite alphabet, others are based on more #' heuristic principles, such as how a text string sounds when pronounced by a #' native English speaker. #' #' The terms 'string metrics' and 'string distance' are used more or less #' interchangibly in literature. From a mathematical point of view, string #' metrics often do not obey the demands that are usually required from a #' distance function. For example, it is not true for all string metrics that a #' distance of 0 means that two strings are the same (e.g. in the \eqn{q}-gram #' distance). Nevertheless, string metrics are very useful in practice and have #' many applications. #' #' The metric you need to choose for an application strongly depends on both the #' nature of the string (what does the string represent?) and the cause of #' dissimilarities between the strings you are measuring. For example, if you #' are comparing human-typed names that may contain typo's, the Jaro-Winkler #' distance may be of use. If you are comparing names that were written down #' after hearing them, a phonetic distance may be a better choice. #' #' Currently, the following distance metrics are supported by \pkg{stringdist}. #' \tabular{ll}{ #' \bold{Method name} \tab \bold{Description}\cr #' \code{osa} \tab Optimal string aligment, (restricted Damerau-Levenshtein distance).\cr #' \code{lv} \tab Levenshtein distance (as in R's native \code{\link[utils]{adist}}).\cr #' \code{dl} \tab Full Damerau-Levenshtein distance.\cr #' \code{hamming} \tab Hamming distance (\code{a} and \code{b} must have same nr of characters).\cr #' \code{lcs} \tab Longest common substring distance.\cr #' \code{qgram} \tab \eqn{q}-gram distance. \cr #' \code{cosine} \tab cosine distance between \eqn{q}-gram profiles \cr #' \code{jaccard} \tab Jaccard distance between \eqn{q}-gram profiles \cr #' \code{jw} \tab Jaro, or Jaro-Winkler distance.\cr #' \code{soundex} \tab Distance based on soundex encoding (see below) #' } #' #' #' @section A short description of string metrics supported by \pkg{stringdist}: #' #' See \href{https://journal.r-project.org/archive/2014-1/loo.pdf}{Van der Loo #' (2014)} for an extensive description and references. The review papers of #' Navarro (2001) and Boytsov (2011) provide excellent technical overviews of #' respectively online and offline string matching algorithms. #' #' The \bold{Hamming distance} (\code{method='hamming'}) counts the number of #' character substitutions that turns \code{b} into \code{a}. If \code{a} #' and \code{b} have different number of characters the distance is \code{Inf}. #' #' The \bold{Levenshtein distance} (\code{method='lv'}) counts the number of #' deletions, insertions and substitutions necessary to turn \code{b} into #' \code{a}. This method is equivalent to \code{R}'s native \code{\link[utils]{adist}} #' function. #' #' The \bold{Optimal String Alignment distance} (\code{method='osa'}) is like the Levenshtein #' distance but also allows transposition of adjacent characters. Here, each #' substring may be edited only once. (For example, a character cannot be transposed twice #' to move it forward in the string). #' #' The \bold{full Damerau-Levenshtein distance} (\code{method='dl'}) is like the optimal #' string alignment distance except that it allows for multiple edits on substrings. #' #' The \bold{longest common substring} (method='lcs') is defined as the longest string that can be #' obtained by pairing characters from \code{a} and \code{b} while keeping the order #' of characters intact. The \bold{lcs-distance} is defined as the number of unpaired characters. #' The distance is equivalent to the edit distance allowing only deletions and insertions, #' each with weight one. #' #' A \bold{\eqn{q}-gram} (method='qgram') is a subsequence of \eqn{q} \emph{consecutive} #' characters of a string. If \eqn{x} (\eqn{y}) is the vector of counts #' of \eqn{q}-gram occurrences in \code{a} (\code{b}), the \bold{\eqn{q}-gram distance} #' is given by the sum over the absolute differences \eqn{|x_i-y_i|}. #' The computation is aborted when \code{q} is is larger than the length of #' any of the strings. In that case \code{Inf} is returned. #' #' The \bold{cosine distance} (method='cosine') is computed as \eqn{1-x\cdot #' y/(\|x\|\|y\|)}, where \eqn{x} and \eqn{y} were defined above. #' #' Let \eqn{X} be the set of unique \eqn{q}-grams in \code{a} and \eqn{Y} the set of unique #' \eqn{q}-grams in \code{b}. The \bold{Jaccard distance} (\code{method='jaccard'}) is given by \eqn{1-|X\cap Y|/|X\cup Y|}. #' #' The \bold{Jaro distance} (\code{method='jw'}, \code{p=0}), is a number #' between 0 (exact match) and 1 (completely dissimilar) measuring #' dissimilarity between strings. It is defined to be 0 when both strings have #' length 0, and 1 when there are no character matches between \code{a} and #' \code{b}. Otherwise, the Jaro distance is defined as #' \eqn{1-(1/3)(w_1m/|a| + w_2m/|b| + w_3(m-t)/m)}. #' Here,\eqn{|a|} indicates the number of characters in \code{a}, \eqn{m} is #' the number of character matches and \eqn{t} the number of transpositions of #' matching characters. The \eqn{w_i} are weights associated with the characters #' in \code{a}, characters in \code{b} and with transpositions. A character #' \eqn{c} of \code{a} \emph{matches} a character from \code{b} when \eqn{c} #' occurs in \code{b}, and the index of \eqn{c} in \code{a} differs less than #' \eqn{\max(|a|,|b|)/2 -1} (where we use integer division) from the index of #' \eqn{c} in \code{b}. Two matching characters are transposed when they are #' matched but they occur in different order in string \code{a} and \code{b}. #' #' The \bold{Jaro-Winkler distance} (\code{method=jw}, \code{0