pax_global_header00006660000000000000000000000064125247626000014516gustar00rootroot0000000000000052 comment=4d6bc6c3e4955932663bc404ffaf647f9d666b2b r-cran-fastmatch-1.0-4/000077500000000000000000000000001252476260000146515ustar00rootroot00000000000000r-cran-fastmatch-1.0-4/DESCRIPTION000066400000000000000000000011601252476260000163550ustar00rootroot00000000000000Package: fastmatch Version: 1.0-4 Title: Fast match() function Author: Simon Urbanek Maintainer: Simon Urbanek Description: Package providing a fast match() replacement for cases that require repeated look-ups. It is slightly faster that R's built-in match() function on first match against a table, but extremely fast on any subsequent lookup as it keeps the hash table in memory. License: GPL-2 URL: http://www.rforge.net/fastmatch Packaged: 2012-01-21 10:09:18 UTC; svnuser Repository: CRAN Date/Publication: 2012-01-21 10:22:24 r-cran-fastmatch-1.0-4/MD5000066400000000000000000000005041252476260000151600ustar00rootroot0000000000000089f00fff119030016fece98c08b5040b *DESCRIPTION 7dd3c164abc64183f0681eaf7b85d73e *NAMESPACE 27e152f5450341fbb88d31cfbff45520 *NEWS 770a7b76ccff6f95d86152999543269b *R/fastmatch.R ddc4a8e8795d9bc6be2c7d507b7e160b *R/match.hash.R 1cf3221f784b90ed613d2454cc00a727 *man/fmatch.Rd 632693d50dad9116f97f57578ee10502 *src/fastmatch.c r-cran-fastmatch-1.0-4/NAMESPACE000066400000000000000000000001001252476260000160570ustar00rootroot00000000000000useDynLib(fastmatch) export(fmatch) S3method(print, match.hash) r-cran-fastmatch-1.0-4/NEWS000066400000000000000000000022121252476260000153450ustar00rootroot00000000000000 NEWS for fastmatch -------------------- 0.1-4 2012-01-12 o some R functions (such as subset assignment like x[1] <- 2) can create a new object (with possibly modified content) and copy all attributes including the hash cache. If the original object was used as a table in fmatch(), the hash cache will be copied into the modified object and thus its cache will be possibly out of sync with the object. fmatch() will now identify such cases and discard the hash to prevent errorneous results. 0.1-3 2011-12-21 o match() coerces POSIXlt objects into characters, but so far fmatch() performed the match on the actual objects. Now fmatch() coerces POSIXlt object into characters just like match(), but note that you will lose the ability to perform fast lookups if the table is a POSIXlt object -- please use POSIXct objects (much more efficient) or use as.character() on the POSIXlt object to create a table that you want to re-use. 0.1-2 2011-09-14 o bugfix: nomatch was ignored in the fastmatch implementation (thanks to Enrico Schumann for reporting) 0.1-1 2010-12-23 o minor cleanups 0.1-0 2010-12-23 o initial release r-cran-fastmatch-1.0-4/R/000077500000000000000000000000001252476260000150525ustar00rootroot00000000000000r-cran-fastmatch-1.0-4/R/fastmatch.R000066400000000000000000000002251252476260000171460ustar00rootroot00000000000000fmatch <- function(x, table, nomatch = NA_integer_, incomparables = NULL) .Call("fmatch", x, table, nomatch, incomparables, PACKAGE = "fastmatch") r-cran-fastmatch-1.0-4/R/match.hash.R000066400000000000000000000004421252476260000172130ustar00rootroot00000000000000# match.hash is an infomal (S3) class representing the # chain of hash tables stored in the .match.hash attribute # of tables that have been hashed # we provide a (sort of dummy) print method so # the output is not as ugly print.match.hash <- function(x, ...) { cat("\n"); x } r-cran-fastmatch-1.0-4/man/000077500000000000000000000000001252476260000154245ustar00rootroot00000000000000r-cran-fastmatch-1.0-4/man/fmatch.Rd000066400000000000000000000102471252476260000171610ustar00rootroot00000000000000\name{fmatch} \alias{fmatch} \alias{fastmatch} \title{ Fast match() replacement } \description{ \code{fmatch} is a faster version of the built-in \code{\link{match}}() function. It is slightly faster than the built-in version because it uses more specialized code, but in addition it retains the hash table within the table object such that it can be re-used, dramatically reducing the look-up time especially for large tables. Although \code{fmatch} can be used separately, in general it is also safe to use: \code{match <- fmatch} since it is a drop-in replacement. Any cases not directly handled by \code{fmatch} are passed to \code{match} with a warning. } \usage{ fmatch(x, table, nomatch = NA_integer_, incomparables = NULL) } \arguments{ \item{x}{values to be matched} \item{table}{values to be matched against} \item{nomatch}{the value to be returned in the case when no match is found. It is coerced to \code{integer}.} \item{incomparables}{a vector of values that cannot be matched. Any value other than \code{NULL} will result in a fall-back to \code{match} without any speed gains.} } \details{ See \code{\link{match}} for the purpose and details of the \code{match} function. \code{fmatch} is a drop-in replacement for the \code{match} function with the focus on performance. \code{incomparables} are not supported by \code{fmatch} and will be passed down to \code{match}. The first match against a table results in a hash table to be computed from the table. This table is then attached as the `.match.hash` attribute of the table so that it can be re-used on subsequent calls to \code{fmatch} with the same table. The hashing algorithm used is the same as the \code{match} function in R, but it is re-implemented in a slight different way to improve its performance at the cost of supporting only a subset of types (integer, real and character). For any other types \code{fmatch} falls back to \code{match} (with a warning). } \value{ A vector of the same length as \code{x} - see \code{\link{match}} for details. } %\references{ %} %\author{ %% ~~who you are~~ %} \note{ \code{fmatch} modifies the \code{table} by attaching an attribute to it. It is expected that the values will not change unless that attribute is dropped. Under normal circumstances this should not have any effect from user's point of view, but there is a theoretical chance of the cache being out of sync with the table in case the table is modified directly (e.g. by some C code) without removing attributes. Also \code{fmatch} does not convert to a common encoding so strings with different representation in two encodings don't match. } %% ~Make other sections like Warning with \section{Warning }{....} ~ \seealso{ \code{\link{match}} } \examples{ # some random speed comparison examples: # first use integer matching x = as.integer(rnorm(1e6) * 1000000) s = 1:100 # the first call to fmatch is comparable to match system.time(fmatch(s,x)) # but the subsequent calls take no time! system.time(fmatch(s,x)) system.time(fmatch(-50:50,x)) system.time(fmatch(-5000:5000,x)) # here is the speed of match for comparison system.time(base::match(s, x)) # the results should be identical identical(base::match(s, x), fmatch(s, x)) # next, match a factor against the table # this will require both x and the factor # to be cast to strings s=factor(c("1","1","2","foo","3",NA)) # because the casting will have to allocate a string # cache in R, we run a dummy conversion to take # that out of the equation dummy = as.character(x) # now we can run the speed tests system.time(fmatch(s, x)) system.time(fmatch(s, x)) # the cache is still valid for string matches as well system.time(fmatch(c("foo","bar","1","2"),x)) # now back to match system.time(base::match(s, x)) identical(base::match(s, x), fmatch(s, x)) # finally, some reals to match y = rnorm(1e6) s = c(y[sample(length(y), 100)], 123.567, NA, NaN) system.time(fmatch(s, y)) system.time(fmatch(s, y)) system.time(fmatch(s, y)) system.time(base::match(s, y)) identical(base::match(s, y), fmatch(s, y)) # this used to fail before 0.1-2 since nomatch was ignored identical(base::match(4L, 1:3, nomatch=0), fmatch(4L, 1:3, nomatch=0)) } \keyword{manip} r-cran-fastmatch-1.0-4/src/000077500000000000000000000000001252476260000154405ustar00rootroot00000000000000r-cran-fastmatch-1.0-4/src/fastmatch.c000066400000000000000000000250011252476260000175540ustar00rootroot00000000000000/* * fastmatch: fast implementation of match() in R using semi-permanent hash tables * * Copyright (C) 2010, 2011 Simon Urbanek * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; version 2 of the License. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. */ /* for speed (should not really matter in this case as most time is spent in the hashing) */ #define USE_RINTERNALS 1 #include /* for malloc/free since we handle our hash table memory separately from R */ #include /* for hashing for pointers we need intptr_t */ #include /* match5 to fall-back to R's internal match for types we don't support */ SEXP match5(SEXP itable, SEXP ix, int nmatch, SEXP incomp, SEXP env); /* ".match.hash" symbol - cached on first use */ SEXP hs; typedef int hash_index_t; typedef struct hash { int m, k, els, type; void *src; SEXP prot, parent; struct hash *next; hash_index_t ix[1]; } hash_t; /* create a new hash table with the given source and length. we store only the index - values are picked from the source so you must make sure the source is still alive when used */ static hash_t *new_hash(void *src, hash_index_t len) { hash_t *h; hash_index_t m = 2, k = 1, desired = len * 2; /* we want a maximal load of 50% */ while (m < desired) { m *= 2; k++; } h = (hash_t*) calloc(1, sizeof(hash_t) + (sizeof(hash_index_t) * m)); if (!h) Rf_error("unable to allocate %.2Mb for a hash table", (double) sizeof(hash_index_t) * (double) m / (1024.0 * 1024.0)); h->m = m; h->k = k; h->src = src; return h; } /* free the hash table (and all chained hash tables as well) */ static void free_hash(hash_t *h) { if (h->next) free_hash(h->next); if (h->prot) R_ReleaseObject(h->prot); free(h); } /* R finalized for the hash table object */ static void hash_fin(SEXP ho) { hash_t *h = (hash_t*) EXTPTR_PTR(ho); if (h) free_hash(h); } /* pi-hash fn */ #define HASH(X) (3141592653U * ((unsigned int)(X)) >> (32 - h->k)) /* add the integer value at index i (0-based!) to the hash */ static void add_hash_int(hash_t *h, hash_index_t i) { int *src = (int*) h->src; int val = src[i++], addr; addr = HASH(val); #ifdef PROFILE_HASH int oa = addr; #endif while (h->ix[addr] && src[h->ix[addr] - 1] != val) { addr++; if (addr == h->m) addr = 0; } #ifdef PROFILE_HASH if (addr != oa) printf("%d: dist=%d (addr=%d, oa=%d)\n", val, addr - oa, addr, oa); #endif if (!h->ix[addr]) h->ix[addr] = i; } /* to avoid aliasing rules issues use a union */ union dint_u { double d; unsigned int u[2]; }; /* add the double value at index i (0-based!) to the hash */ static void add_hash_real(hash_t *h, hash_index_t i) { double *src = (double*) h->src; union dint_u val; int addr; /* double is a bit tricky - we nave to nomalize 0.0, NA and NaN */ val.d = (src[i] == 0.0) ? 0.0 : src[i]; if (R_IsNA(val.d)) val.d = NA_REAL; else if (R_IsNaN(val.d)) val.d = R_NaN; addr = HASH(val.u[0]+ val.u[1]); #ifdef PROFILE_HASH int oa = addr; #endif while (h->ix[addr] && src[h->ix[addr] - 1] != val.d) { addr++; if (addr == h->m) addr = 0; } #ifdef PROFILE_HASH if (addr != oa) printf("%g: dist=%d (addr=%d, oa=%d)\n", val.d, addr - oa, addr, oa); #endif if (!h->ix[addr]) h->ix[addr] = i + 1; } /* add the pointer value at index i (0-based!) to the hash */ static void add_hash_ptr(hash_t *h, hash_index_t i) { int addr; void **src = (void**) h->src; intptr_t val = (intptr_t) src[i++]; #if (defined _LP64) || (defined __LP64__) || (defined WIN64) addr = HASH((val & 0xffffffff) ^ (val >> 32)); #else addr = HASH(val); #endif #ifdef PROFILE_HASH int oa = addr; #endif while (h->ix[addr] && (intptr_t) src[h->ix[addr] - 1] != val) { addr++; if (addr == h->m) addr = 0; } #ifdef PROFILE_HASH if (addr != oa) printf("%p: dist=%d (addr=%d, oa=%d)\n", val, addr - oa, addr, oa); #endif if (!h->ix[addr]) h->ix[addr] = i; } /* NOTE: we are returning a 1-based index ! */ static int get_hash_int(hash_t *h, int val, int nmv) { int *src = (int*) h->src; int addr; addr = HASH(val); while (h->ix[addr]) { if (src[h->ix[addr] - 1] == val) return h->ix[addr]; addr ++; if (addr == h->m) addr = 0; } return nmv; } /* NOTE: we are returning a 1-based index ! */ static int get_hash_real(hash_t *h, double val, int nmv) { double *src = (double*) h->src; int addr; union dint_u val_u; /* double is a bit tricky - we nave to normalize 0.0, NA and NaN */ if (val == 0.0) val = 0.0; if (R_IsNA(val)) val = NA_REAL; else if (R_IsNaN(val)) val = R_NaN; val_u.d = val; addr = HASH(val_u.u[0] + val_u.u[1]); while (h->ix[addr]) { if (src[h->ix[addr] - 1] == val) return h->ix[addr]; addr++; if (addr == h->m) addr = 0; } return nmv; } /* NOTE: we are returning a 1-based index ! */ static int get_hash_ptr(hash_t *h, void *val_ptr, int nmv) { void **src = (void **) h->src; intptr_t val = (intptr_t) val_ptr; int addr; #if (defined _LP64) || (defined __LP64__) || (defined WIN64) addr = HASH((val & 0xffffffff) ^ (val >> 32)); #else addr = HASH(val); #endif while (h->ix[addr]) { if ((intptr_t) src[h->ix[addr] - 1] == val) return h->ix[addr]; addr ++; if (addr == h->m) addr = 0; } return nmv; } static SEXP asCharacter(SEXP s, SEXP env) { SEXP call, r; PROTECT(call = lang2(install("as.character"), s)); PROTECT(r = eval(call, env)); UNPROTECT(2); return r; } /* the only externally visible function to be called from R */ SEXP fmatch(SEXP x, SEXP y, SEXP nonmatch, SEXP incomp) { SEXP a; SEXPTYPE type; hash_t *h = 0; int nmv = asInteger(nonmatch), n = LENGTH(x), np = 0, y_to_char = 0, y_factor = 0; /* edge-cases of 0 length */ if (n == 0) return allocVector(INTSXP, 0); if (LENGTH(y) == 0) { /* empty table -> vector full of nmv */ int *ai; a = allocVector(INTSXP, n); ai = INTEGER(a); for (np = 0; np < n; np++) ai[np] = nmv; return a; } /* if incomparables are used we fall back straight to match() */ if (incomp != R_NilValue && !(isLogical(incomp) && LENGTH(incomp) == 1 && LOGICAL(incomp)[0] == 0)) { Rf_warning("incomparables used in fmatch(), falling back to match()"); return match5(y, x, nmv, incomp, R_BaseEnv); } /* implicitly convert factors/POSIXlt to character */ if (OBJECT(x)) { if (inherits(x, "factor")) { x = PROTECT(asCharacterFactor(x)); np++; } else if (inherits(x, "POSIXlt")) { x = PROTECT(asCharacter(x, R_GlobalEnv)); /* FIXME: match() uses env properly - should we switch to .External ? */ np++; } } /* for y we may need to do that later */ y_factor = OBJECT(y) && inherits(y, "factor"); y_to_char = y_factor || (OBJECT(y) && inherits(y, "POSIXlt")); /* coerce to common type - in the order of SEXP types */ if(TYPEOF(x) >= STRSXP || TYPEOF(y) >= STRSXP) type = STRSXP; else type = (TYPEOF(x) < TYPEOF(y)) ? TYPEOF(y) : TYPEOF(x); /* we only support INT/REAL/STR */ if (type != INTSXP && type != REALSXP && type != STRSXP) { Rf_warning("incompatible type, fastmatch() is falling back to match()"); return match5(y, x, nmv, NULL, R_BaseEnv); } if (y_to_char && type != STRSXP) /* y = factor -> character -> type must be STRSXP */ type = STRSXP; /* coerce x - not y yet because we may get away with the existing cache */ if (TYPEOF(x) != type) { x = PROTECT(coerceVector(x, type)); np++; } /* find existing cache(s) */ if (!hs) hs = Rf_install(".match.hash"); a = Rf_getAttrib(y, hs); if (a != R_NilValue) { /* if there is a cache, try to find the matching type */ h = (hash_t*) EXTPTR_PTR(a); /* could the object be out of sync ? If so, better remove the hash and ignore it */ if (h->parent != y) { #if HASH_VERBOSE Rprintf(" - DISCARDING hash, its parent and the bearer don't match, taking no chances.\n"); #endif h = 0; Rf_setAttrib(y, hs, R_NilValue); } while (h && h->type != type) h = h->next; } /* if there is no cache or not of the needed coerced type, create one */ if (a == R_NilValue || !h) { h = new_hash(DATAPTR(y), LENGTH(y)); h->type = type; h->parent = y; #if HASH_VERBOSE Rprintf(" - creating new hash for type %d\n", type); #endif if (a == R_NilValue) { /* if there is no cache attribute, create one */ a = R_MakeExternalPtr(h, R_NilValue, R_NilValue); Rf_setAttrib(y, hs, a); Rf_setAttrib(a, R_ClassSymbol, Rf_mkString("match.hash")); R_RegisterCFinalizer(a, hash_fin); } else { /* otherwise append the new cache */ hash_t *lh = (hash_t*) EXTPTR_PTR(a); while (lh->next) lh = lh->next; lh->next = h; #if HASH_VERBOSE Rprintf(" (appended to the cache list)\n"); #endif } if (TYPEOF(y) != type) { #if HASH_VERBOSE if (y_to_char) Rprintf(" (need to convert table factor/POSIXlt to strings\n"); else Rprintf(" (need to coerce table to %d)\n", type); #endif y = y_to_char ? (y_factor ? asCharacterFactor(y) : asCharacter(y, R_GlobalEnv)) : coerceVector(y, type); h->src = DATAPTR(y); /* this is ugly, but we need to adjust the source since we changed it */ h->prot = y; /* since the coerced object is temporary, we let the hash table handle its life span */ R_PreserveObject(y); } /* make sure y doesn't go away while we create the hash */ /* R_PreserveObject(y); */ /* spawn a thread to create the hash */ /* nope - so far we do it serially */ { /* create the hash table */ int i, n = LENGTH(y); if (type == INTSXP) for(i = 0; i < n; i++) add_hash_int(h, i); else if (type == REALSXP) for(i = 0; i < n; i++) add_hash_real(h, i); else for(i = 0; i < n; i++) add_hash_ptr(h, i); } } { /* query the hash table */ int i, n = LENGTH(x); SEXP r = allocVector(INTSXP, n); int *v = INTEGER(r); if (type == INTSXP) { int *k = INTEGER(x); for (i = 0; i < n; i++) v[i] = get_hash_int(h, k[i], nmv); } else if (type == REALSXP) { double *k = REAL(x); for (i = 0; i < n; i++) v[i] = get_hash_real(h, k[i], nmv); } else { SEXP *k = (SEXP*) DATAPTR(x); for (i = 0; i < n; i++) v[i] = get_hash_ptr(h, k[i], nmv); } if (np) UNPROTECT(np); return r; } }