fastmatch/0000755000175100001440000000000014076471110012243 5ustar hornikusersfastmatch/NAMESPACE0000644000175100001440000000033314076406314013465 0ustar hornikusersuseDynLib(fastmatch, C_fmatch = fmatch, C_ctapply = ctapply, C_coalesce = coalesce, C_append = append, mk_hash, get_table, get_values) export(fmatch, fmatch.hash, ctapply, coalesce, "%fin%") S3method(print, match.hash) fastmatch/man/0000755000175100001440000000000014076406314013022 5ustar hornikusersfastmatch/man/ctapply.Rd0000644000175100001440000000367714076406314015002 0ustar hornikusers\name{ctapply} \alias{ctapply} \title{ Fast tapply() replacement functions } \description{ \code{ctapply} is a fast replacement of \code{tapply} that assumes contiguous input, i.e. unique values in the index are never speparated by any other values. This avoids an expensive \code{split} step since both value and the index chungs can be created on the fly. It also cuts a few corners to allow very efficient copying of values. This makes it many orders of magnitude faster than the classical \code{lapply(split(), ...)} implementation. } \usage{ ctapply(X, INDEX, FUN, ..., MERGE=c) } \arguments{ \item{X}{an atomic object, typically a vector} \item{INDEX}{numeric or character vector of the same length as \code{X}} \item{FUN}{the function to be applied} \item{...}{additional arguments to \code{FUN}. They are passed as-is, i.e., without replication or recycling} \item{MERGE}{function to merge the resulting vector or \code{NULL} if the arguments to such a functiona re to be returned instead} } \details{ Note that \code{ctapply} supports either integer, real or character vectors as indices (note that factors are integer vectors and thus supported, but you do not need to convert character vectors). Unlike \code{tapply} it does not take a list of factors - if you want to use a cross-product of factors, create the product first, e.g. using \code{paste(i1, i2, i3, sep='\01')} or multiplication - whetever method is convenient for the input types. \code{ctapply} requires the \code{INDEX} to contiguous. One (slow) way to achieve that is to use \code{\link{sort}} or \code{\link{order}}. } %\value{ %} %\references{ %} \author{ Simon Urbanek } %% ~Make other sections like Warning with \section{Warning }{....} ~ \seealso{ \code{\link{tapply}} } \examples{ i = rnorm(4e6) names(i) = as.integer(rnorm(1e6)) i = i[order(names(i))] system.time(tapply(i, names(i), sum)) system.time(ctapply(i, names(i), sum)) } \keyword{manip} fastmatch/man/coalesce.Rd0000644000175100001440000000426414076406314015075 0ustar hornikusers\name{coalesce} \alias{coalesce} \title{ Create an index that groups unique values together } \description{ \code{coalesce} makes sure that a given index vector is coalesced, i.e., identical values are grouped into contiguous blocks. This can be used as a much faster alternative to \code{\link{sort.list}} where the goal is to group identical values, but not necessarily in a pre-defined order. The algorithm is linear in the length of the vector. } \usage{ coalesce(x) } \arguments{ \item{x}{character, integer or real vector to coalesce} } \details{ The current implementation takes two passes through the vector. In the first pass it creates a hash table for the values of \code{x} counting the occurrences in the process. In the second pass it assigns indices for every element based on the index stored in the hash table. The order of the groups of unique values is defined by the first occurence of each unique value, hence it is identical to the order of \code{\link{unique}}. One common use of \code{coalesce} is to allow the use of arbitrary vectors in \code{\link{ctapply}} via \code{ctapply(x[coalesce(x)], ...)}. } \value{ Integer vector with the resulting permutation. \code{x[coalesce(x)]} gives \code{x} with contiguous unique values. } %\references{ %} \author{ Simon Urbanek } %% ~Make other sections like Warning with \section{Warning }{....} ~ \seealso{ \code{\link{unique}}, \code{\link{sort.list}}, \code{\link{ctapply}} } \examples{ i = rnorm(2e6) names(i) = as.integer(rnorm(2e6)) ## compare sorting and coalesce system.time(o <- i[order(names(i))]) system.time(o <- i[coalesce(names(i))]) ## more fair comparison taking the coalesce time (and copy) into account system.time(tapply(i, names(i), sum)) system.time({ o <- i[coalesce(names(i))]; ctapply(o, names(o), sum) }) ## in fact, using ctapply() on a dummy vector is faster than table() ... ## believe it or not ... (that that is actually wasteful, since coalesce ## already computed the table internally anyway ...) ftable <- function(x) { t <- ctapply(rep(0L, length(x)), x[coalesce(x)], length) t[sort.list(names(t))] } system.time(table(names(i))) system.time(ftable(names(i))) } \keyword{manip} fastmatch/man/fmatch.Rd0000644000175100001440000001234614076406314014561 0ustar hornikusers\name{fmatch} \alias{fmatch} \alias{\%fin\%} \alias{fmatch.hash} \alias{fastmatch} \title{ Fast match() replacement } \description{ \code{fmatch} is a faster version of the built-in \code{\link{match}}() function. It is slightly faster than the built-in version because it uses more specialized code, but in addition it retains the hash table within the table object such that it can be re-used, dramatically reducing the look-up time especially for large tables. Although \code{fmatch} can be used separately, in general it is also safe to use: \code{match <- fmatch} since it is a drop-in replacement. Any cases not directly handled by \code{fmatch} are passed to \code{match} with a warning. \code{fmatch.hash} is identical to \code{fmatch} but it returns the table object with the hash table attached instead of the result, so it can be used to create a table object in cases where direct modification is not possible. \code{\%fin\%} is a version of the built-in \code{\link{\%in\%}} function that uses \code{fmatch} instead of \code{\link{match}}(). } \usage{ fmatch(x, table, nomatch = NA_integer_, incomparables = NULL) fmatch.hash(x, table, nomatch = NA_integer_, incomparables = NULL) x \%fin\% table } \arguments{ \item{x}{values to be matched} \item{table}{values to be matched against} \item{nomatch}{the value to be returned in the case when no match is found. It is coerced to \code{integer}.} \item{incomparables}{a vector of values that cannot be matched. Any value other than \code{NULL} will result in a fall-back to \code{match} without any speed gains.} } \details{ See \code{\link{match}} for the purpose and details of the \code{match} function. \code{fmatch} is a drop-in replacement for the \code{match} function with the focus on performance. \code{incomparables} are not supported by \code{fmatch} and will be passed down to \code{match}. The first match against a table results in a hash table to be computed from the table. This table is then attached as the \code{".match.hash"} attribute of the table so that it can be re-used on subsequent calls to \code{fmatch} with the same table. The hashing algorithm used is the same as the \code{match} function in R, but it is re-implemented in a slightly different way to improve its performance at the cost of supporting only a subset of types (integer, real and character). For any other types \code{fmatch} falls back to \code{match} (with a warning). } \value{ \code{fmatch}: A vector of the same length as \code{x} - see \code{\link{match}} for details. \code{fmatch.hash}: \code{table}, possibly coerced to match the type of \code{x}, with the hash table attached. \code{\%fin\%}: A logical vector the same length as \code{x} - see \code{\link{\%in\%}} for details. } %\references{ %} \author{ Simon Urbanek } \note{ \code{fmatch} modifies the \code{table} by attaching an attribute to it. It is expected that the values will not change unless that attribute is dropped. Under normal circumstances this should not have any effect from user's point of view, but there is a theoretical chance of the cache being out of sync with the table in case the table is modified directly (e.g. by some C code) without removing attributes. In cases where the \code{table} object cannot be modified (or such modification would not survive) \code{fmatch.hash} can be used to build the hash table and return \code{table} object including the hash table. In that case no lookup is done and \code{x} is only used to determine the type into which \code{table} needs to be coerced. Also \code{fmatch} does not convert to a common encoding so strings with different representation in two encodings don't match. } %% ~Make other sections like Warning with \section{Warning }{....} ~ \seealso{ \code{\link{match}} } \examples{ # some random speed comparison examples: # first use integer matching x = as.integer(rnorm(1e6) * 1000000) s = 1:100 # the first call to fmatch is comparable to match system.time(fmatch(s,x)) # but the subsequent calls take no time! system.time(fmatch(s,x)) system.time(fmatch(-50:50,x)) system.time(fmatch(-5000:5000,x)) # here is the speed of match for comparison system.time(base::match(s, x)) # the results should be identical identical(base::match(s, x), fmatch(s, x)) # next, match a factor against the table # this will require both x and the factor # to be cast to strings s = factor(c("1","1","2","foo","3",NA)) # because the casting will have to allocate a string # cache in R, we run a dummy conversion to take # that out of the equation dummy = as.character(x) # now we can run the speed tests system.time(fmatch(s, x)) system.time(fmatch(s, x)) # the cache is still valid for string matches as well system.time(fmatch(c("foo","bar","1","2"),x)) # now back to match system.time(base::match(s, x)) identical(base::match(s, x), fmatch(s, x)) # finally, some reals to match y = rnorm(1e6) s = c(y[sample(length(y), 100)], 123.567, NA, NaN) system.time(fmatch(s, y)) system.time(fmatch(s, y)) system.time(fmatch(s, y)) system.time(base::match(s, y)) identical(base::match(s, y), fmatch(s, y)) # this used to fail before 0.1-2 since nomatch was ignored identical(base::match(4L, 1:3, nomatch=0), fmatch(4L, 1:3, nomatch=0)) } \keyword{manip} \keyword{logic} fastmatch/DESCRIPTION0000644000175100001440000000120614076471110013750 0ustar hornikusersPackage: fastmatch Version: 1.1-3 Title: Fast 'match()' Function Author: Simon Urbanek Maintainer: Simon Urbanek Description: Package providing a fast match() replacement for cases that require repeated look-ups. It is slightly faster that R's built-in match() function on first match against a table, but extremely fast on any subsequent lookup as it keeps the hash table in memory. License: GPL-2 Depends: R (>= 2.3.0) URL: http://www.rforge.net/fastmatch NeedsCompilation: yes Packaged: 2021-07-23 00:26:57 UTC; svnuser Repository: CRAN Date/Publication: 2021-07-23 07:39:52 UTC fastmatch/src/0000755000175100001440000000000014076406314013036 5ustar hornikusersfastmatch/src/fastmatch.c0000644000175100001440000003642014076406314015161 0ustar hornikusers/* * fastmatch: fast implementation of match() in R using semi-permanent hash tables * * Copyright (C) 2010, 2011 Simon Urbanek * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; version 2 of the License. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. */ #include "common.h" /* for malloc/free since we handle our hash table memory separately from R */ #include #include /* for hashing for pointers we need intptr_t */ #include /* match5 to fall-back to R's internal match for types we don't support */ SEXP match5(SEXP itable, SEXP ix, int nmatch, SEXP incomp, SEXP env); /* ".match.hash" symbol - cached on first use */ SEXP hs; typedef struct hash { hash_index_t m, els; /* hash size, added elements (unused!) */ int k; /* used bits */ SEXPTYPE type; /* payload type */ void *src; /* the data array of the hashed object */ SEXP prot; /* object to protect along whith this hash */ SEXP parent; /* hashed object */ struct hash *next; /* next hash table - typically for another type */ hash_index_t ix[1]; /* actual table of indices */ } hash_t; /* create a new hash table with the given source and length. we store only the index - values are picked from the source so you must make sure the source is still alive when used */ static hash_t *new_hash(void *src, hash_index_t len) { hash_t *h; int k = 1; hash_index_t m = 2, desired = len * 2; /* we want a maximal load of 50% */ while (m < desired) { m *= 2; k++; } h = (hash_t*) calloc(1, sizeof(hash_t) + (sizeof(hash_index_t) * m)); if (!h) Rf_error("unable to allocate %.2fMb for a hash table", (double) sizeof(hash_index_t) * (double) m / (1024.0 * 1024.0)); h->m = m; h->k = k; h->src = src; return h; } /* free the hash table (and all chained hash tables as well) */ static void free_hash(hash_t *h) { if (h->next) free_hash(h->next); if (h->prot) R_ReleaseObject(h->prot); free(h); } /* R finalized for the hash table object */ static void hash_fin(SEXP ho) { hash_t *h = (hash_t*) EXTPTR_PTR(ho); if (h) free_hash(h); } /* pi-hash fn */ #define HASH(X) (3141592653U * ((unsigned int)(X)) >> (32 - h->k)) /* add the integer value at index i (0-based!) to the hash */ static hash_value_t add_hash_int(hash_t *h, hash_index_t i) { int *src = (int*) h->src; int val = src[i++]; hash_value_t addr = HASH(val); #ifdef PROFILE_HASH hash_value_t oa = addr; #endif while (h->ix[addr] && src[h->ix[addr] - 1] != val) { addr++; if (addr == h->m) addr = 0; } #ifdef PROFILE_HASH if (addr != oa) Rprintf("%d: dist=%d (addr=%d, oa=%d)\n", val, (int) (addr - oa), (int) addr, (int) oa); #endif if (!h->ix[addr]) h->ix[addr] = i; return addr; } /* to avoid aliasing rules issues use a union */ union dint_u { double d; unsigned int u[2]; }; /* add the double value at index i (0-based!) to the hash */ static hash_value_t add_hash_real(hash_t *h, hash_index_t i) { double *src = (double*) h->src; union dint_u val; hash_value_t addr; /* double is a bit tricky - we nave to nomalize 0.0, NA and NaN */ val.d = (src[i] == 0.0) ? 0.0 : src[i]; if (R_IsNA(val.d)) val.d = NA_REAL; else if (R_IsNaN(val.d)) val.d = R_NaN; addr = HASH(val.u[0]+ val.u[1]); #ifdef PROFILE_HASH hash_value_t oa = addr; #endif while (h->ix[addr] && src[h->ix[addr] - 1] != val.d) { addr++; if (addr == h->m) addr = 0; } #ifdef PROFILE_HASH if (addr != oa) Rprintf("%g: dist=%d (addr=%d, oa=%d)\n", val.d, (int) (addr - oa), (int)addr, (int)oa); #endif if (!h->ix[addr]) h->ix[addr] = i + 1; return addr; } /* add the pointer value at index i (0-based!) to the hash */ static int add_hash_ptr(hash_t *h, hash_index_t i) { hash_value_t addr; void **src = (void**) h->src; intptr_t val = (intptr_t) src[i++]; #if (defined _LP64) || (defined __LP64__) || (defined WIN64) addr = HASH((val & 0xffffffff) ^ (val >> 32)); #else addr = HASH(val); #endif #ifdef PROFILE_HASH hash_value_t oa = addr; #endif while (h->ix[addr] && (intptr_t) src[h->ix[addr] - 1] != val) { addr++; if (addr == h->m) addr = 0; } #ifdef PROFILE_HASH if (addr != oa) Rprintf("%p: dist=%d (addr=%d, oa=%d)\n", val, (int)(addr - oa), (int)addr, (int)oa); #endif if (!h->ix[addr]) h->ix[addr] = i; return addr; } /* NOTE: we are returning a 1-based index ! */ static hash_index_t get_hash_int(hash_t *h, int val, int nmv) { int *src = (int*) h->src; hash_value_t addr = HASH(val); while (h->ix[addr]) { if (src[h->ix[addr] - 1] == val) return h->ix[addr]; addr++; if (addr == h->m) addr = 0; } return nmv; } /* NOTE: we are returning a 1-based index ! */ static hash_index_t get_hash_real(hash_t *h, double val, int nmv) { double *src = (double*) h->src; hash_value_t addr; union dint_u val_u; /* double is a bit tricky - we nave to normalize 0.0, NA and NaN */ if (val == 0.0) val = 0.0; if (R_IsNA(val)) val = NA_REAL; else if (R_IsNaN(val)) val = R_NaN; val_u.d = val; addr = HASH(val_u.u[0] + val_u.u[1]); while (h->ix[addr]) { if (!memcmp(&src[h->ix[addr] - 1], &val, sizeof(val))) return h->ix[addr]; addr++; if (addr == h->m) addr = 0; } return nmv; } /* NOTE: we are returning a 1-based index ! */ static hash_index_t get_hash_ptr(hash_t *h, void *val_ptr, int nmv) { void **src = (void **) h->src; intptr_t val = (intptr_t) val_ptr; hash_value_t addr; #if (defined _LP64) || (defined __LP64__) || (defined WIN64) addr = HASH((val & 0xffffffff) ^ (val >> 32)); #else addr = HASH(val); #endif while (h->ix[addr]) { if ((intptr_t) src[h->ix[addr] - 1] == val) return h->ix[addr]; addr ++; if (addr == h->m) addr = 0; } return nmv; } static SEXP asCharacter(SEXP s, SEXP env) { SEXP call, r; PROTECT(call = lang2(install("as.character"), s)); r = eval(call, env); UNPROTECT(1); return r; } static double NA_int2real(hash_index_t res) { return (res == NA_INTEGER) ? R_NaReal : ((double) res); } /* the only externally visible function to be called from R */ SEXP fmatch(SEXP x, SEXP y, SEXP nonmatch, SEXP incomp, SEXP hashOnly) { SEXP a; SEXPTYPE type; hash_t *h = 0; int nmv = asInteger(nonmatch), np = 0, y_to_char = 0, y_factor = 0, hash_only = asInteger(hashOnly); hash_index_t n = (x == R_NilValue) ? 0 : XLENGTH(x); /* edge-cases of 0 length */ if (n == 0) return allocVector(INTSXP, 0); if (y == R_NilValue || XLENGTH(y) == 0) { /* empty table -> vector full of nmv */ int *ai; hash_index_t ii; a = allocVector(INTSXP, n); ai = INTEGER(a); for (ii = 0; ii < n; ii++) ai[ii] = nmv; return a; } /* if incomparables are used we fall back straight to match() */ if (incomp != R_NilValue && !(isLogical(incomp) && LENGTH(incomp) == 1 && LOGICAL(incomp)[0] == 0)) { Rf_warning("incomparables used in fmatch(), falling back to match()"); return match5(y, x, nmv, incomp, R_BaseEnv); } /* implicitly convert factors/POSIXlt to character */ if (OBJECT(x)) { if (inherits(x, "factor")) { x = PROTECT(asCharacterFactor(x)); np++; } else if (inherits(x, "POSIXlt")) { x = PROTECT(asCharacter(x, R_GlobalEnv)); /* FIXME: match() uses env properly - should we switch to .External ? */ np++; } } /* for y we may need to do that later */ y_factor = OBJECT(y) && inherits(y, "factor"); y_to_char = y_factor || (OBJECT(y) && inherits(y, "POSIXlt")); /* coerce to common type - in the order of SEXP types */ if(TYPEOF(x) >= STRSXP || TYPEOF(y) >= STRSXP) type = STRSXP; else type = (TYPEOF(x) < TYPEOF(y)) ? TYPEOF(y) : TYPEOF(x); /* we only support INT/REAL/STR */ if (type != INTSXP && type != REALSXP && type != STRSXP) { Rf_warning("incompatible type, fastmatch() is falling back to match()"); if (np) UNPROTECT(np); return match5(y, x, nmv, NULL, R_BaseEnv); } if (y_to_char && type != STRSXP) /* y = factor -> character -> type must be STRSXP */ type = STRSXP; /* coerce x - not y yet because we may get away with the existing cache */ if (TYPEOF(x) != type) { x = PROTECT(coerceVector(x, type)); np++; } /* find existing cache(s) */ if (!hs) hs = Rf_install(".match.hash"); a = Rf_getAttrib(y, hs); if (a != R_NilValue) { /* if there is a cache, try to find the matching type */ h = (hash_t*) EXTPTR_PTR(a); /* could the object be out of sync ? If so, better remove the hash and ignore it */ if (!h || h->parent != y) { #if HASH_VERBOSE Rprintf(" - DISCARDING hash, its parent and the bearer don't match, taking no chances.\n"); #endif h = 0; Rf_setAttrib(y, hs, R_NilValue); } while (h && h->type != type) h = h->next; } #ifdef CHECKHASH hash_t *orig_h = h; h = 0; /* pretend that there is no hash */ a = R_NilValue; #endif /* if there is no cache or not of the needed coerced type, create one */ if (a == R_NilValue || !h) { h = new_hash(DATAPTR(y), XLENGTH(y)); h->type = type; h->parent = y; #if HASH_VERBOSE Rprintf(" - creating new hash for type %d\n", type); #endif if (a == R_NilValue || !EXTPTR_PTR(a)) { /* if there is no cache attribute, create one */ a = R_MakeExternalPtr(h, R_NilValue, R_NilValue); Rf_setAttrib(y, hs, a); Rf_setAttrib(a, R_ClassSymbol, Rf_mkString("match.hash")); R_RegisterCFinalizer(a, hash_fin); } else { /* otherwise append the new cache */ hash_t *lh = (hash_t*) EXTPTR_PTR(a); while (lh->next) lh = lh->next; lh->next = h; #if HASH_VERBOSE Rprintf(" (appended to the cache list)\n"); #endif } if (TYPEOF(y) != type) { #if HASH_VERBOSE if (y_to_char) Rprintf(" (need to convert table factor/POSIXlt to strings\n"); else Rprintf(" (need to coerce table to %d)\n", type); #endif y = y_to_char ? (y_factor ? asCharacterFactor(y) : asCharacter(y, R_GlobalEnv)) : coerceVector(y, type); R_PreserveObject(y); h->src = DATAPTR(y); /* this is ugly, but we need to adjust the source since we changed it */ h->prot = y; /* since the coerced object is temporary, we let the hash table handle its life span */ } /* make sure y doesn't go away while we create the hash */ /* R_PreserveObject(y); */ /* spawn a thread to create the hash */ /* nope - so far we do it serially */ { /* create the hash table */ hash_index_t i, n = XLENGTH(y); if (type == INTSXP) for(i = 0; i < n; i++) add_hash_int(h, i); else if (type == REALSXP) for(i = 0; i < n; i++) add_hash_real(h, i); else for(i = 0; i < n; i++) add_hash_ptr(h, i); } #ifdef CHECKHASH if (orig_h) { if (orig_h->type != type) /* this should never happen since we check the type */ Rf_error("Hash type mistmatch on object %p (has %d, expected %d)", y, type, orig_h->type); if (orig_h->m != h->m) Rf_error("Object %p modified, cached hash table has size %ld, but re-hashing has %ld", y, (long)orig_h->m, (long)h->m); if (memcmp(orig_h->ix, h->ix, sizeof(hash_index_t) * h->m)) { hash_index_t i = 0, n = h->m, No = 0, Nn = 0; while (i < n) { if (orig_h->ix[i]) No++; if (h->ix[i]) Nn++; i++; } if (No != Nn) Rf_error("Object %p resized (from %ld to %ld) after the hash table has been created", y, (long)No, (long)Nn); Rf_error("Object %p modified after the hash table has been created (size %ld remained constant)", y, (long)No); } } #endif } if (hash_only) { if (np) UNPROTECT(np); return y; } { /* query the hash table */ SEXP r; #ifdef LONG_VECTOR_SUPPORT if (IS_LONG_VEC(x)) { hash_index_t i, n = XLENGTH(x); double *v = REAL(r = allocVector(REALSXP, n)); if (nmv == NA_INTEGER) { /* we have to treat nmv = NA differently, because is has to be transformed into NA_REAL in the result. To avoid checking when nmv is different, we have two paths */ if (type == INTSXP) { int *k = INTEGER(x); for (i = 0; i < n; i++) v[i] = NA_int2real(get_hash_int(h, k[i], NA_INTEGER)); } else if (type == REALSXP) { double *k = REAL(x); for (i = 0; i < n; i++) v[i] = NA_int2real(get_hash_real(h, k[i], NA_INTEGER)); } else { SEXP *k = (SEXP*) DATAPTR(x); for (i = 0; i < n; i++) v[i] = NA_int2real(get_hash_ptr(h, k[i], NA_INTEGER)); } } else { /* no need to transcode nmv */ if (type == INTSXP) { int *k = INTEGER(x); for (i = 0; i < n; i++) v[i] = (double) get_hash_int(h, k[i], nmv); } else if (type == REALSXP) { double *k = REAL(x); for (i = 0; i < n; i++) v[i] = (double) get_hash_real(h, k[i], nmv); } else { SEXP *k = (SEXP*) DATAPTR(x); for (i = 0; i < n; i++) v[i] = (double) get_hash_ptr(h, k[i], nmv); } } } else #endif { /* short vector - everything is int */ int i, n = LENGTH(x); int *v = INTEGER(r = allocVector(INTSXP, n)); if (type == INTSXP) { int *k = INTEGER(x); for (i = 0; i < n; i++) v[i] = get_hash_int(h, k[i], nmv); } else if (type == REALSXP) { double *k = REAL(x); for (i = 0; i < n; i++) v[i] = get_hash_real(h, k[i], nmv); } else { SEXP *k = (SEXP*) DATAPTR(x); for (i = 0; i < n; i++) v[i] = get_hash_ptr(h, k[i], nmv); } } if (np) UNPROTECT(np); return r; } } /* FIXME: should we also attach the hash? */ SEXP coalesce(SEXP x) { SEXPTYPE type = TYPEOF(x); SEXP res; hash_index_t i, n = XLENGTH(x), dst = 0; hash_t *h; hash_index_t *count; res = PROTECT(allocVector(INTSXP, XLENGTH(x))); h = new_hash(DATAPTR(x), XLENGTH(x)); h->type = type; h->parent = x; if (!(count = calloc(h->m, sizeof(*count)))) { free_hash(h); Rf_error("Unable to allocate memory for counts"); } /* count the size of each category - we're using negative numbers since we will re-purpose the array later to hold the pointer to the index of the next entry to stroe which will be positive */ if (type == INTSXP) for(i = 0; i < n; i++) count[add_hash_int(h, i)]--; else if (type == REALSXP) for(i = 0; i < n; i++) count[add_hash_real(h, i)]--; else for(i = 0; i < n; i++) count[add_hash_ptr(h, i)]--; if (type == INTSXP) for(i = 0; i < n; i++) { hash_value_t addr = add_hash_int(h, i); if (count[addr] < 0) { /* this cat has not been used yet, reserve the index space for it*/ hash_index_t ni = -count[addr]; count[addr] = dst; dst += ni; } INTEGER(res)[count[addr]++] = i + 1; } else if (type == REALSXP) for(i = 0; i < n; i++) { hash_value_t addr = add_hash_real(h, i); if (count[addr] < 0) { hash_index_t ni = -count[addr]; count[addr] = dst; dst += ni; } INTEGER(res)[count[addr]++] = i + 1; } else for(i = 0; i < n; i++) { hash_value_t addr = add_hash_ptr(h, i); if (count[addr] < 0) { hash_index_t ni = -count[addr]; count[addr] = dst; dst += ni; } INTEGER(res)[count[addr]++] = i + 1; } free(count); free_hash(h); UNPROTECT(1); return res; } fastmatch/src/fasthash.c0000644000175100001440000003262414076406314015012 0ustar hornikusers/* * fasthash: hash table * This is very similar to fastmatch except that the payload * is stored in the hash table as well and thus can be used to * append values * * Copyright (C) 2013 Simon Urbanek * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; version 2 of the License. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. */ #include "common.h" /* for malloc/free since we handle our hash table memory separately from R */ #include /* for hashing for pointers we need intptr_t */ #include /* for memcpy */ #include typedef struct hash { hash_index_t m, els; /* hash size, added elements */ hash_index_t max_load; /* max. load - resize when reached */ int k, type; /* used bits, payload type */ void *src; /* the data array of the hashed object */ SEXP prot; /* object to protect along whith this hash */ SEXP parent; /* hashed object */ SEXP vals; /* values vector if used as key/value storage */ struct hash *next; hash_index_t ix[1]; } hash_t; #define MAX_LOAD 0.85 /* create a new hash table with the given type and length. Implicitly calls allocVector(type, len) to create the storage of the newly added hash values NOTE: len is the *hash* size, so it should be 2 * length(unique(x)) It will be rounded up to the next power of 2 */ static hash_t *new_hash(SEXPTYPE type, hash_index_t len) { hash_t *h; int k = 8; /* force a minimal size of 256 */ hash_index_t m = 1 << k; hash_index_t max_load; SEXP keys; while (m < len) { m *= 2; k++; } max_load = (hash_index_t) (((double) m) * MAX_LOAD); keys = allocVector(type, max_load); h = (hash_t*) calloc(1, sizeof(hash_t) + (sizeof(hash_index_t) * m)); if (!h) Rf_error("unable to allocate %.2fMb for a hash table", (double) sizeof(hash_index_t) * (double) m / (1024.0 * 1024.0)); h->parent = keys; h->max_load = max_load; R_PreserveObject(h->parent); h->m = m; h->k = k; h->src = DATAPTR(h->parent); h->type = type; return h; } /* free the hash table (and all chained hash tables as well) */ static void free_hash(hash_t *h) { if (h->next) free_hash(h->next); if (h->prot) R_ReleaseObject(h->prot); R_ReleaseObject(h->parent); free(h); } /* R finalized for the hash table object */ static void hash_fin(SEXP ho) { hash_t *h = (hash_t*) EXTPTR_PTR(ho); if (h) free_hash(h); } /* pi-hash fn */ #define HASH(X) (3141592653U * ((unsigned int)(X)) >> (32 - h->k)) static int INCEL(hash_t *h) { if (h->els == h->max_load) Rf_error("Maximal hash load reached, resizing is currently unimplemented"); return h->els++; } /* add an integer value to the hash */ static int add_hash_int(hash_t *h, int val) { int *src = (int*) h->src; hash_index_t addr = HASH(val); #ifdef PROFILE_HASH int oa = addr; #endif while (h->ix[addr] && src[h->ix[addr] - 1] != val) { addr++; if (addr == h->m) addr = 0; } #ifdef PROFILE_HASH if (addr != oa) printf("%d: dist=%d (addr=%d, oa=%d)\n", val, addr - oa, addr, oa); #endif if (!h->ix[addr]) { src[INCEL(h)] = val; h->ix[addr] = h->els; } return addr; } /* to avoid aliasing rules issues use a union */ union dint_u { double d; unsigned int u[2]; }; /* add the double value at index i (0-based!) to the hash */ static int add_hash_real(hash_t *h, double val_) { double *src = (double*) h->src; union dint_u val; int addr; /* double is a bit tricky - we nave to nomalize 0.0, NA and NaN */ val.d = (val_ == 0.0) ? 0.0 : val_; if (R_IsNA(val.d)) val.d = NA_REAL; else if (R_IsNaN(val.d)) val.d = R_NaN; addr = HASH(val.u[0] + val.u[1]); #ifdef PROFILE_HASH int oa = addr; #endif while (h->ix[addr] && src[h->ix[addr] - 1] != val.d) { addr++; if (addr == h->m) addr = 0; } #ifdef PROFILE_HASH if (addr != oa) printf("%g: dist=%d (addr=%d, oa=%d)\n", val.d, addr - oa, addr, oa); #endif if (!h->ix[addr]) { src[INCEL(h)] = val.d; h->ix[addr] = h->els; } return addr; } /* add a R object to the hash */ static int add_hash_obj(hash_t *h, SEXP val) { int addr; SEXP *src = (SEXP*) h->src; intptr_t val_i = (intptr_t) val; #if (defined _LP64) || (defined __LP64__) || (defined WIN64) addr = HASH((val_i & 0xffffffff) ^ (val_i >> 32)); #else addr = HASH(val_i); #endif #ifdef PROFILE_HASH int oa = addr; #endif while (h->ix[addr] && src[h->ix[addr] - 1] != val) { addr++; if (addr == h->m) addr = 0; } #ifdef PROFILE_HASH if (addr != oa) printf("%p: dist=%d (addr=%d, oa=%d)\n", val, addr - oa, addr, oa); #endif if (!h->ix[addr]) { src[INCEL(h)] = val; h->ix[addr] = h->els; } return addr; } /* NOTE: we are returning a 1-based index ! */ static hash_index_t get_hash_int(hash_t *h, int val) { int *src = (int*) h->src; hash_index_t addr; addr = HASH(val); while (h->ix[addr]) { if (src[h->ix[addr] - 1] == val) return h->ix[addr]; addr++; if (addr == h->m) addr = 0; } return 0; } /* NOTE: we are returning a 1-based index ! */ static hash_index_t get_hash_real(hash_t *h, double val) { double *src = (double*) h->src; hash_index_t addr; union dint_u val_u; /* double is a bit tricky - we nave to normalize 0.0, NA and NaN */ if (val == 0.0) val = 0.0; if (R_IsNA(val)) val = NA_REAL; else if (R_IsNaN(val)) val = R_NaN; val_u.d = val; addr = HASH(val_u.u[0] + val_u.u[1]); while (h->ix[addr]) { if (src[h->ix[addr] - 1] == val) return h->ix[addr]; addr++; if (addr == h->m) addr = 0; } return 0; } /* NOTE: we are returning a 1-based index ! */ static int get_hash_obj(hash_t *h, SEXP val_ptr) { SEXP *src = (SEXP *) h->src; intptr_t val = (intptr_t) val_ptr; hash_index_t addr; #if (defined _LP64) || (defined __LP64__) || (defined WIN64) addr = HASH((val & 0xffffffff) ^ (val >> 32)); #else addr = HASH(val); #endif while (h->ix[addr]) { if ((intptr_t) src[h->ix[addr] - 1] == val) return h->ix[addr]; addr++; if (addr == h->m) addr = 0; } return 0; } static SEXP asCharacter(SEXP s, SEXP env) { SEXP call, r; PROTECT(call = lang2(install("as.character"), s)); r = eval(call, env); UNPROTECT(1); return r; } /* there are really three modes: 1) if vals in non-NULL then h->vals are populated with the values from vals corresponding to x as the keys 2) if ix is non-NULL then ix is is populated with the indices into the hash table (1-based) 3) if both are NULL then only the hash table is built */ static void append_hash(hash_t *h, SEXP x, int *ix, SEXP vals) { SEXPTYPE type = TYPEOF(x); R_xlen_t i, n = XLENGTH(x); if (type == INTSXP) { int *iv = INTEGER(x); if (vals) for(i = 0; i < n; i++) SET_VECTOR_ELT(h->vals, h->ix[add_hash_int(h, iv[i])] - 1, VECTOR_ELT(vals, i)); else if (ix) for(i = 0; i < n; i++) ix[i] = h->ix[add_hash_int(h, iv[i])]; else for(i = 0; i < n; i++) add_hash_int(h, iv[i]); } else if (type == REALSXP) { double *dv = REAL(x); if (vals) for(i = 0; i < n; i++) SET_VECTOR_ELT(h->vals, h->ix[add_hash_real(h, dv[i])] - 1, VECTOR_ELT(vals, i)); else if (ix) for(i = 0; i < n; i++) ix[i] = h->ix[add_hash_real(h, dv[i])]; else for(i = 0; i < n; i++) add_hash_real(h, dv[i]); } else { SEXP *sv = (SEXP*) DATAPTR(x); if (vals) for(i = 0; i < n; i++) SET_VECTOR_ELT(h->vals, h->ix[add_hash_obj(h, sv[i])] - 1, VECTOR_ELT(vals, i)); else if (ix) for(i = 0; i < n; i++) ix[i] = h->ix[add_hash_obj(h, sv[i])]; else for(i = 0; i < n; i++) add_hash_obj(h, sv[i]); } } static hash_t *unwrap(SEXP ht) { hash_t *h; if (!inherits(ht, "fasthash")) Rf_error("Invalid hash object"); h = (hash_t*) EXTPTR_PTR(ht); if (!h) /* FIXME: we should just rebuild the hash ... */ Rf_error("Hash object is NULL - probably unserialized?"); return h; } static SEXP chk_vals(SEXP vals, SEXP keys) { /* FIXME: requiring vals to be a list is not very flexible, but the easiest to implement. Anything else complicates the append_hash() function enormously and would require a separate solution for each combination of key and value types */ if (vals == R_NilValue) vals = 0; else { if (TYPEOF(vals) != VECSXP) Rf_error("`values' must be a list"); if (XLENGTH(vals) != XLENGTH(keys)) Rf_error("keys and values vectors must have the same length"); } return vals; } static void setval(SEXP res, R_xlen_t i, hash_index_t ix, SEXP vals) { SET_VECTOR_ELT(res, i, (ix == 0) ? R_NilValue : VECTOR_ELT(vals, ix - 1)); } /*---- API visible form R ----*/ SEXP mk_hash(SEXP x, SEXP sGetIndex, SEXP sValueEst, SEXP vals) { SEXP a, six; SEXPTYPE type; hash_t *h = 0; int np = 0, get_index = asInteger(sGetIndex) == 1; int *ix = 0; hash_index_t val_est = 0; if (TYPEOF(sValueEst) == REALSXP) { double ve = REAL(sValueEst)[0]; if (ve < 0 || R_IsNaN(ve)) Rf_error("Invalid value count estimate, must be positive or NA"); if (R_IsNA(ve)) ve = 0.0; val_est = ve; } else { int ve = asInteger(sValueEst); if (ve == NA_INTEGER) ve = 0; if (ve < 0) Rf_error("Invalid value count estimate, must be positive or NA"); val_est = ve; } vals = chk_vals(vals, x); /* implicitly convert factors/POSIXlt to character */ if (OBJECT(x)) { if (inherits(x, "factor")) { x = PROTECT(asCharacterFactor(x)); np++; } else if (inherits(x, "POSIXlt")) { x = PROTECT(asCharacter(x, R_GlobalEnv)); /* FIXME: match() uses env properly - should we switch to .External ? */ np++; } } type = TYPEOF(x); /* we only support INT/REAL/STR */ if (type != INTSXP && type != REALSXP && type != STRSXP && type != VECSXP) Rf_error("Currently supported types are integer, real, chracter vectors and lists"); if (get_index) { ix = INTEGER(six = PROTECT(allocVector(INTSXP, XLENGTH(x)))); np++; } /* FIXME: determine the proper hash size */ if (!val_est) val_est = XLENGTH(x); /* check for overflow */ if (val_est * 2 > val_est) val_est *= 2; h = new_hash(TYPEOF(x), val_est); a = PROTECT(R_MakeExternalPtr(h, R_NilValue, R_NilValue)); Rf_setAttrib(a, R_ClassSymbol, Rf_mkString("fasthash")); if (ix) Rf_setAttrib(a, install("index"), six); R_RegisterCFinalizer(a, hash_fin); np++; #if HASH_VERBOSE Rprintf(" - creating new hash for type %d\n", type); #endif append_hash(h, x, ix, vals); UNPROTECT(np); return a; } SEXP append(SEXP ht, SEXP x, SEXP sGetIndex, SEXP vals) { SEXP six; SEXPTYPE type; hash_t *h = 0; int np = 0; int *ix = 0; int get_index = (asInteger(sGetIndex) == 1); h = unwrap(ht); vals = chk_vals(vals, x); /* implicitly convert factors/POSIXlt to character */ if (OBJECT(x)) { if (inherits(x, "factor")) { x = PROTECT(asCharacterFactor(x)); np++; } else if (inherits(x, "POSIXlt")) { x = PROTECT(asCharacter(x, R_GlobalEnv)); /* FIXME: match() uses env properly - should we switch to .External ? */ np++; } } type = TYPEOF(x); /* we only support INT/REAL/STR */ if (type != INTSXP && type != REALSXP && type != STRSXP && type != VECSXP) Rf_error("Currently supported types are integer, real, chracter vectors and lists"); if (get_index) { /* FIXME: long vec support? */ ix = INTEGER(six = PROTECT(allocVector(INTSXP, LENGTH(x)))); np++; } append_hash(h, x, ix, vals); if (np) UNPROTECT(np); return ix ? six : ht; } SEXP get_table(SEXP ht) { R_len_t n; R_xlen_t sz = sizeof(int); SEXP res; hash_t *h = unwrap(ht); n = h->els; res = allocVector(h->type, n); if (h->type == REALSXP) sz = sizeof(double); else if (h->type != INTSXP) sz = sizeof(SEXP); sz *= n; memcpy(DATAPTR(res), DATAPTR(h->parent), sz); return res; } SEXP get_values(SEXP ht, SEXP x) { SEXP res; SEXPTYPE type; hash_t *h = 0; int np = 0; h = unwrap(ht); if (!h->vals) Rf_error("This is not a key/value hash table"); /* implicitly convert factors/POSIXlt to character */ if (OBJECT(x)) { if (inherits(x, "factor")) { x = PROTECT(asCharacterFactor(x)); np++; } else if (inherits(x, "POSIXlt")) { x = PROTECT(asCharacter(x, R_GlobalEnv)); /* FIXME: match() uses env properly - should we switch to .External ? */ np++; } } type = TYPEOF(x); /* we only support INT/REAL/STR */ if (type != INTSXP && type != REALSXP && type != STRSXP && type != VECSXP) Rf_error("Currently supported types are integer, real, chracter vectors and lists"); { R_xlen_t i, n = XLENGTH(x); res = PROTECT(allocVector(VECSXP, n)); np++; if (type == INTSXP) { int *iv = INTEGER(x); for (i = 0; i < n; i++) setval(res, i, get_hash_int(h, iv[i]), h->vals); } else if (type == REALSXP) { double *rv = REAL(x); for (i = 0; i < n; i++) setval(res, i, get_hash_real(h, rv[i]), h->vals); } else { SEXP *rv = (SEXP*) DATAPTR(x); for (i = 0; i < n; i++) setval(res, i, get_hash_obj(h, rv[i]), h->vals); } } UNPROTECT(np); return res; } fastmatch/src/common.h0000644000175100001440000000137014076406314014500 0ustar hornikusers/* fastmatch - common types */ #ifndef FM_COMMON_H__ #define FM_COMMON_H__ /* for speed (should not really matter in this case as most time is spent in the hashing) */ #define USE_RINTERNALS 1 #include #ifndef R_SHORT_LEN_MAX /* for compatibility with old R */ #define XLENGTH(X) LENGTH(X) #define IS_LONG_VEC(X) 0 typedef R_len_t R_xlen_t; #endif /* hash_index_t is big enough to cover long vectors */ #ifdef LONG_VECTOR_SUPPORT typedef R_xlen_t hash_index_t; #else typedef int hash_index_t; #endif /* hashes are always 32-bit -- this is for compatibility with the hash function used in R. This means that long vectors are fine, but they may not have more than 2^32 - 1 unique values */ typedef unsigned int hash_value_t; #endif fastmatch/src/ctapply.c0000644000175100001440000000737114076406314014666 0ustar hornikusers#include #include #define USE_RINTERNALS 1 #include #define MIN_CACHE 128 SEXP ctapply_(SEXP args) { SEXP rho, vec, by, fun, mfun, cdi = 0, cdv = 0, tmp, acc, tail; int i = 0, n, cdlen; args = CDR(args); rho = CAR(args); args = CDR(args); vec = CAR(args); args = CDR(args); by = CAR(args); args = CDR(args); fun = CAR(args); args = CDR(args); mfun= CAR(args); args = CDR(args); tmp = PROTECT(allocVector(VECSXP, 3)); acc = 0; if (TYPEOF(by) != INTSXP && TYPEOF(by) != REALSXP && TYPEOF(by) != STRSXP) Rf_error("INDEX must be either integer, real or character vector"); if (TYPEOF(vec) != INTSXP && TYPEOF(vec) != REALSXP && TYPEOF(vec) != STRSXP && TYPEOF(vec) != VECSXP) Rf_error("X must be either integer, real, character or generic vector (list)"); if ((n = LENGTH(vec)) != LENGTH(by)) Rf_error("X and INDEX must have the same length"); while (i < n) { int i0 = i, N; SEXP eres; /* find the contiguous stretch */ while (++i < n) { if ((TYPEOF(by) == INTSXP && INTEGER(by)[i] != INTEGER(by)[i - 1]) || (TYPEOF(by) == STRSXP && STRING_ELT(by, i) != STRING_ELT(by, i - 1)) || (TYPEOF(by) == REALSXP && REAL(by)[i] != REAL(by)[i - 1])) break; } /* [i0, i - 1] is the interval to run on */ N = i - i0; /* allocate cache for both the vector and index */ if (!cdi) { /* we have to guarantee named > 0 since we'll be modifying it in-place */ SET_NAMED(cdi = SET_VECTOR_ELT(tmp, 0, allocVector(TYPEOF(by), (cdlen = ((N < MIN_CACHE) ? MIN_CACHE : N)))), 1); SET_NAMED(cdv = SET_VECTOR_ELT(tmp, 1, allocVector(TYPEOF(vec), cdlen)), 1); } else if (cdlen < N) { SET_NAMED(cdi = SET_VECTOR_ELT(tmp, 0, allocVector(TYPEOF(by), (cdlen = N))), 1); SET_NAMED(cdv = SET_VECTOR_ELT(tmp, 1, allocVector(TYPEOF(vec), cdlen)), 1); } SETLENGTH(cdi, N); SETLENGTH(cdv, N); /* copy the index slice */ if (TYPEOF(by) == INTSXP) memcpy(INTEGER(cdi), INTEGER(by) + i0, sizeof(int) * N); else if (TYPEOF(by) == REALSXP) memcpy(REAL(cdi), REAL(by) + i0, sizeof(double) * N); else if (TYPEOF(by) == STRSXP) memcpy(STRING_PTR(cdi), STRING_PTR(by) + i0, sizeof(SEXP) * N); /* copy the vector slice */ if (TYPEOF(vec) == INTSXP) memcpy(INTEGER(cdv), INTEGER(vec) + i0, sizeof(int) * N); else if (TYPEOF(vec) == REALSXP) memcpy(REAL(cdv), REAL(vec) + i0, sizeof(double) * N); else if (TYPEOF(vec) == STRSXP) memcpy(STRING_PTR(cdv), STRING_PTR(vec) + i0, sizeof(SEXP) * N); else if (TYPEOF(vec) == VECSXP) memcpy(VECTOR_PTR(cdv), VECTOR_PTR(vec) + i0, sizeof(SEXP) * N); eres = eval(PROTECT(LCONS(fun, CONS(cdv, args))), rho); UNPROTECT(1); /* eval arg */ /* if the result has NAMED > 1 then we have to duplicate it see ctapply(x, y, identity). It should be uncommon, though since most functions will return newly allocated objects FIXME: check NAMED == 1 -- may also be bad if the reference is outside, but then NAMED1 should be duplicated before modification so I think we're safe */ /* Rprintf("NAMED(eres)=%d\n", NAMED(eres)); */ if (NAMED(eres) > 1) eres = duplicate(eres); PROTECT(eres); if (!acc) tail = acc = SET_VECTOR_ELT(tmp, 2, list1(eres)); else tail = SETCDR(tail, list1(eres)); { char cbuf[64]; const char *name = ""; if (TYPEOF(by) == STRSXP) name = CHAR(STRING_ELT(by, i0)); else if (TYPEOF(by) == INTSXP) { snprintf(cbuf, sizeof(cbuf), "%d", INTEGER(by)[i0]); name = cbuf; } else { /* FIXME: this one is not consistent with R ... */ snprintf(cbuf, sizeof(cbuf), "%g", REAL(by)[i0]); name = cbuf; } SET_TAG(tail, install(name)); } UNPROTECT(1); /* eres */ } UNPROTECT(1); /* tmp */ if (!acc) return R_NilValue; acc = eval(PROTECT(LCONS(mfun, acc)), rho); UNPROTECT(1); return acc; } fastmatch/NEWS0000644000175100001440000000507114076406314012751 0ustar hornikusers NEWS for fastmatch -------------------- 1.1-3 2021-07-23 o don't call XLENGTH() on NULL objects o if compiled with -DCHECKHASH fastmatch will re-compute the hash table every time and compare it to the existing table to verify its consistency. This can be used to detect incorrect use of fastmatch, i.e., cases where the object is modified and the hash table is not removed. 1.1-2 2021-07-22 o minor change for compatibility with R-devel 1.1-1 2019-04-16 o fix protection bug in case when fmatch() falls back to R's match() because of unsupported types (thanks to Tomáš Kalibera) 1.1-0 2017-01-28 o add fmatch.hash() which will create a hash table that can be used later with fmatch(). This can be used in cases where attaching the hash to the table implicitly is not reliable. o added ctapply() - a fast version of tapply() o added coalesce() - fast way of grouping unique values into contiguous groups (in linear time). o added %fin% - a fast version of %in% o fastmatch now supports long vectors. Note that the hash function is the same as in R and thus it uses at most 32-bits, hence long vectors can be used, but they must have less than 2^32 (~4e9) unique values. o bugfix: matching reals against a table that contains NA or NaNs would not match the position of those but return NA instead. o bugfix: fix crash when a newly unserialized hash table is used (since the table hash is not stored during serialization). 1.0-4 2012-01-12 o some R functions (such as subset assignment like x[1] <- 2) can create a new object (with possibly modified content) and copy all attributes including the hash cache. If the original object was used as a table in fmatch(), the hash cache will be copied into the modified object and thus its cache will be possibly out of sync with the object. fmatch() will now identify such cases and discard the hash to prevent errorneous results. 1.0-3 2011-12-21 o match() coerces POSIXlt objects into characters, but so far fmatch() performed the match on the actual objects. Now fmatch() coerces POSIXlt object into characters just like match(), but note that you will lose the ability to perform fast lookups if the table is a POSIXlt object -- please use POSIXct objects (much more efficient) or use as.character() on the POSIXlt object to create a table that you want to re-use. 1.0-2 2011-09-14 o bugfix: nomatch was ignored in the fastmatch implementation (thanks to Enrico Schumann for reporting) 1.0-1 2010-12-23 o minor cleanups 1.0-0 2010-12-23 o initial release fastmatch/R/0000755000175100001440000000000014076406314012450 5ustar hornikusersfastmatch/R/fastmatch.R0000644000175100001440000000054114076406314014545 0ustar hornikusersfmatch <- function(x, table, nomatch = NA_integer_, incomparables = NULL) .Call(C_fmatch, x, table, nomatch, incomparables, FALSE) fmatch.hash <- function(x, table, nomatch = NA_integer_, incomparables = NULL) .Call(C_fmatch, x, table, nomatch, incomparables, TRUE) `%fin%` <- function (x, table) .Call(C_fmatch, x, table, 0L, NULL, FALSE) > 0L fastmatch/R/coalesce.R0000644000175100001440000000005514076406314014351 0ustar hornikuserscoalesce <- function(x) .Call(C_coalesce, x) fastmatch/R/hash.R0000644000175100001440000000047414076406314013523 0ustar hornikusersmk.hash <- function(x, size=256L, index=FALSE, values=NULL) .Call(mk_hash, x, index, size, values) levels.fasthash <- function(x) .Call(get_table, x) map.values <- function(hash, keys) .Call(get_values, hash, keys) append.hash <- function(hash, x, index=TRUE, values=NULL) .Call(C_append, hash, x, index, values) fastmatch/R/match.hash.R0000644000175100001440000000044214076406314014611 0ustar hornikusers# match.hash is an infomal (S3) class representing the # chain of hash tables stored in the .match.hash attribute # of tables that have been hashed # we provide a (sort of dummy) print method so # the output is not as ugly print.match.hash <- function(x, ...) { cat("\n"); x } fastmatch/R/ctapply.R0000644000175100001440000000016114076406314014245 0ustar hornikusersctapply <- function(X, INDEX, FUN, ..., MERGE=c) .External(C_ctapply, parent.frame(), X, INDEX, FUN, MERGE, ...) fastmatch/MD50000644000175100001440000000127714076471110012562 0ustar hornikusers5a3306c6c67728a22f941efacde6c3a6 *DESCRIPTION 7e7ec63e6925cc4435d98adf39c5e26d *NAMESPACE 0ad0f3912966acc26f53f88e4ffef0db *NEWS f89bb99f16073fd87eb20e3a366e0d2e *R/coalesce.R c55a081862af768fb0493109ec5b898d *R/ctapply.R aa671c24c5486532d61366bd014ecb26 *R/fastmatch.R c4f0cdd605049cd165c501d88bd4f51f *R/hash.R ddc4a8e8795d9bc6be2c7d507b7e160b *R/match.hash.R f61d17ec420b9ada0a40d24277999f3b *man/coalesce.Rd b8d381ce543a5aa2a7f4421bf6c1cbdf *man/ctapply.Rd 0837fb690176702f0fcd16d2abdc0668 *man/fmatch.Rd 9049f773584158310d6eae9fc6ccbaba *src/common.h e16cc450c8e002f1b819261653706f88 *src/ctapply.c 461b9689226fc1550c15289a52c5896a *src/fasthash.c ed02ffd3ef64310b69ae74d6c12d861a *src/fastmatch.c