cba/0000755000175100001440000000000014630611511011013 5ustar hornikuserscba/MD50000644000175100001440000000611614630611511011327 0ustar hornikusers95819a01c2ed8d1d4ddaa10f24c0a044 *CHANGELOG 23c59c85287d462ed2b7475809ba7a0d *DESCRIPTION ee83dfb5779aac9143c472be1e0c33a4 *NAMESPACE b55737d69dd264044306e179d174d017 *R/ccfkms.r b3f65fab6ff0043e305d77f43f1698f1 *R/circleplot.R c72e86a160f440a10cf9510b14b13fff *R/cluster.r bc88c9402ecdd6052788df9c9de7879f *R/coding.r bf4f81f977a8b4b33913bb39089b10a5 *R/cut.ordered.r 8d6b01de986c64b57f048fe36b6f30ab *R/gknn.r feb86496c8eafc9b9eb6546cf1b71dce *R/order.r c41a46cc9d9e1c409f03426bbad0c1fb *R/plots.r 8f2d4d1812261bf2807f643d98da152b *R/proximus.r b9ec985120759f11b82f51dac96b37ee *R/rock.r 643c8ea3ec692a00994078661a279447 *R/sdists.r b477f489bcccf323f5005220358a59a3 *R/sdists.util.R 8366a3f21b1cbfd5ae6b386d08c462fc *R/stress.r 4d22d82120f151d8a1aba261ee1721e4 *TODO f61db891eb928123aba4412ca322f508 *build/partial.rdb 1d082e34751b86f2eec9002530ef0683 *data/Mushroom.rda 29252ea1e818bb6234d87dd45d7e7023 *data/Votes.rda ee0710bb02aafe991232bed602b9af89 *data/townships.rda c688803838800f21f1354b8d81585c82 *man/Mushroom.Rd f301737c62c9c8191893a61d545f2b6c *man/Votes.Rd 33bee73777d10f2b368a7f2c34131c9e *man/ccfkms.Rd e0384dd29d5be7df7a85db9beaac6530 *man/circleplot.dist.Rd dbbe9cfc0c79a2b5337060a6f2cd9e85 *man/clmplot.Rd 503dda383258bad096b43115ac527ef6 *man/cluster.dist.Rd 09a0a957026071ccaeb1520508b48e15 *man/coding.Rd 2b231b8b6e0bdc38cb7f59d6c4457f57 *man/cut.ordered.Rd ada771779c072aaa56a08a44b6c5f853 *man/fitted.proximus.Rd 79d0bd6669e7dc11b9b0a58d0546c256 *man/gknn.Rd 66a3c9718ee92b9824f78bc210f0fcbc *man/image.Rd b0208e716aa6d9e4423657a295274167 *man/lminter.Rd 940308c3cff8ccafa9a76bd465212274 *man/lmplot.Rd f8f2259574ffae733ec04247c3551b2f *man/order.Rd b78b9419e53528c84ddbc7329161e4ca *man/order.greedy.Rd c1b36ba2096a39cbcee4c9be20940746 *man/order.length.Rd 59c0d131ca165bc0f33eab6d0e5f831d *man/order.optimal.Rd 9fb50f3e806b85e2a083876805600662 *man/plot.sdists.graph.Rd 4add29e1d8ca3101c9c0f499d6ffaa3a *man/predict.ccfkms.Rd fe35e1ffc9d3b99109d7051d70b29b18 *man/predict.rock.Rd be793dd63a489009151361be90fe0d0e *man/proximus.Rd d16cc9c1022c241850648cb9632a3873 *man/rlbmat.Rd a4710e715566cbe4cc4c894cf6b83aa0 *man/rockCluster.Rd f86cfb9cf7e0d48890b0596b7eba7c52 *man/sdists.Rd 980bfcccb80195048cd88e1b72c4d68a *man/sdists.center.Rd 658471bdec54892304b264ac06622275 *man/sdists.center.align.Rd f6f2e066701e66c42079552978da753a *man/sdists.trace.Rd b400b9082051a9387ca3aa94b65a6979 *man/stress.Rd d6cf6d45781842f2e405165bb84448fd *man/summary.proximus.Rd 85110593cfcfd8e14e87834012bd95df *man/townships.Rd b4a706d255fa2aeb729a733b122586af *src/arrayIndex.c 9ca98783cd41c91a125a91185312e181 *src/ccfkms.c 111261fcfaa5901cde5a0f4cc7166a19 *src/cluster.c de9a955fe73b30af2c2aec47c7f24251 *src/coding.c ef8c122e8f74586bbad0f10e8735d795 *src/dll.c c64aa0a75887b9a61da5bff3e3354d88 *src/gknn.c 17e1e159b79439e8c6a5e6bb0757908d *src/greedy.c 754456b8a5172943507a549604994f39 *src/interpolate.c 5899a7d270acda77b1e527a0443ad0d8 *src/optimal.c c72ac54ae06b7bf17c5cfbef9c6ec481 *src/proximus.c 6704480fe9751a7f71c0e6607028c772 *src/rock.c 07399697d52e9c14e26da38542b88992 *src/sdists.c fea7854b09482f20f42dd2b41beaf991 *src/stress.c cba/R/0000755000175100001440000000000014332123330011210 5ustar hornikuserscba/R/cut.ordered.r0000644000175100001440000000127211304023136013612 0ustar hornikusers # cutting of ordinal variables # # ceeboo 2005 cut.ordered <- function(x, breaks, ...) { if (is.logical(breaks)) { if (length(breaks) != nlevels(x)) stop("levels of",paste(sQuote("x"),"and",sQuote("breaks"), "do not conform")) breaks <- which(breaks) } else breaks <- sort(unique(breaks)) if (is.character(breaks)) breaks <- pmatch(breaks, levels(x)) else breaks <- match(breaks, 1:nlevels(x)) if (any(is.na(breaks))) stop(paste(sQuote("breaks"),"invalid")) # breaks <- unique(c(breaks, nlevels(x))) levels(x) <- rep(levels(x)[breaks], diff(c(0,breaks))) x <- as.ordered(x) x } ### cba/R/circleplot.R0000644000175100001440000000153211304023136013473 0ustar hornikusers ## ceeboo 2007 circleplot.dist <- function(x, cutoff = 0.5, col = 1, circle = FALSE, scale = 1.4) { if (!inherits(x, "dist")) stop("'x' not of class dist") x <- order.dist(x) # seriation z <- seq(-pi, pi, length.out = attr(x, "Size") + 1) x0 <- cos(z) y0 <- sin(z) r <- c(-1,1) * scale plot(x0, y0, type = "p", xlim = r, ylim = r, xlab = "", ylab = "", xaxt = "n", yaxt = "n", pty="s",) if (circle) { z <- seq(-pi, pi, 0.01) lines(cos(z), sin(z), lty = 2) } text(x0, y0, labels = dimnames(x), pos = sign(x0) + 3) w <- c(cut(c(x), seq(0, cutoff, length.out = 4))) k <- !is.na(w) if (any(k)) { i <- row.dist(x)[k] j <- col.dist(x)[k] segments(x0[i], y0[i], x0[j], y0[j], lwd = w[k], col = col) } invisible() } ## cba/R/plots.r0000644000175100001440000000752112020453760012546 0ustar hornikusers # Wrapper function for interpolating a logical matrix into # non-overlapping square blocks of user-specified size. # Returns binned values of the counts of TRUE values per # block. Note that the effective number of bins is one # greater the specified number because the zero bin is # always included. # # ceeboo 2005 lminter <- function(x, block.size=1, nbin=0) { if (!is.logical(x)) stop(paste(sQuote("x"),"not logical")) if (nbin < 0) stop(paste(sQuote("nbin"),"illegal value")) storage.mode(block.size) <- storage.mode(nbin) <- "integer" x <- .Call(R_lminter, x, block.size, nbin) x } # plot a logical matrix with the option to reduce the resolution lmplot <- function(x, block.size=1, gray=FALSE, xlab="", ylab="", axes = FALSE, ...) { if (!is.logical(x)) stop(paste(sQuote("x"),"not logical")) if (block.size < 1) stop(paste(sQuote("block.size"),"illegal value")) nbin <- 0 # majority mode if (block.size > 1) { if (gray) nbin <- min(block.size, 8) # maximum palette x <- lminter(x, block.size, nbin) } # density equals opacity # this sucks! gray <- rev(gray.colors(max(2, nbin + 1), start=0, end=1) )[is.element(0:max(1, nbin), x)] implot(x, xlab=xlab, ylab=ylab, col=gray, axes = axes, ...) } # plot a logical matrix with the option to color (by rows or # columns) and to reorder by rows and columns (using hclust). clmplot <- function(x, col, col.bycol=FALSE, order=FALSE, dist.method="binary", hclust.method="average", axes=FALSE, xlab="", ylab="", ...) { if (!is.logical(x)) stop(paste(sQuote("x"),"not logical")) if (order) { ro <- hclust(dist(x, method=dist.method), method=hclust.method)$order co <- hclust(dist(t(x), method=dist.method), method=hclust.method)$order x <- x[ro, co] } else { ro <- 1:dim(x)[1] co <- 1:dim(x)[2] } if (missing(col)) col <- factor("black") else { if (length(col) != if (col.bycol) length(co) else length(ro)) stop(paste(sQuote("x"),"and",sQuote("col"),"do not conform")) if (col.bycol) col <- col[co] else col <- col[ro] if (is.character(col)) col <- as.factor(col) else { col <- as.factor(col) levels(col) <- heat.colors(nlevels(col)) } if (col.bycol) x <- x * rep(as.integer(col), each=dim(x)[1]) else x <- x * rep(as.integer(col), dim(x)[2]) } implot(structure(x, dimnames = list(ro, co)), zlim=c(1,nlevels(col)), col=levels(col), xlab=xlab, ylab=ylab, axes = axes, ...) invisible(list(rows=ro, cols=co)) } # Make a proper image plot of a matrix. That is, # the rows and columns are swapped and the order of the # columns (original rows) is reversed. implot <- function(x, xlab="", ylab="", axes = FALSE, ticks = 10, las = 2, ...) { if (inherits(x, "dist")) x <- as.matrix(x) else { if (!is.matrix(x)) stop("'x' not of class matrix") x <- t(x) } x <- x[,rev(seq_len(dim(x)[2])),drop = FALSE] image.default(seq_len(dim(x)[1]), seq_len(dim(x)[2]), x, axes=FALSE, xlab=xlab, ylab=ylab, ...) if (axes) { if (ticks < 1) stop("'ticks' invalid") ticks <- as.integer(ticks) if (length(rownames(x))) { at <- seq(1, dim(x)[1], length.out = min(ticks, dim(x)[1])) axis(1, at, labels = rownames(x)[at], las = las, line = -0.5, tick = 0, cex.axis = 0.2 + 1/log10(length(at))) } if (length(colnames(x))) { at <- seq(1, dim(x)[2], length.out = min(ticks, dim(x)[2])) axis(4, at, labels = colnames(x)[at], las = las, line = -0.5, tick = 0, cex.axis = 0.2 + 1/log10(length(at))) } } invisible(x) } ### cba/R/proximus.r0000644000175100001440000001155114332123330013264 0ustar hornikusers # # proximus.r - according to the paper: # # M. Koyutürk, A. Graham, and N. Ramakrishnan. Compression, # Clustering, and Pattern Discovery in Very High-Dimensional # Descrete-Attribute Data Sets. IEEE Transactions On Knowledge # and Data Engineering, Vol. 17, No. 4, (April) 2005 # # Contents: # # wrapper(s) for my C implementation of PROXIMUS. a better # implementation may use two sparse matrices holding the pair # of approximating matrices X and Y so that hat A = X * Y. # # Version: 0.1-1 # # (C) ceeboo, 2005 proximus <- function(x, max.radius=2, min.size=1, min.retry=10, max.iter=16, debug=FALSE) { if (!is.logical(x)) stop(paste(sQuote("x"),"not logical")) storage.mode(max.radius) <- storage.mode(min.size) <- "integer" storage.mode(min.retry) <- storage.mode(max.iter) <- "integer" storage.mode(debug) <- "logical" obj <- .Call(R_proximus, x, max.radius, min.size, min.retry, max.iter, debug) obj$max.radius <- max.radius obj$min.size <- min.size obj$rownames <- rownames(x) obj$colnames <- colnames(x) class(obj) <- c("proximus") invisible(obj) } # get the full storage representation + pattern (cluster) labels fitted.proximus <- function(object, drop=FALSE, ...) { x <- matrix(FALSE, nrow=object$nr, ncol=object$nc) c <- vector("integer", object$nr) for (i in 1:length(object$a)) { x[object$a[[i]]$x, object$a[[i]]$y] <- TRUE c[object$a[[i]]$x] <- i } k <- rep(TRUE, object$nr) # keep if (drop) { for (i in 1:length(object$a)) if (length(object$a[[i]]$x) < object$min.size || object$a[[i]]$r > object$max.radius) k[object$a[[i]]$x] <- FALSE x <- x[k,] c <- c[k] } rownames(x) <- object$rownames[k] colnames(x) <- object$colnames attr(c, "Index") <- which(k) # x <- list(x=x, pl=factor(c)) x } ### print.proximus <- function(x, ...) { cat("an object of class:",class(x),"\n") invisible(x) } summary.proximus <- function(object, ...) { n <- length(object$a) s <- as.data.frame(matrix(nrow=n, ncol=7)) names(s) <- c("Size","Length","Radius","Error","Fnorm","Jsim","Valid") e <- j <- 0 for (i in 1:n) { # pattern summaries a <- object$a[[i]] # approximation nx <- length(a$x) ny <- length(a$y) s[i,] <- c(nx, ny, a$r, (a$n - a$c) / (nx * object$nc), # Error sqrt(a$n - a$c), # Frobenius norm if (a$c == 0 && ny == 0) 1 # definition! else 1 / (1 + 2 * (a$n - a$c) / (a$c + nx * ny)),# Jaccard (nx >= object$min.size & a$r <= object$max.radius)) # valid e <- e + a$n - a$c # total Error j <- j + a$c + nx * ny # total Jaccard } storage.mode(s[,7]) <- "logical" s <- list(nr=object$nr, nc=object$nc, error=e / (object$nr * object$nc), fnorm=sqrt(e), jsim=if (j == 0 && e == 0) 1 # definition! else j / (j + e / 2), valid=sum(s$Valid), pattern=s) class(s) <- "summary.proximus" s } print.summary.proximus <- function(x, ...) { cat("approximates",x$nr,"x",x$nc, "matrix\n") cat("total Error:",format(x$error, digits=2), "\n") cat("total Fnorm:",format(x$fnorm, digits=2), "\n") cat("total Jsim:",format(x$jsim, digits=2), "\n") cat("total Valid:",x$valid,"\n") cat("Pattern Summary:\n") print(x$pattern[order(x$pattern$Size, decreasing=TRUE),], digits=2) invisible(x) } ### # Generate a matrix containing blocks of (overlapping) uniform # binary patterns on a noisy background. The perfect switch allows # for overlap between the first and last pattern block, making the # test case balanced. # # ceeboo 2005 rlbmat <- function(npat=4, rows=20, cols=12, over=4, noise=0.01, prob=0.8, perfect=FALSE) { rlmat <- function(nrow, ncol, prob=0.5) { x <- matrix(as.logical(runif(nrow*ncol) <= prob), ncol=ncol) x } nrow <- npat * rows ncol <- cols * npat + over x <- rlmat(nrow, ncol, noise) r <- c <- 1 while (r < nrow) { x[r:(r+rows-1), c:(c+cols+over-1)] <- rlmat(rows, cols+over, prob) r <- r + rows c <- c + cols } # overlap first and last block, too if (perfect) x[(r-rows):(r-1), 1:over] <- rlmat(rows, cols, prob) x } ### cba/R/order.r0000644000175100001440000000356112020453760012520 0ustar hornikusers # wrapper to the optimal leaf ordering algorithm # # ceeboo 2005 order.optimal <- function(dist, merge) { if (!inherits(dist,"dist")) stop(paste(sQuote("dist"),"not of class dist")) if (!is.matrix(merge)) stop(paste(sQuote("merge"),"not a matrix")) if (length(dim(merge)) != 2) stop(paste(sQuote("merge"),"invalid")) if (dim(merge)[1] != attr(dist,"Size")-1) stop(paste(sQuote("dist"),"and",sQuote("merge"),"do not conform")) if (!is.double(dist)) storage.mode(dist) <- "double" storage.mode(merge) <- "integer" obj <- .Call(R_order_optimal, dist, merge) names(obj) <- c("merge","order","length") names(obj$order) <- attr(dist,"Labels") obj } # wrapper to computing the lenght of the order # under a distance matrix, e.g. a tour where the # leg between the first and last city is omitted. # that this is a (Hamilton) path. # # note that this corresponds to the sum of distances # along the first off diagonal of the ordered distance # matrix. # order.length <- function(dist, order) { if (!inherits(dist,"dist")) stop(paste(sQuote("dist"),"not of class dist")) if (missing(order)) order <- 1:attr(dist, "Size") else { if (length(order) != attr(dist,"Size")) stop(paste(sQuote("order"),"invalid lenght")) } if (!is.double(dist)) storage.mode(dist) <- "double" if (!is.integer(order)) storage.mode(order) <- "integer" x <- .Call(R_order_length, dist, order) x } # wrapper to greedy ordering inspired by F. Murtagh # actually a hierarchical cluster algorithm. order.greedy <- function(dist) { if (!inherits(dist, "dist")) stop(paste(sQuote("dist"),"not of class dist")) if (!is.double(dist)) storage.mode(dist) <- "double" obj <- .Call(R_order_greedy, dist) names(obj) <- c("merge", "order", "height"); obj } ### cba/R/cluster.r0000644000175100001440000000041512020453760013061 0ustar hornikusers ### ceeboo 2006 cluster.dist <- function(x, beta) { if (!inherits(x, "dist")) stop("'x' not of class dist") storage.mode(x) <- storage.mode(beta) <- "double" obj <- .Call(R_cluster_dist, x, beta) names(obj) <- attr(x,"Labels") obj } ### cba/R/ccfkms.r0000644000175100001440000000560213037706735012665 0ustar hornikusers# # k-means based on conjugate convex functions using sparse data # structures and centering (and optionally standardizing). # # for details see the C source code. # # (C) ceeboo 2005, 2007 ccfkms_sample <- function(x, n) { if (inherits(x, "dgCMatrix")) as(t(x[,sample(dim(x)[2],n)]), "matrix") else x[sample(dim(x)[1],n),] } ccfkms <- function (x, n, p=NULL, par=2, max.iter=100, opt.std=FALSE, opt.retry=0, debug=FALSE) { ## dgRMatrix is currently broken if (inherits(x, "dgTMatrix")) x <- t(as(x, "dgCMatrix")) else if (inherits(x, "dgCMatrix")) x <- t(x) else if (!is.matrix(x)) stop(paste(sQuote("x"), "invalid argument")) if (!missing(n) && length(n) != 1) stop(paste(sQuote("n"), "invalid argument")) if (is.null(p)) p <- ccfkms_sample(x, n) else if (!is.matrix(p) || ifelse(inherits(x,"dgCMatrix"), dim(x)[1], dim(x)[2]) != dim(p)[2]) stop(paste(sQuote("p"), "invalid argument")) if (is.matrix(x) && !is.double(x)) storage.mode(x) <- "double" if (!is.double(p)) storage.mode(p) <- "double" storage.mode(par) <- "double" storage.mode(max.iter) <- "integer" storage.mode(opt.std) <- storage.mode(debug) <- "logical" obj <- .Call(R_ccfkms, x, p, par, max.iter, opt.std, debug) if (opt.retry > 0) { for (i in 1:opt.retry) { p <- ccfkms_sample(x,n) robj <- .Call(R_ccfkms, x, p, par, max.iter, opt.std, debug) if (robj[[4]] < obj[[4]]) obj <- robj } } names(obj) <- c("centers", "size", "cl", "inv.inf") rownames(obj$centers) <- names(obj$size) <- levels(obj$cl) colnames(obj$centers) <- if (inherits(x, "dgCMatrix")) rownames(x) else colnames(x) names(obj$cl) <- if (inherits(x, "dgCMatrix")) colnames(x) else rownames(x) obj <- c(obj, par=par, opt.std=opt.std) class(obj) <- "ccfkms" obj } predict.ccfkms <- function(object, x, drop=1, ...) { if (inherits(x, "dgTMatrix")) x <- t(as(x, "dgCMatrix")) else if (inherits(x, "dgCMatrix")) x <- t(x) else if (!is.matrix(x)) stop(paste(sQuote("x"), "invalid argument")) if (ifelse(inherits(x, "dgCMatrix"), dim(x)[1], dim(x)[2]) != dim(object$centers)[2]) stop(paste(sQuote("x"), "and", sQuote("object"), "do not conform")) if (drop > 0) { d <- which(object$size <= drop) if (length(d) > 0) { cat("dropping", length(d), "clusters\n") object$size <- object$size[-d] k <- !object$cl %in% d object$cl <- factor(object$cl[k]) } } x <- ccfkms(x, p=object$centers, par=object$par, opt.std=object$opt.std, max.iter=1) x$par <- x$opt.std <- NULL # prohibit reuse x } ### cba/R/sdists.r0000644000175100001440000002056313037611552012723 0ustar hornikusers # implements a wrapper to distance (similarity) computation on # collections of sequences. auto and cross distances can be # computed (compare with dist in package proxy) # # note that 1) we can supply lists of vectors or vectors of # character (strings) # 2) operation weights are in the order of # insertion/deletion, equality, and replacing # 3) the first row/column of the matrix of alphabet # weights are used for replacement with the empty # symbol (space) # 4) include NA, etc if exclude = NULL # 5) but the C function returns NA if NAs are encounterd # 6) use parallel mode only if y != NULL # # ceeboo 2006, 2008 sdists <- function(x,y=NULL, method="ow", weight=c(1,1,0,2), exclude=c(NA,NaN,Inf,-Inf), pairwise = FALSE) { METHODS <- c("ow","aw","awl") code <- pmatch(method, METHODS) if (is.na(code)) stop("invalid method") if (code == -1) stop("ambiguous method") if (is.character(x)) x <- strsplit(x,"") if (!is.list(x)) stop("'x' not a list") if (!is.null(y)) { if (is.character(y)) y <- strsplit(y,"") if (!is.list(y)) stop("'y' not a list") } if (code >= 2) { if (!is.matrix(weight)) stop("'weight' not a matrix") if (dim(weight)[1] != dim(weight)[2]) stop("'weight' not square") if (is.null(colnames(weight))) stop("'weight' no colnames") l <- colnames(weight) } else { if (length(weight) < 4) stop("'weight' invalid") # determine common symbol set l <- sort(unique(c(unlist(x),unlist(y),"")),na.last=TRUE) } x <- lapply(x,function(x) factor(x,levels=l,exclude=if(is.integer(x))NA else exclude)) if (!is.null(y)) { y <- lapply(y,function(x) factor(x,levels=l,exclude=if(is.integer(x))NA else exclude)) if (pairwise && length(x) != length(y)) stop("'pairwise', lengths of 'x' and 'y' do not conform") } if (!is.double(weight)) storage.mode(weight) <- "double" obj <- .Call(R_sdists,x,y,as.integer(code),weight,pairwise) if (is.null(y)) obj <- structure(obj, Size=length(x), class="dist", Diag=FALSE, Upper=FALSE, Labels=names(x), method=method) else if (!pairwise) { rownames(obj) <- names(x) colnames(obj) <- names(y) } obj } # as there is no unique space symbol available not 'excluding' # NA has NA as result (see the C implementation). # # if graph = TRUE the vector of transcripts is transformed into # graph data that can be supplied to 'segments', or 'grid.segments', # etc. the dynmic programming table is returned as attribute # 'table' and the traceback graph in attribute 'graph'. sdists.trace <- function(x,y, method="ow", weight=c(1,1,0,2), exclude=c(NA,NaN,Inf,-Inf), graph = FALSE, partial = FALSE) { METHODS <- c("ow","aw","awl") code <- pmatch(method, METHODS) if (is.na(code)) stop("invalid method") if (code == -1) stop("ambiguous method") if (is.character(x)) { if (length(x) != 1) stop("'x' not a scalar string") x <- strsplit(x,"")[[1]] } if (is.factor(x)) x <- as.character(x) if (!is.vector(x)) stop("'x' not a vector") if (is.character(y)) { if (length(y) != 1) stop("'y' not a scalar string") y <- strsplit(y,"")[[1]] } if (is.factor(y)) y <- as.character(y) if (!is.vector(y)) stop("'y' not a vector") if (code >= 2) { if (partial) stop("'partial' not implemented") if (!is.matrix(weight)) stop("'weight' not a matrix") if (is.null(colnames(weight))) stop("'weight' no colnames") l2 <- colnames(weight) if (is.null(rownames(weight))) { if (dim(weight)[1] != dim(weight)[2]) stop("'weight' not square") l1 <- l2 } else l1 <- rownames(weight) } else { if (length(weight) < 4) stop("'weight' invalid") if (partial) { if (length(weight) < 5) weight <- c(weight, weight[1], 0) if (length(weight) < 6) weight <- c(weight, 0) } # determine symbol sets l1 <- l2 <- sort(unique(c(x,y,"")),na.last=TRUE) } x <- factor(x,levels=l1,exclude=if(is.integer(x))NA else exclude) y <- factor(y,levels=l2,exclude=if(is.integer(y))NA else exclude) if (!is.double(weight)) storage.mode(weight) <- "double" t <- .Call(R_sdists_transcript, x, y, as.integer(code), weight, graph) if (is.na(t[1])) return(t) # reduce set of transcripts/paths if (partial) { z <- t ## reduce to maximum number of trailing inserts k <- attr(regexpr("I+$", z), "match.length") z <- z[k == max(k)] ## reduce to maximum number of matches k <- sapply(lapply(strsplit(z, ""), table), "[", "M") k <- which(k == max(k, na.rm = TRUE)) if (length(k)) z <- z[k] ## reduce to maximum number of leading inserts k <- attr(regexpr("^I+", z), "match.length") z <- z[k == max(k)] attributes(z) <- attributes(t) t <- z } if (graph) { dimnames(attr(t, "table")) <- list(x = c("", as.character(x)), y = c("", as.character(y))) attr(t, "graph") <- .Call(R_sdists_graph, t) names(attr(t, "graph")) <- c("x0", "y0", "x1", "y1", "weight") names(attr(t, "pointer")) <- c("x0", "y0", "x1", "y1") class(t) <- "sdists.graph" return(t) } z <- lapply(t, function(t) .Call(R_sdists_align, x, y, t)) names(z) <- t attr(z, "value") <- attr(t, "value") attr(z, "partial") <- attr(t, "partial") class(z) <- "sdists.trace" z } ### experimental plot function for # # idea from: http://home.uchicago.edu/~aabbott/ # # in R 2.4.x we will fix yscale = c(ny, 0) # # label in grid.xaxis cannot contain "", i.e. does not # produce output if it does. # # with pdf() it produces a garbage file that segfaults # xpdf (but not acroread) :-( # # fixme: use another line type for prefixes or suffixes # of local alignments. # # ceeboo 2006 plot.sdists.graph <- function(x, circle.col = 1, graph.col = 2, circle.scale = c("mean", "max", "last", "text"), main = "", ...) { circle.scale <- match.arg(circle.scale) g <- attr(x, "graph") b <- attr(x, "pointer") t <- attr(x, "table") nx <- dim(t)[2] ny <- dim(t)[1] if (circle.scale == "text") fontsize <- 24 ## FIXME else { t <- t - min(t) t <- t / switch(circle.scale, mean = mean(t), max = max(t), last = t[ny,nx]) } cn <- colnames(t) rn <- rownames(t) # bug?fix cn[cn == ""] <- " " rn[rn == ""] <- " " grid.newpage() grid.text(y = 0.95, label = main, gp = gpar(fontface = "bold")) vp <- viewport(xscale = c(0, nx), yscale = c(0, ny), width = nx / max(nx, ny) * 0.70, height = ny / max(nx, ny) * 0.70) pushViewport(vp) grid.grill(h = seq(ny)-0.5, v = seq(nx)-0.5, default.units = "native") grid.xaxis(at = seq(nx)-0.5, label = cn) grid.yaxis(at = seq(ny)-0.5, label = rn) if (circle.scale == "text") mapply(grid.text, label = t, x = rep(1:nx, each = ny) - 1/2, y = rep(1:ny, times = nx) - 1/2, MoreArgs = list( check.overlap = TRUE, default.units = "native", gp = gpar(col = "lightgrey", fontsize = fontsize)) ) else grid.circle(x = rep(1:nx, each = ny) - 1/2, y = rep(1:ny, times = nx) - 1/2, r = t / 2, default.units = "native", gp = gpar(col = circle.col)) grid.segments(x0 = b$y0 + 1/2, y0 = b$x0 + 1/2, x1 = b$y1 + 1/2, y1 = b$x1 + 1/2, default.units = "native", gp = gpar(lty = 3)) grid.segments(x0 = g$y0 + 1/2, y0 = g$x0 + 1/2, x1 = g$y1 + 1/2, y1 = g$x1 + 1/2, default.units = "native", gp = gpar(col = graph.col, lwd = g$weight, lty = (g$y1 > g$y0 & g$x1 > g$x0 & cn[g$y1+1] == rn[g$x1+1]) + 1)) popViewport() } ### cba/R/sdists.util.R0000644000175100001440000000744212020453760013634 0ustar hornikusers ### ### stuff for analyzing sequences ### ### ceeboo 2006, 2007 ## Find the centroid (medoid) sequence(s), i.e. which have minimum ## sum of distance among a collection of sequences. ## ## Alternativley, apply FUN to the distances and select the ## distance with the minimum value of FUN (mean, median, etc.) ## ## Option 'unique' specifies to reduce the result set to a distinct ## set of sequences. ## sdists.center <- function(x, d = NULL, method = "ow", weight = c(1, 1, 0, 2), exclude = c(NA, NaN, Inf, -Inf), FUN = NULL, ..., unique = FALSE) { if (is.null(d)) d <- sdists(x, method = method, weight = weight, exclude = exclude) r <- if (is.null(FUN)) rowSums.dist(d) else apply(as.matrix(d), 1, FUN, ...) k <- which(r == min(r)) r <- x[k] if (unique && length(r) > 1) r <- r[!duplicated(sapply(r, paste, collapse = ""))] r } ## Compute a global alignment of a collection of sequences using ## the center star tree heuristic, i.e. 'c' is assumed to be the ## center and each remaining sequence 'x' is aligned in turn ## against the (aligned) center. Spaces are inserted as needed. ## ## If transitive = TRUE the space symbols in the center are ## replaced by the (non-space) symbols in the current sequence ## the center sequence was aligned with. This results in a ## 'transitive' global alignment, i.e. each pair of sequences ## is implicitly aligned, too. However, this usually results ## in spreading out the alignments (considerably). ## ## NOTE unfortunately, there may not exist a unique alignment ## of a pair of sequences, so that the global ## alignment may not be unique either. In this case we ## make a first or random choice. ## sdists.center.align <- function(x, center, method = "ow", weight = c(1, 1, 0, 2), exclude = c(NA, NaN, Inf, -Inf), break.ties = TRUE, transitive = FALSE, to.data.frame = FALSE) { if (!is.list(x) && !is.character(x)) stop("'x' not a list") if (missing(center)) { center <- sdists.center(x, method = method, weight = weight, exclude = exclude, unique = TRUE) k <- length(center) if (break.ties && k > 1) k <- sample(k, 1) else k <- 1 center <- center[[k]] } n <- 0 ## total number of ties r <- list() for (s in x) { a <- sdists.trace(center, s, method, weight, exclude) k <- length(a) if (break.ties && k > 1) { n <- n + k k <- sample(k, 1) ## random choice } else k <- 1 ## first center <- a[[k]][[1]] na <- is.na(center) if (any(na)) { center[na] <- "" if (length(r) > 0) { ## update t <- gsub("[Dd]","?", names(a)[k]) if (regexpr("[Ii]", t) > -1) r <- lapply(r, function(x) { x <- .Call(R_sdists_align, x, center, t)[[1]] x[is.na(x)] <- "" x }) } if (transitive) center[na] <- a[[k]][[2]][na] } s <- a[[k]][[2]] s[is.na(s)] <- "" r <- c(r, list(s)) } names(r) <- names(x) if (to.data.frame) { if (is.null(names(r))) names(r) <- seq_len(length(r)) r <- data.frame(center = center, r, check.names = FALSE) } else { is.na(center) <- center == "" ## recode space symbol names(center) <- seq(length(center)) ## add positional index r <- lapply(r, function(x) { is.na(x) <- x == "" names(x) <- names(center) x }) attr(r, "center") <- center attr(r, "ties") <- n } r } ### cba/R/rock.r0000644000175100001440000000777312020453760012354 0ustar hornikusers # wrapper functions for the Rock algorithm. # # note that the behavior for other than the binary distance functions # has not been tested. therefore, the default relationship between beta # and theta may not be meaningful in all cases. # # (C) ceeboo 2005 # compute link counts (internal function) # # let me stress that the semantics are unscaled # similarities but we package as a dist object # for possible future use in different contexts. rockLink <- function(x, beta=0.5) { if (!inherits(x, "dist")) stop(paste(sQuote("x"),"not of class dist")) if (!is.double(x)) storage.mode(x) <- "double" storage.mode(beta) <- "double" obj <- .Call(R_rockLink, x, beta) obj <- structure(obj, Size=attr(x,"Size"), class="dist", Diag=FALSE, Upper=FALSE, Labels=attr(x, "Labels"), method="rock") #invisible(obj) obj } # merge into clusters (internal function) rockMerge <- function(x, n, theta=0.5, debug=FALSE) { if (!inherits(x, "dist")) stop(paste(sQuote("x"),"not of class dist")) if (n < 1) stop(paste(sQuote("n"),"illegal value")) if (theta < 0 || theta >= 1) stop(paste(sQuote("theta"),"illegal value")) if (!is.integer(x)) storage.mode(x) <- "integer" storage.mode(n) <- "integer" storage.mode(theta) <- "double" storage.mode(debug) <- "logical" obj <- .Call(R_rockMerge, x, n, theta, debug) names(obj) <- c("cl","size") names(obj$cl) <- attr(x,"Labels") invisible(obj) } # classify based on distances to clustered samples # (we have to compute these separately; for an # example wrapper see below; internal function) rockClass <- function(x, cl, beta=1-theta, theta=0.5) { if (!is.matrix(x)) stop(paste(sQuote("x"),"not a mtrix")) if (!is.factor(cl)) stop(paste(sQuote("cl"),"not a factor")) if (!is.double(x)) storage.mode(x) <- "double" storage.mode(beta) <- storage.mode(theta) <- "double" storage.mode(cl) <- "integer" obj <- .Call(R_rockClass, x, cl, beta, theta) names(obj) <- c("cl","size") names(obj$cl) <- rownames(x) invisible(obj) } # cluster interface rockCluster <- function(x, n, beta=1-theta, theta=0.5, fun="dist", funArgs=list(method="binary"), debug=FALSE) { if (!is.matrix(x)) warning(paste(sQuote("x"),"not a matrix")) if (n < 1) stop(paste(sQuote("n"),"illegal value")) if (is.function(fun)) fun <- deparse(substitute(fun)) # cluster cat("Clustering:\n") cat("computing distances ...\n") rc <- do.call(fun, c(list(x=x), as.list(funArgs))) cat("computing links ...\n") rc <- rockLink(rc, beta) cat("computing clusters ...\n") rc <- rockMerge(rc, n, theta, debug) rc <- list(x=x, cl=rc$cl, size=rc$size, beta=beta, theta=theta, fun=fun, funArgs=funArgs) class(rc) <- "rock" rc } # wrapper for predicting the class of new (or existing) samples # predict.rock <- function(object, x, drop=1, ...) { if (!is.matrix(x)) warning(paste(sQuote("x"),"not a matrix")) # drop if (drop > 0) { d <- which(object$size <= drop) if (length(d) > 0) { cat("dropping",length(d),"clusters\n") object$size <- object$size[-d] k <- !object$cl %in% d # keep object$cl <- factor(object$cl[k]) # enforce contiguous indexing !!! object$x <- object$x[k,] } } # classify cat("computing distances ...\n") x <- do.call(object$fun, c(list(x=x, y=object$x), as.list(object$funArgs))) cat("computing classes ...\n") x <- rockClass(x, object$cl, object$beta, object$theta) x } fitted.rock <- function(object, ...) predict.rock(object, object$x) print.rock <- function(x, ...) { cat(" data:",dim(x$y)[1],"x",dim(x$y)[2],"\n") cat(" beta:",x$beta,"\n") cat("theta:",x$theta,"\n") cat(" fun:",x$fun,"\n") cat(" args:",deparse(x$funArgs, control=NULL),"\n") print(x$size) invisible(x) } ### the end cba/R/gknn.r0000644000175100001440000000140412020453760012334 0ustar hornikusers # knn.r # # implements generic k-nearest neighbors, i.e. for arbitrary distance # measures, in a way that is compatible with "knn" in package class. # # ceeboo 2005 gknn <- function(x, y, k=1, l=0, break.ties=TRUE, use.all=TRUE, prob=FALSE) { if (!is.matrix(x)) stop(paste(sQuote("x"),"not a matrix")) if (!is.factor(y)) stop(paste(sQuote("y"),"not a factor")) if (length(y) != dim(x)[2]) stop(paste(sQuote("x"),"and",sQuote("y"),"non-conformable")) storage.mode(x) <- "double" storage.mode(y) <- storage.mode(k) <- storage.mode(l) <- "integer" storage.mode(break.ties) <- storage.mode(use.all) <- storage.mode(prob) <- "logical" # y <- .Call(R_gknn, x, y, k, l, break.ties, use.all, prob) y } ### the end cba/R/coding.r0000644000175100001440000000343213037611552012651 0ustar hornikusers # coding.r # # dummy coding for data mining applications # # fixme: no reverse methods implemented # # ceeboo 2005 as.dummy <- function(x, ...) UseMethod("as.dummy") as.dummy.logical <- function(x, ...) { x <- as.dummy(as.factor(x)) x } as.dummy.integer <- function(x, ...) { x <- as.dummy(as.factor(x)) x } as.dummy.factor <- function(x, ...) { x <- .Call("R_as_dummy", x) x } as.dummy.matrix <- function(x, sep=" ", drop=FALSE, ...) { if (is.null(colnames(x))) colnames(x) <- paste("V", 1:dim(x)[2], sep="") obj <- NULL levels <- NULL colnames <- NULL varnames <- NULL for (i in 1:dim(x)[2]) { z <- as.dummy(x[,i]) if (drop && nlevels(z) == 1) next obj <- cbind(obj, z) levels <- c(levels, list(levels(z))) varnames <- c(varnames, colnames(x)[i]) colnames <- c(colnames, paste(colnames(x)[i], levels(z), sep=sep)) } rownames(obj) <- rownames(x) colnames(obj) <- colnames names(levels) <- varnames attr(obj, "levels") <- levels obj } as.dummy.list <- function(x, ...) lapply(x, function(z) as.dummy(z)) as.dummy.data.frame <- function(x, sep=" ", drop=FALSE, ...) { if (is.null(names(x))) names(x) <- paste("V", 1:length(x), sep="") obj <- NULL levels <- NULL colnames <- NULL varnames <- NULL for (name in names(x)) { z <- as.dummy(x[[name]]) if (drop && nlevels(z) == 1) next obj <- cbind(obj, z) levels <- c(levels, list(levels(z))) varnames <- c(varnames, name) colnames <- c(colnames, paste(name, levels(z), sep=sep)) } rownames(obj) <- rownames(x) colnames(obj) <- colnames names(levels) <- varnames attr(obj, "levels") <- levels obj } ### cba/R/stress.r0000644000175100001440000000773713437252001012737 0ustar hornikusers # stuff for improving the presentation of tables, etc. # a.k.a. bertin matrices. # # (C) ceeboo 2005, 2006 # the interface to the stress functions allows for # arbitrary subsetting (see the wrapper in C). stress <- function(x, rows=NULL, cols=NULL, type="moore") { TYPE <- c(1,2,3) names(TYPE) <- c("moore", "neumann") if (inherits(x, "dist")) x <- as.matrix(x) if (!is.matrix(x)) stop(paste(sQuote("x"),"not a matrix")) if (!is.double(x)) storage.mode(x) <- "double" if (is.null(rows)) rows <- as.integer(1:dim(x)[1]) if (is.null(cols)) cols <- as.integer(1:dim(x)[2]) type <- as.integer(TYPE[type]) x <- .Call(R_stress, x, rows, cols, type) x } # interface to distance computation based on the above # stress functions (auto-distances only) stress.dist <- function(x, rows=NULL, cols=NULL, bycol=FALSE, type="moore") { TYPE <- c(1,2) names(TYPE) <- c("moore", "neumann") if (inherits(x, "dist")) as.matrix(x) if (!is.matrix(x)) stop(paste(sQuote("x"),"not a matrix")) if (!is.double(x)) storage.mode(x) <- "double" if (is.null(rows)) rows <- as.integer(1:dim(x)[1]) if (is.null(cols)) cols <- as.integer(1:dim(x)[2]) type <- as.integer(TYPE[type]) storage.mode(bycol) <- "logical" # obj <- .Call(R_stress_dist, x, rows, cols, bycol, type) # return dist object if (bycol) obj <- structure(obj, Size= if (bycol) dim(x)[2] else dim(x)[1], class="dist", Diag=FALSE, Upper=FALSE, Labels= if (bycol) { if (is.null(colnames(x))) cols else colnames(x)[cols] } else { if (is.null(rownames(x))) rows else rownames(x)[rows] }, method=names(TYPE[type])) obj } # reorder table like objects (we may use S3 dispatch in the # future order.dist <- function(x, index = FALSE) { if (!inherits(x, "dist")) stop("'x' not of class dist") k <- .Call(R_orderTSP, x, sample(attr(x, "Size"))) cat("length:", order.length(x, k),"\n") if (index) return(k) subset(x, k) } order.matrix <- function(x, type = "neumann", by = c("both","rows","cols"), index = FALSE) { if (!is.matrix(x)) stop("'x' not a matrix") by <- match.arg(by) if (by == "both") { r <- sample(dim(x)[1]) c <- sample(dim(x)[2]) c <- c[.Call(R_orderTSP, stress.dist(x,r,c,TRUE, type), seq(c))] r <- r[.Call(R_orderTSP, stress.dist(x,r,c,FALSE,type), seq(r))] } else if (by == "rows") { r <- sample(dim(x)[1]) c <- seq(dim(x)[2]) r <- r[.Call(R_orderTSP, stress.dist(x,r,c,FALSE,type), seq(r))] } else if (by == "cols") { r <- seq(dim(x)[1]) c <- sample(dim(x)[2]) c <- c[.Call(R_orderTSP, stress.dist(x,r,c,TRUE, type), seq(c))] } cat("stress:",stress(x,r,c,type),"\n") if (index) return(list(rows=r, cols=c)) x <- x[r,c] if (is.null(rownames(x))) rownames(x) <- r if (is.null(colnames(x))) colnames(x) <- c x } order.data.frame <- function(x, type = "neumann", by = c("both","rows","cols"), index = FALSE) { if (!inherits(x, "data.frame")) stop("'x' not a data frame") by <- match.arg(by) k <- sapply(x, function(x) is.numeric(x) || is.logical(x)) if (!any(k)) { warning("cannot order on ordinal attributes only") if (index) return(list(rows=seq(dim(x)[1]),cols=seq(dim(x)[2]))) x } z <- as.matrix(as.data.frame(lapply(x[k], function(x) { if (is.logical(x)) as.integer(x) else { m <- min(x) (x+m)/(max(x)-m) } }))) o <- order.matrix(z, type, by, index=TRUE) if (by == "cols" || by == "both") { c <- o$cols o$cols <- seq(k) o$cols[k] <- c } if (index) return(o) x[o$rows,o$cols] } ### the end cba/data/0000755000175100001440000000000012724613221011726 5ustar hornikuserscba/data/townships.rda0000644000175100001440000000063514344040324014456 0ustar hornikusers͔O0 ȧ"0k3 ,)22 6ha$t`BF$sƗKg>9!5(ٸq AD$AR g 8'Ys .J \\PZ@h"<"Q;cQ9HTM-g$d<`f :}UT̼Mb}sd"yqz #)r{\IVZboU ̯'E{W0kKC Nb&ҟթbٗTNJ6Q]w2͠Ce EGӴAE{̤c ]-Og[M,tB@QN~Pu)݇t$n8o!UrEO'0ykåe PMA!:QKr0X#HK83Ձ`Mjm|=~?\n;OZ7\3qp=@y'#D7b$$mj .vs@z>'I(NV;|PfOJ<1(*2N-WpcKޢᏗA貝W,Ua,`+&vMϦVuW[()qX#X_{FIJŵW`Z_HQ ck{7mP!w^L0֖IMgD,Gy(Lcf6rVuCcӫLX(kdoXw.W5m^f߲uDh;vRs ?ln|yGՀxhi6'lyv2PK.|WLJ1< DhS+.`rZY 3f &h`kq\D9!I}t15J;R DT+xϱ0=,ޮA|OF찆yƣtfkH҉sF %XfTzx:dyUW[o2|Q`v+>Ck*mDxf-GNm6px)DI#fC9ddň.|p&P0bU$Dxr~հ)9lyJ=Mf`goNedGY}@THKnjSvV󌷗4R?-zcX}P2f02` ]lq K]-vB2taUjv:3wA[}1LrR"8+So5ֲxu5L~qBףQb=ep\ N¤Zbra~f mdKuW{*S35%~Q@`Û i"OxM>+ n[(g+3|yZъMUΌ1(1iS[NoTyQ8$m9x)깔u73[X"2s4GLlibm{F}*R! 7jg?7,eZB!G#M1Mq#U;ZbJ:Ig*@\%l\Y͞vG!pNT7V F]2X#_ `}ش4ŻzO9fY0VZ۶uq'RO~* 6=bY],[k]q!PkO˝@ +:|QL?莙`:Nd~6쀵JwM !._%:\%WTH灻'1%ø_ >WSVӺdxV/Q!oq{:rUYqw6sNvX#ڙ/XIc}]Wor߅7P~BqO'&< ,30QpIx23:moU0y<jSM $zEȖy6ڴ="ΧL6f˒cċp]~_cU,@y HM0P(-ʽcJ4OGܜ()a A.PrT&G+觗-lfzgx29/Vxĝ3,BbqW{öhRAܸa1 ,袃ʜ)upipFlj=&6MWXN^aR̂^t9"JnX*Le28bxN3戅9Rx>deW}9X3g &á o[{D[XrDkw?zI7*ƌLK%zBVخ%J{SL$>3ς,sVέHLYiM!߳kK=>ت*:O 5).*04QC<BmVX`]{ 0vPw. Yzse`Lnqg,-W^lԺ[tL1j v */6?Z!%V@,'=9WWj0uI\{di>L8B|Nڢ~Qe,LӠ$\ڕS}^L85"TV gnfj0Ƚ^2IJUPZEu%~[;*x}I\6 ;B9wyD_6$q(iQ_c1lǺe)8NhÁ1D|\IS7u=\ּ͔xy2XT>0`{lf.$B6[{8io%Cͨ8AY65Ђ=VswkX\H3f'-i4L w.A_Ζ`8:|MaՖtW' FBqMEj  YW*GpQ/Н=Q;JŠֆG@!%wt,ϘZN# Xd2氟i2pbcJw޴6 r2x.0Tw Snx!-6Z"ʋ1P}EOr&r(}`f XxuۥWv;c9>v?lx,5Q7w0lYA0."O#| TH+a3uTRBcD)JJ6IiLJLG;T8ʵ :j/ fx(f\ުG]1\' XdbGJ̈bCGPqfXNE(00p=jN]G=~WnqURcC}xE~D衊͐B[g "e]f鎌 㜲j#:v|]1CUBwH0E{VDBL>CVu;B*0%uXH e;荌J|I/G#Ձ͆RF4X`7^Wo'NT(cqZ߿!J޶i{3vWX2a..O+NRHxxc$ lJqݣ7Nr:</4;GL*u_k:hs"Y㰢 [>!|sbi|2%y W$"_Pca3PM1sNwk5 k<rkC\~rYo`:F^ocTF<X*;{#?+f鰒SRBϥ,j.jJk.@ʊCқLfF. *v}UՁsgRE4&zR|&]@E~1K|R?B8eRgdqgRd:9^W%hn Q($On~QH׬2bÂȞ71³wphS7!%CdBJ?^>_U.zU{ҬR[A VBhI}\jR2\LbPIޯg<`B:Og޼hfaB;Fƾ'!J#_W` dL0C]VKVbw} _g/m(=C_CqQh>,s]#S&]+iEwHǒֺ{&#;$-bRqeJ&(bc*GoeAEj\*Ndyqmu#a=T*ԙW=I凤P/)NJh`JNC5@,y|E@rv2Z%W\dZUcჯUxS3,^|c ,-mtX`>›sH%TmվفFu@7 5LsMQL;@wX c7nK` iYAJɽm&B4][ f@t&+صm+tdDa窱a{D"W_A">9~o'q!6s͊bh鮁X׌x;SK`9[q;ExY7%ܔM`EWv]ZL؊C ͤLIZejpÖm?6:]#n)ͻ9!v%1~09>>x$l6ޟNG}֋%cR(Uӿ`q z9pOk_ wUxHyp-s]b;O)/RTByp7ޞpBZjp;ai0*A8ddR܁34~ \3S=/m .dhвo'hތ?S~b6/<:)7leVAE+&@Ǯ4\ j$8U)q\U,1#6 îhAF67SA7}/d-S9їg1KD hgF4'I6$6i1%14,]0RC=ВJbGXVYfooGfҁt5S8Ll~+޷%۶(K,> 0Rޤ1_E?p;s%Pz5蠿x;_A.=5/:X\'&ߋF0f $ȱy S9U%IQmLyGʺP2s%z㜼)"?JٵoMf쪃yuU6! {YG5VG'B}ZTШ#$in@b>k-`U{Ys9,g2,RURfyc!u&Hf4x&x(хwc9gN$"Vg~3^(8\d.V(y_3$5]d 1UMdlO7shƋg-z#zĆ:fګ%riU `dzX ӇBp%tm8-[4ڡ%#:ӎCt8b^"@nԂhJYOV񔁃7ERTwOC ¯ߛ^O' 'y+mYL 'VQDA-G$ zzmIzTŌSENET1~uL!=CBpBgEXM2i]9Nh UsԌ,Z,} X'Ѯ^5=%,X XI~7O`*GtגJZCk<ᅣtY gOiX64D m'u'W9q;|3#]D?ݰbZu+<&zlmL=#GG6&< /%݈I&[v >?  #+5__f&0{C`CYQ#B8̺~K ,HMfp~Dbj;h@17~;{XDsf6MO%E~(W˪a{N^+nT-Vs$ U p? ϭn`$xٖ+e޺M[?V x)QvTJDiH$2]A l\_=rLXp]^o 1*K WkêO#dǝHJ FZFHkźhFBJ,f]/p3/HA i NW#u%| 0M*b) *p1Ww גʾpJTIeU,#7ϽWpls:'\"&Wn|LpVnEH8=>wlF[ Azr |VnG.3߳LWa8xFAO'i=[z_ 5<XA-j.("݌dDXVTwͿQm '*㢔ryE\M2;[lx5/4?s_[WK`(i$HL2yBFI^!;[! K.eg;m<±*{1mxNqv%x'~>&ݸڐwuvϋ|j&A懑`=IRGT?Qd_Ȼ&#kFRSU3ShԮ N!Z '] nTQ 3I6(SSQErf2b'mA[Ep| F7} K5Z/^GK13:g|1J !3 ~0: Zjr)Ё]ٴ tzo2ynvFԤ>oq-eA@ЈDDQޝ Κ|shn60ҌW(G'ԁ2Խ]9N{4V֯=^硤ݘl5>gNvk'=u7@:s4L2J6.B$ GJ s`eoEkE Z6 ю ?[Y_r-(:hݠ(,*TpT=HE xzOGboBDg6Ma-ʍ ^0tZic7úUchz. ìL@sĨCϋm䝯aN̈́Eܢ6J-h(gߥHkդ*R=&߄{/>zqs1F(.r=nʼn㉙~m/*8 .e:&LcN{ѐl.^ǃZ19.ezdCy Q"&lN }K쉬0tDdYD|ҴֺT TG3pT'z5dLD 9DGZh'$&\ 'lS!WN&+WQ.LvD$&dqzP;_Aq~byrhr'b5 QFR&**! 5rofz"m}>:NVAĜLo~4ͽJ a+O:Y_`}6u ""ٜ9Z}ItNY*{>Mp7$"@dr &>9Xs2! Hyb^5^c4d Ԟ:{|xAe?K_Ԏn Yo}bM3r{|{iNs0㩌CRҳ|65PHq%Fy%wÿul`"=68Fz/Xq^<= `3mexfȓWRy~>( ba{.$I7ZgPeЏ 0V}RP>KΥs(Z$ofaRӮLY6|@ WՌ+ču#&` rՌs?^oz\%G}NUڂ=s9HeThymn4}j7tPCډi Phyɾ`LptFIH6` iG qvπ.P=aM*:l{^JOgHs$zeTܲRuRAP:'G ]1ć-DCg$ B|퐯k)#yuF?Y=+!Jl$eXcX@)~jڧXoxC(1.IKɉzC l:}Gm8qD~)%_3wv]GE[.'}n~z* /ޢ?y>D0!W-CG 7A^^i$e2j])9ŖzW,`7 9?)@6ig+%+` RBGNK ytfO1|iN9P|eA,mo3wǾ|Y;SWSs|Lk/i?v"AMEI  lZmFv9mO:4`h&˵e$+0y2-dj#zs%jbswgoH8@4mhO ߁p8̻k5^#Bxg8TkwjS+||K“vdA))fV^o.h.mmSҫSrw o$F > ?Y4U|>>"W!fqRƼ0P=KҍD21"}I- YVB6>mZ 2DD#ve,c陼z׮m4ek85^ik_O^ӜݭR*1u ? tAN3 t{:h!6){ʭ eU8gǭ6*`|cED m66I_4}(F@ Op05 : ,b|(W:J06#tD݈U\IƓ.5]lT~T-#|ǰŜt6Px;VԽs i yDߞɟ2[P{ tWXLBqᾒ"TmIe?o0c <>鶋j~r$ K0C SwRлiL2Tg3gӲʌjloaBvWeL}'*x铿]3YyH< 0Vt´4>AQ@N(4F0O(#bf["{@|pR< ZP(v[Z;l $۸fF!2.6 mߊQ#يT~[YclG"BxWQ&WIŦ>akEco$g'!e_R$6J"c OCw@BZZ,Wlc 4/Ø(DDa"rRODBe𶠞NL oLeGwL$|inY.M6#Bt #Kk,8#W }/ pYά5y!HE$ u5ٺOGT괽 _SQTF\])=pH }~i Dݪ fX宩m'yfvȺ%Ѳk%a!LLF[yz>H)N#hTi*.T6sFt`t$0:T#I  vwHa&D&YW-l ڛ\wgnԕ+@E XGr88 4c', >MH[M'cpW?Quiޭ>u`NqN&V족K\(1P, J+X DaD{#39yMKY):=_IX޽ Ǖ sT9K͘,wi28k%"c$٠F۴0GEp)UØz "Z557P.+ATbZ8֓i.w[cxeſ$mH? ĺ>~q25sIjku *>r'hER˿]!ȏE N 7hN#ʼn *Q*YQg.뀍Ae:UTJV`PCg(c #<v (<\ѹfX)1]/{a>szWLH'\E]٠1BZHaoC[3=p;NO@>l`Zqfg5.EqRC=l ֕ dD%hT/(SV>2(A(3"3̓#Y tV;I:#Njj)~i=1!.y=0:EaId>^FpFR^lcn9n\]v;?%˳ $qne u>Uc[cX@zEJm*mR@R77(NT(biYZ8{7{SRg^ *4xe"Zჾ <dO;h t UJg{F˞JcEZҫ+/ N&6[sgp-`mmhxD(Pb=)/VB|rDQF.7'kYR5ZZ8in޿]SgY푏ȇ\pMs|{w}lKIz$lae,+Cy-$/,/ÑG&NN(#%ͩ0+y+~LWl re$ֹccVLvl5̻*y?ΥWxĭ~fh8ζ{@IKVkA!4& zፊ%¶&/YkȨnxf4$ƚb\N'_4^Uګ1 H|Ǹ`qNEW? ԟ.!- ĉ\aFaż4hF)g*>Hkbg0d\ijd"Ѝ&U33AQI}ڍ߅;ƯN8 M"rp!O -UuwPG>"#2ڨ0('(&SxDjϿ~WG2d8j4km'څ1i+Imئ&u͏S/dLj2b[ G/t 4'Gj*8qR)ԏd;Z^x taVb4bN 4|+:ҀמWg.ȹoiRn(zY;8q+f >DY ('rGށ;Y< T*>s67yغd./b\"*e=t/LpAQl+8&~Vj s1uɒ@#lm =gWϊKB;o^MT|>V7ŋ5Z1{Dƅfäh]Zw rm(jgr76?mSɳ{ pd>I0<ud8REdCsb ՘< [y6bWuy⍇$%l4>⮃VFfX:7٦~ܧ{jGDÓCk(!q]犤_Mf]ʑnqvGJ񬧖`Ա:>>$iCiӮ0Ħx#Xu23(j`CV[從mBz=1t&%)^G;]нR2"K 2I pR k# *6bOЯev_$5]jB^$e}!LĬgF< Wx> IDeH߸̷B|قla[C@>pGNadCݕ)P,QLX`jR&NDj~O.w揓Մm $~2I< bW[(io-ii46?w~CK cܹ +YT͌Knr VM;ܟ'Bo#:qkNmi9D2Z Ε36l]FV6@ҘgP 9 vƞ0T؅Sy^S`]՟'(bke C!+#5\vQ/%J\RPUpG>׎vW؉’3f0R%<ޮۆ⽏Bx0K~K{R_}p:YrZ\p_lbX싿u[b_Brt˼Rz\w&~YMh|_-G|u=/ vמ ڼd?Xa4qcFx~+%Ԃ*[Q$VWsnH#,*Fi?s%sҽ,בur,~9{N@0+Ķ2lnQw.^P~S;vQ2boG-,K7 ce'қ 1]%q)\9|uutMevm{Jۅ Dޙ f M=`7H:Lkmؙ9Ws^~w(ܪ"~jBk09ϔς P:AF]Hp8׍*}Mב7\տ. Ԓʧ&- HAw5s5; gaiM^SwqT:E+oelP!~Vm7 99}\ͪyB򛽣H48]s\jxʽU7[vRxiZ%8*$)]ȾD>zB`d+hLVJ79VPߤ0%g|ɑZFOWDB[Yz@"lj75&8]MS' . Frほ{6\Hja{0YA/.9=X~c|#7:C U\9tg_0  P˜ R 1];8}c5]l:%|1ۛ2yIR]0xTޕ@(|-~2gNIp <R+j]뙸2w -Y9I'͌z̚O,Z(e\o¡<p.QrBVLE%u!wJu`KHU2 ~S.# 9*F͟/\^8 A>{=&ПEBmP 2Qf/ 4g,e㶖+G!R_z]30$n9=S>j'ϑyMSR<'J@/'Dǂ g(٬^-˧yNeSAZЀ`$?wIDR! .Om]aNW/1鈙6u{!3o:@s-&/?d?R+)#l3r+enTNXHtk B X8Td]Zg  Ǟ" o͚ȓ Q!H&iD6{@s"v_ b(}w,%4csnh}3kaCͽyТHJJol(-#ǀ~ 4o8t0[{ooQ:я 1,OȍaK&ìKPIc9ܡwZ 98*zAVQ]5Ckܠ6Lh,|?{VU9.UH_ޟZq0ks? }Ok[a;/ϒQR`7[F%bcL6)4uΖ(0PCmυx3zk,ΰ"so<,įG=-X鹼Qk"GU׳z;I) AⶎT|=:?Ա昛DMAr[Ե#-L: ĝM+R,3\|t t*hj4No.o,{)7stXĉȂ*gZ}Շ=L ap*@+U[X~* b^ ;`jVP M6:I1w@trbcn&dnǬֿ\C%A=v@Fr&&D`B"!4[bXr ? F"ٖ*F,2+k;t饵lsuԃ,mnӴ%+ DIgc޶ϭ~V\fv`EzC0>4\Gyt1eIz~{^ebpӈYopҖf&>#ldd@0e0A$%bB0뢐6]@!)TG{0T|awƿ^s)$=uqx,sfֽqc*xhv@neBYʩ2\`TC>~v2*˔NIn{:c~y)L%x:N0XL;7.Z0,V#=c&SЭƑ]o",B'%W9۷:6[D:9>֒-uD/g 6 v0m3ܑ&/B*! ǚ>4Ct`ZI/mmy[% ŻP/f=T%Hz1Ili6C~5#xbG6ϯ>n =8o!jMW(~U6*؄j3Dl,N.j;",{Q ĪBd~wv"VK,c d6 am-e=ɕ6Ny^4_p^Lp&gQ~L>>bv`k \X*@? LOJ$ Ncd)+)y H~lsAIid!/%X-VfROuq69ܪtG=m$c ,,j1n_vfI;REuh: <5^C9MWaB2Ao#2UriGvQY7tlkS9o|Z^9_dS R4<V=d3D:"YUZYSB@͠<< m\Y=їlG˕T0}ڞ?ˎrw2,C.-NTg@zt1*c|*|L~=\$Xwl({p!Rl*nLQ7 }{o'8}^cm+T+!z'Jj|ZT u=X.;5گUf׀:sUf~"u(fN(V+ea 2ߍ<&#hz:+#멜8m'~25!¹\*? ų2えnȰdbMn\ef kM3t">*ٸC:+ 4ekp}?y)R|_pyIz& NQWF⭔M?&96YkOvweׯj%]3'thj Dze#*tB|$Pʃ6xIԶD==c:U&,5 v!فe?QpC {(7Zw#O# $vb/st  JܬuLjHX >Ve i(񊪻')珵$ymlHݠ1l^x F 4|eC"/p|J?v/\< >eepO(W3c=W^r.Rj <#UI\\-z}(A 1mW52Bh,g>=^{+n55ʮTU>Ss yW`%B)KTtQUUҶS:c5:+o\+"6'#+ CO1֌WRSFЉx^n iE0:7l>s5x|6XVʟGRkÍʕvly ~n?0v_^06@UY3(xS*3wC9l4i?K,G{avhr|0yDG ̜Fѭ b$Րk>tMWշOe#K4 Q0Sf!Ņ6$c\SE`4Ǟ~@$#IϦp5WPt ;7[+vdI){;N&#+ ӆ( U\S0(ijᔞ"J)rA+Z!.N3.cWUr @eyZ?%lܹӗVfyYtň֮`3L?62y}!C|$ԫFP]Y)6YFGԜt-MB1YMC!6[1 q2|ʩAie%ry<7F&%3t)o9v&WXO5x]Cu=uaQ/ N[0uѻ% Ros'$ π`a0Wu^ZS0v"mBYIo,so|;`]2. 픯٭liSIa Y _N@07Hq5,-))X2-f9/Z:$Ʋnm䉿M334`,F}|_kڣV—F1EC` lens}#l F&(;wcO2mA&?YO;6Lz[5$>mB!DlX;}Va;!G}.kڞA{S2~e2؀b#N7\u3-5Cq#[xॳ}tì([Q=O0pZLNd~Ը;':U}xLs "ߋh3-'ƴn2EC J> #`kr(q'noW0w5=,qYzMt¶3`dPIVp(8 4)2xv[CŻznt%޴9cj5IKor6:9{)ۙ7NE&soGREڭ1{#nQ=kȮʈQfrnP}g XUkc!^q|U@ȰlI@CbBΜJ:%e!8?9fHej eirFuǟ'Eʝ"n(C#!SĴ UCo ȧ_ Սq^MCD6˂?B&.4*hxbOM<*%>Jy@pBwwz QJǰ৯}l5cO(_q%[cm b5|jyT~hO$ڝU8"!ǭtz@l cK٧'6n}M0ș%V 7aI>Ïv2 ӓ|Z6akPj0}r~D 㹶2A@5qs[kts{,<ȯ)ͬdb4$ԵlyCLU֜~/)0t -+/uG:~| [>w)%@f~f >>/+__- V(vGkaKM͔txfɻzb x,栈gZIFxғ03eg-ڼvwئp1cS`rI9lnq͢X a)_YX-l馛h 0|ZuTem\SLjrNlξv\{ߟ|-q2WvTtFr%GgLjB`دG(N0ewH.֌ h{0Ky..Җ1jSn`I)Dmu>͊ %y> r=Xtt"17b/7vG&onRowpbξ,T\Ўr$,h1hIޙ<K4k \ԄˬC/N`a~OtJƂ]H$yv?@]WP]݁۲m0O.wIPw9-t*1JƓT>TS7.-rMvʊtR@j r;&(m "/@>`T\[hnBiwL2Xc+o=fT6ä~> o"h[ g.A+YOAw*x3`}UW]WnCA/P?~ש|%Xq10|}{,7B&.b&q;䵲6Y8@n3cnB? ȁE!s\tot&# VyGK Zݙ/= lF8&M^xXRoLfZQkC,RނP42hSjp^hSKY]HM.8ܐ?v:W HU)d%]NIS0efh=PxIRPki_T[|a7eQ?IJKKCB ,b"KH7Wu(<1bq&B^:v`k 3F0IP7 H]} SK7rA7^*j ,*]L@ΨŸ':ZFOWfN}CxVIzs_iD:]s-괣q儇O34c*uk"XPX qt@n-ԃ/9>Dh+!u?pS'ATD$.V+ⳖcHI=Ź, tc'@V_`opF9L|< `~jLoXpNOV&„Fr4AҢQM nE5K0֫ԞGryIo g>$~+LnNf !b V.Rh`AՅ-bP7J[×C~.Z 1((kG9,p7ĥ&j[Ț}`گpxЬn35gWA'#p4ѕ/lf2JVXTk]wJqXn?ri%ZT`/ӣa&뱞e_+өގB89/[J{H5"l`ƙ!x`XX@"1Nɾ'me^LEȧeRr1YlK[q\GѶBR\lNE =鸪D籟|ʰZKp`Izi(]&;iMYӳ Ύ_hZhkoIPlV)\\ri7Z-m7Kb(jO5:#BzWηvU{]9D(:5(9.ܘ  ;ƿa^#!ZsceĄO242 \AI!I h FwBS[pԐi [0{wڱ-D;!)lyk{ӂ+a:1c**1ۏo<8zE}>%x D.x ՟V֭%(O\` HΌC`"U50%IQDN[Txx s^9+ pi]JѰ2 w/%s,%!Pn|X^y^}̎#crZ^ K;?eH,N`[ 2O_Qeǽ\ϫPΚ=:nq]+5F@\}Kzz I9a:)l;! n'%H[X췐jEX?_^۾368\!Ԁ-%r41$AS]%D>?K%\vfYa`&zRαQ(t<5~j] DE"E1vz 1Y cB15=[`6>j,-o V/" F2C3jDj!d"r6|f{I-R۷:}0[+œ$j#g[Š|ޞm|< 9s +ڜ٤\D]- ݏͩ1&dֱɲ&hyR]i2"ܠfxUsn:Q=k\`nQgnq,m~'wW_p,#ƣCs)^j45 +IH9m9e=5Ӌ8~m.~z9Sv~"[D*:( 筠V_U6kX cd0zxVpU2L;#TT"\HqJfiq^`=K7E -::y!09PTo=8@Ѫ)Nt=PW+Zx>\5 'VOk A-H>R[.U$tƮ ' ֚?H KGť`H)R^_so>fYV@5noԯO g#t)}-g#hqԐ3݀TUVT==k8p%T^4s=oiIշ% L:|*nCYO7!RR F^*yF֊SO4^]{CCC,ވ?,Z͈`~Iן #WdlO ! 4nFCԇoefFt܍Z`;Ll:aA3(:儿P#b E7LJo%K)G m=ע6բX"ư*ܲ%CG"r`X3P#EVoy,EpQ k?J$M* BdJi5.QsF^Y]sֹ*ۢiF|aEUAYD,{ ܭ~bIe3@&gSH 5ɛ"d gKVHߖ2VS앻\$0ߚ]5>نMe+[~>&摱Z|`VՓEP?5XRp&g"OA(bB ;zdT;h?)T3e%^6KɼTMJ&oKhdW͞6eN޶YN,_A1bA(V[vJs[5b;O׫XŲUCF+7C0h,OwvU^G>7 T=Xѷ2K_rJ]ceeY:@?,[?:P-ڗBbF2`6P!wN*Ƃ>x+/Z6" y"zӾ2BAL~KmDm8gRc߆G*wC ~zԶ?ZJwޫapt6Ey~zpj}ٹrf{Ps0[[kᠭ0=dSH5˿ قH~mQ`L")9}*@Dc."H'tR?"38պtW~K bcfp֍zb\N ~ʦ U+0s-jR ՞ vnK;f(ʒ҂Xs Qݤ}5뿋8rJSqq(-A>VPEE{ahp"_,BQ<&,L;o"کRQH9F{h蓤OnzgD#B妬$9 >Fr~p98TbLU(-]p* 2yQmq˪jfuWa<>ptWf"ēlm-|"N.8]Ih?>4-}FEHY \Cz-IFYq5Hf b"C`7.wOEӞe/_J0`Mf9h\%U=H^ ^ ir춠{RZ XQc{w%{-b룜;$XL̚[Q[6m! =[ol؉rVrkA% |Gr۳:[L>.P2ͳm#V @fPKCd@l/e#ZWO"`g?RU!e&OTc \7H7Dd G ppF۴w8bip˹C6GuXbDd}\ ZH2!V4CI LiVLCn7B@rtdC HY4b^$[U`"7Xn=,P yv^`JF{ %)Zr*7Ph3\ v*9vĪfLoo'= 9tXNl+ bV(ш2ؿa(銒gPX%&,PFAo[3»g>/hC,b#z,qdՅgX N>O !'nSa~6#l(nhhUj4]ΒѴtns zUxzJ=%ntlfWMJ-Lݻ/[7^j'>AjR`zN.*Į0k2GDX4O1__/M^:kqк9m*)\Y *y?J:cA4 6OlTx֧Z4ב,e`ﺬ'ɨ|ռQ1S.F&F4\D.G!!tN[+KʢٲQ<13Kk)k$ 2ЁCk$hOWҠ8Z`P#^Tc@ ycsE8o4 1k^o_z}pj-ƦwpMz3Y/LIQ <1bPdzގ;8=;V;:(S0j361QAtNv*ޣhҔ=~T IE8,E>m'}@E2?ٽD@P2S/#({FD5_kk4K 1A0,XZ cfdEZ?Nd#:ԓ@Sː)`>re2)E.kdB~-Yo/XlAn5X ȥ|j?H}Jre{l貙Iѱbv-+UvB#pŧ+s ;%H_gEMHM67&}0W4V*ơ[ x96^]nswT|B&s ZxT oOjPbգt h(Ë4Yd~D{ב F%{{UaV(@Q3qf*" XmZXs=b0yReJD[<u^ܤv&\ dX.q(BSpb7oit89{Z8_Hߐ6oU&Y ^*eCɉ*@,;uS%O`h_ކ&;-ϦעNFZ`sў_= ܶK4-Vψ0B[i&pm:\a챡O tO6ySWgrvOήT FV[3s/~ %G[\R`O7E<5,LAB%Pv7~+W(NHOS؃9=i*ELTV,2ox|_d /}DZu%>$4uCIi_~5 XZ]k7y9+${Yx#" &U")]5 MUӑC5 /(r8-AG&7ݸ r37:pIe( VrQ=f'bN͙+lQ \JrD .@U0U>D8o_M  a'u~GYatKT^dX9e*r{ v'L碭Bާgka =*_&Hzp  w( Gv_@k^bM2$e?HĎG\lTwFLtx"2*ۀ gZtW)64N{cSI^砦]+Q0:-2L}bB7+ Kh[va_P୿D㼖WC'lQm"}>Z탆ձDi@pыlTjj %vK-K/">0 YZcba/src/0000755000175100001440000000000014627400032011602 5ustar hornikuserscba/src/optimal.c0000644000175100001440000003406711304023136013420 0ustar hornikusers #include #include /* compute the lenght of an order, i.e. the sum of * the edge weights along the path defined by the * order. * * note that the order is a tour with the leg between * the first and the last city omitted. * * ceeboo 2005 */ static double orderLength(double *x, int *o, int n) { double v, z; int i, j, k; z = 0; /* path length */ i = o[0]; for (k = 0; k < n-1; k++) { j = o[k+1]; if (i > j) v = x[i+j*(n-1)-j*(j+1)/2-1]; else if (i == j) return NA_REAL; else v = x[j+i*(n-1)-i*(i+1)/2-1]; if (!R_FINITE(v)) return NA_REAL; z += v; i = j; } return z; } /* R wrapper */ SEXP order_length(SEXP R_dist, SEXP R_order) { int n, k; int *o; SEXP R_obj; n = 1 + (int) sqrt(2 * LENGTH(R_dist)); if (LENGTH(R_dist) < 1 || LENGTH(R_dist) != n*(n-1)/2) error("order_cost: invalid length"); if (LENGTH(R_order) != n) error("order_length: \"dist\" and \"order\" do not match"); o = Calloc(n, int); for (k = 0; k < n; k++) /* offset to C indexing */ o[k] = INTEGER(R_order)[k]-1; PROTECT(R_obj = NEW_NUMERIC(1)); REAL(R_obj)[0] = orderLength(REAL(R_dist), o, n); Free(o); UNPROTECT(1); return R_obj; } /* check validity of a merge tree representation */ int checkRmerge(int *x, int n) { int k, v; if (x[0] > 0 || x[n-1] > 0) /* initial merge */ return 0; for (k = 0; k < 2*(n-1); k++) { v = x[k]; if (v < -n || v > n-1) return 0; if (v > 0 && v > k+1) return 0; } return 1; } /* Z. Bar-Joseph, E. D. Demaine, D. K. Gifford, and T. Jaakkola. * (2001) Fast Optimal Leaf Ordering for Hierarchical Clustering. * Bioinformatics, Vol. 17 Suppl. 1, pp. 22-29. * * this implementation builds on the improvements of a more recent paper * available at the website of Bar-Joseph! * * as input we exepct a matrix with the distances in the lower triangle, * a merge tree, i.e. two arrays holding n-1 indexes of the left and right * subtrees (or leaves) merged at the kth step (for details see dist and * hclust). * * returns a list with a matrix (merge) and two vectors (order and length). * * The algorithm has the following stages: * * 1) find a leaf ordering consistent with the supplied merge tree. * the order of the leaves of a tree consists of the order of the * leaves in the left subtree followed by the order of the leaves * in the right subtree. * * note that the tree (leaf) indexes must have an offset of one because * the leaves are coded as negative numbers. subtrees are referenced by * their position in the merge sequence (see hclust). this sucks! * * we compute for each left and right subtree the offset of the leftmost * leaf in the total order of leaves, and the number of leaves in both * trees, i.e. in the parent tree. * * 2) recursively compute for each pair of outer endpoints, i.e. a left * endpoint from the left subtree and a right endpoint from the right * subtree the length of the optimal ordering of the leaves. * * the temporary tables are stored in the lower triangle as well as the * similarities. the lengths of the best linear orderings are stored in * the upper triangle. * * for the improved computations at the root the diagonal is used as * storage for temporary results. * * the time complexity of finding all the partial optimal leaf orderings * is O(n^3). * * the suggested improvement based on early termination of the search is * currently not implemented. however, ties are broken randomly. * * 3) recursively find the total optimal leaf ordering. * * 4) find the merge tree corresponding to the optimal ordering. * * fixme: using similarities would allow a remapping of non-finite * values to zero and thus sanitizing of overflows. also for * missing values this would be a more user friendly approach. * * (C) ceeboo 2005 */ static int calcAllOrder(double *x, int *e, int *oi, int *ok, int *oj, int ci, int ck, int cj, int n) { int i, ii, j, jj, k, kk, h = 0, l; double s, z; for (i = 0; i < ci; i++) { ii = oi[i]; for (j = 0; j < cj; j++) { jj = oj[j]; l = 0; z = R_PosInf; for (k = 0; k < ck; k++) { kk = ok[k]; if (ii > kk) s = x[kk+ii*n]; else s = x[ii+kk*n]; if (kk > jj) s += x[kk+jj*n]; else s += x[jj+kk*n]; if (s < z) { z = s; h = kk; l = 1; } else if (s == z) { if (unif_rand() > (double) l/(l+1)) h = kk; l++; } } if (!R_FINITE(z)) return 0; /* error */ if (ii > jj) x[jj+ii*n] = z; else x[ii+jj*n] = z; e[ii+jj*n] = h; } } return 1; } static int calcEndOrder(double *x, int *e, int *oi, int *ok, int ci, int ck, int n) { int i, ii, k, kk, h = 0, l; double s, z; for (i = 0; i < ci; i++) { ii = oi[i]; l = 0; z = R_PosInf; for (k = 0; k < ck; k++) { kk = ok[k]; if (ii > kk) s = x[kk+ii*n]; else s = x[ii+kk*n]; if (s < z) { z = s; h = kk; l = 1; } else if (s == z) { if (unif_rand() > (double) l/(l+1)) h = kk; l++; } } if (!R_FINITE(z)) return 0; x[ii+ii*n] = z; e[ii+ii*n] = h; } return 1; } static int debug = FALSE; SEXP order_optimal(SEXP R_dist, SEXP R_merge) { int n, i, ii, j, jj, k, kk, h, a = 0, b = 0; int cl = 0, cll = 0, clr = 0, cr = 0, crl = 0, crr = 0; int *l, *r, *c, *e; int *left, *right, *o, *ol = 0, *oll = 0, *olr = 0, *or = 0, *orl = 0, *orr = 0; double s, z, zz; double *x; SEXP R_obj; n = 1 + (int) sqrt(2 * LENGTH(R_dist)); if (LENGTH(R_dist) < 3 || LENGTH(R_dist) != n*(n-1)/2) error("order_optimal: invalid length"); if (LENGTH(GET_DIM(R_merge)) != 2) error("order_optimal: \"merge\" invalid"); if (INTEGER(GET_DIM(R_merge))[0] != n-1) error("order_optimal: \"dist\" and \"merge\" do not conform"); if (!checkRmerge(INTEGER(R_merge), n)) error("order_optimal: \"merge\" invalid"); /* copy similarities into lower triangle */ x = Calloc(n*n, double); /* data + part order lengths + temporary */ k = 0; for (i = 0; i < n-1; i++) for (j = i+1; j < n; j++) { z = REAL(R_dist)[k++]; if (!R_FINITE(z)) { Free(x); error("order_optimal: \"dist\" invalid"); } else x[j+i*n] = z; } PROTECT(R_obj = NEW_LIST(3)); /* result list */ SET_ELEMENT(R_obj, 0, duplicate(R_merge)); /* merge */ SET_ELEMENT(R_obj, 1, NEW_INTEGER(n)); /* order */ SET_ELEMENT(R_obj, 2, NEW_NUMERIC(1)); /* length */ left = INTEGER(VECTOR_ELT(R_obj, 0)); right = INTEGER(VECTOR_ELT(R_obj, 0))+n-1; o = INTEGER(VECTOR_ELT(R_obj, 1)); GetRNGstate(); l = Calloc(n, int); /* offset of leftmost leaf of left tree */ r = Calloc(n, int); /* offset of leftmost leaf of right tree; * reverse mapping of order */ c = Calloc(n-1, int); /* number of leaves in a tree */ e = Calloc(n*n, int); /* inner endpoints */ /* for each tree count the number of leaves. */ for (k = 0; k < n-1; k++) { if (left[k] > 0) c[k] += c[left[k]-1]; else c[k] = 1; if (right[k] > 0) c[k] += c[right[k]-1]; else c[k] += 1; } /* backpropagate the counts to obtain the current * leaf order and the offset of the leftmost leaf * of the left and right subtree. */ for (k = n-2; k >= 0; k--) { if (left[k] > 0) { h = l[k] + c[left[k]-1]; if (right[k] > 0) l[right[k]-1] = h; else o[h] = -right[k]-1; l[left[k]-1] = l[k]; } else { h = l[k] + 1; if (right[k] > 0) l[right[k]-1] = h; else o[h] = -right[k]-1; o[l[k]] = -left[k]-1; } r[k] = h; } /* determine for each subtree the optimal order * for each pair of left and right endpoints * (leaves). this is done in the order provided * by the merge tree. */ for (k = 0; k < n-1; k++) { ol = o + l[k]; /* order of left subtree */ or = o + r[k]; /* order of right subtree */ cl = r[k] - l[k]; /* number of leaves in left subtree */ cr = c[k] - cl; /* number of leaves in right subtree */ if (cl > 1) { /* a left tree */ h = left[k]-1; oll = o + l[h]; olr = o + r[h]; cll = r[h] - l[h]; clr = c[h] - cll; } else { /* a left leaf */ oll = olr = ol; cll = clr = cl; } if (cr > 1) { /* a right tree */ h = right[k]-1; orl = o + l[h]; orr = o + r[h]; crl = r[h] - l[h]; crr = c[h] - crl; } else { /* a right leaf */ orl = orr = or; crl = crr = cr; } if (k == n-2) /* optimized search at the root */ break; /* compute temporary sums for all endpoints */ if (!calcAllOrder(x, e, oll, olr, or, cll, clr, cr, n)) { Free(x); Free(r); Free(l); Free(c); Free(e); error("order_optimal: non-finite values"); } if (olr != oll) if (!calcAllOrder(x, e, olr, oll, or, clr, cll, cr, n)) { Free(x); Free(r); Free(l); Free(c); Free(e); error("order_optimal: non-finite values"); } /* copy temporary sums to lower triangle */ for (i = 0; i < cl; i++) { ii = ol[i]; for (j = 0; j < cr; j++) { jj = or[j]; if (ii > jj) x[ii+jj*n] = x[jj+ii*n]; else x[jj+ii*n] = x[ii+jj*n]; } } /* compute best orders for all endpoints */ if (!calcAllOrder(x, e, orl, orr, ol, crl, crr, cl, n)) { Free(x); Free(r); Free(l); Free(c); Free(e); error("order_optimal: non-finite values"); } if (orr != orl) if (!calcAllOrder(x, e, orr, orl, ol, crr, crl, cl, n)) { Free(x); Free(r); Free(l); Free(c); Free(e); error("order_optimal: non-finite values"); } /* now that we know both endpoints we can store * the inner endpoint from the left tree at the * correct addresse. */ for (i = 0; i < cr; i++) { ii = or[i]; for (j = 0; j < cl; j++) { jj = ol[j]; kk = e[ii+jj*n]; if (ii > jj) x[ii+jj*n] = (double) e[jj+kk*n]; else x[jj+ii*n] = (double) e[jj+kk*n]; } } /* copy back */ for (i = 0; i < cl; i++) { ii = ol[i]; for (j = 0; j < cr; j++) { jj = or[j]; if (ii > jj) e[ii+jj*n] = (int) x[ii+jj*n]; else e[ii+jj*n] = (int) x[jj+ii*n]; } } } /* find the best linear order for each endpoint * of the left and right subtree of the root */ if (!calcEndOrder(x, e, oll, olr, cll, clr, n)) { Free(x); Free(r); Free(l); Free(c); Free(e); error("order_optimal: non-finite values"); } if (olr != oll) if (!calcEndOrder(x, e, olr, oll, clr, cll, n)) { Free(x); Free(r); Free(l); Free(c); Free(e); error("order_optimal: non-finite values"); } if (!calcEndOrder(x, e, orl, orr, crl, crr, n)) { Free(x); Free(r); Free(l); Free(c); Free(e); error("order_optimal: non-finite values"); } if (orr != orl) if (!calcEndOrder(x, e, orr, orl, crr, crl, n)) { Free(x); Free(r); Free(l); Free(c); Free(e); error("order_optimal: non-finite values"); } /* find the best linear order at the root */ k = 0; z = R_PosInf; for (i = 0; i < cl; i++) { ii = ol[i]; zz = x[ii+ii*n]; for (j = 0; j < cr; j++) { jj = or[j]; s = zz + x[jj+jj*n]; if (ii > jj) s += x[ii+jj*n]; else s += x[jj+ii*n]; if (s < z) { z = s; a = ii; b = jj; k = 1; } else if (s == z) { if (unif_rand() > (double) k/(k+1)) { a = ii; b = jj; } k++; } } if (!R_FINITE(z)) { Free(x); Free(r); Free(l); Free(c); Free(e); error("order_optimal: non-finite values"); } } REAL(VECTOR_ELT(R_obj, 2))[0] = z; /* set length */ /* the order can be found by double recursion. * for this we use a stack, one for the left * and one for the right endpoints. */ l[0] = b; /* push endpoints of right tree on the stack*/ r[0] = e[b+b*n]; i = e[a+a*n]; /* start with endpoints of left tree */ j = a; h = 0; k = 1; while (h < n) { if (i == j) { /* backtrack */ o[h++] = i; k--; if (k < 0) break; i = l[k]; /* pop endpoints */ j = r[k]; } else { l[k] = e[j+i*n]; /* push endpoints of right tree on the stack */ r[k] = j; k++; j = e[i+j*n]; /* recurse left tree */ } } /* adjust the merge tree to the optimal order * * 1) for each pair of leaves from a left and right * subtree the order relation is the same. thus, * use the leftmost leaves as representatives. * * 2) if the order is reversed we must swap the * subtrees at the parent. */ for (k = 0; k < n; k++) /* reverse mapping of optimal order */ r[o[k]] = k; for (k = 0; k < n-1; k++) { if (left[k] > 0) /* left leaf in left subtree */ i = l[left[k]-1]; else i = -left[k]-1; if (right[k] > 0) /* left leaf in right subtree */ j = l[right[k]-1]; else j = -right[k]-1; if (r[i] > r[j]) { /* swap the subtrees */ h = right[k]; right[k] = left[k]; left[k] = h; } l[k] = i; /* left leaf in parent tree */ } for (k = 0; k < n; k++) /* offset to R indexing */ o[k]++; if (debug) { i = e[a+a*n]; j = e[b+b*n]; if (i > j) x[j+i*n] = z; else x[i+j*n] = z; for (k = 0; k < n-1; k++) { if (left[k] > 0) l[k] = l[left[k]-1]; else l[k] = -left[k]-1; if (right[k] > 0) r[k] = r[right[k]-1]; else r[k] = -right[k]-1; i = l[k]; j = r[k]; if (i > j) z = x[j+i*n]; else z = x[i+j*n]; Rprintf(" %3i | %4i %4i | %3i %3i | %f\n", k+1, left[k], right[k], i+1, j+1, z); } } Free(x); Free(l); Free(r); Free(c); Free(e); PutRNGstate(); UNPROTECT(1); return R_obj; } /**/ cba/src/dll.c0000644000175100001440000000474513041567251012541 0ustar hornikusers #include #include #include extern SEXP ccfkms(SEXP R_x, SEXP R_p, SEXP R_par, SEXP R_max_iter, SEXP R_opt_std, SEXP R_debug); extern SEXP cluster_dist(SEXP R_x, SEXP R_beta); extern SEXP as_dummy(SEXP R_x); extern SEXP gknn(SEXP R_x, SEXP R_y, SEXP R_k, SEXP R_l, SEXP R_break_ties, SEXP R_use_all, SEXP R_prob); extern SEXP order_optimal(SEXP R_dist, SEXP R_merge); extern SEXP order_length(SEXP R_dist, SEXP R_order); extern SEXP order_greedy(SEXP R_dist); extern SEXP lminter(SEXP R_x, SEXP R_block_size, SEXP R_nbin); extern SEXP proximus(SEXP R_mat, SEXP R_max_radius, SEXP R_min_size, SEXP R_min_retry, SEXP R_max_iter, SEXP R_debug); extern SEXP rockLink(SEXP R_x, SEXP R_beta); extern SEXP rockMerge(SEXP R_x, SEXP R_n, SEXP R_theta, SEXP R_debug); extern SEXP rockClass(SEXP R_x, SEXP R_l, SEXP R_beta, SEXP R_theta); extern SEXP sdists(SEXP R_x, SEXP R_y, SEXP R_method, SEXP R_weight, SEXP R_pairwise); extern SEXP sdists_transcript(SEXP R_x, SEXP R_y, SEXP R_method, SEXP R_weight, SEXP R_table); extern SEXP sdists_graph(SEXP x); extern SEXP sdists_align(SEXP R_x, SEXP R_y, SEXP t); extern SEXP stress(SEXP R_x, SEXP R_r, SEXP R_c, SEXP R_type); extern SEXP stress_dist(SEXP R_x, SEXP R_r, SEXP R_c, SEXP R_bycol, SEXP R_type); extern SEXP orderTSP(SEXP x, SEXP t); static const R_CallMethodDef CallEntries[] = { {"R_ccfkms", (DL_FUNC) ccfkms, 6}, {"R_cluster_dist", (DL_FUNC) cluster_dist, 2}, {"R_as_dummy", (DL_FUNC) as_dummy, 1}, {"R_gknn", (DL_FUNC) gknn, 7}, {"R_order_optimal", (DL_FUNC) order_optimal, 2}, {"R_order_length", (DL_FUNC) order_length, 2}, {"R_order_greedy", (DL_FUNC) order_greedy, 1}, {"R_lminter", (DL_FUNC) lminter, 3}, {"R_proximus", (DL_FUNC) proximus, 6}, {"R_rockLink", (DL_FUNC) rockLink, 2}, {"R_rockMerge", (DL_FUNC) rockMerge, 4}, {"R_rockClass", (DL_FUNC) rockClass, 4}, {"R_sdists", (DL_FUNC) sdists, 5}, {"R_sdists_transcript", (DL_FUNC) sdists_transcript, 5}, {"R_sdists_graph", (DL_FUNC) sdists_graph, 1}, {"R_sdists_align", (DL_FUNC) sdists_align, 3}, {"R_stress", (DL_FUNC) stress, 4}, {"R_stress_dist", (DL_FUNC) stress_dist, 5}, {"R_orderTSP", (DL_FUNC) orderTSP, 2}, {NULL, NULL, 0} }; void R_init_cba(DllInfo *dll) { R_registerRoutines(dll, NULL, CallEntries, NULL, NULL); R_useDynamicSymbols(dll, FALSE); } cba/src/stress.c0000644000175100001440000003017612265434154013310 0ustar hornikusers #include #include // arrayIndex.c extern SEXP _int_array_subscript(int, SEXP, const char *, const char *, SEXP, Rboolean, SEXP); /* compute the stress measure based on Moor Neighborhoods, i.e. the * sums of the squared distances of a point to its eight (five at the * margins and three at the corners) adjacent neighbors as defined by * the row and column indexes (or subsets of it). * * this function counts each edge distance only once! so, if you * prefer the measure from the paper you have to take twice the * value. * * note that NAs are omitted. however, the function does not return * NA if there was no legal edge at all. */ double stressMoore(double *x, int *r, int *c, int nr, int nc, int nrx) { double d, v, z; int i, j, l, ll, k, kk; z = 0; l = r[0]; for (i = 0; i < nr-1; i++) { ll = r[i+1]; k = c[0] * nrx; for (j = 0; j < nc-1; j++) { kk = c[j+1] * nrx; v = x[l+k]; if (!ISNAN(v)) { d = v - x[ll+k]; if (!ISNAN(d)) z += d * d; d = v - x[ll+kk]; if (!ISNAN(d)) z += d * d; d = v - x[l+kk]; if (!ISNAN(d)) z += d * d; } d = x[ll+k] - x[l+kk]; k = kk; if (!ISNAN(d)) z += d * d; } d = x[l+k] - x[ll+k]; l = ll; if (!ISNAN(d)) z += d * d; R_CheckUserInterrupt(); } k = c[0] * nrx; for (j = 0; j < nc-1; j++) { kk = c[j+1] * nrx; d = x[l+k] - x[l+kk]; k = kk; if (!ISNAN(d)) z += d * d; } return z; } /* same as above but use a von Neumann neighborhood, i.e. the * neighboring points on the diagonals are excluded. */ double stressNeumann(double *x, int *r, int *c, int nr, int nc, int nrx) { double d, v, z; int i, j, l, ll, k, kk; z = 0; l = r[0]; for (i = 0; i < nr-1; i++) { ll = r[i+1]; k = c[0] * nrx; for (j = 0; j < nc-1; j++) { kk = c[j+1] * nrx; v = x[l+k]; if (!ISNAN(v)) { d = v - x[ll+k]; if (!ISNAN(d)) z += d * d; d = v - x[l+kk]; if (!ISNAN(d)) z += d * d; } k = kk; } d = x[l+k] - x[ll+k]; l = ll; if (!ISNAN(d)) z += d * d; R_CheckUserInterrupt(); } k = c[0] * nrx; for (j = 0; j < nc-1; j++) { kk = c[j+1] * nrx; d = x[l+k] - x[l+kk]; k = kk; if (!ISNAN(d)) z += d * d; } return z; } /* R wrapper to the stress functions */ SEXP stress(SEXP R_x, SEXP R_r, SEXP R_c, SEXP R_type) { int nrx, nr, nc; int k; int *r, *c; SEXP R_obj; #ifdef _COMPAT_ PROTECT(R_r = arraySubscript(0, R_r, GET_DIM(R_x), getAttrib, (STRING_ELT), R_x)); PROTECT(R_c = arraySubscript(1, R_c, GET_DIM(R_x), getAttrib, (STRING_ELT), R_x)); #else PROTECT(R_r = _int_array_subscript(0, R_r, "dim", "dimnames", R_x, TRUE, R_NilValue)); PROTECT(R_c = _int_array_subscript(1, R_c, "dim", "dimnames", R_x, TRUE, R_NilValue)); #endif nrx = INTEGER(GET_DIM(R_x))[0]; /* number of rows */ nr = LENGTH(R_r); nc = LENGTH(R_c); /* remap R indexes to C indexes * this sucks! */ r = Calloc(nr, int); c = Calloc(nc, int); /* copy and shift indexes */ for (k = 0; k < nr; k++) r[k] = INTEGER(R_r)[k]-1; for (k = 0; k < nc; k++) c[k] = INTEGER(R_c)[k]-1; PROTECT(R_obj = NEW_NUMERIC(1)); switch (INTEGER(R_type)[0]) { case 1: REAL(R_obj)[0] = stressMoore(REAL(R_x), r, c, nr, nc, nrx); break; case 2: REAL(R_obj)[0] = stressNeumann(REAL(R_x), r, c, nr, nc, nrx); break; default: Free(r); Free(c); error("stress: type not implemented"); } Free(r); Free(c); UNPROTECT(3); return R_obj; } /* calculate the Moore distances between all pairs of rows or columns. * of a matrix. for a given (fixed) row or column ordering the distances * could be used to search for an optimal column or row ordering using * an alternating scheme. * * if the calculation are over the rows ncx = 1, otherwise the roles * of rows and columns are swapped and nrx = 1. * * the caller must provide the result array d and the temporary array t. * * the distances are arranged in lower triangular column format (compare * the R function dist). * * note that the edge distances are computed only once! * * (C) ceeboo 2005, 2006 */ void distMoore(double *x, int *r, int *c, int nr, int nc, int nrx, int ncx, double *d, double *t) { double v, w, z; int i, ii, j, jj, k, kk, kkk, l; for (k = 0; k < nr*(nr-1)/2; k++) /* initialize distances */ d[k] = 0; for (i = 0; i < nr; i++) { z = 0; ii = r[i] * ncx; kk = c[0] * nrx; for (k = 0; k < nc-1; k++) { kkk = c[k+1] * nrx; w = x[ii+kk] - x[ii+kkk]; if (!ISNAN(w)) z += w * w; kk = kkk; } t[i] = z; R_CheckUserInterrupt(); } l = 0; for (i = 0; i < nr-1; i++) { ii = r[i] * ncx; for (j = i+1; j < nr; j++) { z = t[i] + t[j]; jj = r[j] * ncx; kk = c[0] * nrx; for (k = 0; k < nc-1; k++) { kkk = c[k+1] * nrx; v = x[ii+kk]; if (!ISNAN(v)) { w = v - x[jj+kk]; if (!ISNAN(w)) z += w * w; w = v - x[jj+kkk]; if (!ISNAN(w)) z += w * w; } w = x[jj+kk] - x[ii+kkk]; if (!ISNAN(w)) z += w * w; kk = kkk; } w = x[ii+kk] - x[jj+kk]; if (!ISNAN(w)) z += w * w; d[l++] = z; R_CheckUserInterrupt(); } } } /* calculate the von Neumann distances over the rows or columns of a * matrix. * * compare above. */ void distNeumann(double *x, int *r, int *c, int nr, int nc, int nrx, int ncx, double *d, double *t) { double w, z; int i, ii, j, jj, k, kk, kkk, l; for (k = 0; k < nr*(nr-1)/2; k++) /* initialize distances */ d[k] = 0; for (i = 0; i < nr; i++) { z = 0; ii = r[i] * ncx; kk = c[0] * nrx; for (k = 0; k < nc-1; k++) { kkk = c[k+1] * nrx; w = x[ii+kk] - x[ii+kkk]; if (!ISNAN(w)) z += w * w; kk = kkk; } t[i] = z; R_CheckUserInterrupt(); } l = 0; for (i = 0; i < nr-1; i++) { ii = r[i] * ncx; for (j = i+1; j < nr; j++) { z = t[i] + t[j]; jj = r[j] * ncx; for (k = 0; k < nc-1; k++) { kk = c[k] * nrx; w = x[ii+kk]- x[jj+kk]; if (!ISNAN(w)) z += w * w; } kk = c[k] * nrx; w = x[ii+kk] - x[jj+kk]; if (!ISNAN(w)) z += w * w; d[l++] = z; R_CheckUserInterrupt(); } } } /* R wrapper */ SEXP stress_dist(SEXP R_x, SEXP R_r, SEXP R_c, SEXP R_bycol, SEXP R_type) { int nrx, nr, nc; int k; int *r, *c; double *d, *t; SEXP R_obj = R_NilValue; /* compiler hack */ #ifdef _COMPAT_ PROTECT(R_r = arraySubscript(0, R_r, GET_DIM(R_x), getAttrib, (STRING_ELT), R_x)); PROTECT(R_c = arraySubscript(1, R_c, GET_DIM(R_x), getAttrib, (STRING_ELT), R_x)); #else PROTECT(R_r = _int_array_subscript(0, R_r, "dim", "dimnames", R_x, TRUE, R_NilValue)); PROTECT(R_c = _int_array_subscript(1, R_c, "dim", "dimnames", R_x, TRUE, R_NilValue)); #endif nrx = INTEGER(GET_DIM(R_x))[0]; /* number of rows */ nr = LENGTH(R_r); nc = LENGTH(R_c); /* remap R indexes to C indexes * this sucks! */ r = Calloc(nr, int); c = Calloc(nc, int); /* copy and shift indexes */ for (k = 0; k < nr; k++) r[k] = INTEGER(R_r)[k]-1; for (k = 0; k < nc; k++) c[k] = INTEGER(R_c)[k]-1; switch(LOGICAL(R_bycol)[0]) { case 0: PROTECT(R_obj = NEW_NUMERIC(nr*(nr-1)/2)); d = REAL(R_obj); t = Calloc(nr, double); switch(INTEGER(R_type)[0]) { case 1: distMoore(REAL(R_x), r, c, nr, nc, nrx, 1, d, t); break; case 2: distNeumann(REAL(R_x), r, c, nr, nc, nrx, 1, d, t); break; default: Free(r); Free(c); Free(t); error("stress_dist: \"type\" not implemented"); } Free(t); break; case 1: PROTECT(R_obj = NEW_NUMERIC(nc*(nc-1)/2)); d = REAL(R_obj); t = Calloc(nc, double); switch(INTEGER(R_type)[0]) { case 1: distMoore(REAL(R_x), c, r, nc, nr, 1, nrx, d, t); break; case 2: distNeumann(REAL(R_x), c, r, nc, nr, 1, nrx, d, t); break; default: Free(r); Free(c); Free(t); error("stress_dist: type not implemented"); } Free(t); break; default: Free(r); Free(c); error("stress_dist: \"bycol\" invalid"); } Free(r); Free(c); UNPROTECT(3); return R_obj; } /* in ordering problems we want find a path of minimum length through * a distance graph. this is a TSP problem with a dummy city that is * equally distant from all other cities, i.e. the length of the path * i -> 0 -> j, closing the tour is irrelevant. however, the length of * the leg i -> j is greater or equal than any remaining leg k -> l on * the optimum tour. * * see: Climer, S. and Zhang, W. (2006) Rearrangement Clustering: * Pitfalls, Remedies, and Applications. Journal of Machine * Learning Research 7, pp. 919-943. * * orderTSP implements a greedy heuristic that exchanges two edges * immediately if this improves the tour length and stops if no further * improvement (over all combinations of edges) is possible. exchanging * edges amounts to reversing subpaths. * * the time complexity is O(n^2) with n the number of cities. * * note: the algorithm could easily be extended to a simulated * annealing algorithm. the code is slightly optimized. */ SEXP orderTSP(SEXP x, SEXP t) { if (TYPEOF(x) != REALSXP) error("'x' invalid storage type"); if (TYPEOF(t) != INTSXP) error("'t' invalid storage type"); int i, n, f = 0; n = 1 + (int) sqrt(2*LENGTH(x)); if (LENGTH(x) != n*(n-1)/2) error("'x' invalid length"); if (LENGTH(t) != n) error("'t' invalid length"); for (i = 0; i < n; i++) if (INTEGER(t)[i] < 1 || INTEGER(t)[i] > n) error("'t' invalid"); PROTECT(t = duplicate(t)); do { int i, j, k = 0, l = 0, c1, c2, c3, c4 = n-1; double e23, e13, e12, e34, e24, e31, e41; f = 0; c1 = INTEGER(t)[0]-1; for (i = 1; i < n-1; i++) { c2 = INTEGER(t)[i]-1; c3 = INTEGER(t)[i+1]-1; if (c2 > c3) e23 = REAL(x)[c2+c3*(n-1)-c3*(c3+1)/2-1]; else e23 = REAL(x)[c3+c2*(n-1)-c2*(c2+1)/2-1]; if (c1 > c3) e13 = REAL(x)[c1+c3*(n-1)-c3*(c3+1)/2-1]; else e13 = REAL(x)[c3+c1*(n-1)-c1*(c1+1)/2-1]; if (e23 > e13) { f++; for (k = 0; k < (i+1)/2; k++) { l = INTEGER(t)[i-k]; INTEGER(t)[i-k] = INTEGER(t)[k]; INTEGER(t)[k] = l; } c1 = INTEGER(t)[0]-1; } } for (i = 0; i < n-3; i++) { c1 = INTEGER(t)[i]-1; c2 = INTEGER(t)[i+1]-1; if (c1 > c2) e12 = REAL(x)[c1+c2*(n-1)-c2*(c2+1)/2-1]; else e12 = REAL(x)[c2+c1*(n-1)-c1*(c1+1)/2-1]; for (j = i+2; j < n-1; j++) { c3 = INTEGER(t)[j]-1; c4 = INTEGER(t)[j+1]-1; if (c3 > c4) e34 = REAL(x)[c3+c4*(n-1)-c4*(c4+1)/2-1]; else e34 = REAL(x)[c4+c3*(n-1)-c3*(c3+1)/2-1]; if (c2 > c4) e24 = REAL(x)[c2+c4*(n-1)-c4*(c4+1)/2-1]; else e24 = REAL(x)[c4+c2*(n-1)-c2*(c2+1)/2-1]; if (c3 > c1) e31 = REAL(x)[c3+c1*(n-1)-c1*(c1+1)/2-1]; else e31 = REAL(x)[c1+c3*(n-1)-c3*(c3+1)/2-1]; if (e12+e34 > e24+e31) { f++; for (k = 0; k < (j-i)/2; k++) { l = INTEGER(t)[j-k]; INTEGER(t)[j-k] = INTEGER(t)[i+1+k]; INTEGER(t)[i+1+k] = l; } c2 = INTEGER(t)[i+1]-1; if (c1 > c2) e12 = REAL(x)[c1+c2*(n-1)-c2*(c2+1)/2-1]; else e12 = REAL(x)[c2+c1*(n-1)-c1*(c1+1)/2-1]; } } if (c4 > c1) e41 = REAL(x)[c4+c1*(n-1)-c1*(c1+1)/2-1]; else e41 = REAL(x)[c1+c4*(n-1)-c4*(c4+1)/2-1]; if (e12 > e41) { f++; for (k = 0; k < (j-i)/2; k++) { l = INTEGER(t)[j-k]; INTEGER(t)[j-k] = INTEGER(t)[i+1+k]; INTEGER(t)[i+1+k] = l; } } R_CheckUserInterrupt(); } } while (f); UNPROTECT(1); return t; } /**/ cba/src/arrayIndex.c0000644000175100001440000001662112265434154014072 0ustar hornikusers #include #include // workaround i18n #define _(x) (x) // copied from 2.14-2 src/main/subscript.c // // ceeboo 2011/11 2014/1 // #define ECALL(call, yy) if(call == R_NilValue) error(yy); else errorcall(call, yy); static SEXP nullSubscript(int n) { int i; SEXP indx; indx = allocVector(INTSXP, n); for (i = 0; i < n; i++) INTEGER(indx)[i] = i + 1; return indx; } static SEXP logicalSubscript(SEXP s, int ns, int nx, int *stretch, SEXP call) { int canstretch, count, i, nmax; SEXP indx; canstretch = *stretch; if (!canstretch && ns > nx) { ECALL(call, _("(subscript) logical subscript too long")); } nmax = (ns > nx) ? ns : nx; *stretch = (ns > nx) ? ns : 0; if (ns == 0) return(allocVector(INTSXP, 0)); count = 0; for (i = 0; i < nmax; i++) if (LOGICAL(s)[i%ns]) count++; indx = allocVector(INTSXP, count); count = 0; for (i = 0; i < nmax; i++) if (LOGICAL(s)[i%ns]) { if (LOGICAL(s)[i%ns] == NA_LOGICAL) INTEGER(indx)[count++] = NA_INTEGER; else INTEGER(indx)[count++] = i + 1; } return indx; } static SEXP negativeSubscript(SEXP s, int ns, int nx, SEXP call) { SEXP indx; int stretch = 0; int i, ix; PROTECT(indx = allocVector(LGLSXP, nx)); for (i = 0; i < nx; i++) LOGICAL(indx)[i] = 1; for (i = 0; i < ns; i++) { ix = INTEGER(s)[i]; if (ix != 0 && ix != NA_INTEGER && -ix <= nx) LOGICAL(indx)[-ix - 1] = 0; } s = logicalSubscript(indx, nx, nx, &stretch, call); UNPROTECT(1); return s; } static SEXP positiveSubscript(SEXP s, int ns, int nx) { SEXP indx; int i, zct = 0; for (i = 0; i < ns; i++) { if (INTEGER(s)[i] == 0) zct++; } if (zct) { indx = allocVector(INTSXP, (ns - zct)); for (i = 0, zct = 0; i < ns; i++) if (INTEGER(s)[i] != 0) INTEGER(indx)[zct++] = INTEGER(s)[i]; return indx; } else return s; } static SEXP integerSubscript(SEXP s, int ns, int nx, int *stretch, SEXP call) { int i, ii, min, max, canstretch; Rboolean isna = FALSE; canstretch = *stretch; *stretch = 0; min = 0; max = 0; for (i = 0; i < ns; i++) { ii = INTEGER(s)[i]; if (ii != NA_INTEGER) { if (ii < min) min = ii; if (ii > max) max = ii; } else isna = TRUE; } if (max > nx) { if(canstretch) *stretch = max; else { ECALL(call, _("subscript out of bounds")); } } if (min < 0) { if (max == 0 && !isna) return negativeSubscript(s, ns, nx, call); else { ECALL(call, _("only 0's may be mixed with negative subscripts")); } } else return positiveSubscript(s, ns, nx); return R_NilValue; } /* This uses a couple of horrible hacks in conjunction with * VectorAssign (in subassign.c). If subscripting is used for * assignment, it is possible to extend a vector by supplying new * names, and we want to give the extended vector those names, so they * are returned as the use.names attribute. Also, unset elements of the vector * of new names (places where a match was found) are indicated by * setting the element of the newnames vector to NULL. */ /* The original code (pre 2.0.0) used a ns x nx loop that was too * slow. So now we hash. Hashing is expensive on memory (up to 32nx * bytes) so it is only worth doing if ns * nx is large. If nx is * large, then it will be too slow unless ns is very small. */ static SEXP stringSubscript(SEXP s, int ns, int nx, SEXP names, int *stretch, Rboolean in, SEXP call) { SEXP indx, indexnames; int i, j, nnames, sub, extra; int canstretch = *stretch; /* product may overflow, so check factors as well. */ Rboolean usehashing = in && ( ((ns > 1000 && nx) || (nx > 1000 && ns)) || (ns * nx > 15*nx + ns) ); PROTECT(s); PROTECT(names); PROTECT(indexnames = allocVector(VECSXP, ns)); nnames = nx; extra = nnames; /* Process each of the subscripts. First we compare with the names * on the vector and then (if there is no match) with each of the * previous subscripts, since (if assigning) we may have already * added an element of that name. (If we are not assigning, any * nonmatch will have given an error.) */ if(usehashing) { /* must be internal, so names contains a character vector */ /* NB: this does not behave in the same way with respect to "" and NA names: they will match */ PROTECT(indx = match(names, s, 0)); /* second pass to correct this */ for (i = 0; i < ns; i++) if(STRING_ELT(s, i) == NA_STRING || !CHAR(STRING_ELT(s, i))[0]) INTEGER(indx)[i] = 0; for (i = 0; i < ns; i++) SET_VECTOR_ELT(indexnames, i, R_NilValue); } else { PROTECT(indx = allocVector(INTSXP, ns)); for (i = 0; i < ns; i++) { sub = 0; if (names != R_NilValue) { for (j = 0; j < nnames; j++) { SEXP names_j = STRING_ELT(names, j); if (!in && TYPEOF(names_j) != CHARSXP) { ECALL(call, _("character vector element does not have type CHARSXP")); } if (NonNullStringMatch(STRING_ELT(s, i), names_j)) { sub = j + 1; SET_VECTOR_ELT(indexnames, i, R_NilValue); break; } } } INTEGER(indx)[i] = sub; } } for (i = 0; i < ns; i++) { sub = INTEGER(indx)[i]; if (sub == 0) { for (j = 0 ; j < i ; j++) if (NonNullStringMatch(STRING_ELT(s, i), STRING_ELT(s, j))) { sub = INTEGER(indx)[j]; SET_VECTOR_ELT(indexnames, i, STRING_ELT(s, j)); break; } } if (sub == 0) { if (!canstretch) { ECALL(call, _("subscript out of bounds")); } extra += 1; sub = extra; SET_VECTOR_ELT(indexnames, i, STRING_ELT(s, i)); } INTEGER(indx)[i] = sub; } /* We return the new names as the names attribute of the returned subscript vector. */ if (extra != nnames) setAttrib(indx, install("use.names"), indexnames); if (canstretch) *stretch = extra; UNPROTECT(4); return indx; } /* Array Subscripts. dim is the dimension (0 to k-1) s is the subscript list, dn is the attribute name of dim dnn is the attribute name of dimnames x is the array to be subscripted. */ SEXP _int_array_subscript(int dim, SEXP s, const char *dn, const char *dnn, SEXP x, Rboolean in, SEXP call) { int nd, ns, stretch = 0; SEXP dnames, tmp; ns = LENGTH(s); nd = INTEGER(getAttrib(x, install(dn)))[dim]; switch (TYPEOF(s)) { case NILSXP: return allocVector(INTSXP, 0); case LGLSXP: return logicalSubscript(s, ns, nd, &stretch, call); case INTSXP: return integerSubscript(s, ns, nd, &stretch, call); case REALSXP: PROTECT(tmp = coerceVector(s, INTSXP)); tmp = integerSubscript(tmp, ns, nd, &stretch, call); UNPROTECT(1); return tmp; case STRSXP: dnames = getAttrib(x, install(dnn)); if (dnames == R_NilValue) { ECALL(call, _("no 'dimnames' attribute for array")); } dnames = VECTOR_ELT(dnames, dim); return stringSubscript(s, ns, nd, dnames, &stretch, in, call); case SYMSXP: if (s == R_MissingArg) return nullSubscript(nd); default: if (call == R_NilValue) error(_("invalid subscript type '%s'"), type2char(TYPEOF(s))); else errorcall(call, _("invalid subscript type '%s'"), type2char(TYPEOF(s))); } return R_NilValue; } // R interface SEXP R_arraySubscript(SEXP x, SEXP dim, SEXP s, SEXP dn, SEXP dnn) { // FIXME return _int_array_subscript(INTEGER(dim)[0], s, (const char *) CHAR(STRING_ELT(dn, 0)), (const char *) CHAR(STRING_ELT(dnn, 0)), x, TRUE, R_NilValue); } // cba/src/interpolate.c0000644000175100001440000000311011304023136014262 0ustar hornikusers #include #include /* interpolate a logical matrix to a lower resolution. * * note 1) that we currently use the full storage representation * of a binary matrix and 2) that some rows and/or columns at the * lower and left margins may get cut off * * (C) ceeboo 2005 */ SEXP lminter(SEXP R_x, SEXP R_block_size, SEXP R_nbin) { int nr, nc, np, zr, zc; int i, j; int *x, *z; SEXP R_obj, R_dim; nr = INTEGER(GET_DIM(R_x))[0]; /* number of rows */ nc = INTEGER(GET_DIM(R_x))[1]; /* number of columns */ x = LOGICAL(R_x); np = INTEGER(R_block_size)[0]; /* number of pixels */ zr = nr / np; /* reduced number of rows */ zc = nc / np; /* reduced number of columns */ PROTECT(R_obj = NEW_INTEGER(zr * zc)); z = INTEGER(R_obj); for (j = 0; j < zr * zc; j++) /* this sucks! */ z[j] = 0; for (j = 0; j < zc * np; j++) for (i = 0; i < zr * np; i++) z[i / np + (j / np) * zr] += x[i + j * nr]; i = INTEGER(R_nbin)[0]; /* number of bins */ if (i < 0 || i > np) error("lminter: invalid number of bins"); if (i == 0) { /* majority */ i = np * np / 2 + 1; for (j = 0; j < zr * zc; j++) z[j] /= i; } else { /* bins */ i = np * np / i; for (j = 0; j < zr * zc; j++) z[j] = ceil((double) z[j] / i); } PROTECT(R_dim = NEW_INTEGER(2)); INTEGER(R_dim)[0] = zr; INTEGER(R_dim)[1] = zc; SET_DIM(R_obj, R_dim); UNPROTECT(2); return R_obj; } /**/ cba/src/coding.c0000644000175100001440000000142111304023136013202 0ustar hornikusers #include #include /* dummy code a factor where NAs are ignored, * i.e. all indicators are FALSE */ SEXP as_dummy(SEXP R_x) { int n, l, i, j; SEXP R_obj, R_tmp; n = LENGTH(R_x); l = LENGTH(GET_LEVELS(R_x)); if (l == 0) return R_NilValue; PROTECT(R_obj = NEW_LOGICAL(n*l)); for (i = 0; i < n*l; i++) /* this sucks! */ LOGICAL(R_obj)[i] = FALSE; for (i = 0; i < n; i++) { j = INTEGER(R_x)[i]; if (j == NA_INTEGER) continue; LOGICAL(R_obj)[i+(j-1)*n] = TRUE; } PROTECT(R_tmp = NEW_INTEGER(2)); INTEGER(R_tmp)[0] = n; INTEGER(R_tmp)[1] = l; SET_DIM(R_obj, R_tmp); UNPROTECT(1); SET_LEVELS(R_obj, duplicate(GET_LEVELS(R_x))); UNPROTECT(1); return R_obj; } /**/ cba/src/greedy.c0000644000175100001440000000604311304023136013223 0ustar hornikusers #include #include /* greedy endpoint ordering based on arbitrary similarities. * this is trivial. * * input is a lower triangular distance matrix. returns the * merge tree), the corresponding order, and the height (see * hclust). * * note that the height need not be monotonically increasing! * * (C) ceeboo 2005 */ typedef struct { double v; int i; } MDS; static MDS minDist(double *x, int j, int *c, int *p, int n) { int i, k, l; double v; MDS m = {R_PosInf, 0}; l = 0; for (k = 0; k < n; k++) { i = c[k]; if (i > j) v = x[i+p[j]]; else v = x[j+p[i]]; if (v < m.v) { m.v = v; m.i = i; l = 1; } else if (v == m.v) { if (unif_rand() > (double) l/(l+1)) m.i = i; l++; } } return m; } /* swap */ static void swap(int *x1, int *x2) { int x = *x1; *x1 = *x2; *x2 = x; } SEXP order_greedy(SEXP R_dist) { int n, i, j, h, k; int *left, *right, *order, *c, *p; double *x, *height; MDS l, ll = {R_NaN, 0}, r, rr = {R_NaN, 0}; SEXP R_obj; n = 1 + (int) sqrt(2 * LENGTH(R_dist)); if (LENGTH(R_dist) != n*(n-1)/2) error("order_greedy: \"dist\" invalid length"); PROTECT(R_obj = NEW_LIST(3)); SET_ELEMENT(R_obj, 0, allocMatrix(INTSXP, n-1, 2)); /* merge */ SET_ELEMENT(R_obj, 1, NEW_INTEGER(n)); /* order */ SET_ELEMENT(R_obj, 2, NEW_NUMERIC(n-1)); /* height */ left = INTEGER(VECTOR_ELT(R_obj, 0)); right = INTEGER(VECTOR_ELT(R_obj, 0))+n-1; order = INTEGER(VECTOR_ELT(R_obj, 1)); height = REAL(VECTOR_ELT(R_obj, 2)); x = REAL(R_dist); /* distance matrix */ GetRNGstate(); p = Calloc(n-1, int); /* column pointers */ c = Calloc(n, int); for (k = 0; k < n-1; k++) { c[k] = k; /* candidate leaves */ p[k] = k*(n-1)-k*(k+1)/2-1; order[k] = k; /* here backreference */ } c[k] = k; order[k] = k; i = (int) (unif_rand() * n); /* initial leaf */ h = l.i = ll.i = r.i = rr.i = i; for (k = 0; k < n-1; k++) { swap(c+order[h], c+n-k-1); swap(order+h, order+c[order[h]]); if (ll.i == h) ll = minDist(x, l.i, c, p, n-k-1); if (k == 0) rr = ll; else if (rr.i == h) rr = minDist(x, r.i, c, p, n-k-1); if (!R_FINITE(ll.v) || !R_FINITE(rr.v)) { Free(c); Free(p); error("order_greedy: non-finite values"); } if (ll.v < rr.v) { l = ll; h = l.i; left[k] = -h-1; right[k] = k; height[k] = l.v; } else { r = rr; h = r.i; left[k] = k; right[k] = -h-1; height[k] = r.v; } } left[0] = -i-1; /* in each step a leaf was merged. so, we can simply * descend the tree and place it on the next left * or right position. */ i = 0; j = n-1; for (k = n-2; k >= 0; k--) if (left[k] > 0) order[j--] = -right[k]; else order[i++] = -left[k]; order[j] = -right[0]; Free(c); Free(p); PutRNGstate(); UNPROTECT(1); return R_obj; } /**/ cba/src/proximus.c0000644000175100001440000003270214332123330013634 0ustar hornikusers/* * proximus.c * * implements the paper: * * M. Koyutürk, A. Graham, and N. Ramakrishnan. Compression, Clustering, * and Pattern Discovery in Very High-Dimensional Descrete-Attribute Data * Sets. IEEE Transactions On Knowledge and Data Engineering, Vol. 17, * No. 4, (April) 2005 * * As data mining algorithms are supposed to deal with large amounts of * data this code was optimized for low memory footprint and low execution * times. Fasten your seat belts ;-) * * * (C) ceeboo 2005 */ #include #include static int debug = FALSE; /* user defined */ /* copy a variable to R */ static SEXP var2R(int v) { SEXP R_obj; R_obj = NEW_INTEGER(1); INTEGER(R_obj)[0] = v; return R_obj; } /* vector for counting and indexing */ typedef struct { int *v; /* pointer to array of values */ int n; /* number elements */ } VEC; static VEC *newVec(int n) { int *v = Calloc(n, int); VEC *p = Calloc(1, VEC); p->v = v; p->n = n; return p; } static VEC *copyVec(VEC *v) { int i; VEC *r = newVec(v->n); for (i = 0; i < v->n; i++) r->v[i] = v->v[i]; return r; } static void freeVec(VEC *v) { if (v->v != NULL) Free(v->v); Free(v); } /* copy a vector to R where an offset is added * to each element */ static SEXP vec2R(VEC *v, int o) { int j; SEXP R_obj; R_obj = NEW_INTEGER(v->n); for (j = 0; j < v->n; j++) INTEGER(R_obj)[j] = v->v[j] + o; return R_obj; } /* for debugging static void VecPrintf(VEC *v, char *s) { int j; Rprintf("%s", s); for (j = 0; j < v->n; j++) Rprintf(" (%i,%i)", j, v->v[j]); Rprintf("\n"); } */ /* matrix for binary data in sparse column format */ typedef struct { int *ri; /* pointer to array of row indexes */ int *ci; /* pointer to array of column start indexes */ int nr; /* number of rows */ int nc; /* number of columns */ } MAT; static void freeMat(MAT *m) { Free(m->ri); Free(m->ci); Free(m); } /* copy and transpose R matrix to sparse matrix */ static MAT *R_mat2mat(SEXP R_mat) { extern int debug; int nr, nc; int i, j, k, n; int *x, *ci, *ri; MAT *m; x = INTEGER(R_mat); nr = INTEGER(GET_DIM(R_mat))[0]; /* number of rows */ nc = INTEGER(GET_DIM(R_mat))[1]; /* number of columns */ ci = Calloc(nr+1, int); /* column start */ n = 1024; /* initial size */ ri = Calloc(n, int); /* row indexes */ k = 0; for (j = 0; j < nr; j++) { ci[j] = k; for (i = 0; i < nc; i++) if (x[i * nr + j] == 1) { if (k == n) { /* used up */ n *= 2; /* double size */ ri = Realloc(ri, n, int); } ri[k++] = i; } } ci[j] = k; /* length of ri */ if (n > k) /* free unused */ ri = Realloc(ri, k, int); if (debug) { Rprintf("Non-Zero: %i\n", k); Rprintf("Sparsity: %4.2f\n",k / (double) (nr * nc)); } m = Calloc(1, MAT); m->ri = ri; m->ci = ci; m->nr = nc; m->nc = nr; return m; } /* multiply a matrix in sparse column format (m) with a sparse * vector (v) from the left using a subset of the columns (s). the * caller is reponsible for providing a proper results vector (r). */ static void matLeft(VEC *r, VEC *v, VEC *s, MAT *m) { int i, j, k, z; for (i = 0; i < s->n; i++) { /* columns */ z = 0; k = 0; j = m->ci[s->v[i]]; do { /* rows */ if (m->ri[j] == v->v[k]) { z++; j++; k++; } else if (m->ri[j] < v->v[k]) j++; else k++; } while (j < m->ci[s->v[i] + 1] && k < v->n); r->v[i] = z; } r->n = s->n; } /* as above but multiply from the right */ static void matRight(VEC *r, VEC *v, MAT *m) { int i, j; for (i = 0; i < m->nr; i++) r->v[i] = 0; r->n = m->nr; for (i = 0; i < v->n; i++) for (j = m->ci[v->v[i]]; j < m->ci[v->v[i] + 1]; j++) r->v[m->ri[j]]++; } /* linked list of approximation results */ typedef struct resNode { VEC *x; /* presence vector (column indexes) */ VEC *y; /* dominant pattern vector (row indexes) */ int n; /* number of ones ... */ int c; /* approximation criterion */ int r; /* hamming radius */ struct resNode *next; /* pointer to result element */ } RES; static int res_cnt; /* number of result elements */ static RES *res_last; /* last element of result list */ static int freeRes(RES *r) { int i; RES *p, *q; i = 0; for (p = r; p != NULL; p = q) { q = p->next; freeVec(p->x); freeVec(p->y); Free(p); i++; } return i; } /* copy result list to R and clean up * * fixme: pointer protection should be * on the level of the caller */ static SEXP res2R(RES *r, MAT *m) { int i, nr, nc; RES *p, *q; SEXP R_ret, R_obj, R_lst, R_res; nc = m->nr; /* transpose */ nr = m->nc; PROTECT(R_ret = NEW_LIST(3)); /* results header */ SET_ELEMENT(R_ret, 0, PROTECT(var2R(nr))); SET_ELEMENT(R_ret, 1, PROTECT(var2R(nc))); UNPROTECT(2); PROTECT(R_obj = NEW_STRING(3)); SET_STRING_ELT(R_obj, 0, mkChar("nr")); SET_STRING_ELT(R_obj, 1, mkChar("nc")); SET_STRING_ELT(R_obj, 2, mkChar("a")); SET_NAMES(R_ret, R_obj); UNPROTECT(1); PROTECT(R_lst = NEW_LIST(res_cnt)); /* results list */ i = 0; for (p = r; p != NULL; p = q) { q = p->next; PROTECT(R_res = NEW_LIST(5)); SET_ELEMENT(R_res, 0, PROTECT(vec2R(p->x,1))); SET_ELEMENT(R_res, 1, PROTECT(vec2R(p->y,1))); UNPROTECT(2); SET_ELEMENT(R_res, 2, PROTECT(var2R(p->n))); SET_ELEMENT(R_res, 3, PROTECT(var2R(p->c))); SET_ELEMENT(R_res, 4, PROTECT(var2R(p->r))); UNPROTECT(3); freeVec(p->x); freeVec(p->y); Free(p); PROTECT(R_obj = NEW_STRING(5)); SET_STRING_ELT(R_obj, 0, mkChar("x")); SET_STRING_ELT(R_obj, 1, mkChar("y")); SET_STRING_ELT(R_obj, 2, mkChar("n")); SET_STRING_ELT(R_obj, 3, mkChar("c")); SET_STRING_ELT(R_obj, 4, mkChar("r")); SET_NAMES(R_res, R_obj); UNPROTECT(1); if (i == res_cnt) { i += freeRes(q); freeMat(m); error("res2R result count error [%i:%i]", i, res_cnt); } SET_ELEMENT(R_lst, i++, R_res); UNPROTECT(1); } if (i != res_cnt) error("res2R result count error [%i:%i]", i, res_cnt); SET_ELEMENT(R_ret, 2, R_lst); UNPROTECT(2); return R_ret; } /* compute the rank-one approximation of a column subset of a * matrix. the code is optimized for minimal memory usage */ static int min_size = 1; /* user defined */ static int max_iter = 16; /* user defined */ static RES *approximate(VEC *s, MAT *m) { extern int min_size; /* minimum set size */ extern int max_iter; /* maximum iterations */ extern int debug; int i, j, l, c, z; VEC *x, *y, *v; RES *p; x = newVec(s->n); /* presence set (column indexes) */ y = newVec(m->nr); /* dominant pattern (row indexes) */ v = newVec((s->n > m->nr) ? s->n : m->nr); /* result vector (counts) */ if (s->n > min_size) { i = (int) (unif_rand() * s->n); /* sample a column */ y->n = 0; for (j = m->ci[s->v[i]]; j < m->ci[s->v[i] + 1]; j++) y->v[y->n++] = m->ri[j]; } else { for (j = 0; j < s->n; j++) x->v[j] = s->v[j]; } z = 0; /* number of ones in pattern */ c = -1; /* stopping criterion */ i = 0; while (i < max_iter) { if (s->n > min_size) { matLeft(v, y, s, m); x->n = 0; for (j = 0; j < v->n; j++) if (2 * v->v[j] >= y->n) /* holds for at least one */ x->v[x->n++] = s->v[j]; } matRight(v, x, m); z = 0; y->n = 0; for (j = 0; j < v->n; j++) if (2 * v->v[j] >= x->n) { /* may not hold for any */ z += v->v[j]; y->v[y->n++] = j; } l = c; c = 2 * z - x->n * y->n; if (c == l) /* convergence */ break; i++; if (debug > 1) Rprintf("%2i %6i %i\n", i, x->n, c); } if (i == max_iter) /* no convergence */ warning("approximation: no convergence"); /* compute the Hamming radius of the presence set */ matLeft(v, y, x, m); l = 0; for (i = 0; i < x->n; i++) { j = m->ci[x->v[i] + 1] - m->ci[x->v[i]]; j += y->n - 2 * v->v[i]; if (j > l) l = j; } freeVec(v); x->v = Realloc(x->v, x->n, int); /* see above */ if (y->n) /* see above */ y->v = Realloc(y->v, y->n, int); else { Free(y->v); y->v = NULL; } /* package result */ p = Calloc(1, RES); p->x = x; p->y = y; p->n = z; p->c = c; p->r = l; p->next = NULL; return p; } /* produce a presence set. for now, draw a pattern * and select additional patterns that are within the user * defined radius. this may result in a singular set and has * nothing todo with the approximation idea! this is more * like vodoo. */ static int max_radius = 1; /* user defined */ static VEC *presenceSet(VEC *s, MAT *m) { extern int debug; extern int max_radius; int i, j, z; VEC *y, *x; y = newVec(m->nr); /* pattern vector */ x = newVec(s->n); /* presenece vector */ i = (int) (unif_rand() * s->n); /* sample a column */ y->n = 0; for (j = m->ci[s->v[i]]; j < m->ci[s->v[i] + 1]; j++) y->v[y->n++] = m->ri[j]; /* select all rows that are within the * radius of the selected pattern */ matLeft(x, y, s, m); x->n = 0; for (i = 0; i < s->n; i++) { z = m->ci[s->v[i] + 1] - m->ci[s->v[i]]; z += y->n - 2 * x->v[i]; if (z <= max_radius) x->v[x->n++] = s->v[i]; } if (debug > 1) Rprintf(" %i %i\n", s->n, x->n); freeVec(y); x->v = Realloc(x->v, x->n, int); return x; } /* remove the set x from set s (column indexes). note: this is * not a general implementation of the setminus operation. */ static void remSet(VEC *x, VEC *s) { int i, j, k; j = 0; k = 0; for (i = 0; i < s->n; i++) if (j < x->n && x->v[j] == s->v[i]) j++; else s->v[k++] = s->v[i]; s->n = k; } /* partition a binary matrix over the columns. the code is optimized * for minimal memory usage. for the sake of algorithmic clarity * shortcuts with respect to terminal nodes are not implemented. */ static int min_retry = 10; /* user defined */ static RES *partition(VEC *s, MAT *m, int d, int i) { extern int max_radius; extern int min_retry; extern int min_size; /* see approximation */ extern int debug; extern int res_cnt; extern RES *res_last; VEC *xx, *ss; RES *z, *zz; z = approximate(s, m); if (debug) Rprintf("%3i [%i,%i,%i] %i", d, s->n, z->x->n, z->r, i); if (z->x->n == s->n) { /* pure */ if (z->r <= max_radius || /* homogenous */ z->x->n <= min_size) { /* min size */ res_cnt++; if (debug) Rprintf(" * %i\n", res_cnt); return res_last = z; } else if (min_retry && s->n >= min_retry * i) { /* retry */ if (debug) Rprintf(" +\n"); freeRes(z); return partition(s, m, d, i+1); } else { /* vodoo !!! */ if (debug) Rprintf(" >>\n"); freeRes(z); /* compare below */ xx = presenceSet(s, m); ss = copyVec(xx); zz = partition(ss, m, d+1, i); freeVec(ss); remSet(xx, s); freeVec(xx); if (s->n) { z = res_last; z->next = partition(s, m, d+1, i); } if (debug) Rprintf("%3i <<\n", d); return zz; } } if (debug) Rprintf(" >\n"); /* in order to prevent excessive memory consumption we reuse the * subset vector for the next zero set. as its contents may get * changed in the recursion, the next one set must be a copy of * the current one set. */ ss = copyVec(z->x); zz = partition(ss, m, d+1, i); freeVec(ss); remSet(z->x, s); freeRes(z); z = res_last; z->next = partition(s, m, d+1, i); if (debug) Rprintf("%3i <\n", d); return zz; } /* R interface */ SEXP proximus(SEXP R_mat, SEXP R_max_radius, SEXP R_min_size, SEXP R_min_retry, SEXP R_max_iter, SEXP R_debug) { extern int max_radius; /* see partition */ extern int min_size; extern int min_retry; extern int max_iter; /* see approximation */ extern int debug; extern int res_cnt; int j; VEC *s; MAT *m; RES *r; SEXP R_res; if (!LENGTH(R_max_radius) || !LENGTH(R_min_size ) || !LENGTH(R_min_retry ) || !LENGTH(R_max_iter ) || !LENGTH(R_debug )) error("proximus: missing parameter"); max_radius = INTEGER(R_max_radius)[0]; min_size = INTEGER(R_min_size )[0]; min_retry = INTEGER(R_min_retry )[0]; max_iter = INTEGER(R_max_iter )[0]; debug = LOGICAL(R_debug )[0]; if (!IS_LOGICAL(R_mat)) error("proximus: matrix not logical"); m = R_mat2mat(R_mat); s = newVec(m->nc); /* column subset vector */ for (j = 0; j < s->n; j++) s->v[j] = j; GetRNGstate(); res_cnt = 0; /* reset results counter */ r = partition(s, m, 0, 1); /* recursion */ PutRNGstate(); freeVec(s); R_res = res2R(r, m); freeMat(m); return R_res; } /***/ cba/src/sdists.c0000644000175100001440000005305514627400025013271 0ustar hornikusers #include #include #define min(A,B) ((A)>(B) ? (B) : (A)) #define max(A,B) ((A)>(B) ? (A) : (B)) /* compute auto-distances, or cross-distances on lists of * sequences, i.e. vectors of integers representing the * alphabet used. * * D. Gusfield (1997) Algorithms on Strings, Trees, and * Sequences. Cambridge University Press. * * for an interface for two (atomic) sequences which returns * the distance (similarity) as well as the complete edit * (alignment) traceback see below. * * fixme: 1) distance between two empty sequences. * 2) FASTA does not seem to be GPL but we may ask * if we could use parts of it in R. * * note that we do not implement the most efficient algorithmic * concepts known in the field. * * (C) ceeboo 2005, 2006, 2011 */ /* compute the operation weighted edit distance of two * sequences, i.e. insertion, deletion and substitution * may have different costs. * * note that the occurence of missing values results * in NA as a match or mismatch cannot be determined. */ double edist_ow(int *x, int *y, double *w, int nx, int ny, int nw, double *z0, char *b, double *v) { int i, j, x0 = 0, y0 = 0; double z1 = 0, z2 = 0, s0 = 0, s1 = 0, s2 = 0; for (i = 0; i <= nx; i++) { for (j = 0; j <= ny; j++) if (i == 0) { if (j == 0) { z0[j] = z2 = 0; if (b) b[0] = 0; if (v) v[0] = 0; } else { if (y[j-1] == NA_INTEGER) return NA_REAL; z2 = z0[j] = j * (nw > 5 ? w[5] : w[1]); if (b) b[j*(nx+1)] = 2; if (v) v[j*(nx+1)] = z2; } } else if (j == 0) { x0 = x[i-1]; if (x0 == NA_INTEGER) return NA_REAL; z1 = z2 = i * (nw > 4 ? w[4] : w[0]); if (b) b[i] = 1; if (v) v[i] = z1; } else { y0 = y[j-1]; s0 = z0[j] + w[0]; s1 = z1 + w[1]; s2 = z0[j-1] + ((x0 == y0) ? w[2] : w[3]); z2 = min(s0, s1); z2 = min(z2, s2); if (b) b[i+j*(nx+1)] = (s0 == z2) + (s1 == z2) * 2 + (s2 == z2) * ((x0 != y0) ? 4 : 8); if (v) v[i+j*(nx+1)] = z2; z0[j-1] = z1; if (j == ny) z0[j] = z2; else z1 = z2; } } return z2; } /* compute the alphabet-weighted distance. actually, we compute * the global sequential alignment with maximum similarity (see * Gusfield pp. 225) and return it as a negative number. */ double edist_aw(int *x, int *y, double *w, int nx, int ny, int nw, double *z0, char *b, double *v) { int i, j, x0 = 0, y0 = 0; double z1 = 0, z2 = 0, z3 = 0, s0 = 0, s1 = 0, s2 = 0; for (i = 0; i <= nx; i++) { for (j = 0; j <= ny; j++) if (i == 0) { if (j == 0) { z0[j] = z2 = z3 = w[0]; if (b) b[0] = 0; if (v) v[0] = z3; } else { y0 = y[j-1]; if (y0 == NA_INTEGER) return NA_REAL; z2 = z0[j] = z0[j-1] + w[(y0-1)*nw]; if (b) b[j*(nx+1)] = 2; if (v) v[j*(nx+1)] = z2; } } else if (j == 0) { x0 = x[i-1]; if (x0 == NA_INTEGER) return NA_REAL; z3 += w[(x0-1)]; z1 = z2 = z3; if (b) b[i] = 1; if (v) v[i] = z1; } else { y0 = y[j-1]; s0 = z0[j] + w[(x0-1)]; s1 = z1 + w[(y0-1)*nw]; s2 = z0[j-1] + w[(x0-1)+(y0-1)*nw]; z2 = max(s0, s1); z2 = max(z2, s2); if (b) b[i+j*(nx+1)] = (s0 == z2) + (s1 == z2) * 2 + (s2 == z2) * ((x0 != y0) ? 4 : 8); if (v) v[i+j*(nx+1)] = z2; z0[j-1] = z1; if (j == ny) z0[j] = z2; else z1 = z2; } } return -z2; } /* as above but align locally instead of globally. * * notes: 1) a value of zero indicates the empty sequence. * 2) an optimal non-empty solution is indicated by * the fifth bit in the traceback table */ double edist_awl(int *x, int *y, double *w, int nx, int ny, int nw, double *z0, char *b, double *v) { int i, j, x0 = 0, y0 = 0, k = 0, l = 0, *p = 0; double z1 = 0, z2 = 0, z = 0, s0 = 0, s1 = 0, s2 = 0; if (b) p = Calloc(nx*ny, int); for (i = 0; i <= nx; i++) { for (j = 0; j <= ny; j++) if (i == 0) { if (j == 0) { z0[j] = z = 0; if (b) b[0] = 0; if (v) v[0] = 0; } else { if (y[j-1] == NA_INTEGER) return NA_REAL; z0[j] = 0; if (b) b[j*(nx+1)] = 2; if (v) v[j*(nx+1)] = 0; } } else if (j == 0) { x0 = x[i-1]; if (x0 == NA_INTEGER) return NA_REAL; z1 = 0; if (b) b[i] = 1; if (v) v[i] = 0; } else { y0 = y[j-1]; s0 = z0[j] + w[(x0-1)]; s1 = z1 + w[(y0-1)*nw]; s2 = z0[j-1] + w[(x0-1)+(y0-1)*nw]; z2 = max(0, s0); z2 = max(z2, s1), z2 = max(z2, s2); if (b) { k = i+j*(nx+1); b[k] = (z2 > 0 && s0 == z2) + (z2 > 0 && s1 == z2) * 2 + (z2 > 0 && s2 == z2) * ((x0 != y0) ? 4 : 8); if (z2 > z) { l = 0; p[l++] = k; } else if (z2 > 0 && z2 == z) p[l++] = k; } if (v) v[i+j*(nx+1)] = z2; if (z2 > z) z = z2; z0[j-1] = z1; if (j == ny) z0[j] = z2; else z1 = z2; } } if (b) { while (l-- > 0) b[p[l]] = b[p[l]] + 16; Free(p); } return -z; } /* provide a common interface to all internal functions that * compute distances (similarities) of sequences. * * we expect two lists of integer vectors, an integer code for * the internal function, and a double vector (matrix) for the * weights to use. * * internal functions for distance computation have six + three * arguments: the first two are pointers to arrays of integer (the * sequences). the third is a pointer to an array of double (the * weights). the seventh is a pointer to an array of double long * enough to hold temporary results. the eighth is either null or * a pointer to a temporary array of character used in the computation * of the edit transcripts or alignments (see below). the ninth is * either null or a pointer to an array of double large enough to * hold the values of the dynamic programming table. * * returns either a vector in lower triangular format (see dist) * or a matrix of distances. * * in the case of auto-distances we check for asymmetric weights * as these may result in asymmetric distances. * * todo: warning if NA results are encountered (?) */ // test for exact symmetry int is_symmetric(double *x, int n) { int i, j, r = 1; // true for (i = 0; i < n-1; i++) for (j = i+1; j < n; j++) if (x[i+j*n] != x[j+i*n]) { r = 0; break; } return r; } SEXP sdists(SEXP R_x, SEXP R_y, SEXP R_method, SEXP R_weight, SEXP R_pairwise) { if (TYPEOF(R_x) != VECSXP || (!isNull(R_y) && TYPEOF(R_y) != VECSXP)) error("invalid sequence parameters"); if (TYPEOF(R_method) != INTSXP) error("invalid method parameter"); if (TYPEOF(R_weight) != REALSXP) error("invalid weight parameter"); if (TYPEOF(R_pairwise) != LGLSXP) error("invalid pairwise parameter"); double (*sdfun)(int *, int *, double *, int, int, int, double *, char *, double *v) = NULL; int nx, ny, nw; int i, j, k, n, m = 0; /* default symmetric */ SEXP x, y, t, r; /* return value */ nw = LENGTH(R_weight); switch (INTEGER(R_method)[0]) { case 1: sdfun = edist_ow; break; case 2: if (!isMatrix(R_weight)) error("invalid weight parameter"); sdfun = edist_aw; nw = INTEGER(GET_DIM(R_weight))[0]; break; case 3: if (!isMatrix(R_weight)) error("invalid weight parameter"); sdfun = edist_awl; nw = INTEGER(GET_DIM(R_weight))[0]; break; default: error("method not implemented"); } if (isNull(R_y)) { if ((isMatrix(R_weight) && !is_symmetric(REAL(R_weight), nw)) || (!isMatrix(R_weight) && REAL(R_weight)[0] != REAL(R_weight)[1])) error("auto-similarities for asymmetric weights not implemented"); R_y = R_x; } else if (LOGICAL(R_pairwise)[0] == TRUE) m = 2; else m = 1; nx = LENGTH(R_x); ny = LENGTH(R_y); if (m == 2 && nx != ny) error("invalid number of rows for pairwise mode"); if (m == 0) PROTECT(r = allocVector(REALSXP, nx*(nx-1)/2)); else if (m == 1) PROTECT(r = allocMatrix(REALSXP, nx, ny)); else PROTECT(r = allocVector(REALSXP, nx)); PROTECT(t = allocVector(REALSXP, 256)); /* temporary storage */ k = 0; n = nx; for (j = 0; j < ny; j++) { if (m == 0) i = j + 1; else if (m == 1) i = 0; else { i = j; n = j + 1; } y = VECTOR_ELT(R_y, j); if (LENGTH(y) >= LENGTH(t)) { /* more storage */ UNPROTECT(1); PROTECT(t = allocVector(REALSXP, LENGTH(y) * 2)); } for (; i < n; i++) { x = VECTOR_ELT(R_x, i); REAL(r)[k++] = (*sdfun)(INTEGER(x), INTEGER(y), REAL(R_weight), LENGTH(x), LENGTH(y), nw, REAL(t), 0, 0); R_CheckUserInterrupt(); } } UNPROTECT(2); return r; } /* get the next edit transcript. the input arguments are a pointer * to an array of traceback codes, the lenghts of the sequences * compared, and pointers to the transcript and its length. the * possible edit oparations are indicated by four bits. the lowest * bit is decoded and if more than one bit is set it is set to zero. * * returns -1 on error, 0 if no more transcripts are left, and * otherwise the backtrack position in the code array. */ static int next_transcript(char *b, int i, int j, char *s, int *l) { int b0 = 0, k0 = 0, k1 = 0, k = 0, n = i+1; while (i > 0 || j > 0) { if (i < 0 || j < 0) { REprintf("next_transcript: coding error\n"); return -1; } k0 = i+j*n; b0 = b[k0]; if (b0 & 1) { s[k++] = 'D'; if (b0 & 2 || b0 & 4 || b0 & 8) k1 = k0; i--; } else if (b0 & 2) { s[k++] = 'I'; if (b0 & 4 || b0 & 8) k1 = k0; j--; } else { if (b0 == 4) s[k++] = 'R'; else if (b0 == 8) s[k++] = 'M'; else { REprintf("edit_transcript: coding error\n"); return -1; } i--; j--; } } *l = k; s[k] = (char)0; if (k1) { b0 = b[k1]; if (b0 & 1) b[k1] = b0 ^ 1; else if (b0 & 2) b[k1] = b0 ^ 2; } return k1; } /* get the next transcript for a local alignment. first we have to find * the endpoint of a local alignment (if any). then we proceed as above * until we hit zero or either of the two sequences is exhausted. * remaining prefixes or suffixes are aligned by padding with wildcards * where insertions or deletions at the ends are used to account for * differences in lenghths (shifting the shorter prefix or suffix in the * direction of the local alignment). we use the codes {'i', 'd', '?'} * in order to distinguish these edit operations from those necessary * to obtain the local alignment. * * endpoints within a local alignment are ignored as we seek only * alignments of maximum length. bits 6 and 7 are used as temporary * storage for bits 1 and 2 which we restore after all solutions for * one endpoint have been generated. */ static int next_local_transcript(char *b, int i, int j, char *s, int *l) { int b0 = 0, k0 = 0, k1 = 0, k2 = 0, k = 0, n = i+1, m = j+1; for (; i > 0; i--) for (j = m-1; j > 0; j--) { k0 = i+j*n; if (b[k0] & 16) { k2 = k0; goto next; } } return 0; next: while (k < n-i-m+j) s[k++] = 'd'; while (k < m-j-n+i) s[k++] = 'i'; while (k < n-i-1 || k < m-j-1) s[k++] = '?'; while (i > 0 && j > 0) { k0 = i+j*n; b0 = b[k0]; if (b0 == 0) break; else if (b0 & 16) b[k0] = b0 = b0 ^ 16; if (b0 & 1) { s[k++] = 'D'; if (b0 & 2 || b0 & 4 || b0 & 8) k1 = k0; i--; } else if (b0 & 2) { s[k++] = 'I'; if (b0 & 4 || b0 & 8) k1 = k0; j--; } else { if (b0 & 4) s[k++] = 'R'; else if (b0 & 8) s[k++] = 'M'; else { REprintf("edit_transcript: coding error\n"); return -1; } i--; j--; } } for (; i > 0 && j > 0; i--, j--) s[k++] = '?'; for (; i > 0; i--) s[k++] = 'd'; for (; j > 0; j--) s[k++] = 'i'; *l = k; s[k] = (char)0; if (k1) { b0 = b[k1]; if (b0 & 1) b[k1] = (b0 ^ 1) | 32; else if (b0 & 2) b[k1] = (b0 ^ 2) | 64; b[k2] |= 16; } else for (k = 1; k < k2; k++) { b0 = b[k]; if (b0 & 16) k1 = k; if (b0 & 32) b0 = (b0 ^ 32) | 1; if (b0 & 64) b0 = (b0 ^ 64) | 2; b[k] = b0; } return k1; } /* compute the distance for two sequences and the corresponding set * of equivalent edit transcripts. the input arguments are the same * as above with the exception that the first two are integer vectors * instead of lists. * * the transcripts are coded as strings over the alphabet {'I', 'D', * 'R', 'M'} indicating an insert, delete, replace, or match operation * to be applied to the first (second) sequence (supplied). for the * extended symbol set for local alignments see above. * * the distance is returned as attribute 'value'. the values of the * dynamic programming table may be returned as attribute 'table' for * plotting, etc. Attribute 'pointer' contains an R ''segments'' * compatible representation of the (back)pointers (see also below). */ SEXP sdists_transcript(SEXP R_x, SEXP R_y, SEXP R_method, SEXP R_weight, SEXP R_table) { if (TYPEOF(R_x) != INTSXP || TYPEOF(R_y) != INTSXP) error("invalid sequence parameters"); if (TYPEOF(R_method) != INTSXP) error("invalid method parameter"); if (TYPEOF(R_weight) != REALSXP) error("invalid weight parameter"); if (TYPEOF(R_table) != LGLSXP) error("invalid option parameter"); double (*sdfun)(int *, int *, double *, int, int, int, double *, char *, double *) = NULL; int (*stfun)(char *, int, int, char *, int *) = NULL; int i, j, k, n, nx, ny, nw; double d, *v = 0, *t; // temporary storage char c, *b, *s; // temporary storage SEXP r, tv = (SEXP)0, tb = (SEXP)0; nw = length(R_weight); switch (INTEGER(R_method)[0]) { case 1: sdfun = edist_ow; stfun = next_transcript; break; case 2: if (!isMatrix(R_weight)) error("invalid weight parameter"); sdfun = edist_aw; stfun = next_transcript; nw = INTEGER(GET_DIM(R_weight))[0]; break; case 3: if (!isMatrix(R_weight)) error("invalid weight parameter"); sdfun = edist_awl; stfun = next_local_transcript; nw = INTEGER(GET_DIM(R_weight))[0]; break; default: error("method not implemented"); } nx = length(R_x); ny = length(R_y); if (LOGICAL(R_table)[0] == TRUE) { PROTECT(tv = allocMatrix(REALSXP, nx+1, ny+1)); PROTECT(tb = allocVector(VECSXP, 4)); v = REAL(tv); } // R-2.9.x b = (char *) RAW(PROTECT(allocVector(RAWSXP, (nx+1)*(ny+1)))); t = Calloc(ny+1, double); d = (*sdfun)(INTEGER(R_x),INTEGER(R_y), REAL(R_weight), nx, ny, nw, t, b, v); Free(t); if (!R_FINITE(d)) { UNPROTECT(1); if (LOGICAL(R_table)[0] == TRUE) UNPROTECT(2); return ScalarReal(d); } #ifdef TB_DEBUG Rprintf("traceback codes: 1 = up, 2 = left, 4 = replace, 8 = match\n\n"); for (i = 0; i <= nx; i++) { Rprintf("[%2i]", i); for (j = 0; j <= ny; j++) if (b[i+j*(nx+1)] & 16) Rprintf("(%2i)",b[i+j*(nx+1)] ^ 16); else Rprintf(" %2i ",b[i+j*(nx+1)]); Rprintf("\n"); } #endif if (LOGICAL(R_table)[0] == TRUE) { int b0; SEXP x0, y0, x1, y1; k = 0; for (i = 1; i < (nx+1)*(ny+1); i++) { b0 = b[i]; k += ((b0 & 1) == 1) + ((b0 & 2) == 2) + (((b0 & 4) == 4) || ((b0 & 8) == 8)); } SET_VECTOR_ELT(tb, 0, (x0 = allocVector(INTSXP, k))); SET_VECTOR_ELT(tb, 1, (y0 = allocVector(INTSXP, k))); SET_VECTOR_ELT(tb, 2, (x1 = allocVector(INTSXP, k))); SET_VECTOR_ELT(tb, 3, (y1 = allocVector(INTSXP, k))); k = 0; for (i = 0; i <= nx; i++) for (j = 0; j <= ny; j++) { b0 = b[i+j*(nx+1)]; if (b0 & 1) { INTEGER(x0)[k] = i-1; INTEGER(y0)[k] = j; INTEGER(x1)[k] = i; INTEGER(y1)[k] = j; k++; } if (b0 & 2) { INTEGER(x0)[k] = i; INTEGER(y0)[k] = j-1; INTEGER(x1)[k] = i; INTEGER(y1)[k] = j; k++; } if (b0 & 4 || b0 & 8) { INTEGER(x0)[k] = i-1; INTEGER(y0)[k] = j-1; INTEGER(x1)[k] = i; INTEGER(y1)[k] = j; k++; } } } // R-2.9.x s = (char *) RAW(PROTECT(allocVector(RAWSXP, nx+ny+1))); r = R_NilValue; do { n = (*stfun)(b, nx, ny, s, &k); for (i = 0; i < k/2; i++) { c = s[i]; s[i] = s[k-i-1]; s[k-i-1] = c; } PROTECT(r); r = CONS(mkChar(s), r); UNPROTECT(1); PROTECT(r); R_CheckUserInterrupt(); UNPROTECT(1); } while (n); UNPROTECT(2); PROTECT(r); r = PairToVectorList(r); UNPROTECT(1); PROTECT(r); r = coerceVector(r, STRSXP); UNPROTECT(1); PROTECT(r); setAttrib(r, install("value"), PROTECT(ScalarReal(d))); UNPROTECT(1); if (LOGICAL(R_table)[0] == TRUE) { setAttrib(r, install("table"), tv); setAttrib(r, install("pointer"), tb); UNPROTECT(3); } else UNPROTECT(1); return r; } // align two sequences according to an edit transcript SEXP sdists_align(SEXP R_x, SEXP R_y, SEXP t) { if (TYPEOF(R_x) != INTSXP || TYPEOF(R_y) != INTSXP) error("invalid sequence parameter(s)"); if (TYPEOF(t) != STRSXP || LENGTH(t) != 1) error("invalid transcript parameter"); int i, j, k, i0, j0; SEXP r, x = (SEXP)0, y = (SEXP)0; t = STRING_ELT(t, 0); PROTECT(r = allocVector(VECSXP, 2)); SET_VECTOR_ELT(r, 0, (x = allocVector(INTSXP, LENGTH(t)))); SET_VECTOR_ELT(r, 1, (y = allocVector(INTSXP, LENGTH(t)))); if (isFactor(R_x)) { SET_LEVELS(x, GET_LEVELS(R_x)); setAttrib(x, install("class"), PROTECT(mkString("factor"))); UNPROTECT(1); } if (isFactor(R_y)) { SET_LEVELS(y, GET_LEVELS(R_y)); setAttrib(y, install("class"), PROTECT(mkString("factor"))); UNPROTECT(1); } i = j = i0 = j0 = 0; for (k = 0; k < LENGTH(t); k++) { if (i > LENGTH(R_x) || j > LENGTH(R_y)) error("invalid edit transcript"); switch (CHAR(t)[k]) { case 'i': case 'I': INTEGER(x)[i0++] = NA_INTEGER; INTEGER(y)[j0++] = INTEGER(R_y)[j++]; break; case 'd': case 'D': INTEGER(x)[i0++] = INTEGER(R_x)[i++]; INTEGER(y)[j0++] = NA_INTEGER; break; case 'R': case 'M': case '?': INTEGER(x)[i0++] = INTEGER(R_x)[i++]; INTEGER(y)[j0++] = INTEGER(R_y)[j++]; break; default: error("invalid edit symbol"); } } if (i < LENGTH(R_x) || j < LENGTH(R_y)) error("invalid edit transcript"); UNPROTECT(1); return r; } /* * transform a vector of transcripts into a graph, i.e. a set of edges * with weights the number of times the edge is a member of a path in * the dynamic programming table. * * returns a list of 4 vectors of coordinates for use with 'segments', * x0, y0, x1, y1, where x denotes the first and y the second sequence, * and a vector of edge frequencies. * * notes: we code the edges into scalar integers so that we can sort * and thus efficiently count them. the cells of the dynamic * programming table are numbered column by column. an edit * path is therfore transformed into a sequence of indexes and * pairs of consecutive indexes indicate entries in the edge * table. the latter we number again by columns. the time * complexity thus depends on sorting. * * fixme: Calloc may raise an error so that we cannot free memory * previously allocated with calloc or Calloc. * * ceeboo 2006 */ SEXP sdists_graph(SEXP x) { if (TYPEOF(x) != STRSXP) error("invalid type"); int i = 0, j = 0, h, k, l, p = 0, q = 0, k0, k1, nx = 0, ny = 0, n = 0; int *i0, *i1; SEXP r, x0, y0, x1, y1, f; k0 = 0; for (k = 0; k < LENGTH(x); k++) k0 += LENGTH(STRING_ELT(x, k)); i0 = Calloc(k0, int); k0 = 0; for (h = 0; h < LENGTH(x); h++) { SEXP c = STRING_ELT(x, h); if (h == 0) { nx = ny = LENGTH(c); for (k = 0; k < LENGTH(c); k++) switch (CHAR(c)[k]) { case 'i': case 'I': nx--; break; case 'd': case 'D': ny--; } n = (nx+1) * (ny+1); } p = q = LENGTH(c); i = l = 0; for (k = 0; k < LENGTH(c); k++) { switch (CHAR(c)[k]) { case 'i': case 'I': i += nx+1; p--; break; case 'd': case 'D': i += 1; q--; break; case 'R': case 'M': case '?': i += nx+2; break; default: Free(i0); error("invalid symbol"); } i0[k0++] = l + i * n; l = i; } if (p != nx || q != ny) { Free(i0); error("transcripts do not conform"); } } R_isort(i0, k0); i1 = Calloc(k0, int); l = i0[0]; k1 = 0; for (k = 0; k < k0; k++) { if (i0[k] != l) { l = i0[k]; i0[++k1] = l; } i1[k1]++; } k1++; PROTECT(r = allocVector(VECSXP, 5)); SET_VECTOR_ELT(r, 0, (x0 = allocVector(INTSXP, k1))); SET_VECTOR_ELT(r, 1, (y0 = allocVector(INTSXP, k1))); SET_VECTOR_ELT(r, 2, (x1 = allocVector(INTSXP, k1))); SET_VECTOR_ELT(r, 3, (y1 = allocVector(INTSXP, k1))); SET_VECTOR_ELT(r, 4, (f = allocVector(INTSXP, k1))); for (k = 0; k < k1; k++) { l = i0[k]; i = l % n; j = (l - i) / n; INTEGER(x0)[k] = l = i % (nx+1); INTEGER(y0)[k] = (i - l) / (nx+1); INTEGER(x1)[k] = l = j % (nx+1); INTEGER(y1)[k] = (j - l) / (nx+1); INTEGER(f )[k] = i1[k]; } Free(i0); Free(i1); UNPROTECT(1); return r; } /**/ cba/src/gknn.c0000644000175100001440000001106611304023136012702 0ustar hornikusers #include #include /* gknn.c * * generic k-nearest neighbor algorithm which operates on a * user-supplied distance matrix. * * implements: * * 1) inclusion of tied (equi-distant) kth neighbors (see option * use_all) * 2) random selection of tied (equi-distant) kth neighbors (see * option use_all) * 3) minimum vote test (otherwise NA is returned for 'doubt') * 3) breaking of tied votes * * expects as input a cross-distance matrix, a factor of class values, * the number of neighbors to use, the minimum number of votes * (for a definite decision), options for for handling ties in votes * and/or distances (see above), and an option for inclusion of the * proportions of winning votes. note that classe values may be NA * but missing distance values are ignored. * * returns a factor of class predictions and, optionally, a vector * of proportions of winning votes as attribute "prob". * * ceeboo (2005) */ SEXP gknn(SEXP R_x, SEXP R_y, SEXP R_k, SEXP R_l, SEXP R_break_ties, SEXP R_use_all, SEXP R_prob) { int nr, n, nc, nn, nv; int break_ties, use_all; int *y, *o, *c; int i, j, k, l, m, v; double *x; SEXP R_obj, R_pro, R_str; nr = INTEGER(GET_DIM(R_x))[0]; /* number of test instances */ n = INTEGER(GET_DIM(R_x))[1]; /* number of training instances */ if (LENGTH(R_y) != n) error("gknn: \"x\" and \"y\" do not conform"); nc = LENGTH(GET_LEVELS(R_y)); /* number of classes */ if (nc < 1) error("gknn: \"y\" invalid number of levels"); if (STRING_ELT(GET_LEVELS(R_y), nc-1) == NA_STRING) error("gknn: \"y\" invalid level"); y = INTEGER(R_y); /* class indexes (R shifted) */ for (i = 0; i < n; i++) if (y[i] == NA_INTEGER || y[i] < 1 || y[i] > nc) error("gknn: \"y\" invalid value"); nn = INTEGER(R_k)[0]; /* number of neighbors */ if (nn < 1 || nn > n) error("gknn: invalid number of neighbors"); nv = INTEGER(R_l)[0]; /* number of minimum votes */ if (nv < 0 || nv > nn) error("gknn: invalid minimum number of votes"); break_ties = LOGICAL(R_break_ties)[0]; /* tie breaking */ use_all = LOGICAL(R_use_all)[0]; /* use all neighbors */ o = Calloc(n, int); /* order */ c = Calloc(nc+1, int); /* class counts */ x = Calloc(n, double); /* distances */ PROTECT(R_obj = NEW_INTEGER(nr)); if (LOGICAL(R_prob)[0]) { /* return proportions */ PROTECT(R_pro = NEW_NUMERIC(nr)); setAttrib(R_obj, install("prob"), R_pro); UNPROTECT(1); } else R_pro = R_NilValue; GetRNGstate(); for (i = 0; i < nr; i++) { for (j = 0; j < n; j++) { o[j] = j; x[j] = REAL(R_x)[i+j*nr]; /* copy distances */ } rsort_with_index(x, o, n); for (j = 1; j < nc+1; j++) /* R shifted */ c[j] = 0; k = 0; /* invalid class */ for (j = 0; j < nn; j++) { /* count classes */ if (ISNAN(x[j])) break; k = y[o[j]]; c[k]++; } if (use_all) { /* use tied */ while (j < n && x[j] == x[j-1]) { k = y[o[j++]]; c[k]++; } } else { /* draw from tied */ while (j < n && x[j] == x[j-1]) j++; if (j > nn) { l = nn - 1 + (int) (unif_rand() * (j-nn+1)); l = y[o[l]]; if (l != k) { c[k]--; k = l; c[k]++; } } } l = 0; /* number of ties */ v = 0; /* number of votes */ m = 0; /* max votes */ for (j = 1; j < nc+1; j++) { v += c[j]; if (c[j] > m) { m = c[j]; k = j; l = 1; } else if (l > 0 && c[j] == m) { if (unif_rand() > (double) l/(l+1)) k = j; l++; } } if (R_pro != R_NilValue) { if (v > 0) REAL(R_pro)[i] = (double) m/v; else REAL(R_pro)[i] = NA_REAL; } if (nv > m) { /* below minimum vote */ INTEGER(R_obj)[i] = NA_INTEGER; } else { if (l > 0) { if (break_ties) INTEGER(R_obj)[i] = k; else { if (l > 1) INTEGER(R_obj)[i] = NA_INTEGER; else INTEGER(R_obj)[i] = k; } } else INTEGER(R_obj)[i] = NA_INTEGER; } } Free(o); Free(c); Free(x); PutRNGstate(); SET_LEVELS(R_obj, duplicate(GET_LEVELS(R_y))); PROTECT(R_str = NEW_STRING(1)); SET_STRING_ELT(R_str , 0, mkChar("factor")); SET_CLASS(R_obj, R_str); UNPROTECT(1); UNPROTECT(1); return R_obj; } /**/ cba/src/cluster.c0000644000175100001440000000564414344034315013443 0ustar hornikusers #include #include /* cluster_dist * * cluster an undirected graph as representable by an R dist object, * i.e. find all the disconnected subgraphs of graph. * * as input we expect R_x the vector storage representation of the * upper/lower triangle of a distance matrix (see dist) and R_beta * the distance threshold. * * returns a factor of cluster labels (integers 1,2, ..., k, with * k the number of clusters). * * NA or NaN distance values are interpreted as no link! this is a * simplification as we do not want to check for the 2^k possible * clusterings given each indeterminate link is actually either above * or below the threshold. * * NA or NaN threshold values result in an error as we do not want * to to check for all the possible clusterings given the threshold * assumes a value in the range of the distances. * * fixme: 1) can we do this in less than O(n^2) time? * 2) can we use a strict threshold? * * ceeboo 2006 */ SEXP cluster_dist(SEXP R_x, SEXP R_beta) { if (TYPEOF(R_x) != REALSXP) error("cluster_dist: 'x' invalid storage type"); if (TYPEOF(R_beta) != REALSXP) error("cluster_dist: 'beta' invalid storage type"); int i, j, k, l, n, o, na, *c, *b; char *s; double beta, *x; SEXP R_str, R_obj; n = (int) sqrt(2 * length(R_x)) + 1; if (n < 3 || n * (n - 1) / 2 != length(R_x)) error("cluster_dist: 'x' invalid length"); beta = REAL(R_beta)[0]; /* distance threshold */ if (ISNAN(beta)) error("cluster_dist: 'beta' NA or NaN"); PROTECT(R_obj = NEW_INTEGER(n)); c = INTEGER(R_obj); for (i = 0; i < n; i++) c[i] = i; x = REAL(R_x); k = na = 0; for (i = 0; i < n - 1; i++) for (j = i + 1; j < n; j++) { if (ISNAN(x[k])) { na++; continue; } if (beta >= x[k++]) { if (c[j] == c[i]) continue; if (c[j] == j) c[j] = c[i]; else { o = c[j]; for (l = 0; l < n; l++) if (c[l] == o) c[l] = c[i]; } } } if (na) warning("cluster_dist: found NA (NaN) distance values, different solutions may be possible."); /* make indexes contiguous */ b = Calloc(n, int); k = 0; for (i = 0; i < n; i++) { j = c[i]; if (b[j] == 0) b[j] = ++k; c[i] = b[j]; } Free(b); /* make return value a factor */ int sn = k/10+2; s = Calloc(sn, char); /* stringified integers */ PROTECT(R_str = NEW_STRING(k)); for (j = 0; j < k; j++) { snprintf(s,sn,"%i",j+1); SET_STRING_ELT(R_str, j, mkChar(s)); } Free(s); SET_LEVELS(R_obj, R_str); UNPROTECT(1); PROTECT(R_str = NEW_STRING(1)); SET_STRING_ELT(R_str, 0, mkChar("factor")); SET_CLASS(R_obj, R_str); UNPROTECT(1); /* we are done */ UNPROTECT(1); return R_obj; } cba/src/rock.c0000644000175100001440000003331214344034315012711 0ustar hornikusers #include #include /* rock.c * * implements the paper: * * S. Guha, R. Rastogi, and K. Shim. Rock: A Robust Clustering Algorithm * for Categorical Attributes. Information Systems, Vol. 25, No. 5, 2000. * * The implementation uses a lower triangular matrix representation and * comes in three parts: a function that computes link counts from * distances, another that constructs a cluster solution by merging, and * a function that classifies samples. Implementation of the clustering * problem by separate functions is slightly inefficient but allows for * reuse and further experimentation. * * The auxiliary functions for computation of "binary" distances are * considerably faster than R dist, and the second usage even does not * seem to be available in R (moved to dists.c). * * Release: 0.1-1 * Release date: 2005-06-30 * * (C) ceeboo 2005 */ /* * Compute Rock link counts (of the number of common neighbors of * any two data points). * * As input we exepct a lower triangular matrix organized by columns * but with the diagonal omitted which contains the distance values * for any pair of data points. Fixme: currently, the vector has no * attribute indicating its type. * * The same data structure is used as the return value. * * we now test for NA and NaN (at the cost of runtime) as we do not * want to rely on the implementation detail that NA is a large number. * as befor we settle for ignoring such values. although this is not * correct we avoid computing 2^k results (see also cluster.dist) * * ceeboo 2005, 2006 */ SEXP rockLink(SEXP R_x, SEXP R_beta) { if (TYPEOF(R_x) != REALSXP) error("rockLink: 'x' invalid storage type"); if (TYPEOF(R_beta) != REALSXP) error("rockLink: 'beta' invalid storage type"); int m, n; int i, j, k, kk, l; int *v, *p; double z, beta; double *x; SEXP R_obj; m = LENGTH(R_x); n = 1 + (int) sqrt(2*m); if (m < 3 || m != n*(n-1)/2) /* logical constraint */ error("rockLink: 'x' invalid length"); x = REAL(R_x); beta = REAL(R_beta)[0]; if (ISNAN(beta)) error("rockLink: 'beta' NA or NaN"); PROTECT(R_obj = NEW_INTEGER(m)); for (l = 0; l < m; l++) INTEGER(R_obj)[l] = 0; /* this sucks! */ v = Calloc(n, int); p = Calloc(n, int); /* column offset */ for (k = 0; k < n; k++) p[k] = k*(n-1)-k*(k+1)/2-1; for (i = 0; i < n; i++) { l = 0; for (k = 0; k < i; k++) { z = x[i+p[k]]; if (ISNAN(z)) continue; if (beta >= z) v[l++] = k; } kk = p[i]; for (k = i+1; k < n; k++) { z = x[k+kk]; if (ISNAN(z)) continue; if (beta >= z) v[l++] = k; } for (j = 1; j < l; j++) for (k = 0; k < j; k++) { kk = p[v[k]]; INTEGER(R_obj)[v[j]+kk]++; } } Free(p); Free(v); UNPROTECT(1); return R_obj; } /* * Successively merge two clusters, either unitl the desired * number of clusters is reached, or stop if there are all but * zero link counts left. * * returns a list containing a factor with levels labled * contiguously and starting with "1", and a table of cluster * sizes. * * This code is optimized for low memory footprint and computation * time. However, currently it does not use sparse representations. * * The search for the maximum merger among n candiates has time * complexity O(n*(n-1)/2) in the worst case and O(n-1) in the * best case. * * Note that we do not check the link count matrix for NAs or NaNs * because 1) the above function should not return these values, * and 2) the present function is conceptually internal. * * The neighborhood paramtere is constrained to the interval [0,1) * because inclusion of one results in divison by zero in all * calculations of the merging criterion. For zero link counts * this would result in NaNs and undesirable additional checks. * * Fixme: tie breaking! */ SEXP rockMerge(SEXP R_x, SEXP R_n, SEXP R_theta, SEXP R_debug) { int debug, m, n, nn; int i, j, k, l, ii, jj, kk, ll, iii, jjj, kkk; int *x, *o, *c, *f, *p, *w; double y, z; double *t, *v; char *s; SEXP R_obj, R_tmp, R_str, R_dim; debug = INTEGER(R_debug)[0]; m = LENGTH(R_x); n = 1 + (int) sqrt(2*m); /* number of samples */ if (m < 3 || m != n*(n-1)/2) error("rockMerge: invalid vector length"); nn = INTEGER(R_n)[0]; /* number of clusters */ if (nn < 1) error("rockMerge: invalid number of clusters"); z = REAL(R_theta)[0]; /* neigborhood parameter */ if (z < 0 || z >= 1) error("rockMerge: invalid neigborhood parameter"); z = 1 + 2 * (1-z) / (1+z); x = Calloc(m, int); /* link counts */ Memcpy(x, INTEGER(R_x), m); o = Calloc(n, int); /* sample index */ c = Calloc(n, int); /* cluster index */ f = Calloc(n, int); /* cluster size */ p = Calloc(n, int); /* column offset in dist */ t = Calloc(n+1, double); /* table of powers */ v = Calloc(n-1, double); /* column maximum */ w = Calloc(n-1, int); /* row index */ for (k = 0; k < n; k++) { o[k] = k; c[k] = -1; f[k] = 1; p[k] = k*(n-1)-k*(k+1)/2-1; t[k+1] = pow(k+1, z); } /* find the maximum of a column (in the lower * triangular part of it) and the corresponding * row index. */ y = t[2]-2*t[1]; /* initially constant */ k = 0; for (i = 0; i < n-1; i++) { v[i] = -1; for (j = i+1; j < n; j++) { z = x[k++] / y; if (z > v[i]) { v[i] = z; w[i] = j; } } } if (debug) Rprintf(" #cls clids sizes goodness\n"); m = n; while (m > nn) { z = -1; /* find the maximum */ for (ii = 0; ii < m-1; ii++) if (v[ii] > z) { z = v[ii]; i = ii; } if (z == 0) break; ii = o[i]; j = w[i]; jj = o[j]; if (debug) { Rprintf(" %4i %4i %4i [%4i,%4i] %12.6f", m, ii, jj, f[ii], f[jj], z); if (f[ii] > 1 && f[jj] > 1) Rprintf("+\n"); else Rprintf("\n"); } /* merge the frequencies and link counts; check * if the new cluster provides a new column maximum; * this is slightly inefficient in the worst case. */ f[ii] += f[jj]; for (k = 0; k < i; k++) { kk = o[k]; kkk = p[kk]; x[ii+kkk] += x[jj+kkk]; z = x[ii+kkk] / (t[f[ii]+f[kk]] - t[f[ii]] - t[f[kk]]); if (z > v[k]) { v[k] = z; w[k] = i; } else if (w[k] == i || w[k] == j) { v[k] = -1; /* be deterministic */ for (l = k+1; l < m; l++) { if (l == j) continue; ll = o[l]; z = x[ll+kkk] / (t[f[kk]+f[ll]] - t[f[kk]] - t[f[ll]]); if (z > v[k]) { v[k] = z; w[k] = l; } } } } v[i] = -1; /* column changed */ iii = p[ii]; for (k = i+1; k < j; k++) { kk = o[k]; kkk = p[kk]; x[kk+iii] += x[jj+kkk]; z = x[kk+iii] / (t[f[ii]+f[kk]] - t[f[ii]] - t[f[kk]]); if (z > v[i]) { v[i] = z; w[i] = k; } if (w[k] == j) { v[k] = -1; for (l = k+1; l < m; l++) { if (l == j) continue; ll = o[l]; z = x[ll+kkk] / (t[f[kk]+f[ll]] - t[f[kk]] - t[f[ll]]); if (z > v[k]) { v[k] = z; w[k] = l; } } } } jjj = p[jj]; for (k = j+1; k < m; k++) { kk = o[k]; x[kk+iii] += x[kk+jjj]; z = x[kk+iii] / (t[f[ii]+f[kk]] - t[f[ii]] - t[f[kk]]); if (z > v[i]) { v[i] = z; w[i] = k; } } /* reorganize the indexes of the clusters, * of the rows corresponding to the maxima, * and shrink the vectors. */ if (c[ii] == -1) c[ii] = ii; if (c[jj] == -1) c[jj] = c[ii]; else { iii = c[ii]; jjj = c[jj]; for (k = 0; k < n; k++) if (c[k] == jjj) c[k] = iii; } for (k = 0; k < j; k++) /* for clarity here */ if (w[k] > j) w[k]--; for (k = j+1; k < m-1; k++) { o[k-1] = o[k]; v[k-1] = v[k]; w[k-1] = w[k]-1; } if (k < m) o[k-1] = o[k]; m--; } Free(x); Free(p); Free(t); Free(v); Free(w); if (m > nn) Rprintf("rockMerge: terminated with %i clusters\n", m); PROTECT(R_obj = NEW_LIST(2)); PROTECT(R_tmp = NEW_INTEGER(n)); /* reorganize the indexes of the * clusters to be contiguous and * to start with one. */ for (k = 0; k < n; k++) o[k] = -1; m = 0; for (k = 0; k < n; k++) { if (c[k] == -1) c[k] = k; kk = c[k]; if (o[kk] == -1) o[kk] = ++m; INTEGER(R_tmp)[k] = o[kk]; } int sn = (int) log10(m) + 2; s = Calloc(sn, char); /* stringified integers */ PROTECT(R_str = NEW_STRING(m)); for (j = 0; j < m; j++) { snprintf(s,sn,"%i",j+1); SET_STRING_ELT(R_str, j, mkChar(s)); } Free(s); SET_LEVELS(R_tmp, R_str); UNPROTECT(1); PROTECT(R_str = NEW_STRING(1)); SET_STRING_ELT(R_str, 0, mkChar("factor")); SET_CLASS(R_tmp, R_str); UNPROTECT(1); SET_ELEMENT(R_obj, 0, R_tmp); UNPROTECT(1); PROTECT(R_tmp = NEW_INTEGER(m)); for (k = 0; k < n; k++) { kk = c[k]; if (o[kk] != -1) { INTEGER(R_tmp)[o[kk]-1] = f[kk]; o[kk] = -1; } } Free(o); Free(c); Free(f); PROTECT(R_dim = NEW_INTEGER(1)); INTEGER(R_dim)[0] = m; SET_DIM(R_tmp, R_dim); UNPROTECT(1); PROTECT(R_dim = NEW_LIST(1)); SET_ELEMENT(R_dim, 0, GET_LEVELS(VECTOR_ELT(R_obj, 0))); SET_DIMNAMES(R_tmp, R_dim); UNPROTECT(1); PROTECT(R_str = NEW_STRING(1)); SET_STRING_ELT(R_str, 0, mkChar("table")); SET_CLASS(R_tmp, R_str); UNPROTECT(1); SET_ELEMENT(R_obj, 1, R_tmp); UNPROTECT(1); UNPROTECT(1); return R_obj; } /* * compute a classification based on a Rock clustering. since we * use a threshold on distances a data point may be assigned to * more than one cluster, or even none. * * we expect the cluster indexes to be a factor, i.e. to be contiguous * and to start with one. the supplied distances have to be equal or * greater than zero. NAs and NaNs are allowed (see the explanation * above). * * note: * * 1) ties are broken at random (this may obfuscate that the data * actually has no structure). * 2) points that are not in any neighborhood are assigned the class * value NA. * * */ SEXP rockClass(SEXP R_x, SEXP R_l, SEXP R_beta, SEXP R_theta) { int nr, nc, nl, na; int i, j, h, k; int *l, *c, *cf; double beta; double t, z, y; double *n, *x; SEXP R_lev, R_obj, R_tmp, R_str, R_dim; nr = INTEGER(GET_DIM(R_x))[0]; nc = INTEGER(GET_DIM(R_x))[1]; if (LENGTH(R_l) != nc) error("rockClass: invalid vector length or number of columns"); R_lev = GET_LEVELS(R_l); nl = LENGTH(R_lev); t = REAL(R_theta)[0]; if (t < 0 || t > 1) error("rockMerge: invalid neigborhood parameter"); t = 1 + 2 * (1-t) / (1+t); l = INTEGER(R_l); /* number of levels */ n = Calloc(nc, double); /* expected neighbors */ /* check the validity of the indexes and * compute the expected number of neighbors */ for (j = 0; j < nc; j++) { i = l[j]; if (i == NA_INTEGER || i < 1 || i > nl) { Free(n); error("rockClass: invalid cluster index(es)"); } n[i-1]++; } for (j = 0; j < nl; j++) { z = n[j]; if (z == 0) { /* not contiguous */ Free(n); error("rockClass: invalid cluster index(es)"); } n[j] = pow(1+z, t); } x = REAL(R_x); /* distances */ beta = REAL(R_beta)[0]; /* threshold */ c = Calloc(nl, int); PROTECT(R_obj = NEW_LIST(2)); PROTECT(R_tmp = NEW_INTEGER(nr)); /* class indexes */ cf = Calloc(nl+1, int); /* class frequencies */ GetRNGstate(); for (j = 0; j < nl; j++) cf[j] = 0; for (i = 0; i < nr; i++) { for (j = 0; j < nl; j++) /* initialize */ c[j] = 0; for (j = 0; j < nc; j++) /* count neighbors */ if (beta >= x[i+j*nr]) c[l[j]-1]++; k = nl; /* include NAs */ h = 0; /* compiler hack */ z = 0; for (j = 0; j < nl; j++) { /* determine maximum */ y = c[j] / n[j]; if (y > z) { z = y; k = j; h = 1; } else if (h > 0 && y == z) { /* break ties */ if (unif_rand() > (double) h/(h+1)) k = j; h++; } } cf[k]++; INTEGER(R_tmp)[i] = k+1; } PutRNGstate(); Free(n); Free(c); na = nl+(cf[nl]>0); PROTECT(R_str = NEW_STRING(na)); for (j = 0; j < nl; j++) SET_STRING_ELT(R_str, j, STRING_ELT(R_lev, j)); if (na>nl) SET_STRING_ELT(R_str, j, NA_STRING); SET_LEVELS(R_tmp, R_str); UNPROTECT(1); PROTECT(R_str = NEW_STRING(1)); SET_STRING_ELT(R_str, 0, mkChar("factor")); SET_CLASS(R_tmp, R_str); UNPROTECT(1); SET_ELEMENT(R_obj, 0, R_tmp); UNPROTECT(1); PROTECT(R_tmp = NEW_INTEGER(na)); Memcpy(INTEGER(R_tmp), cf, na); Free(cf); PROTECT(R_dim = NEW_INTEGER(1)); INTEGER(R_dim)[0] = na; SET_DIM(R_tmp, R_dim); UNPROTECT(1); PROTECT(R_dim = NEW_LIST(1)); SET_ELEMENT(R_dim, 0, GET_LEVELS(VECTOR_ELT(R_obj, 0))); SET_DIMNAMES(R_tmp, R_dim); UNPROTECT(1); PROTECT(R_str = NEW_STRING(1)); SET_STRING_ELT(R_str , 0, mkChar("table")); SET_CLASS(R_tmp, R_str); UNPROTECT(1); SET_ELEMENT(R_obj, 1, R_tmp); UNPROTECT(1); UNPROTECT(1); return R_obj; } /**/ cba/src/ccfkms.c0000644000175100001440000002710414344034315013223 0ustar hornikusers/* * ccfkms.c * * parameter and logistic k-means based on conjugate covex functions * using sparse data structures and centering (or optionally * standardizing) of the data. * * For details see: * * Helmut Strasser and Klaus Poetzelberger. Data Compression by * Unsupervised Classification. SFB Report Series, No. 10, 1997. * * convex function: f(x) = |x|^q/q * * kohonen k-means: q = 1 * ordinary k-means: q = 2 * * convex function: f(x) = 2*ln(cosh(|x|))/2 * * logistic k-means: q = 0 * * Sparse data structure means that zero data values in the data are not * stored because they can be ignored in all vector operations, so that * computations get a considerable boost if the data are highly sparse, * i.e. the ratio of the number of non-zero data values to the number of * all data values is low. * * Centering means that the mean of the data is subtracted from the * samples, and standardizing that we further devide by the standard * deviation. Note that only for Euclidian distances the solution does * not depend on centering. * * In the case of degenerate and non-convergent solutions the program * gives a warning message. * * fixme: 1) second, ... winner not implemented. * 2) maybe return a flag indicating convergence issues? * * Note that the code is prepared for direct intefacing with sparse * data structures, such as dgRMatrix from the R package Matrix. * * (C) ceeboo 2003, 2004, 2005, 2007 */ #include #include static int debug = FALSE; /* be silent */ /* matrix structure for data in sparse row format */ typedef struct { int *ri; /* pointer to array of column start indexes */ int *ci; /* pointer to array of column indexes */ double *cv; /* pointer to array of column values */ int nr; /* number of rows */ int nc; /* number of columns */ int s; // non-sparse input data = 0 } SMAT; static void FreeSMat(SMAT *m) { if (m->s != 0) return; Free(m->ri); Free(m->ci); Free(m->cv); Free(m); } /* copy R matrix in full-storage representation to sparse * representation. treat as read only (!) */ static SMAT *matrix2smat(SEXP R_mat) { extern int debug; int nr, nc, n; int i, j, k; int *ri, *ci; double z; double *x, *cv; SMAT *m; nr = INTEGER(GET_DIM(R_mat))[0]; nc = INTEGER(GET_DIM(R_mat))[1]; x = REAL(R_mat); ri = Calloc(nr+1, int); /* row start indexes */ n = 1024; /* initial memory */ ci = Calloc(n, int); /* column indexes */ cv = Calloc(n, double); /* column values */ k = 0; for (i = 0; i < nr; i++) { /* rows */ ri[i] = k; for (j = 0; j < nc; j++) { /* columns */ z = x[i+j*nr]; if (R_FINITE(z) && z != 0.0) { if (k == n) { n *= 2; /* double memory */ ci = Realloc(ci, n, int); cv = Realloc(cv, n, double); } ci[k] = j; cv[k++] = z; } } } ri[i] = k; if (n > k) { ci = Realloc(ci, k, int); cv = Realloc(cv, k, double); } if (debug) { Rprintf("Non-Zero: %i\n", k); Rprintf("Sparsity: %4.2f\n",k / (double) (nr * nc)); } m = Calloc(1, SMAT); m->ri = ri; m->ci = ci; m->cv = cv; m->nr = nr; m->nc = nc; return m; } static SMAT *dgRMatrix2smat(SEXP R_x) { SMAT *m; SEXP x = getAttrib(R_x, install("x")); if (TYPEOF(x) != REALSXP) error("ccfkms: slot 'x' of dgRMatrix not of storage type real"); m = (SMAT *) malloc(sizeof(SMAT)); m->ri = INTEGER(getAttrib(R_x, install("p"))); m->ci = INTEGER(getAttrib(R_x, install("j"))); m->cv = REAL(x); m->nr = INTEGER(getAttrib(R_x, install("Dim")))[0]; m->nc = INTEGER(getAttrib(R_x, install("Dim")))[1]; m->s = 1; return m; } static SMAT *dgCMatrix2smat(SEXP R_x) { SMAT *m; SEXP x = getAttrib(R_x, install("x")); if (TYPEOF(x) != REALSXP) error("ccfkms: slot 'x' of dgCMatrix not of storage type real"); m = (SMAT *) malloc(sizeof(SMAT)); m->ri = INTEGER(getAttrib(R_x, install("p"))); m->ci = INTEGER(getAttrib(R_x, install("i"))); m->cv = REAL(x); m->nr = INTEGER(getAttrib(R_x, install("Dim")))[1]; m->nc = INTEGER(getAttrib(R_x, install("Dim")))[0]; m->s = 1; return m; } SEXP ccfkms(SEXP R_x, SEXP R_p, SEXP R_par, SEXP R_max_iter, SEXP R_opt_std, SEXP R_debug) { extern int debug; int opt_std, np, max_iter; int i, j, k, l, iter, ap; int *pf, *pm; double par; double x = 0, y, z, max_var, max_inf, old_inf, inf, var; double *am, *as, *p, *pt, *cc, *ct; char *s; SMAT *m = NULL; SEXP R_obj, R_tmp; debug = INTEGER(R_debug)[0]; if (inherits(R_x, "dgRMatrix")) m = dgRMatrix2smat(R_x); else if (inherits(R_x, "dgCMatrix")) m = dgCMatrix2smat(R_x); else m = matrix2smat(R_x); /* data matrix */ /* compute attribute means. standardization * is optional. if used we transform so that * we do not need to revert to a full-storage * representation. */ opt_std = INTEGER(R_opt_std)[0]; /* standardization option */ am = Calloc(m->nc, double); /* attribute means */ as = NULL; /* attribute standard deviations */ if (opt_std) as = Calloc(m->nc, double); for (i = 0; i < m->nr; i++) for (j = m->ri[i]; j < m->ri[i + 1]; j++) { am[m->ci[j]] += m->cv[j]; if (opt_std) as[m->ci[j]] += pow(m->cv[j], 2); } for (i = 0; i < m->nc; i++) { am[i] /= m->nr; if (opt_std) { as[i] = sqrt(as[i] / m->nr - pow(am[i], 2)); if (as[i] == 0) { Free(am); if (opt_std) Free(as); FreeSMat(m); error("ccfkms: zero standard deviation"); } am[i] /= as[i]; } } if (opt_std) /* prepere data */ for (i = 0; i < m->nr; i++) for (j = m->ri[i]; j < m->ri[i + 1]; j++) m->cv[j] /= as[m->ci[j]]; /* get initial protoypes and allocate * R result object. */ np = INTEGER(GET_DIM(R_p))[0]; /* number of prototypes */ if (INTEGER(GET_DIM(R_p))[1] != m->nc) { /* check */ Free(am); if (opt_std) Free(as); FreeSMat(m); error("ccfkms: \"x\" and \"p\" do not conform"); } PROTECT(R_obj = NEW_LIST(4)); /* result object */ PROTECT(R_tmp = allocMatrix(REALSXP, np, m->nc)); /* prototypes */ Memcpy(REAL(R_tmp), REAL(R_p), np * m->nc); /* copy prototypes */ p = REAL(R_tmp); SET_VECTOR_ELT(R_obj, 0, R_tmp); UNPROTECT(1); /* center (standardize) initial prototypes */ for (i = 0; i < np; i++) for (j = 0; j < m->nc; j++) { if (opt_std) p[i + j * np] /= as[j]; p[i + j * np] -= am[j]; } /* get parameter */ par = REAL(R_par)[0]; /* get maximum number of iterations */ max_iter = INTEGER(R_max_iter)[0]; /* compute the maximum information and variance, * i.e., each point is a prototype */ z = 0; for (i = 0; i < m->nc; i++) z += pow(am[i], 2); max_var = 0; max_inf = 0; for (i = 0; i < m->nr; i++) { y = z; for (j = m->ri[i]; j < m->ri[i + 1]; j++) y += m->cv[j] * (m->cv[j] - 2 * am[m->ci[j]]); max_var += y; max_inf += pow(sqrt(y), par) / par; } max_var /= m->nr; max_inf /= m->nr; /* allocate remaining result objects * and iterate to a fixpoint solution */ PROTECT(R_tmp = NEW_INTEGER(np)); /* prototype frequencies */ pf = INTEGER(R_tmp); SET_VECTOR_ELT(R_obj, 1, R_tmp); UNPROTECT(1); PROTECT(R_tmp = NEW_INTEGER(m->nr)); /* prototype memberships */ pm = INTEGER(R_tmp); SET_VECTOR_ELT(R_obj, 2, R_tmp); UNPROTECT(1); GetRNGstate(); pt = Calloc(np * m->nc, double); /* prototype temporary */ cc = Calloc(np, double); /* conjugate convex */ ct = Calloc(np, double); /* conjugate convex temporary */ if (debug) Rprintf("\n %3s %5s %5s %3s\n","#","inf","var","nap"); old_inf = -1; inf = 0; iter = 0; while(inf > old_inf && iter < max_iter) { /* map prototype means into domain of dual problem * and compute conjugate convex function */ for (i = 0; i < np; i++) { y = 0; for (j = 0; j < m->nc; j++) y += pow(p[i + j * np], 2); y = sqrt(y); if (par) z = pow(y, par-2); else { /* logistic */ x = (exp(y) - 1) / (exp(y) + 1); z = x / y; } for (j = 0; j < m->nc; j++) p[i + j * np] *= z; z = 0; for (j = 0; j < m->nc; j++) z += p[i + j * np] * am[j]; if (par) z += pow(y, par) * (par-1) / par; else z += (1 + x) * log(1 + x) + (1 - x) * log(1 - x); cc[i] = z; pf[i] = 0; /* initialize */ for (j=0; j < m->nc; j++) pt[i + j * np] = 0; } /* determine partition and * calculate prototype means */ for (i = 0; i < m->nr; i++) { for (k = 0; k < np; k++) { ct[k] = -cc[k]; for (j = m->ri[i]; j < m->ri[i + 1]; j++) ct[k] += m->cv[j] * p[k + m->ci[j] * np]; } /* find the closest prototype. * note that tie breaking is used. */ l = 1; k = 0; z = ct[0]; for (j = 1; j < np; j++) if (z < ct[j]) { k = j; z = ct[j]; } else if (z == ct[j]) { if (unif_rand() > l/(l+1)) k = j; l++; } pm[i] = k; /* update prototype frequency and means */ pf[k]++; for (j = m->ri[i]; j < m->ri[i + 1]; j++) pt[k + m->ci[j] * np] += m->cv[j]; } /* update the stopping criterion. compute the means * and the information and variance of the partition */ old_inf = inf; ap = 0; inf = 0; var = 0; for (i = 0; i < np; i++) { if (pf[i] != 0) { ap++; for (j = 0; j < m->nc; j++) p[i + j * np] = pt[i + j * np] / pf[i] - am[j]; z =0; for (j = 0; j < m->nc; j++) z += pow(p[i + j * np], 2); var += z * pf[i] / m->nr; z = sqrt(z); if (par) z = pow(z, par) / par; else { z = 2 * log((exp(z / 2) + exp(-z / 2)) / 2); } inf += z * pf[i] / m->nr; } } iter++; if (debug) Rprintf(" %3i %5.3f %5.3f %3i\n", iter, inf / max_inf, var / max_var, ap); /* degenrate solution */ if (old_inf > inf) warning("ccfkms: decrease in information"); } Free(pt); Free(cc); Free(ct); PutRNGstate(); if (max_iter > 1 && old_inf != inf) warning("ccfkms: no convergence"); /* invert the information */ inf = max_inf - inf; PROTECT(R_tmp = NEW_NUMERIC(1)); REAL(R_tmp)[0] = inf; SET_VECTOR_ELT(R_obj, 3, R_tmp); UNPROTECT(1); /* decenter (destandardize) the prototype means */ for (i = 0; i < np; i++) { for (j = 0; j < m->nc; j++) { p[i + j * np] += am[j]; if (opt_std) p[i + j * np] *= as[i]; } } Free(am); if (opt_std) Free(as); /* offset memberships to R indexing. */ for (i = 0; i < m->nr; i++) pm[i]++; FreeSMat(m); /* levels attribute */ int sn = np/10+2; s = Calloc(sn, char); /* stringified integers */ PROTECT(R_tmp = NEW_STRING(np)); for (j = 0; j < np; j++) { snprintf(s,sn,"%i",j+1); SET_STRING_ELT(R_tmp, j, mkChar(s)); } Free(s); SET_LEVELS(VECTOR_ELT(R_obj, 2), R_tmp); UNPROTECT(1); UNPROTECT(1); return R_obj; } /**/ cba/NAMESPACE0000644000175100001440000000217113037706735012250 0ustar hornikusersimport("grid") import("proxy") export(proximus, rockCluster, rockLink, ccfkms, sdists,sdists.trace,sdists.center,sdists.center.align, cluster.dist, gknn, as.dummy, lmplot,clmplot, rlbmat, order.length,order.optimal,order.greedy, order.dist,order.matrix,order.data.frame, stress,stress.dist, implot, circleplot.dist) S3method("as.dummy",logical) S3method("as.dummy",integer) S3method("as.dummy",factor) S3method("as.dummy",matrix) S3method("as.dummy",list) S3method("as.dummy","data.frame") S3method(predict,"rock") S3method(fitted,"rock") S3method(print,rock) S3method(fitted,proximus) S3method(print,proximus) S3method(summary,proximus) S3method(print,"summary.proximus") S3method(predict,ccfkms) S3method(cut,ordered) S3method(plot,"sdists.graph") useDynLib("cba", .registration = TRUE) ## 2015/7 importFrom("methods", "as") importFrom("stats", "hclust", "runif") importFrom("graphics", "plot", "lines", "text", "segments", "axis", "image.default") importFrom("grDevices", "heat.colors", "gray.colors") cba/TODO0000644000175100001440000000301011304023136011470 0ustar hornikusers * check which functions are now in seriation and remove such from cba. * move sdists and friends to a seperate package. ceeboo 2009 * complete this list with known issues. * improve the documentation. * consider an option for handling of ties in clusterers (which support it). * standard methods for ccfkms are missing * unit tests to detect bugs caused by changes in the R source code. at least my trust is gone with the copy on write issue. * reconsider overloading of image.matrix * replace Calloc with calloc, etc. and use own checks and cleanups. * replace initializations of R vector memory that use a for loop with, e.g. memset(REAL(x), sizeof(double) * n) * dist and sdist could issue a warning if NAs occur in the result. * check again the duplicate or not issue :-( [current experience indicates it should work] * recheck the handling of NA and NaN in dists and elsewhere. * check if the R interface to ccfkms checks for unique initial solutions. reconsider how these solutions are generated. consider providing a default interface that searches for a "proper" number of clusters. ceeboo 2006 + rethink the current approach to dists in favor of the dist C level interface. this might be more developper friendly in the long run. + we cannot optionally include package Matrix. + the fixation on double for dists computations may be too restrictive. at least we need to shift more checking to the C level functions due to the migration to package proxy. ceeboo 2007 cba/CHANGELOG0000644000175100001440000001113514630405156012234 0ustar hornikusers Realease 0.2-24 fixed SET_TYPEOF in sdists. fixed invalid URLs in documentation. Realease 0.2-23 sprintf was not an issue. Release 0.2-22 change encoding of source files. Release 0.2-21 fixed UNPROTECT in proximus. fixed missing PROTECT in sdists. Release 0.2-20 changed UN / PROTECT approach in proximus. Release 0.2-18 added DLL registry. Release 0.2-17 fixed duplicated factor levels in Mushroom data. Release 0.2-16 fixed colnames in fitted.proximus. Release 0.2-15 Release date: 2015-07-23 fixed imports. Release 0.2-14 renamed int_arraySubscript. Release 0.2-13 adjusted package dependencies and imports. Release 0.2-12 Release date: 2013-04-30 fixed invalid memory access in rockMerge. Release 0.2-11 Release date: 2012-11-30 duplicated arraySubscript for code isolation. Release 0.2-10 Release date: 2012-08-31 resolved native symbols. Release 0.2-9 Release date: 2012-01-18 extended the weight argument in all sdists interfaces. added approximate substring matching to sdists.trace. added text option to plot.sdists.graph. Release 0.2-8 Release date: 2011-09-15 fixed lty issue in plot.sdists.graph. fixed possible issue on sparc-solaris in proximus.c. Release 0.2-7 Release date: 2011-09-09 changed image.matrix to implot so that other packages which use image work with cba. fixed justification issues in documentation. Release 0.2-6 Release date: 2009-01-07 dists, row.dist, col.dist, colSums.dist, colMeans.dist, rowSums.dist, rowMeans.dist, [[.dist, subset.dist, dim.dist, dimnames.dist, dapply, dapply.list were removed as the same functionality is provided in package proxy. cluproxplot and seriation were removed as these are provided and further developed in package seriation. in sdists.c changed temporary storage from CHARSXP to RAWSXP for compatibility with R-2.9.x. added pairwise option to sdists. added sdists.center and sdists.center.align. reversed CHANGELOG. Release 0.2-5 Release date: 2008-05-25 fixed encoding declaration in proximus documentation (C) fixed example in rockCluster documentation (C) Release 0.2-1 Release date: 2006-09-04 fixed Windoze C99 versus gnu99 issues (C) fixed order.matrix and order.data.frame (C) Release 0.2-0 Release date: 2006-08-31 added stress (C) fixed future problems with strict type checking in (C) by using SET_STRING_ELT instead of SET_ELEMENT. Release 0.1-9 Release date: see next fixed asymmetric weights bug in sdists (C) added traceback function for sdists (C) fixed bug due to copy on change in all distance functions (C) added rowSums.dist and dapply (C) added interrupt checks to all distance functions (C) added traceback plot for sdists (C) added cluster.dist (C) Release 0.1-8 Release date: 2006-03-01 fixed random-access bug in subset_dist C code which made the return value dependent on the order of the subset indexes (C) added names to cluster labels in rockCluster and ccfksm (C) added C code for fuzzy binary distance computation (C) added C code for distance computation on sequences (C) reduced cluproxplots memory needs (M) introduced hcl colorspace and fixed colorkey for cluproxplot (M) Release 0.1-7 Release date: 2006-01-27 added cluProxMatrix class w/ print and plot methods (M) simplified the cluproxplot interface by using plotOptions for all plotting related options (m) added threshold to cluproxplot (M) Release 0.1-6 Release date: 2005-12-12 cluproxplot uses now really grid (M) added colorkey for cluproxplot (M) Release 0.1-5 Release date: 2005-10-26 Fixed similarity conversion for intraClusterSim in clusimplot (Michael) renamed clusimplot to cluproxplot (Cluster Proximity plot) (M) cluproxplot uses grid now and includes a silhouettes plot (M) added interface for seriation (M) Release 0.1-4 Release date: 2005-09-12 Added drop option to as.dummy and sparse K-means code. Added clusimplot (Michael) Added Mushroom data set (Michael) Release 0.1-3 Release date: 2005-09-08 The package develops into a trash bag: moved optimal leaf ordering and stuff in. This sucks! (C) Release 0.1-2 Release date: 2005-08-30 Major changes include merging in of Proximus. Visualization is sketchy but useable. Most of the stuff definitely belongs to separate packages. However, it all ended up here. The C code is at least -Wall -pedantic clean. Release 0.1-1 Release date: 2005-07-02 Bug fixes, better fitting in with existing R functionality (e.g. class dist). Added efficient conversion of nominal variables to dummy coding. Release 0.1 Release date: 2005-05-30 Alpha release of the Rock package which implements the Rock algorithm. Focus is on efficient implementation of time critical functions: distance computation and merging. cba/build/0000755000175100001440000000000014630406263012120 5ustar hornikuserscba/build/partial.rdb0000644000175100001440000000007414630406263014246 0ustar hornikusersb```b`a 00 FN ͚Z d@$$7cba/man/0000755000175100001440000000000014630404676011602 5ustar hornikuserscba/man/order.Rd0000644000175100001440000000446211633160276013205 0ustar hornikusers\name{order} \alias{order.dist} \alias{order.matrix} \alias{order.data.frame} \title{Improving the Presentation of Matrix Objects} \description{ High-level functions that improve the presentation of a matrix or data frame by reordering their rows and columns. } \usage{ order.dist(x, index = FALSE) order.matrix(x, type = "neumann", by = c("both","rows","cols"), index = FALSE) order.data.frame(x, type = "neumann", by = c("both","rows","cols"), index = FALSE) } \arguments{ \item{x}{an object of class \code{dist}, \code{matrix}, or \code{data.frame}.} \item{type}{the type of stress measure to use (see details).} \item{by}{option to order either by rows, or columns, or both.} \item{index}{option to return the order index(ex) instead of the reordered object.} } \details{ These functions try to improve the presentation of an object of class \code{dist}, \code{matrix}, or\cr\code{data.frame} by reordering the rows and columns such that similar entries are grouped together. \code{order.dist} uses a simple heuristic to solve the TSP problem of finding an ordering of minimum length (see \code{order.length}) for an object of class \code{dist}. Note that the heuristic used is quick but more elaborate TSP algorithms will produce better orderings. \code{order.matrix} tries to minimize the stress measure of a matrix (see \code{stress} by using the same TSP heuristic as above, once for the column and once for the row ordering (while the other dimension is fixed) if \code{by = "both"}. \code{order.data.frame} uses attributes of type \code{numeric} and \code{logical} only, combines them into a normalized matrix and finds an ordering as above. } \value{ Either the reordered object supplied, or a vector of subscripts (for \code{reorder.dist}), or a list with components \code{rows} and \code{columns} containing the order indexes (for \code{reorder.matrix} and \code{reorder.data.frame}). } %\references{} \author{Christian Buchta} \note{ This is experimental code that may be integrated in a separate packages in the future. } \seealso{\code{\link{dist}}, \code{stress}, \code{stress.dist}.} \examples{ ## not a hard problem data(iris) d <- dist(iris[1:4]) implot(order.dist(d)) data(townships) x <- order.data.frame(townships) x } \keyword{cluster} cba/man/image.Rd0000644000175100001440000000244711633160276013155 0ustar hornikusers\name{image} \alias{implot} \title{Matrix Image Plots} \description{ Implements a wrapper function to \code{image} for proper plotting of objects of class \code{matrix} and \code{dist}. } \usage{ implot(x, xlab = "", ylab = "", axes = FALSE, ticks = 10, las = 2, ...) } \arguments{ \item{x}{an object of class \code{matrix} or \code{dist}.} \item{xlab, ylab}{labels for the x and y axis.} \item{axes}{logical, indicating whether \code{dimnames(x)} should be drawn on the plot.} \item{ticks}{the number of tick-marks to use.} \item{las}{the axis style to use (see \code{par}).} \item{\dots}{further arguments to \code{image}.} } \details{ Plots an object of class \code{matrix} in its original row and column orientation. This means, in a plot the columns become the x-coordinates and the reversed rows the y-coordinates. If \code{x} is of class \code{dist} it is coerced to full-storage \code{matrix} representation. } \value{ Returns the transformed \code{x} \emph{invisibly}. } \author{Christian Buchta} \seealso{ \code{\link{image}} and \code{\link{par}} for details.} \examples{ x <- matrix(sample(c(FALSE, TRUE),100,rep=TRUE),ncol=10, dimnames=list(1:10,LETTERS[1:10])) implot(x) implot(x, col=c("white","black"), axes = TRUE) } \keyword{cluster} \keyword{hplot} cba/man/order.optimal.Rd0000644000175100001440000000433711633160276014652 0ustar hornikusers\name{order.optimal} \alias{order.optimal} \title{Optimal Leaf Ordering of Binary Trees.} \description{ Find an optimal linear leaf ordering of a binary merge tree as produced by a hierarchical cluster algorithm. } \usage{ order.optimal(dist, merge) } \arguments{ \item{dist}{an object of class \code{dist}.} \item{merge}{a binary merge tree (see \code{\link{hclust}}).} } \details{ A binary tree has \eqn{2^{n-1}}{2^(n-1)} internal nodes (subtrees) and the same number of leaf orderings. That is, at each internal node the left and right subtree (or leaves) can be swapped, or, in terms of a dendrogram, be flipped. An objective measure of a leaf ordering is the sum of the distances along the path connecting the leaves in the given order. An ordering with a minimal path length is defined to be an optimal ordering. This function provides an interface to the optimal leaf ordering algorithm (see references) for tree representations that are used by hierarchical cluster algorithms such as \code{\link{hclust}}. Note that non-finite distance values are not allowed. } \value{ A list with the following components: \item{merge}{a matrix containing the merge tree corresponding with the optimal leaf order.} \item{order}{a vector containing the optimal leaf order.} \item{length}{the length of the ordering.} } \references{ Z. Bar-Joseph, E. D. Demaine, D. K. Gifford, and T. Jaakkola. (2001). Fast Optimal Leaf Ordering for Hierarchical Clustering. \emph{Bioinformatics}, Vol. 17 Suppl. 1, pp. 22-29. } \author{Christian Buchta} \note{The time complexity of the algorithm is \eqn{O(n^3)}.} \seealso{ \code{\link{hclust}} for hierarchical clustering and \code{\link{order.length}} for computing the objective value of a leaf ordering. } \examples{ d <- dist(matrix(runif(30), ncol=2)) hc <- hclust(d) co <- order.optimal(d, hc$merge) ### compare dendrograms ho <- hc ho$merge <- co$merge ho$order <- co$order op <- par(mfrow=c(2,2), pty="s") plot(hc, main="hclust") plot(ho, main="optimal") # compare images implot(d[[hc$order]]) implot(d[[co$order]]) par(op) ### compare lengths order.length(d, hc$order) order.length(d, co$order) cat("compare: ",co$length,"\n") } \keyword{hplot} \keyword{cluster} cba/man/clmplot.Rd0000644000175100001440000000534011633160276013540 0ustar hornikusers\name{clmplot} \alias{clmplot} \title{Plotting Logical Matrices} \description{ A wrapper function to \code{image} that produces a level plot with the option to color the rows (or columns) based on a clustering and/or classification of the data, and the option to reorder the rows and columns for better presentation. } \usage{ clmplot(x, col, col.bycol = FALSE, order=FALSE, dist.method = "binary", hclust.method = "average", axes = FALSE, xlab = "", ylab = "", ...) } \arguments{ \item{x}{an logical matrix.} \item{col}{an optional vector defining a coloring.} \item{col.bycol}{option to color by columns.} \item{order}{option to (pre)order the rows and columns.} \item{dist.method}{method to be used by \code{dist}.} \item{hclust.method}{method to be used by \code{hclust}.} \item{axes}{option to plot axes.} \item{xlab, ylab}{labels for the x and y axis.} \item{\dots}{further arguments to \code{image}.} } \details{ For dummy coded data the level \code{FALSE} is assumed to carry no information and is colored \code{white}. Thus, the level \code{TRUE} can be colored according to some classification of the rows (or columns) of the data matrix. If no color specification is supplied the default color \code{black} is used. If \code{col} is of type character it is assumed to contain color codes. Otherwise, it must be a factor and the levels are assigned colors from \code{\link{heat.colors}}. If \code{order} is \code{TRUE} the rows and columns are ordered by \code{hclust} where the distances are computed by \code{dist}. Note that an axis is only plotted if the number of elements (rows or columns) is less than 100. } \value{ A list with the following components: \item{rows}{the row order.} \item{cols}{the column order.} } \author{Christian Buchta} \seealso{ \code{\link{lmplot}} for plotting of logical data at reduced resolutions, \code{\link{heatmap}} for ordered plotting of real-valued data, and package \pkg{gclus} for ordering functions. } \examples{ x <- matrix(sample(c(FALSE,TRUE),100,rep=TRUE),ncol=10) clmplot(x, order=TRUE, axes=TRUE) clmplot(x, col=rep(c(1,2),each=5)) clmplot(x, col=rep(c("red","blue"),each=5)) clmplot(x, col=rep(c("red","blue"),each=5), col.bycol=TRUE) \dontrun{ ### continue example (see rockCluster) col <- Votes$Class # color by party levels(col) <- c("red","blue") op <- par(mfrow=c(1,2), pty="s") clmplot(x, order=TRUE, col=as.character(col), main="Parties") col <- rf$cl # color by cluster levels(col) <- c("blue","red","green", "black") # map NA to black clmplot(x, order=TRUE, col=as.character(col), main="Clusters") par(op) } } \keyword{cluster} \keyword{hplot} cba/man/rockCluster.Rd0000644000175100001440000000676411304023136014365 0ustar hornikusers\name{rockCluster} \alias{rockCluster} \alias{rockLink} \title{Rock Clustering} \description{ Cluster a data matrix using the Rock algorithm. } \usage{ rockCluster(x, n, beta = 1-theta, theta = 0.5, fun = "dist", funArgs = list(method="binary"), debug = FALSE) rockLink(x, beta = 0.5) } \arguments{ \item{x}{a data matrix; for \code{rockLink} an object of class \code{dist}.} \item{n}{the number of desired clusters.} \item{beta}{optional distance threshold.} \item{theta}{neighborhood parameter in the range [0,1).} \item{fun}{distance function to use.} \item{funArgs}{a \code{list} of named parameter arguments to \code{fun}.} \item{debug}{turn on/off debugging output.} } \details{ The intended area of application is the clustering of binary (logical) data. For instance in a preprocessing step in data mining. However, arbitrary distance metrics could be used (see \code{\link[proxy:dist]{dist}}). According to the reference (see below) the distance threshold and the neighborhood parameter are coupled. Thus, higher values of the neighborhood parameter \code{theta} pose a tighter constraint on the neighborhood. For any two data points the latter is defined as the number of other data points that are neighbors to both. Further, points only are neighbors (or linked) if their distance is less than or equal \code{beta}. Note that for a tight neighborhood specification the algorithm may be running out of clusters to merge, i.e. may terminate with more than the desired number of clusters. The \code{debug} option can help in determining the proper settings by examining lines suffixed with a plus which indicates that non-singleton clusters were merged. Note that tie-breaking is not implemented, i.e. the first max encountered is used. However, permuting the order of the data can help in determining the dependence of a solution on ties. Function \code{rockLink} is provided for applications that need to compute link count distances efficiently. Note that \code{NA} and \code{NaN} distances are ignored but supplying such values for the threshold \code{beta} results in an error. } \value{ \code{rockCluster} returns an object of class \code{rock}, a list with the following components: \item{x}{the data matrix or a subset of it.} \item{cl}{a factor of cluster labels.} \item{size}{a vector of cluster sizes.} \item{beta}{see above.} \item{theta}{see above.} \code{rockLink} returns an object of class \code{dist}. } \references{ S. Guha, R. Rastogi, and K. Shim. ROCK: A Robust Clustering Algorithm for Categorical Attributes. \emph{Information Science}, Vol. 25, No. 5, 2000. } \author{Christian Buchta} \seealso{ \code{\link[proxy:dist]{dist}} for common distance functions, \code{\link{predict}} for classifying new data samples, and \code{\link{fitted}} for classifying the clustered data samples. } \examples{ ### example from paper data(Votes) x <- as.dummy(Votes[-17]) rc <- rockCluster(x, n=2, theta=0.73, debug=TRUE) print(rc) rf <- fitted(rc) table(Votes$Class, rf$cl) \dontrun{ ### large example from paper data("Mushroom") x <- as.dummy(Mushroom[-1]) rc <- rockCluster(x[sample(dim(x)[1],1000),], n=10, theta=0.8) print(rc) rp <- predict(rc, x) table(Mushroom$class, rp$cl) } ### real valued example gdist <- function(x, y=NULL) 1-exp(-dist(x, y)^2) xr <- matrix(rnorm(200, sd=0.6)+rep(rep(c(1,-1),each=50),2), ncol=2) rcr <- rockCluster(xr, n=2, theta=0.75, fun=gdist, funArgs=NULL) print(rcr) } \keyword{cluster} % cba/man/sdists.center.Rd0000644000175100001440000000231311703753613014654 0ustar hornikusers\name{sdists.center} \alias{sdists.center} \title{Centroid Sequences} \description{ Find centroid sequences among a collection of sequences. } \usage{ sdists.center(x, d = NULL, method = "ow", weight = c(1, 1, 0, 2), exclude = c(NA, NaN, Inf, -Inf), FUN = NULL, ..., unique = FALSE) } \arguments{ \item{x}{a list (of vectors) of a vector of character.} \item{d}{a matrix or an object of class \code{dist}.} \item{method}{argument to \code{sdists}.} \item{weight}{argument to \code{sdists}.} \item{exclude}{argument to \code{sdists}.} \item{FUN}{a function to rank distances.} \item{\dots}{additional arguments to \code{FUN}.} \item{unique}{a logical specifying whether to return a unique set of sequences.} } \details{ This function provides a wrapper to computing the distances among the sequences in \code{x}, unless \code{d} is supplied, and the subsequent selection of a set of centroid sequences with minimum sum of distances to any other sequence. } \value{ A subset of \code{x}. } \author{Christian Buchta} \seealso{\code{\link{sdists}} for distance computation.} \examples{ x <- c("ABCD", "AD", "BCD", "ACF", "CDF", "BC") sdists.center(x) } \keyword{cluster} cba/man/rlbmat.Rd0000644000175100001440000000176211304023136013337 0ustar hornikusers\name{rlbmat} \alias{rlbmat} \title{Block Uniform Logical Matrix Deviates} \description{ Generate a uniform logical matrix deviate with a possibly overlapping block structure. } \usage{ rlbmat(npat = 4, rows = 20, cols = 12, over = 4, noise = 0.01, prob = 0.8, perfect = FALSE) } \arguments{ \item{npat}{number of patterns.} \item{rows}{number of rows per pattern.} \item{cols}{number of columns per pattern.} \item{over}{number of additional columns per pattern that overlap.} \item{noise}{the probability of observing a one in the background (non-pattern) matrix.} \item{prob}{the probability of observing \code{TRUE} in a pattern.} \item{perfect}{option for overlap of the first and the last pattern.} } \details{ Implements a test case for \code{proximus}. } \value{ A logical matrix } \author{Christian Buchta} %\note{} \seealso{ \code{\link{lmplot}} and \code{\link{clmplot}} for plotting a logical matrix } \examples{ x <- rlbmat() lmplot(x) } \keyword{cluster} cba/man/lminter.Rd0000644000175100001440000000237511304023136013531 0ustar hornikusers\name{lminter} \alias{lminter} \title{Interpolating Logical Matrices} \description{ Interpolate a logical matrix into a lower-resolution representation. } \usage{ lminter(x, block.size = 1, nbin = 0) } \arguments{ \item{x}{a logical matrix.} \item{block.size}{the interpolation block size.} \item{nbin}{the number of density bins.} } \details{ Partitions a binary matrix into square blocks of specified size (length) and interpolates the number of \code{TRUE} values per block using the specified number of bins. Note that the effective number of bins is one greater than the specified number because the zero bin is always included. Excess rows and columns at the lower or right margins of the matrix are ignored. If the number of bins is null counts are mapped to zero and one thresholding at half of the number of distinct count values including zero. Thus, for even numbered block sizes there is a bias towards zero. } \value{ An integer matrix of bin numbers. } \author{Christian Buchta} \note{Package internal function.} \seealso{\code{\link{lmplot}} for plotting logical matrices.} \examples{ \dontrun{ x <- matrix(sample(c(FALSE, TRUE), 4 ,rep=TRUE), ncol=2) lminter(x, block.size=2, nbin=2) }} \keyword{cluster} \keyword{hplot} cba/man/gknn.Rd0000644000175100001440000000424011304023136013005 0ustar hornikusers\name{gknn} \alias{gknn} \title{Generalized k-Nearest Neighbor Classification} \description{ Compute the k-nearest neighbor classification given a matrix of cross-distances and a factor of class values. For each row the majority class is found, where ties are broken at random (default). If there are ties for the kth nearest neighbor, all candidates are included in the vote (default). } \usage{ gknn(x, y, k = 1, l = 0, break.ties = TRUE, use.all = TRUE, prob = FALSE) } \arguments{ \item{x}{a cross-distances matrix.} \item{y}{a factor of class values of the columns of \code{x}.} \item{k}{number of nearest neighbors to consider.} \item{l}{minimum number of votes for a definite decision.} \item{break.ties}{option to break ties.} \item{use.all}{option to consider all neighbors that are tied with the kth neighbor.} \item{prob}{optionally return proportions of winning votes.} } \details{ The rows of the cross-distances matrix are interpreted as referencing the test samples and the columns as referencing the training samples. The options are fashioned after \code{knn} in package \pkg{class} but are extended for tie breaking of votes, e.g. if only definite (majority) votes are of interest. Missing class values are not allowed because that would collide with a missing classification result. Missing distance values are ignored but with the possible consequence of missing classification results. Note that this depends on the options settings, e.g. } \value{ Returns a factor of class values (of the rows of \code{x}) which may be \code{NA} in the case of doubt (no definite decision), ties, or missing neighborhood information. The proportions of winning votes are returned as attribute \code{prob} (if option \code{prob} was used). } \author{Christian Buchta} \seealso{\code{\link[proxy:dist]{dist}} for efficient computation of cross-distances.} \examples{ \dontrun{ ### extend Rock example data(Votes) x <- as.dummy(Votes[-17]) rc <- rockAll(x, n=2, m=100, theta=0.73, predict=FALSE, debug=TRUE) gc <- gknn(dist(x, rc$y, method="binary"), rc$cl, k=3) table(gc[rc$s], rc$cl) } } \keyword{cluster} \keyword{classif} cba/man/order.length.Rd0000644000175100001440000000233511304023136014446 0ustar hornikusers\name{order.length} \alias{order.length} \title{Conciseness of Presentation Measures} \description{ Compute the length of a Hamilton path through a distance matrix. } \usage{ order.length(dist, order) } \arguments{ \item{dist}{an object of class \code{dist}.} \item{order}{an optional permutation of the row (column) indexes.} } \details{ Ordering a distance matrix such that low distance values are placed close to the diagonal may improve its presentation. The length of an order is the corresponding objective measure. The order corresponds to a path through a graph where each node is visited only once, i.e. a Hamilton path. The length of a path is defined as the sum of the edge weights, i.e. distances. If \code{order} is missing the identity order is used. If \code{order} is not unique \code{NA} is returned. If there are non-finite distance values \code{NA} is returned. } \value{ A scalar real value. } \references{ R. Sedgewick. (2002). \emph{Algorithms in C. Part 5. Graph Algorithms}. 3rd Edition, Addison-Wesley. } \author{Christian Buchta} \examples{ d <- dist(matrix(runif(10),ncol=2)) order.length(d) o <- sample(5,5) # random order order.length(d, o) } \keyword{hplot} \keyword{cluster} cba/man/cluster.dist.Rd0000644000175100001440000000264411304023136014501 0ustar hornikusers\name{cluster.dist} \alias{cluster.dist} \title{Clustering a Sparse Symmetric Distance Matrix} \description{ Compute a clustering on a sparse symmetric distance matrix using graph cutting. } \usage{ cluster.dist(x, beta) } \arguments{ \item{x}{an object of class \code{dist}.} \item{beta}{the distance threshold.} } \details{ This function computes a clustering on an object of class \code{dist} by cutting the graph induced by the threshold \code{beta} into all disconnected subgraphs (the clusters). Two nodes are connected by a link if their distance is below the specified threshold. Note that the threshold is not strict, i.e. \code{>=}. Note that distances of value \code{NA} and \code{NaN} are ignored. This is not strictly correct but avoids computing \eqn{2^k} possible solutions if there are k \code{NA} values. The time complexity is \eqn{O(n^2)} with n the number of rows/columns. } \value{ A factor of cluster labels (indexed 1,2,\dots,k). } %\references{ % fixme %} \author{Christian Buchta} \note{ Fixme: can the time complexity be improved? } \seealso{ \code{\link{dist}} and \code{\link{sdists}} for distance computation.} \examples{ ## 3 clusters (1 = connected) x <- matrix(c(1,1,0,0,0,0, 1,1,0,0,0,0, 0,0,1,1,0,0, 0,0,1,1,0,0, 0,0,0,0,1,1, 0,0,0,0,1,1), ncol=6) c <- cluster.dist(as.dist(!x), beta = 0) # invert and note that 0 >= 0 c } \keyword{cluster} cba/man/ccfkms.Rd0000644000175100001440000000714711304023136013327 0ustar hornikusers\name{ccfkms} \alias{ccfkms} \title{Clustering with Conjugate Convex Functions} \description{ Partition a data set into convex sets using conjugate convex functions. } \usage{ ccfkms(x, n, p = NULL, par = 2, max.iter = 100, opt.std = FALSE, opt.retry = 0, debug = FALSE) } \arguments{ \item{x}{a data matrix.} \item{n}{optional number of prototypes.} \item{p}{a matrix of initial prototypes.} \item{par}{type or parameter of conjugate convex function.} \item{max.iter}{maximum number of iterations.} \item{opt.std}{optionally standardize the data.} \item{opt.retry}{number of retries.} \item{debug}{optionally turn on debugging output.} } \details{ Two types of conjugate convex functions are available: one that is based on powers of the norm of the prototype vectors and another that is based on a logarithmic transformation of the norm. Both are intended to obtain more robust partitions. Using \code{par} = 2 is equivalent to performing ordinary k-means with Euclidean distances. \code{par} = 1 is equivalent to LVQ of Kohonen type (the directions of the prototypes from the center of the data are used), and \code{par} = 0 is equivalent to using 2*ln(cosh(|p|))/2. Internally the algorithm uses sparse data structures and avoids computations with zero data values. Thus, the data must not be centered (the algorithm does this internally with the option to further standardize the data). For dense data this is slightly inefficient. If initial prototypes are omitted the number of prototypes must be specified. In this case the initial prototypes are drawn from the data (without replacement). If the number of retries is greater than zero the best among that number of trial solutions is returned. Note that the number of prototypes must be specified as the initial prototypes are sampled from the data. The debugging output shows the iteration number, the inverted information and the variance of the current partition as a percentage of the total (if each data point were a cluster), and the number of active prototypes (those with at least one member, i.e. a data point that is not closer to any other prototype). Note that the algorithm uses tie-breaking when it determines the cluster memberships of the samples. } \value{ A list with the following components: \item{centers}{a matrix of cluster means (final prototypes).} \item{size}{a vector of cluster sizes.} \item{cl}{a factor of cluster labels (indexes).} \item{inv.inf}{the inverted information of the partition.} \item{par}{see above.} \item{opt.std}{see above.} } \references{ Helmut Strasser and Klaus Poetzelberger. Data Compression by Unsupervised Classification. \emph{SFB Report Series}, No. 10, 1997. } \author{Christian Buchta} \note{ Support for data matrices \code{x} in sparse \code{dgTMatrix} and \code{dgCMatrix} format (see package \pkg{Matrix}) is experimental. Support for the \code{dgRMatrix} format is currently suspended due to problems with package \pkg{Matrix}. } \seealso{ \code{\link{kmeans}}, \code{cmeans}, \code{kkmeans} for similar or related clustering techniques. } \examples{ ### extend proximus example x <- rlbmat() rownames(x) <- seq(dim(x)[1]) cm <- ccfkms(x, n=4, opt.retry=10) pcm <- predict(cm, x) \dontrun{ ### using sparse data may be more time-efficient ### depending on the goodness of the implementation ### of subset, etc. in package Matrix. require(Matrix) #sx <- as(x, "dgRMatrix") # currently broken sx <- as(x, "dgCMatrix") system.time(scm <- ccfkms(sx, n=4, opt.retry=50)) system.time(cm <- ccfkms(x, n=4, opt.retry=50)) } } \keyword{cluster} cba/man/predict.rock.Rd0000644000175100001440000000330011304023136014433 0ustar hornikusers\name{predict.rock} \alias{predict.rock} \alias{fitted.rock} \title{Rock Clustering} \description{ Classify the rows of a data matrix using the Rock classifier. } \usage{ \method{predict}{rock}(object, x, drop = 1, ...) \method{fitted}{rock}(object, ...) } \arguments{ \item{object}{an object of class \code{rock}.} \item{x}{a data matrix containing test or new samples.} \item{drop}{cluster size threshold.} \item{\dots}{further (unused) arguments.} } \details{ Provides a wrapper function to the Rock classifier: cluster memberships of the (row) samples are determined by majority vote using the size (of the cluster) weighted number of links. The classifier uses random tie-breaking and assigns samples with zero link counts to \code{NA}. This allows for detection of possible outliers, or interesting patterns. In the case a \code{drop} value greater than zero is specified, all clusters with size equal or less than this value are removed from the classifier. Especially, \code{fitted} uses a threshold of one because for singleton clusters the neighborhood is empty. Note that for the training data the predicted memberships need not necessarily be the same as those obtained from the cluster algorithm. } \value{ A list with the following components: \item{cl}{a factor of cluster memberships of the samples.} \item{size}{a vector of cluster sizes.} } \seealso{ \code{\link{rockCluster}} for obtaining a rock classifier and \code{\link{gknn}} for k-nearest neighbor classification. } \examples{ ### example from paper data(Votes) x <- as.dummy(Votes[-17]) rc <- rockCluster(x, n=2, theta=0.73, debug=TRUE) rp <- predict(rc, x) table(rp$cl) } \keyword{cluster} %% cba/man/stress.Rd0000644000175100001440000000432311633160276013411 0ustar hornikusers\name{stress} \alias{stress} \alias{stress.dist} \title{Conciseness of Presentation Measures} \description{ Compute different types of conciseness of presentation measures. } \usage{ stress(x, rows = NULL, cols = NULL, type = "moore") stress.dist(x, rows = NULL, cols = NULL, bycol = FALSE, type = "moore") } \arguments{ \item{x}{a matrix object.} \item{rows}{a subscript vector indexing the rows.} \item{cols}{a subscript vector indexing the columns.} \item{bycol}{logical for computation over the columns.} \item{type}{the type of neighborhood to use.} } \details{ Function \code{stress} computes the sum of squared distances of each matrix entry from its adjacent entries. The following types of neighborhoods are available: \describe{ \item{\code{moore}:}{comprises the eight adjacent entries (five at the margins and three at the corners).} \item{\code{neumann}:}{comprises the four adjacent entries (three at the margins and two at the corners).} } Function \code{stress.dist} computes the auto-distance matrix for each pair of rows (or columns) given one of the above stress measures. Note that the result depends on the ordering of the unused dimension. As the computation can be reduced to summing the edge distances between any two neighboring points, only half of the value of the proposed measures is reported. Row and/or column indexes (or labels) can be supplied to test specific orderings, as well as subsets of indexes (labels). Note that the matrix should be normalized so that the distance computation is meaningful. } \value{ \code{stress} returns a scalar real, i.e. half of the global stress measure. \code{stress.dist} returns an object of class \code{\link{dist}}, i.e. a lower triangular matrix in column format. } %\references{} \author{Christian Buchta} %\note{} \seealso{ \code{\link[proxy:dist]{dist}} for general distance computation. } \examples{ ## x1 <- matrix(sample(c(FALSE,TRUE),25,rep=TRUE),ncol=5) stress(x1) stress(x1, type="neumann") ## x2 <- cbind(rbind(matrix(1,4,4),matrix(0,4,4)), rbind(matrix(0,4,4),matrix(1,4,4))) stress.dist(x2) stress.dist(x2, bycol=TRUE) stress.dist(x2, type="neumann") } \keyword{hplot} \keyword{cluster} cba/man/plot.sdists.graph.Rd0000644000175100001440000000463411703753613015462 0ustar hornikusers\name{plot.sdists.graph} \alias{plot.sdists.graph} \title{Plotting Edit Transcripts and Sequence Alignments} \description{ Function for visualizing the optimal transformation (alignment) graph for two symbol sequences. } \usage{ \method{plot}{sdists.graph}(x, circle.col = 1, graph.col = 2, circle.scale = c("mean", "max", "last", "text"), main = "", ...) } \arguments{ \item{x}{an object of class \code{sdists.graph.}} \item{circle.col}{color to be used for circles.} \item{graph.col}{color to be used for the graph.} \item{circle.scale}{scaling to be used for circles.} \item{main}{plot title.} \item{\dots}{further unused arguments.} } \details{ This function plots the dynamic programming table, the (back)pointers and the combined graph of optimal edit transcripts (alignments) computed with \code{sdist.trace}. The fist sequence is represented by the y-axis and the second by the x-axis. The circumference of a circle is proportional to the minimum cost (maximum weight) of edit (alignment) operations leading to a table cell. \code{circle.scale} specifies the type of normalization performed where \code{last} means the last table entry (containing the optimum value), and \code{text} draws the values instead of circles. The (back)pointers, defining possible (traceback) paths, are plotted as \code{dotted} edges. Note that a traceback starts in the last cell of the table and ends at the origin. For the edges of the graph that is the union of all optimal paths, two line types are used: \code{solid} for insert, delete, and replace operations, and \code{dashed} for a match. The line width indicates the number of times an edge is on a path, but note that the interpretation is device-specific (compare \code{\link{par}}). } %\value{ %} \references{ D. Gusfield (1997). \emph{Algorithms on Strings, Trees, and Sequences}. Cambridge University Press, Chapter 11. Inspired by: \url{http://home.uchicago.edu/~aabbott/}. } \author{Christian Buchta} \note{ Some issues with grid were fixed in R.2.4.x (Fixme ?). } \seealso{\code{\link{sdists.trace}}, \code{\link{plot}}} \examples{ ## continue example in sdists.trace x1 <- "vintner" y1 <- "writers" b11 <- sdists.trace(x1, y1, weight=c(1,1,0,1), graph = TRUE) b11 plot(b11) plot(b11, circle.scale = "text") ## partial b12 <- sdists.trace(x1, y1, weight=c(1,1,0,1), graph = TRUE, partial = TRUE) b12 plot(b12) } \keyword{cluster} cba/man/sdists.center.align.Rd0000644000175100001440000000366311703753613015756 0ustar hornikusers\name{sdists.center.align} \alias{sdists.center.align} \title{Align Sequences to a Center} \description{ Find a global alignment of a collection of sequences using the center-star-tree heuristic. } \usage{ sdists.center.align(x, center, method = "ow", weight = c(1, 1, 0, 2), exclude = c(NA, NaN, Inf, -Inf), break.ties = TRUE, transitive = FALSE, to.data.frame = FALSE) } \arguments{ \item{x}{a list (of vectors) or a vector of character.} \item{center}{a vector} \item{method}{argument to \code{sdists}.} \item{weight}{argument to \code{sdists}.} \item{exclude}{arguments to \code{sdists}.} \item{break.ties}{a logical specifying whether random tie-breaking should be performed. Otherwise the first alignment is used.} \item{transitive}{a logical specifying whether the sequences in \code{x} should be aligned with each other, too.} \item{to.data.frame}{a logical specifying whether the result should be converted to \code{data.frame}.} } \details{ Each component of \code{x} is aligned with \code{center} in turn such that the latter is aligned with all sequences processed so far. If \code{center} is missing \code{i\link{sdists.center}} is used to compute an initial center. } \value{ Either a list of sequences with attributes \code{center} and \code{ties}, or a \code{data.frame} with the sequences in the columns. } \references{ D. Gusfield (1997). \emph{Algorithms on Strings, Trees, and Sequences}. Cambridge University Press, Chapter XX. } \author{Christian Buchta} \note{ The global alignment may depend on the order of \code{x}. } \seealso{ \code{\link{sdists}} for computation of distances, \code{\link{sdists.center}} for computation of centroids. } \examples{ ## continue example x <- c("ABCD", "AD", "BCD", "ACF", "CDF", "BC") sdists.center.align(x) sdists.center.align(x, transitive = TRUE, to.data.frame = TRUE) } \keyword{cluster} cba/man/fitted.proximus.Rd0000644000175100001440000000202511304023136015213 0ustar hornikusers\name{fitted.proximus} \alias{fitted.proximus} \title{Extract from a Proximus Object} \description{ Get the full storage representation of the approximated matrix and the pattern labels of the original data samples from an object of class \code{proximus}. } \usage{ \method{fitted}{proximus}(object, drop = FALSE, ...) } \arguments{ \item{object}{an object of class \code{proximus}.} \item{drop}{optionally drop patterns that do not meet the mining criteria.} \item{\dots}{further (unused) arguments.} } \details{ If option \code{drop} is \code{TRUE} only patterns that satisfy the minimum size and maximum radius constraint are extracted. } \value{ A list with the following components: \item{x}{the fitted data matrix.} \item{pl}{a factor of pattern (cluster) labels. The indexes of the original data samples are provided as attribute \code{Index}.} } \author{Christian Buchta} \seealso{ \code{\link{proximus}} for pattern mining with the Proximus algorithm. } \examples{ ### see proximus } \keyword{cluster} cba/man/summary.proximus.Rd0000644000175100001440000000306411304023136015435 0ustar hornikusers\name{summary.proximus} \alias{summary.proximus} \alias{print.summary.proximus} \title{Summarizing Proximus Objects} \description{ \code{summary} method for an object of class \code{proximus}. } \usage{ \method{summary}{proximus}(object, ...) } \arguments{ \item{object}{an object of class \code{proximus}.} \item{\dots}{further (unused) arguments.} } \value{ An object of class \code{summary.proximus} with the following elements: \item{nr}{the number of rows of the approximated matrix.} \item{nc}{the number of columns of the approximated matrix.} \item{error}{the relative error of the total approximation.} \item{fnorm}{the Frobenius norm of the total approximation.} \item{jsim}{the Jaccard similarity of the total approximation.} \item{valid}{the number of patterns that satisfy the mining constraints.} \item{pattern}{a \code{data.frame} of pattern summaries:} \item{pattern$Size}{the absolute size of the presence set.} \item{pattern$Length}{the number of \code{TRUE} values of the dominant pattern.} \item{pattern$Radius}{the Hamming radius of the presence set.} \item{pattern$Error}{the relative error of the presence set.} \item{pattern$Fnorm}{the Frobenius norm of the presence set.} \item{pattern$Jsim}{the Jaccard similarity of the presence set.} \item{pattern$Valid}{a logical indicating if the constraints are satisfied.} } \author{Christian Buchta} \section{Warning}{The function may change in future releases} \seealso{ \code{\link{proximus}} for obtaining a result object. } \examples{ ### see proximus } \keyword{cluster} cba/man/cut.ordered.Rd0000644000175100001440000000173111304023136014270 0ustar hornikusers\name{cut.ordered} \alias{cut.ordered} \title{Converting Ordered Factors} \description{ Reduce the levels of an ordered factor. } \usage{ \method{cut}{ordered}(x, breaks, ...) } \arguments{ \item{x}{an ordered factor.} \item{breaks}{a logical, character, or index vector of cut points.} \item{\dots}{further (unused) arguments.} } \details{ If \code{breaks} is of class \code{logical} it must have the same length as the number of levels of \code{x}. If \code{breaks} is of class \code{character} partial matching with the levels of \code{x} is attempted. Otherwise \code{breaks} is assumed to index the levels. } \value{ An ordered factor. } \references{Functions Missing in R: A Never Ending Story ;-)} \author{Christian Buchta} %\note{} %\section{Warning }{} \seealso{ \code{\link{cut}} for converting numeric vectors to factor. } \examples{ x <- ordered(sample(letters[1:3],10,rep=TRUE)) cut(x, c(FALSE,TRUE,FALSE)) cut(x, "b") cut(x, 2) } \keyword{manip} cba/man/lmplot.Rd0000644000175100001440000000271311633160276013376 0ustar hornikusers\name{lmplot} \alias{lmplot} \title{Plotting Logical Matrices} \description{ Implements a wrapper function to \code{image} that produces a black and white or gray-scale plot of a logical matrix. } \usage{ lmplot(x, block.size = 1, gray = FALSE, xlab = "", ylab = "", axes = FALSE, ...) } \arguments{ \item{x}{a logical matrix.} \item{block.size}{the interpolation block size.} \item{gray}{optionally use a gray scale.} \item{xlab}{title for the x axis.} \item{ylab}{title for the y axis.} \item{axes}{option to plot axes.} \item{\dots}{further arguments to \code{image}.} } \details{ \code{TRUE} is represented by the color white and \code{FALSE} by the color black. A lower resolution can be obtained by specifying an (interpolation) block size greater than one. Block densities can then be visualized by using the gray scale option. The number of levels of the palette corresponds to the block size but is capped to 8 levels (excluding white). Note that the opacity (blackness) corresponds with density (as on photographic film). } \author{Christian Buchta} %\section{Warning}{} \seealso{ \code{\link{lminter}} for interpolating logical matrices and \code{\link{image}} for further plotting options} \examples{ ### x <- matrix(sample(c(FALSE, TRUE), 64, rep=TRUE), ncol=8) lmplot(x) ### use lower resolution lmplot(x, block.size=2) ### use gray scale lmplot(x, block.size=2, gray=TRUE) } \keyword{cluster} \keyword{hplot} cba/man/Votes.Rd0000644000175100001440000000610614630403262013162 0ustar hornikusers\name{Votes} \alias{Votes} \docType{data} \title{Congressional Votes 1984 Data Set} \description{ This data set includes votes for each of the U.S. House of Representatives Congressmen on the 16 key votes identified by the CQA. The CQA lists nine different types of votes: voted for, paired for, and announced for (these three simplified to yea), voted against, paired against, and announced against (these three simplified to nay), voted present, voted present to avoid conflict of interest, and did not vote or otherwise make a position known (these three simplified to an unknown disposition). } \usage{data(Votes)} \format{ A data frame with 435 observations on the following 17 variables. \describe{ \item{\code{handicapped-infants}}{a factor with levels \code{n} and \code{y}} \item{\code{water-project-cost-sharing}}{a factor with levels \code{n} and \code{y}} \item{\code{adoption-of-the-budget-resolution}}{a factor with levels \code{n} and \code{y}} \item{\code{physician-fee-freeze}}{a factor with levels \code{n} and \code{y}} \item{\code{el-salvador-aid}}{a factor with levels \code{n} and \code{y}} \item{\code{religious-groups-in-schools}}{a factor with levels \code{n} and \code{y}} \item{\code{anti-satellite-test-ban}}{a factor with levels \code{n} and \code{y}} \item{\code{aid-to-nicaraguan-contras}}{a factor with levels \code{n} and \code{y}} \item{\code{mx-missile}}{a factor with levels \code{n} and \code{y}} \item{\code{immigration}}{a factor with levels \code{n} and \code{y}} \item{\code{synfuels-corporation-cutback}}{a factor with levels \code{n} and \code{y}} \item{\code{education-spending}}{a factor with levels \code{n} and \code{y}} \item{\code{superfund-right-to-sue}}{a factor with levels \code{n} and \code{y}} \item{\code{crime}}{a factor with levels \code{n} and \code{y}} \item{\code{duty-free-exports}}{a factor with levels \code{n} and \code{y}} \item{\code{export-administration-act-south-africa}}{a factor with levels \code{n} and \code{y}} \item{\code{Class}}{a factor with levels \code{democrat} and \code{republican}} } } \details{ The records are drawn from: \emph{Congressional Quarterly Almanac}, 98th Congress, 2nd session 1984, Volume XL: Congressional Quarterly Inc. Washington, D.C., 1985. It is important to recognize that \code{NA} in this database does not mean that the value of the attribute is unknown. It means simply, that the value is not "yea" or "nay" (see above). The current version of the UC Irvine Machine Learning Repository Congressional Voting Records data set is available from \doi{10.24432/C5C01P}. Blake, C.L. & Merz, C.J. (1998). UCI Repository of Machine Learning Databases. Irvine, CA: University of California, Department of Information and Computer Science. Formerly available from \samp{http://www.ics.uci.edu/~mlearn/MLRepository.html}. } %\source{ %\url{https://archive.ics.uci.edu/dataset/105/congressional+voting+records} %} \examples{ data(Votes) summary(Votes) ## maybe str(Votes) ; plot(Votes) ... } \keyword{datasets} cba/man/townships.Rd0000644000175100001440000000254711304023136014116 0ustar hornikusers\name{townships} \alias{townships} \docType{data} \title{Bertin's Characteristics and Townships Data Set} \description{ This data set was used to illustrate that the conciseness of presentation can be improved by reordering the rows and columns. } \usage{data(townships)} \format{ A data frame with 16 observations on the following 10 variables. \describe{ \item{\code{Township}}{a factor with levels \code{A} \code{B} \code{C} \code{D} \code{E} \code{F} \code{G} \code{H} \code{I} \code{J} \code{K} \code{L} \code{M} \code{N} \code{O} \code{P}} \item{\code{High.School}}{a logical vector} \item{\code{Agricultural.Coop.}}{a logical vector} \item{\code{Railway.Station}}{a logical vector} \item{\code{One.Room.School}}{a logical vector} \item{\code{Veterinary}}{a logical vector} \item{\code{No.Doctor}}{a logical vector} \item{\code{No.Water.Supply}}{a logical vector} \item{\code{Police.Station}}{a logical vector} \item{\code{Land.Reallocation}}{a logical vector} } } \details{ \code{townships} is a data set with 16 \code{logical} variables indicating the presence (\code{TRUE}) or absence (\code{FALSE}) of characteristics of townships. } %\source{} \references{ Bertin, J. (1981) \emph{Graphics and Graphic Information Processing}. Berlin, Walter de Gruyter. } \examples{ ## see order.data.frame } \keyword{datasets} cba/man/sdists.trace.Rd0000644000175100001440000001245511703753613014502 0ustar hornikusers\name{sdists.trace} \alias{sdists.trace} \title{Edit Transcripts and Sequence Alignments} \description{ This function computes and returns the set of all optimal but equivalent edit transcripts that transforms one sequences into another at minimum cost, as well as the corresponding aligned sequences, or, alternatively a combined edit graph. } \usage{ sdists.trace(x, y, method = "ow", weight = c(1, 1, 0, 2), exclude = c(NA, NaN, Inf, -Inf), graph = FALSE, partial = FALSE) } \arguments{ \item{x,y}{a numeric or string vector.} \item{method}{a mnemonic string referencing a distance measure.} \item{weight}{vector or matrix of parameter values.} \item{exclude}{argument to factor.} \item{graph}{option to compute the combined edit graph.} \item{partial}{option to compute an approximate substring match.} } \details{ Function \code{sdists.trace} complements the distance computation between sequences by \code{\link{sdists}}. So, please, see the details of \code{method}, \code{weight}, and \code{exclude} there. However, note the following differences: 1) you can supply only two sequences, either as vectors of numeric symbol codes, factors, or as strings, i.e. scalar vectors of type \code{character}. 2) you can supply a weight matrix with the rownames and colnames representing the symbol sets of the first and second sequence. For instance, this allows you to align a sequence with the profile of a multiple alignment. 3) if \code{method = "ow"} the space symbol \code{""} is included in the factor levels so that you can conveniently replace \code{NA} in the aligned sequences. A transcript uses the character codes \code{I}, \code{D}, \code{R}, and \code{M}, for insert, delete, replace, and match operations, which transform the first into the second sequence. Thus, conceptually a symbol has to be inserted into the first, deleted from the second, replaced in the first sequence, or matched in both, to obtain the second sequence. However, in the aligned sequences you will see \code{NA}, where an insert or delete would take place, indicating space. In the case of a local alignment different symbols are used for the prefix and/or suffix of the alignment: \code{i}, \code{d}, and \code{?} for insert, delete, and replace or match operations. However, note that their sole purpose is to obtain a common representation of the two sequences. Finally, only alignments of maximal length are reported. The time complexity of finding a transcript is \eqn{O(n+m)} for two sequences of length n and m, respectively \eqn{O(n*m)} for the local alignment problem. However, note that the runtime for generating all transcripts can be \eqn{O((n*m)^3)} in the worst case. If \code{partial = FALSE} computes an approximate substring match of \code{x} (the pattern) in \code{y}, for \code{method = "ow"} only. Returns the subset of paths which require the maximum number of match and initial and final insert operations. } \value{ A list with components each a list of two factors, the aligned sequences. The names of the components are the edit transcripts, and the attribute \code{value} contains the minimum cost, i.e. the distance (or negative similarity). If \code{graph = TRUE} a vector of edit transcripts is returned with attributes \code{value}, \code{table}, \code{pointer}, and \code{graph}. The second contains the values of the dynamic programming table and the third a list of vectors \code{x0, y0, x1, y1} representing the (back)pointers. Similarly, the fourth attribute is a list of vectors \code{x0, y0, x1, y1, weight} representing the edge set of all optimal paths. That is, each tuple contains the \code{from} and \code{to} coordinates as used by \code{segments}, each representing a pair of indexes into the first and second sequence, and the number of times an edge occurs on a path. Note that the origin of the coordinate system (0,0) corresponds to the element of \code{table} indexed by (\code{""},\code{""}), where \code{""} indicates the space symbol. Thus, if used as subscripts the coordinates have to be offset by one. } \references{ D. Gusfield (1997). \emph{Algorithms on Strings, Trees, and Sequences}. Cambridge University Press, Chapter 11. } \author{Christian Buchta} \section{Warning}{The interface is experimental and may change in the future} \seealso{ \code{\link{sdists}} for computation of distances between sequences, \code{\link{segments}} for plotting of edge sets, \code{\link{plot.sdists.graph}} for visualizing alignments. } \examples{ ### from the book x1 <- "vintner" y1 <- "writers" b1 <- sdists.trace(x1, y1, weight=c(1,1,0,1)) b1 ## longest common subsequence ? sdists.trace("a","b", weight=c(0,0,-1,0)) ## from the book w2 <- matrix(-2,ncol=13,nrow=13) w2[1,] <- w2[,1] <- -1 diag(w2) <- c(0,rep(2,12)) x2 <- "pqraxabcstvq" y2 <- "xyaxbacsll" colnames(w2) <- c("",unique(strsplit(paste(x2, y2, sep = ""),"")[[1]])) b2 <- sdists.trace(x2, y2, method="awl", weight=w2) b2 ## alignment with different symbol sets x3 <- "121314" y3 <- "ABACAD" w3 <- matrix(-1,nrow=5,ncol=5) diag(w3) <- 0 rownames(w3) <- c("","1","2","3","4") colnames(w3) <- c("","A","B","C","D") b3 <- sdists.trace(x3, y3, method="aw", weight=w3) b3 ## partial b4 <- sdists.trace(x1, y1, weight=c(1,1,0,1), partial = TRUE) b4 } \keyword{cluster} cba/man/proximus.Rd0000644000175100001440000001040014332123330013732 0ustar hornikusers\encoding{utf-8} \name{proximus} \alias{proximus} \title{Proximus} \description{ Cluster the rows of a logical matrix using the Proximus algorithm. The compression rate of the algorithm can be influenced by the choice of the maximum cluster radius and the minimum cluster size. } \usage{ proximus(x, max.radius = 2, min.size = 1, min.retry = 10, max.iter = 16, debug = FALSE) } \arguments{ \item{x}{a logical matrix.} \item{max.radius}{the maximum number of bits a member in a row set may deviate from its dominant pattern.} \item{min.size}{the minimum split size of a row set.} \item{min.retry}{number of retries to split a pure rank-one approximation (translates into a resampling rate).} \item{max.iter}{the maximum number of iterations for finding a local rank-one approximation.} \item{debug}{optional debugging output.} } \details{ The intended area of application is the compression of high-dimensional binary data into representative patterns. For instance, purchase incidence (market basket data) or term-document matrices may be preprocessed by Proximus for later association rule mining. The algorithm is of a recursive partitioning type. Specifically, at each step a binary split is attempted using a local rank-one approximation of the current submatrix (row set). That is a specialization of principal components to binary data which represents a matrix as the outer product of two binary vectors. The node expansion stops if a submatrix is pure, i.e., the column (presence set) vector indicates all the rows and the Hamming distances from the row (dominant attribute set) pattern vector, or the size of the row set, are less than or equal the specified threshold. In the case the rank-one approximation does not result in a split but the radius constraint is violated, the matrix is split using a random row and the radius constraint. The debug option can be used to gain some insight into how the algorithm proceeds: a right angle bracket indicates a split and the return to a recursion level is indicated by a left one. Leafs in the recursion tree are indicated by an asterisk and retries by a plus sign. The number of retries is bounded by the size of the current set divided by \code{min.retry}. Double angle brackets indicate a random split (see above). The numbers between square brackets indicate the current set size, the size of the presence (sub)set, and its radius. The adjoining numbers indicate the depth of the recursion and the count of retries. Finally, a count of the leaf nodes found so far is shown to the right of an asterisk. } \value{ An object of class \code{proximus} with the following components: \item{nr}{the number of rows of the data matrix.} \item{nc}{the number of columns of the data matrix.} \item{a}{a list containing the approximations (patterns).} \item{a$x}{a vector of row (presence set) indexes.} \item{a$y}{a vector of column (dominant attribute set) indexes.} \item{a$n}{the number of ones in the approximated submatrix.} \item{a$c}{the absolute error reduction by the approximation.} \item{max.radius}{see arguments.} \item{min.size}{see arguments.} \item{rownames}{rownames of the data matrix.} \item{colnames}{colnames of the data matrix.} } \references{ M. Koyutürk, A. Graham, and N. Ramakrishnan. Compression, Clustering, and Pattern Discovery in Very High-Dimensional Discrete-Attribute Data Sets. \emph{IEEE Transactions On Knowledge and Data Engineering}, Vol. 17, No. 4, (April) 2005. } \author{Christian Buchta} \note{ The size of a set need not be equal or greater than the user defined threshold. } \section{Warning}{Deep recursions may exhaust your computer.} \seealso{ \code{\link{summary.proximus}} for summaries, \code{\link{fitted}} for obtaining the approximated matrix and the pattern labels of the samples, and \code{\link{lmplot}} for plotting logical matrices. } \examples{ x <- matrix(sample(c(FALSE, TRUE), 200, rep=TRUE), ncol=10) pr <- proximus(x, max.radius=8) summary(pr) ### example from paper x <- rlbmat() pr <- proximus(x, max.radius=8, debug=TRUE) op <- par(mfrow=c(1,2), pty="s") lmplot(x, main="Data") box() lmplot(fitted(pr)$x, main="Approximation") box() par(op) } \keyword{cluster} cba/man/Mushroom.Rd0000644000175100001440000001120314630404674013675 0ustar hornikusers\name{Mushroom} \alias{Mushroom} \docType{data} \title{Mushroom Data Set} \description{ A data set with descriptions of hypothetical samples corresponding to 23 species of gilled mushrooms in the Agaricus and Lepiota Family, classified according to their edibility as (definitely) \sQuote{edible} or \sQuote{poisonous} (definitely poisonous, or of unknown edibility and not recommended). } \usage{ data("Mushroom") } \format{ A data frame with 8124 observations on the following 23 variables. \describe{ \item{\code{class}}{a factor with levels \code{edible} and \code{poisonous}.} \item{\code{cap-shape}}{a factor with levels \code{bell}, \code{conical}, \code{convex}, \code{flat}, \code{knobbed}, \code{sunken}.} \item{\code{cap-surface}}{a factor with levels \code{fibrous}, \code{grooves}, \code{scaly}, \code{smooth}.} \item{\code{cap-color}}{a factor with levels \code{brown}, \code{buff}, \code{cinnamon}, \code{gray}, \code{green}, \code{pink}, \code{purple}, \code{red}, \code{white}, \code{yellow}.} \item{\code{bruises?}}{a factor with levels \code{bruises} and \code{no}.} \item{\code{odor}}{a factor with levels \code{almond}, \code{anise}, \code{creosote}, \code{fishy}, \code{foul}, \code{musty}, \code{none}, \code{pungent}, \code{spicy}.} \item{\code{gill-attachment}}{a factor with levels \code{attached} and \code{free}. } \item{\code{gill-spacing}}{a factor with levels \code{close} and \code{crowded}.} \item{\code{gill-size}}{a factor with levels \code{broad} and \code{narrow}.} \item{\code{gill-color}}{a factor with levels \code{black}, \code{brown}, \code{buff}, \code{chocolate}, \code{gray}, \code{green}, \code{orange}, \code{pink}, \code{purple}, \code{red}, \code{white}, and \code{yellow}.} \item{\code{stalk-shape}}{a factor with levels \code{enlarging} and \code{tapering}.} \item{\code{stalk-root}}{a factor with levels \code{bulbous}, \code{club}, \code{equal}, and \code{rooted}. } \item{\code{stalk-surface-above-ring}}{a factor with levels \code{fibrous}, \code{scaly}, \code{silky}, and \code{smooth}.} \item{\code{stalk-surface-below-ring}}{a factor with levels \code{fibrous}, \code{scaly}, \code{silky}, and \code{smooth}.} \item{\code{stalk-color-above-ring}}{a factor with levels \code{brown}, \code{buff}, \code{cinnamon}, \code{gray}, \code{orange}, \code{pink}, \code{red}, \code{white}, and \code{yellow}.} \item{\code{stalk-color-below-ring}}{a factor with levels \code{brown}, \code{buff}, \code{cinnamon}, \code{gray}, \code{orange}, \code{pink}, \code{red}, \code{white}, and \code{yellow}.} \item{\code{veil-type}}{a factor with levels \code{partial}.} \item{\code{veil-color}}{a factor with levels \code{brown}, \code{orange}, \code{white}, and \code{yellow}.} \item{\code{ring-number}}{a factor with levels \code{one}, \code{one}, and \code{two}.} \item{\code{ring-type}}{a factor with levels \code{evanescent}, \code{flaring}, \code{large}, \code{none}, and \code{pendant}.} \item{\code{spore-print-color}}{a factor with levels \code{black}, \code{brown}, \code{buff}, \code{chocolate}, \code{green}, \code{orange}, \code{purple}, \code{white}, and \code{yellow}.} \item{\code{population}}{a factor with levels \code{abundant}, \code{clustered}, \code{numerous}, \code{scattered}, \code{several}, and \code{solitary}.} \item{\code{habitat}}{a factor with levels \code{grasses}, \code{leaves}, \code{meadows}, \code{paths}, \code{urban}, \code{waste}, and \code{woods}.} } } \details{ The records are drawn from G. H. Lincoff (1981) (Pres.), \emph{The Audubon Society Field Guide to North American Mushrooms}. New York: Alfred A. Knopf. (See pages 500--525 for the Agaricus and Lepiota Family.) The Guide clearly states that there is no simple rule for determining the edibility of a mushroom; no rule like \dQuote{leaflets three, let it be} for Poisonous Oak and Ivy. Unused levels in the original data were dropped. The current version of the UC Irvine Machine Learning Repository Mushroom data set is available from \doi{10.24432/C5959T} Blake, C.L. & Merz, C.J. (1998). UCI Repository of Machine Learning Databases. Irvine, CA: University of California, Department of Information and Computer Science. Formerly available from \samp{http://www.ics.uci.edu/~mlearn/MLRepository.html}. } %\source{ %\url{https://archive.ics.uci.edu/dataset/73/mushroom} %} \examples{ data("Mushroom") summary(Mushroom) } \keyword{datasets} cba/man/circleplot.dist.Rd0000644000175100001440000000217511633160276015173 0ustar hornikusers\name{circleplot.dist} \alias{circleplot.dist} \title{Plotting Distance Graphs} \description{ Function for visualizing distance graphs using a circular layout. } \usage{ circleplot.dist(x, cutoff = 0.5, col = 1, circle = FALSE, scale = 1.4) } \arguments{ \item{x}{an object of class \code{dist}.} \item{cutoff}{a numeric value specifying the threshold for edge exclusion.} \item{col}{a number or string specifying the edge color to use.} \item{circle}{a logical value specifying if a circle connecting the nodes should be drawn.} \item{scale}{a numeric value specifying the plot range (the default accommodates node labels).} } \details{ Plots the distance graph of \code{x} placing its nodes on a circle such that the number of crossing edges is approximately minimized. This is achieved by using \code{order.dist} for seriation. } %\value{} %\references{} \author{Christian Buchta} %\note{} \seealso{\code{\link{order.dist}}.} \examples{ ## data(iris) d <- dist(iris[,-5])[[1:26]] circleplot.dist(d, col = 2, scale = 1) dimnames(d) <- LETTERS[1:26] circleplot.dist(d) } \keyword{hplot} \keyword{cluster} cba/man/coding.Rd0000644000175100001440000000310313037611552013322 0ustar hornikusers\name{coding} \alias{as.dummy} \alias{as.dummy.matrix} \alias{as.dummy.data.frame} \title{Dummy Coding} \description{ Functions that convert R objects to a dummy coded matrix (or a list of matrices). } \usage{ as.dummy(x, ...) \method{as.dummy}{matrix}(x, sep = " ", drop = FALSE, ...) \method{as.dummy}{data.frame}(x, sep = " ", drop = FALSE, ...) } \arguments{ \item{x}{an R object (see below).} \item{sep}{separator used for construction of colnames.} \item{drop}{drop factors with less than two levels.} \item{\dots}{other (unused) arguments.} } \details{ The generic is applicable to factor and to R objects that can be converted to factor, i.e. \code{logical}, \code{integer}, or \code{character}. For numeric data a discretization method has to be used. A factor is converted to as many logical variables as there are levels where the value \code{TRUE} indicates the presence of a level. The colnames are made of the concatenation of a variable name and the level, separated by \code{sep}. For \code{matrix} and \code{data.frame} variable names are created if necessary. A value of \code{NA} is mapped to \code{FALSE} across all levels. } \value{ A \code{matrix} with a \code{levels} attribute which contains a list of the levels of the coded variables. } \author{Christian Buchta} \section{Warning}{This is experimental code which may change in the future.} \seealso{\code{\link{as.logical}}.} \examples{ ### x <- as.integer(sample(3,10,rep=TRUE)) as.dummy(x) is.na(x) <- c(3,5) as.dummy(x) x <- as.data.frame(x) as.dummy(x) } \keyword{cluster} cba/man/predict.ccfkms.Rd0000644000175100001440000000213211304023136014745 0ustar hornikusers\name{predict.ccfkms} \alias{predict.ccfkms} \title{Clustering with Conjugate Convex Functions.} \description{ Classify the rows of a data matrix using conjugate convex functions. } \usage{ \method{predict}{ccfkms}(object, x, drop = 1, ...) } \arguments{ \item{object}{an object of class \code{ccfkms}.} \item{x}{a data matrix containing test or new samples.} \item{drop}{cluster size threshold.} \item{\dots}{other (unused) arguments.} } \details{ This is a wrapper to \code{\link{ccfkms}} which uses a single iteration for classifying the data. In the case a \code{drop} value greater than zero is specified, all clusters with size equal or less than this value are removed from the classifier. } \value{ A list with the following components: \item{centers}{a matrix of cluster means.} \item{size}{a vector of cluster sizes.} \item{cl}{a factor of cluster labels (indexes).} \item{inv.inf}{the inverted information of the partition.} } \author{Christian Buchta} %\note{} \seealso{ \code{\link{ccfkms}} for obtaining a classifier.} \examples{ ### see ccfkms } \keyword{cluster} cba/man/order.greedy.Rd0000644000175100001440000000354411633160276014463 0ustar hornikusers\name{order.greedy} \alias{order.greedy} \title{Hierarchical Greedy Ordering} \description{ Compute a hierarchical greedy ordering of a data matrix. } \usage{ order.greedy(dist) } \arguments{ \item{dist}{an object of class \code{dist}.} } \details{ A single cluster is constructed by merging in each step the leaf closest to one of the two endpoints of the cluster. The algorithm starts with a random leaf and uses tie-breaking. Clearly, the algorithm is more an ordering than a cluster algorithm. However, it constructs a binary merge tree so that the linear ordering of its leaves could be further improved. } \value{ A list with the following components: \item{merge}{a matrix containing the merge tree.} \item{order}{a vector containing the leaf ordering.} \item{height}{a vector containing the merge heights.} } \references{ F. Murtagh (1985). Multidimensional Cluster Algorithms. \emph{Lectures in Computational Statistics}, Physica Verlag, pp. 15. } \author{Christian Buchta} \note{The merge heights may not be monotonic.} \seealso{ \code{\link{hclust}} for hierarchical clustering, \code{\link{order.optimal}} for optimal leaf ordering, and \code{\link{order.length}} for computing the objective value of a leaf ordering. } \examples{ d <- dist(matrix(runif(20), ncol=2)) hc <- hclust(d) co <- order.optimal(d, hc$merge) md <- -as.dist(crossprod(as.matrix(d, diag = 0))) # Murtagh's distances hg <- order.greedy(md) go <- order.optimal(md, hg$merge) ### compare images op <- par(mfrow=c(2,2), pty="s") implot(d[[hc$order]], main="hclust") implot(d[[co$order]], main="hlcust + optimal") implot(d[[hg$order]], main="greedy") implot(d[[go$order]], main="greedy + optimal") par(op) # compare lengths order.length(d, hc$order) order.length(d, co$order) order.length(d, hg$order) order.length(d, go$order) } \keyword{hplot} \keyword{cluster} cba/man/sdists.Rd0000644000175100001440000001105711703753613013402 0ustar hornikusers\name{sdists} \alias{sdists} \title{Sequence Distance Computation} \description{ This function computes and returns the auto-distance matrix between the vectors of a list or between the character strings of a vector treating them as sequences of symbols, as well as the cross-distance matrix between two such lists or vectors. } \usage{ sdists(x, y = NULL, method = "ow", weight = c(1, 1, 0, 2), exclude = c(NA, NaN, Inf, -Inf), pairwise = FALSE) } \arguments{ \item{x,y}{a list (of vectors) or a vector of character.} \item{method}{a mnemonic string referencing a distance measure.} \item{weight}{vector or matrix of parameter values.} \item{exclude}{argument to factor.} \item{pairwise}{compute distances for the parallel pairs of \code{x} and \code{y} only.} } \details{ This function provides a common interface to different methods for computation of distances between sequences, such as the edit a.k.a. Levenshtein distance. Conversely, in the context of sequence alignment the similarity of the maximizing alignment is computed. Note that negative similarities are returned as distances. So be careful to use a proper weighting (scoring) scheme. The following methods are currently implemented: \describe{ \item{\code{ow}:}{operation-weight edit distance. Weights have to be specified for deletion, insertion, match, and replacement. Other weights for initial operations can be specified as \code{weight[5:6]}.} \item{\code{aw}:}{alphabet-weight sequential alignment similarity. A matrix of weights (scores) for all possible symbol replacements needs to be specified with the convention that the first row/column defines the replacement with the empty (space) symbol. The colnames of this matrix are used as the levels argument for the encoding as \code{factor}. Consequently, unspecified symbols are mapped to \code{NA}.} \item{\code{awl}:}{alphabet-weight local sequential alignment similarity. The weight matrix must be as described above. However, note that zero acts as threshold for a 'restart' of the search for a local alignment and at the same time indicates that the solution is the empty substring. Thus, you normally would use non-negative scores for matches and non-positive weights otherwise.} } Missing (and non-finite) values should be avoided, i.e. either be removed or recoded (and appropriately weighted). By default they are excluded when coercing to factor and therefore mapped to \code{NA}. The result is then defined to be \code{NA} as we cannot determine a match! The time complexity is O(n*m) for two sequences of length n and m. Note that in the case of auto-distances the weight matrix must be (exactly) symmetric. Otherwise, for asymmetric weights \code{y} must not be \code{NULL}. For instance, \code{x} may be supplied twice (see the examples). } \value{ Auto distances are returned as an object of class \code{dist} and cross-distances as an object of class \code{matrix}. } \references{ D. Gusfield (1997). \emph{Algorithms on Strings, Trees, and Sequences}. Cambridge University Press, Chapter 11. } \author{Christian Buchta} \section{Warning}{The interface is experimental and may change in the future} \seealso{ \code{\link{sdists.trace}} for computation of edit transcripts and sequence alignments, \code{\link[proxy:dist]{dist}} for computation of common distances, \code{\link{agrep}} for searches for approximate matches. } \examples{ ### numeric data sdists(list(c(2,2,3),c(2,4,3))) # 2 sdists(list(c(2,2,3),c(2,4,3)),weight=c(1,1,0,1)) # 1 ### character data w <- matrix(-1,nrow=8,ncol=8) # weight/score matrix for diag(w) <- 0 # longest common subsequence colnames(w) <- c("",letters[1:7]) x <- sapply(rbinom(3,64,0.5),function(n,x) paste(sample(x,n,rep=TRUE),collapse=""), colnames(w)[-1]) x sdists(x,method="aw",weight=w) sdists(x,x,method="aw",weight=w) # check ## pairwise sdists(x,rev(x),method="aw",weight=w,pairwise = TRUE) diag(w) <- seq(0,7) sdists(x,method="aw", weight=w) # global alignment sdists(x,method="awl",weight=w) # local alignment ## empty strings sdists("", "FOO") sdists("", list(c("F","O","O"))) sdists("", list("")) # space symbol sdists("", "abc", method="aw", weight=w) sdists("", list(""), method="aw", weight=w) ### asymmetric weights w[] <- matrix(-sample(0:5,64,TRUE),ncol=8) diag(w) <- seq(0,7) sdists(x,x,method="aw", weight=w) sdists(x,x,method="awl",weight=w) ### missing values sdists(list(c(2,2,3),c(2,NA,3)),exclude=NULL) # 2 (include anything) sdists(list(c(2,2,3),c(2,NA,3)),exclude=NA) # NA } \keyword{cluster} cba/DESCRIPTION0000644000175100001440000000113714630611511012523 0ustar hornikusersPackage: cba Type: Package Title: Clustering for Business Analytics Version: 0.2-24 Author: Christian Buchta and Michael Hahsler Maintainer: Christian Buchta Description: Implements clustering techniques such as Proximus and Rock, utility functions for efficient computation of cross distances and data manipulation. Depends: R (>= 2.10), grid, proxy Imports: stats, graphics, grDevices, methods Suggests: gclus, colorspace Encoding: UTF-8 License: GPL-2 NeedsCompilation: yes Packaged: 2024-06-06 19:23:31 UTC; buchta Repository: CRAN Date/Publication: 2024-06-07 14:03:53 UTC