CVST/0000755000176000001440000000000012251616014011057 5ustar ripleyusersCVST/NAMESPACE0000644000176000001440000000003712133035115012272 0ustar ripleyusersexportPattern("^[[:alpha:]]+") CVST/R/0000755000176000001440000000000012251610222011253 5ustar ripleyusersCVST/R/methods.R0000644000176000001440000000736512251610222013054 0ustar ripleyusersconstructSVRLearner = function() { learn.svr = function(data, params) { #require(kernlab) stopifnot(isRegression(data)) kpar=params[setdiff(names(params), c("kernel", "nu", "C"))] return(ksvm(data$x, data$y, kernel=params$kernel, kpar=kpar, type="nu-svr", nu=params$nu, C=params$C / getN(data), scale=FALSE)) } predict.svr = function(model, newData) { stopifnot(isRegression(newData)) return(predict(model, newData$x)) } return(constructLearner(learn.svr, predict.svr)) } constructSVMLearner = function() { learn.svm = function(data, params) { #require(kernlab) stopifnot(isClassification(data)) kpar=params[setdiff(names(params), c("kernel", "nu"))] return(ksvm(data$x, data$y, kernel=params$kernel, kpar=kpar, type="nu-svc", nu=params$nu, scale=FALSE)) } predict.svm = function(model, newData) { stopifnot(isClassification(newData)) return(predict(model, newData$x)) } return(constructLearner(learn.svm, predict.svm)) } constructKlogRegLearner = function() { learn.klogreg = function(data, params) { #require(kernlab) stopifnot(isClassification(data)) # convert the factor to numeric 0/1 if (nlevels(data$y) > 2) { stop("klogreg does not support multiclass experiments") } y = (data$y != levels(data$y)[1]) + 0 kpar = params[setdiff(names(params), c("kernel", "lambda", "tol", "maxiter"))] kernel = do.call(params$kernel, kpar) model = .klogreg(data$x, kernel, y, getN(data) * params$lambda, params$tol, params$maxiter) model$yLevels = levels(data$y) return(model) } predict.klogreg = function(model, newData) { stopifnot(isClassification(newData)) pred = .klogreg.predict(model, newData$x) f = factor(pred, c("0", "1"), model$yLevels, ordered=FALSE) return(f) } return(constructLearner(learn.klogreg, predict.klogreg)) } constructKRRLearner = function() { learn.krr = function(data, params) { #require(kernlab) stopifnot(isRegression(data)) kpar = params[setdiff(names(params), c("kernel", "lambda"))] kernel = do.call(params$kernel, kpar) return(.krr(data$x, kernel, data$y, getN(data) * params$lambda)) } predict.krr = function(model, newData) { stopifnot(isRegression(newData)) return(as.matrix(.krr.predict(newData$x, model))) } return(constructLearner(learn.krr, predict.krr)) } .krr = function(data, kernel, y, lambda) { #require(kernlab) #require(Matrix) K = kernelMatrix(kernel, data) N = nrow(K) alpha = solve(Matrix(K + diag(lambda, N))) %*% y return(list(data=data, kernel=kernel, alpha=alpha)) } .krr.predict = function(newData, krr) { #require(kernlab) k = kernelMatrix(krr$kernel, newData, krr$data) return(k %*% krr$alpha) } .klogreg = function(data, kernel, labels, lambda, tol, maxiter) { # labels should be 0/1 #require(kernlab) #require(Matrix) K = Matrix(kernelMatrix(kernel, data)@.Data) N = nrow(K) alpha = rep(1/N, N) iter = 1 while (TRUE) { Kalpha = as.vector(K %*% alpha) spec = 1 + exp(-Kalpha) pi = 1 / spec diagW = pi * (1 - pi) e = (labels - pi) / diagW q = Kalpha + e theSol = try(solve(K + lambda * Diagonal(x=1/diagW), q)) if (class(theSol) == "try-error") { break } alphan = as.vector(theSol) if (any(is.nan(alphan)) || all(abs(alphan - alpha) <= tol)) { break } else if (iter > maxiter) { cat("klogreg:maxiter!") break } else { alpha = alphan iter = iter + 1 } } return(list(data=data, kernel=kernel, alpha=as.vector(alpha), pi=pi)) } .klogreg.predict = function(klogreg, newData) { #require(kernlab) K = kernelMult(klogreg$kernel, newData, klogreg$data, klogreg$alpha) pi = 1 / (1 + exp(-as.vector(K))) return((pi >= .5) + 0) } CVST/R/CV.R0000644000176000001440000002040412133035115011707 0ustar ripleyusersCV = function(data, learner, params, fold=5, verbose=TRUE) { stopifnot(class(learner) == "CVST.learner" && class(data) == "CVST.data" && class(params) == "CVST.params") nParams = length(params) dimnames = list(as.character(1:fold), names(params)) results = matrix(0, fold, nParams, dimnames=dimnames) size = getN(data) / fold for (ind in 1:nParams) { p = params[[ind]] for (f in 1:fold) { validationIndex = seq((f-1)*size + 1, f*size) curTrain = getSubset(data, -validationIndex) curTest = getSubset(data, validationIndex) # either mean squared error or mean classification error results[f, ind] = mean(.getResult(curTrain, curTest, learner, p)) } if (verbose) { cat(names(params)[ind], "(", mean(results[, ind]), ")\n") } } winner = which.min(apply(results, 2, mean)) if (length(winner) == 0) { return(NULL) } else { return(params[winner]) } } # the function to perform fastcrossvalidation: # # train: training data CVST.data # # learner: the learner as CVST.learner # # params: list of parameters for the learner as CVST.params # # setup: setup of the CVST as CVST.setup # # test: either the test data for fixed test error setting or NULL, if # the adjusted test error setting should be used fastCV = function(train, learner, params, setup, test=NULL, verbose=TRUE) { stopifnot(class(learner) == "CVST.learner" && class(train) == "CVST.data" && class(params) == "CVST.params" && class(setup) == "CVST.setup" && (is.null(test) || class(test) == "CVST.data")) isClassificationTask = isClassification(train) regressionSimilarityViaOutliers = setup$regressionSimilarityViaOutliers earlyStopping = setup$earlyStoppingSignificance similarity = setup$similaritySignificance # use nested modeling, i.e. we start with the first minimalModel number of # data points and in each step subsequently add minimalModel data points to it nestModel = TRUE earlyStoppingWindow = setup$earlyStoppingWindow if (is.null(test)) { # we are in the adjusted test error setting, therefore we have to keep # an additional slice of the data for the last test minimalModel = getN(train) / (setup$steps + 1) n = getN(train) - minimalModel } else { minimalModel = getN(train) / setup$steps n = getN(train) } N = seq(minimalModel, n, by=minimalModel) st = getCVSTTest(setup$steps, setup$beta, setup$alpha) nParams = length(params) if (verbose) { cat("Total number of params:", nParams, "\n") } dimnames = list(names(params), as.character(N)) traces = matrix(0, nParams, length(N), dimnames=dimnames) success = matrix(0, nParams, length(N), dimnames=dimnames) skipCalculation = rep(FALSE, nParams) isEarlyStopping = FALSE stoppedAt = length(N) activeConfigurations = matrix(FALSE, nParams, length(N), dimnames=dimnames) configurationsLeft = nParams for (ind in 1:length(N)) { n = N[ind] if (!isClassificationTask && regressionSimilarityViaOutliers) { err = .calculateErrors(train, test, n, learner, params, skipCalculation, squared=FALSE) success[, ind] = apply(err^2, 1, mean) } else { err = .calculateErrors(train, test, n, learner, params, skipCalculation) success[, ind] = apply(err, 1, mean) } success[, ind] = apply(err, 1, mean) indByError = sort.list(success[, ind], decreasing=FALSE, na.last=TRUE) traces[indByError[1], ind] = 1 sortedErrors = t(err[indByError, ]) if (!isClassificationTask && regressionSimilarityViaOutliers) { s = apply(sortedErrors, 2, sd) sortedErrors = t(abs(t(sortedErrors)) > s * qnorm(1 - (similarity / 2))) } adjustedSignificance = similarity / (configurationsLeft - 1) for (k in 2:length(indByError)) { if (is.na(success[indByError[k], ind])) { # we either have an unsufficient model, which gives us NA as result... # ... or reached the skipCalculation, so we can stop our procedure break } if (isClassificationTask) { pvalue = cochranq.test(sortedErrors[, 1:k])$p.value } else { if (regressionSimilarityViaOutliers) { pvalue = cochranq.test(sortedErrors[, 1:k])$p.value } else { pvalue = friedman.test(sortedErrors[, 1:k])$p.value } } if (!is.nan(pvalue) && pvalue <= adjustedSignificance) { break } traces[indByError[k], ind] = 1 } if (verbose) { cat("(sim:", sum(traces[, ind]), "alpha:", similarity, "left:", configurationsLeft, ")") } # do the testing here... # check for loosers if (ind > 1) { testResults = apply(traces[, 1:ind], 1, testSequence, st=st) # check for loosers skipCalculation = (testResults == -1) if (verbose) { cat("Skipped configurations:", sum(skipCalculation), " ") } } configurationsLeft = nParams - sum(skipCalculation) activeConfigurations[, ind] = !skipCalculation # check for early stopping if (earlyStoppingWindow >= 2 && ind > earlyStoppingWindow && earlyStopping < 1.0) { # check, whether all remaining parameters perform similar if (sum(!skipCalculation) > 1) pvalue = cochranq.test(t(traces[!skipCalculation, (ind-earlyStoppingWindow+1):ind]))$p.value else { pvalue = 1.0 } if (!is.nan(pvalue) && pvalue > earlyStopping) { if (verbose) { cat("EARLY STOPPING!") } isEarlyStopping = TRUE stoppedAt = ind break } # just go on, if they are signifcantly dissimilar! } } if (verbose) { cat("\n") } theWinners = !skipCalculation ret = list(traces=traces, success=success) ret$numberOfPotentialWinners = sum(theWinners) ret$isEarlyStopping = isEarlyStopping ret$stoppedAt = stoppedAt ret$activeConfigurations = activeConfigurations ret$earlyStoppingWindow = earlyStoppingWindow winningConfiguration = .getOptimalSolution(ret) ret$param = params[winningConfiguration] ret$winningConfiguration = winningConfiguration return(params[winningConfiguration]) } # returns a (# configuration) X (# testsamples) matrix containing 0/1 or squared error at # position i, j if the model learned on N data points of traindata # with configuration i labeled point j of the testdata # correctly or not. skipCalculation controls, which confguration should be # skipped. A NA in the returned matrix corresponds to skipped configuration. .calculateErrors = function(traindata, testdata, N, learner, params, skipCalculation, squared=TRUE) { nestModel = TRUE nPars = length(params) if (nestModel) { sampleIndex = 1:N } else { sampleIndex = sample.int(getN(traindata), N) } # if no test data is available, we have the adjusted test error settings, # i.e. we use the rest of the train data, which is not used for model building # to determine the test error if (is.null(testdata)) { testdata = getSubset(traindata, -sampleIndex) } # initialize results results = matrix(NA, nPars, getN(testdata)) # calculate results curTrain = getSubset(traindata, sampleIndex) for (ind in 1:nPars) { param = params[[ind]] if (!is.null(skipCalculation) && skipCalculation[ind]) { next } results[ind, ] = as.vector(.getResult(curTrain, testdata, learner, param, squared=squared)) } return(results) } .getOptimalSolution = function(paramRace) { remainingConfs = paramRace$activeConfigurations[, paramRace$stoppedAt] if (sum(remainingConfs) == 1) { return(remainingConfs) } # pick the race, which has the smallest mean rank inside # the earlyStoppingWindow: lastSuccess = paramRace$success[remainingConfs, (paramRace$stoppedAt - paramRace$earlyStoppingWindow + 1):paramRace$stoppedAt] meanRank = apply(apply(lastSuccess, 2, rank), 1, mean) # breaks ties at random overallWinner = which(remainingConfs)[.which.is.min(meanRank)] ret = rep(FALSE, nrow(paramRace$traces)) names(ret) = rownames(paramRace$traces) ret[overallWinner] = TRUE return(ret) } .which.is.min = function (x) { y = seq_along(x)[x == min(x)] if (length(y) > 1) { y = sample(y, 1) } return(y) } getCVSTTest = function(steps, beta=.1, alpha=.01) { pi1 = .5 * ((1 - beta) / alpha)^(1/steps) sst = constructSequentialTest(.5, pi1, beta, alpha) sst$steps = steps return(sst) } CVST/R/util.R0000644000176000001440000001715312133035115012363 0ustar ripleyusers# data is a list # x: either a list or a matrix containing the data rowwise # y: vector of labels/values constructData = function(x, y) { stopifnot(is.list(x) || is.vector(x) || is.matrix(x)) stopifnot(is.list(y) || is.vector(y) || is.factor(y)) data = list(x=x, y=y) class(data) = "CVST.data" return(data) } getN = function(data) { stopifnot(class(data) == "CVST.data") if (is.list(data$x) || is.vector(data$x)) { N = length(data$x) } else { N = nrow(data$x) } return(N) } shuffleData = function(data) { stopifnot(class(data) == "CVST.data") shuffle = sample.int(getN(data)) return(getSubset(data, shuffle)) } getSubset = function(data, subset) { stopifnot(class(data) == "CVST.data") x = getX(data, subset) y = data$y[subset] ret = constructData(x=x, y=y) return(ret) } getX = function(data, subset=NULL) { stopifnot(class(data) == "CVST.data") if (is.null(subset)) { ret = data$x } else { if (is.list(data$x) || is.vector(data$x)) { ret = data$x[subset] } else { ret = data$x[subset, ,drop=FALSE] } } return(ret) } isClassification = function(data) { stopifnot(class(data) == "CVST.data") return(is.factor(data$y)) } isRegression = function(data) { stopifnot(class(data) == "CVST.data") return(!isClassification(data)) } constructLearner = function(learn, predict) { stopifnot(is.function(learn) && is.function(predict)) learner = list(learn=learn, predict=predict) class(learner) = "CVST.learner" return(learner) } constructCVSTModel = function(steps=10, beta=.1, alpha=.01, similaritySignificance=.05, earlyStoppingSignificance=.05, earlyStoppingWindow=3, regressionSimilarityViaOutliers=FALSE) { ret = list(steps=steps, beta=beta, alpha=alpha, similaritySignificance=similaritySignificance, earlyStoppingSignificance=earlyStoppingSignificance, earlyStoppingWindow=earlyStoppingWindow, regressionSimilarityViaOutliers=regressionSimilarityViaOutliers) class(ret) = "CVST.setup" return(ret) } constructParams = function(...) { pn = names(substitute(c(...)))[-1] ret = expand.grid(..., stringsAsFactors=FALSE, KEEP.OUT.ATTRS = FALSE) params = lapply(1:nrow(ret), function(ind) as.list(ret[ind, ])) paramNames = lapply(1:nrow(ret), function(ind) paste(pn, ret[ind, ], sep="=", collapse=" ")) names(params) = paramNames class(params) = "CVST.params" return(params) } .getResult = function(train, test, learner, param, squared=TRUE) { stopifnot(class(learner) == "CVST.learner" && class(train) == "CVST.data" && class(test) == "CVST.data") model = try(learner$learn(train, param)) if (class(model) == "try-error") { pred = rep(NA, length(test$y)) } else { pred = try(learner$predict(model, test)) if (class(pred) == "try-error") { pred = rep(NA, length(test$y)) } } if (isClassification(test)) { res = (test$y != pred) } else { if (squared) { res = (pred - test$y)^2 } else { res = (pred - test$y) } } return(res) } cochranq.test = function(mat) { cochransQtest = list(statistic = 0, parameter = 0, p.value = 1, method = "Cochran's Q Test", data.name = deparse(substitute(mat))) class(cochransQtest) = "htest" if (is.vector(mat) || any(dim(mat) <= 1)) { return(cochransQtest) } # we expect the individuals in the rows, repetitions/treatments in the columns m = ncol(mat) df = m - 1 L = apply(mat, 1, sum) index = (L > 0 & L < m) if (sum(index) <= 1) { # all rows are either one or zero... no effect! return(cochransQtest) } if (sum(index) * m <= 24) { return(.perm.cochranq.test(mat[index, ])) } L = L[index] T = apply(mat[index, ], 2, sum) Q = ((m-1) * (m * sum(T^2) - sum(T)^2)) / (m * sum(L) - sum(L^2)) names(df) = "df" names(Q) = "Cochran's Q" if (is.nan(Q)) { p.val = 1.0 } else { p.val = pchisq(Q, df, lower.tail=FALSE) } cochransQtest$statistic = Q cochransQtest$parameter = df cochransQtest$p.value = p.val return(cochransQtest) } .perm.cochranq.test = function(mat, nperm=1000) { if (is.vector(mat) || any(dim(mat) <= 1)) { cochransQtest = list(statistic = 0, parameter = 0, p.value = 1, method = "Cochran's Q Test", data.name = deparse(substitute(mat))) class(cochransQtest) = "htest" return(cochransQtest) } # we expect no straight zero or one-rows in mat m = ncol(mat) df = m - 1 L = apply(mat, 1, sum) T = apply(mat, 2, sum) quot = (m * sum(L) - sum(L^2)) Q = ((m-1) * (m * sum(T^2) - sum(T)^2)) / quot names(df) = "df" names(Q) = "Cochran's Q" permFun = function() { newPerm = mat for (i in 1:nrow(mat)) { newPerm[i, ] = mat[i, sample(m)] } T = apply(newPerm, 2, sum) Q = ((m-1) * (m * sum(T^2) - sum(T)^2)) / quot return(Q) } QS = replicate(nperm, permFun()) p.value = mean(QS >= Q) cochransQtest = list(statistic = Q, parameter = df, p.value = p.value, method = "Cochran's Q Test (monte-carlo)", data.name = deparse(substitute(mat))) class(cochransQtest) = "htest" return(cochransQtest) } constructSequentialTest = function(piH0=.5, piH1=.9, beta, alpha) { a1 = log((1 - beta) / alpha) / (log(piH1 / piH0) + log((1 - piH0) / (1 - piH1))) a0 = -log(beta / (1 - alpha)) / (log(piH1 / piH0) + log((1 - piH0) / (1 - piH1))) b = log((1 - piH0) / (1 - piH1)) / (log(piH1 / piH0) + log((1 - piH0) / (1 - piH1))) ret = list(a1=a1, a0=a0, b=b, piH0=piH0, piH1=piH1, alpha=alpha, beta=beta) class(ret) = "CVST.sequentialTest" return(ret) } plotSequence = function(st, s) { y = cumsum(s) if (!is.null(st$steps)) { plot(y, xlim=c(1, st$steps), ylim=c(1, st$steps)) } else { plot(y) } abline(a=st$a1, b=st$b, col="red") abline(a=-st$a0, b=st$b, col="red", lty=2) abline(h=0) abline(a=0, b=1) title(sprintf("one-sided H0:%0.2f; H1:%0.2f", st$piH0, st$piH1)) } testSequence = function(st, s) { stopifnot(class(st) == "CVST.sequentialTest") n = length(s) y = cumsum(s) ret = 0 if (y[n] >= st$b * n + st$a1) { ret = 1 } else if (y[n] <= st$b * n - st$a0) { ret = -1 } return(ret) } noisySinc = function(n, dim=2, sigma=0.1) { if (length(n) > 1) { x = n } else { x = runif(n, -pi, pi) } sinc = function(d) sin(d) / (d) y = sinc(4 * x) + 0.2 * sin(15 * x * dim) + sigma*rnorm(n) y[is.nan(y)] = 1 return(constructData(x=as.matrix(x), y=y)) } noisySine = function(n, dim=5, sigma=.25) { x = runif(n, 0, 2 * pi * dim) y = sin(x) if (!is.null(sigma) && sigma > 0) { y = y + rnorm(n, sd=sigma) } label = factor(y == abs(y)) return(constructData(x=as.matrix(x), y=label)) } noisyDonoho = function(n, fun=doppler, sigma=1) { x = matrix(runif(n, 0, 1), n, 1) y = as.vector(fun(x)) + rnorm(n, sd=sigma) return(constructData(x=x, y=y)) } blocks = function(x, scale=3.656993) { t = c(0.1, 0.13, 0.15, 0.23, 0.25, 0.40, 0.44, 0.65, 0.76, 0.78, 0.81) h = c(4, -5, 3, -4, 5, -4.2, 2.1, 4.3, -3.1, 2.1, -4.2) ret = t(sapply(x, function(xx) (1 + sign(xx - t)) / 2)) %*% h ret = ret * scale return(ret) } bumps = function(x, scale=10.52884) { t = c(0.1, 0.13, 0.15, 0.23, 0.25, 0.40, 0.44, 0.65, 0.76, 0.78, 0.81) h = c(4, 5, 3, 4, 5, 4.2, 2.1, 4.3, 3.1, 5.1, 4.2) w = c(0.005, 0.005, 0.006, 0.01, 0.01, 0.03, 0.01, 0.01, 0.005, 0.008, 0.005) ret = t(sapply(x, function(xx) (1 + abs((xx - t) / w))^-4 )) %*% h ret = ret * scale return(ret) } heavisine = function(x, scale=2.356934) { ret = 4 * sin(4 * pi * x) - sign(x - 0.3) - sign(0.72 - x) ret = ret * scale return(ret) } doppler = function(x, scale=24.22172) { ret = sqrt(x * (1 - x)) * sin((2.1 * pi) / (x + 0.05)) ret = ret * scale return(ret) } CVST/README.md0000644000176000001440000000030412210445232012330 0ustar ripleyusersCVST ==== Fast Cross-Validation via Sequential Testing The package CVST is hosted on CRAN, so install.packages("CVST") library(CVST) example(CVST) will give you a first impression.CVST/MD50000644000176000001440000000154012251616014011367 0ustar ripleyusers635d31fd315c55401a3ce30a4f096481 *DESCRIPTION 8b54e5a89fbda3af5e077053d40bec76 *NAMESPACE 209666bfdd1d76a79ee11af8fa919a61 *R/CV.R 807866a5ca52a90f7ce267e28e41704b *R/methods.R cf33cdc93c9b2aa2929369a658eab977 *R/util.R d397398aac7e3352aa7e1ed6234dc491 *README.md 889feb1c2dba681c0b85199ec33693b9 *man/CV.Rd ebb263537ff83916c7c7ffd97b89e97d *man/CVST-package.Rd 22f0b3ced43046e9c6c832213c612a99 *man/cochranq.test.Rd 2c840dfb7798526a5726f93bbc8e5d50 *man/constructCVSTModel.Rd 101d0d1145f69a67cc133b224ac3a5f0 *man/constructData.Rd bc6a2af0e2ba849b839fd41a26768399 *man/constructLearner.Rd 53cdd42ecc89dd8f52f45af2367fcc78 *man/constructParams.Rd 1a7e9e2bc332e7bae0a611cfe5797269 *man/constructSequentialTest.Rd 5f8fb944813989e373ff9c9f24513cc7 *man/fastCV.Rd d4befef987e30fa1eaa09def9e5dc3f1 *man/noisyDonoho.Rd 3bfc25412298e3b10174bbf080dc2573 *man/noisySine.Rd CVST/DESCRIPTION0000644000176000001440000000234112251616014012565 0ustar ripleyusersPackage: CVST Type: Package Title: Fast Cross-Validation via Sequential Testing Version: 0.2-1 Date: 2013-12-10 Depends: kernlab,Matrix Author: Tammo Krueger, Mikio Braun Maintainer: Tammo Krueger Description: This package implements the fast cross-validation via sequential testing (CVST) procedure. CVST is an improved cross-validation procedure which uses non-parametric testing coupled with sequential analysis to determine the best parameter set on linearly increasing subsets of the data. By eliminating underperforming candidates quickly and keeping promising candidates as long as possible, the method speeds up the computation while preserving the capability of a full cross-validation. Additionally to the CVST the package contains an implementation of the ordinary k-fold cross-validation with a flexible and powerful set of helper objects and methods to handle the overall model selection process. The implementations of the Cochran's Q test with permutations and the sequential testing framework of Wald are generic and can therefore also be used in other contexts. License: GPL (>= 2.0) Packaged: 2013-12-10 13:06:36 UTC; tammok NeedsCompilation: no Repository: CRAN Date/Publication: 2013-12-10 14:50:04 CVST/man/0000755000176000001440000000000012133340376011636 5ustar ripleyusersCVST/man/CVST-package.Rd0000644000176000001440000000434412133340376014302 0ustar ripleyusers\name{CVST-package} \alias{CVST-package} \alias{CVST} \docType{package} \title{ Fast Cross-Validation via Sequential Testing } \description{ This package implements the fast cross-validation via sequential testing (CVST) procedure. CVST is an improved cross-validation procedure which uses non-parametric testing coupled with sequential analysis to determine the best parameter set on linearly increasing subsets of the data. By eliminating underperforming candidates quickly and keeping promising candidates as long as possible, the method speeds up the computation while preserving the capability of a full cross-validation. Additionally to the CVST the package contains an implentation of the ordinary k-fold cross-validation with a flexible and powerful set of helper objects and methods to handle the overall model selection process. The implementations of the Cochran's Q test with permutations and the sequential testing framework of Wald are generic and can therefore also be used in other contexts. } \details{ \tabular{ll}{ Package: \tab CVST\cr Type: \tab Package\cr Version: \tab 0.2\cr Date: \tab 2013-03-25\cr License: \tab GPL (>=2.0)\cr } } \author{Tammo Krueger, Mikio Braun Maintainer: Tammo Krueger } \references{ Tammo Krueger, Danny Panknin, and Mikio Braun. Fast cross-validation via sequential analysis. \emph{Neural Information Processing Systems (NIPS), Big Learning Workshop}, 2011. URL \url{http://biglearn.org/2011/index.php/Papers\#paper2}. Tammo Krueger, Danny Panknin, and Mikio Braun. Fast cross-validation via sequential testing. \emph{CoRR}, abs/1206.2248, 2012. URL \url{http://arxiv.org/abs/1206.2248}. Abraham Wald. \emph{Sequential Analysis}. Wiley, 1947. W. G. Cochran. The comparison of percentages in matched samples. \emph{Biometrika}, 37 (3-4):256--266, 1950. M. Friedman. The use of ranks to avoid the assumption of normality implicit in the analysis of variance. \emph{Journal of the American Statistical Association}, 32 (200):675--701, 1937. } \keyword{ package } \examples{ ns = noisySine(100) svm = constructSVMLearner() params = constructParams(kernel="rbfdot", sigma=10^(-3:3), nu=c(0.05, 0.1, 0.2, 0.3)) opt = fastCV(ns, svm, params, constructCVSTModel()) } CVST/man/constructLearner.Rd0000644000176000001440000000656612133340376015477 0ustar ripleyusers\name{constructLearner} \alias{constructLearner} \alias{constructKlogRegLearner} \alias{constructKRRLearner} \alias{constructSVMLearner} \alias{constructSVRLearner} %- Also NEED an '\alias' for EACH other topic documented here. \title{ Construction of Specific Learners for CVST } \description{ These methods construct a \code{CVST.learner} object suitable for the CVST method. These objects provide the common interface needed for the \code{\link{CV}} and \code{\link{fastCV}} methods. We provide kernel logistic regression, kernel ridge regression, support vector machines and support vector regression as fully functional implementation templates. } \usage{ constructLearner(learn, predict) constructKlogRegLearner() constructKRRLearner() constructSVMLearner() constructSVRLearner() } %- maybe also 'usage' for other objects documented here. \arguments{ \item{learn}{ The learning methods which takes a \code{CVST.data} and list of parameters and return a model. } \item{predict}{ The prediction method which takes a model and \code{CVST.data} and returns the corresponding predictions. } } \details{ The nu-SVM and nu-SVR are build on top the corresponding implementations of the \code{kernlab} package (see reference). In the list of parameters these implementations expect an entry named \code{kernel}, which gives the name of the kernel that should be used, an entry named \code{nu} specifying the nu parameter, and an entry named \code{C} giving the C parameter for the nu-SVR. The KRR and KLR also expect \code{kernel} and necessary other parameters to construct the kernel. Both methods expect a lambda parameter and KLR additonally a tol and maxiter parameter in the parameter list. Note that the lambda of KRR/KLR and the C parameter of SVR are scaled by the data set size to allow for comparable results in the fast CV loop. } \value{ Returns a learner of type \code{CVST.learner} suitable for \code{\link{CV}} and \code{\link{fastCV}}. } \references{ Alexandros Karatzoglou, Alexandros Smola, Kurt Hornik, Achim Zeileis. kernlab - An S4 Package for Kernel Methods in R \emph{Journal of Statistical Software} Vol. 11, Issue 9, Nov 2004. URL: \url{http://www.jstatsoft.org/v11/i09}. Volker Roth. Probabilistic discriminative kernel classifiers for multi-class problems. In \emph{Proceedings of the 23rd DAGM-Symposium on Pattern Recognition}, pages 246--253, 2001. } \author{ Tammo Krueger } \seealso{ \code{\link{CV}} \code{\link{fastCV}} } \examples{ # SVM ns = noisySine(100) svm = constructSVMLearner() p = list(kernel="rbfdot", sigma=100, nu=.1) m = svm$learn(ns, p) nsTest = noisySine(1000) pred = svm$predict(m, nsTest) sum(pred != nsTest$y) / getN(nsTest) # Kernel logistic regression klr = constructKlogRegLearner() p = list(kernel="rbfdot", sigma=100, lambda=.1/getN(ns), tol=10e-6, maxiter=100) m = klr$learn(ns, p) pred = klr$predict(m, nsTest) sum(pred != nsTest$y) / getN(nsTest) # SVR ns = noisySinc(100) svr = constructSVRLearner() p = list(kernel="rbfdot", sigma=100, nu=.1, C=1*getN(ns)) m = svr$learn(ns, p) nsTest = noisySinc(1000) pred = svr$predict(m, nsTest) sum((pred - nsTest$y)^2) / getN(nsTest) # Kernel ridge regression krr = constructKRRLearner() p = list(kernel="rbfdot", sigma=100, lambda=.1/getN(ns)) m = krr$learn(ns, p) pred = krr$predict(m, nsTest) sum((pred - nsTest$y)^2) / getN(nsTest) } CVST/man/constructSequentialTest.Rd0000644000176000001440000000425012133340376017045 0ustar ripleyusers\name{constructSequentialTest} \alias{constructSequentialTest} \alias{getCVSTTest} \alias{testSequence} \alias{plotSequence} %- Also NEED an '\alias' for EACH other topic documented here. \title{ Construct and Handle Sequential Tests. } \description{ These functions handle the construction and calculation with sequential tests as introduced by Wald (1947). \code{getCVSTTest} constructs a special sequential test as introduced in Krueger (2011). \code{testSequence} test a sequence of 0/1 whether it is distributed according to H0 or H1. } \usage{ constructSequentialTest(piH0 = 0.5, piH1 = 0.9, beta, alpha) getCVSTTest(steps, beta = 0.1, alpha = 0.01) testSequence(st, s) plotSequence(st, s) } %- maybe also 'usage' for other objects documented here. \arguments{ \item{piH0}{ Probability of the binomial distribution for H0. } \item{piH1}{ Probability of the binomial distribution for H1. } \item{beta}{ Significance level for H0. } \item{alpha}{ Significance level for H1. } \item{steps}{ Number of steps the CVST procedure should be executed. } \item{st}{ A sequential test of type \code{CVST.sequentialTest}. } \item{s}{ A sequence of 0/1 values. } } \value{ \code{constructSequentialTest} and \code{getCVSTTest} return a \code{CVST.sequentialTest} with the specified properties. \code{testSequence} returns 1, if H1 can be expected, -1 if H0 can be accepted, and 0 if the test needs more data for a decission. \code{plotSequence} gives a graphical impression of the this testing procedure. } \references{ Abraham Wald. \emph{Sequential Analysis}. Wiley, 1947. Tammo Krueger, Danny Panknin, and Mikio Braun. Fast cross-validation via sequential analysis. \emph{Neural Information Processing Systems (NIPS), Big Learning Workshop}, 2011. URL \url{http://biglearn.org/2011/index.php/Papers\#paper2}. Tammo Krueger, Danny Panknin, and Mikio Braun. Fast cross-validation via sequential testing. \emph{CoRR}, abs/1206.2248, 2012. URL \url{http://arxiv.org/abs/1206.2248}. } \author{ Tammo Krueger } \seealso{ \code{\link{fastCV}} } \examples{ st = getCVSTTest(10) s = rbinom(10,1, .5) plotSequence(st, s) testSequence(st, s) } CVST/man/fastCV.Rd0000644000176000001440000000434612133340376013322 0ustar ripleyusers\name{fastCV} \alias{fastCV} %- Also NEED an '\alias' for EACH other topic documented here. \title{ The Fast Cross-Validation via Sequential Testing (CVST) Procedure } \description{ CVST is an improved cross-validation procedure which uses non-parametric testing coupled with sequential analysis to determine the best parameter set on linearly increasing subsets of the data. By eliminating underperforming candidates quickly and keeping promising candidates as long as possible, the method speeds up the computation while preserving the capability of a full cross-validation. } \usage{ fastCV(train, learner, params, setup, test = NULL, verbose = TRUE) } %- maybe also 'usage' for other objects documented here. \arguments{ \item{train}{ The data set as \code{CVST.data} object. } \item{learner}{ The learner as \code{CVST.learner} object. } \item{params}{ the parameter grid as \code{CVST.params} object. } \item{setup}{ A \code{CVST.setup} object containing the necessary parameter for the CVST procedure. } \item{test}{ An independent test set that should be used at each step. If \code{NULL} then the remaining data after learning a model at each step is used instead. } \item{verbose}{ Should the procedure report the performance after each step? } } \value{ Returns the optimal parameter settings as determined by fast cross-validation via sequential testing. } \references{ Tammo Krueger, Danny Panknin, and Mikio Braun. Fast cross-validation via sequential analysis. \emph{Neural Information Processing Systems (NIPS), Big Learning Workshop}, 2011. URL \url{http://biglearn.org/2011/index.php/Papers\#paper2}. Tammo Krueger, Danny Panknin, and Mikio Braun. Fast cross-validation via sequential testing. \emph{CoRR}, abs/1206.2248, 2012. URL \url{http://arxiv.org/abs/1206.2248}. } \author{ Tammo Krueger } \seealso{ \code{\link{CV}} \code{\link{constructCVSTModel}} \code{\link{constructData}} \code{\link{constructLearner}} \code{\link{constructParams}} } \examples{ ns = noisySine(100) svm = constructSVMLearner() params = constructParams(kernel="rbfdot", sigma=10^(-3:3), nu=c(0.05, 0.1, 0.2, 0.3)) opt = fastCV(ns, svm, params, constructCVSTModel()) } CVST/man/constructData.Rd0000644000176000001440000000302312133340376014741 0ustar ripleyusers\name{constructData} \alias{constructData} \alias{getN} \alias{getSubset} \alias{getX} \alias{shuffleData} \alias{isClassification} \alias{isRegression} %- Also NEED an '\alias' for EACH other topic documented here. \title{ Construction and Handling of \code{CVST.data} Objects } \description{ The CVST methods needs a structured interface to both regression and classification data sets. These helper methods allow the construction and consistence handling of these types of data sets. } \usage{ constructData(x, y) getN(data) getSubset(data, subset) getX(data, subset = NULL) shuffleData(data) isClassification(data) isRegression(data) } %- maybe also 'usage' for other objects documented here. \arguments{ \item{x}{ The feature data as vector or matrix. } \item{y}{ The observed values (regressands/labels) as list, vector or factor. } \item{data}{ A \code{CVST.data} object generated via \code{constructData}. } \item{subset}{ A index set. } } \value{ \code{constructData} returns a \code{CVST.data} object. \code{getN} returns the number of data points in the data set. \code{getSubset} returns a subset of the data as a \code{CVST.data} object, while \code{getX} just return the feature data. \code{shuffleData} returns a randomly shuffled instance of the data. } \author{ Tammo Krueger } \examples{ nsine = noisySine(10) isClassification(nsine) isRegression(nsine) getN(nsine) getX(nsine) nsineShuffeled = shuffleData(nsine) getX(nsineShuffeled) getSubset(nsineShuffeled, 1:3) } CVST/man/CV.Rd0000644000176000001440000000301012133340376012427 0ustar ripleyusers\name{CV} \alias{CV} %- Also NEED an '\alias' for EACH other topic documented here. \title{ Perform a k-fold Cross-validation } \description{ Performs the usual k-fold cross-validation procedure on a given data set, parameter grid and learner. } \usage{ CV(data, learner, params, fold = 5, verbose = TRUE) } %- maybe also 'usage' for other objects documented here. \arguments{ \item{data}{ The data set as \code{CVST.data} object. } \item{learner}{ The learner as \code{CVST.learner} object. } \item{params}{ the parameter grid as \code{CVST.params} object. } \item{fold}{ The number of folds that should be generated for each set of parameters. } \item{verbose}{ Should the procedure report the performance for each model? } } \value{ Returns the optimal parameter settings as determined by k-fold cross-validation. } \references{ M. Stone. Cross-validatory choice and assessment of statistical predictions. \emph{Journal of the Royal Statistical Society. Series B}, 36(2):111--147, 1974. Sylvain Arlot, Alain Celisse, and Paul Painleve. A survey of cross-validation procedures for model selection. \emph{Statistics Surveys}, 4:40--79, 2010. } \author{ Tammo Krueger } \seealso{ \code{\link{fastCV}} \code{\link{constructData}} \code{\link{constructLearner}} \code{\link{constructParams}} } \examples{ ns = noisySine(100) svm = constructSVMLearner() params = constructParams(kernel="rbfdot", sigma=10^(-3:3), nu=c(0.05, 0.1, 0.2, 0.3)) opt = CV(ns, svm, params) } CVST/man/noisySine.Rd0000644000176000001440000000262212133340376014107 0ustar ripleyusers\name{noisySine} \alias{noisySine} \alias{noisySinc} %- Also NEED an '\alias' for EACH other topic documented here. \title{ Regression and Classification Toy Data Set } \description{ Regression and Classification Toy Data Set based on the sine and sinc function. } \usage{ noisySine(n, dim = 5, sigma = 0.25) noisySinc(n, dim = 2, sigma = 0.1) } %- maybe also 'usage' for other objects documented here. \arguments{ \item{n}{ Number of data points that should be generated. } \item{dim}{ Intrinsic dimensionality of the data set (see references for details). } \item{sigma}{ Standard deviation of the noise component. } } \value{ Returns a data set of type CVST.data } \references{ Tammo Krueger, Danny Panknin, and Mikio Braun. Fast cross-validation via sequential analysis. \emph{Neural Information Processing Systems (NIPS), Big Learning Workshop}, 2011. URL \url{http://biglearn.org/2011/index.php/Papers\#paper2}. Tammo Krueger, Danny Panknin, and Mikio Braun. Fast cross-validation via sequential testing. \emph{CoRR}, abs/1206.2248, 2012. URL \url{http://arxiv.org/abs/1206.2248}. } \author{ Tammo Krueger } \seealso{ \code{\link{constructData}} } \examples{ nsine = noisySine(1000) plot(nsine, col=nsine$y) nsinc = noisySinc(1000) plot(nsinc) } % Add one or more standard keywords, see file 'KEYWORDS' in the % R documentation directory. \keyword{datasets} CVST/man/constructParams.Rd0000644000176000001440000000156412133340376015323 0ustar ripleyusers\name{constructParams} \alias{constructParams} %- Also NEED an '\alias' for EACH other topic documented here. \title{ Construct a Grid of Parameters } \description{ This is a helper function which, geiven a named list of parameter choices, expand the complete grid and returns a \code{CVST.params} object suitable for \code{\link{CV}} and \code{\link{fastCV}}. } \usage{ constructParams(...) } %- maybe also 'usage' for other objects documented here. \arguments{ \item{\dots}{ The parameters that should be expanded. } } \value{ Returns a \code{CVST.params} wich is basically a named list of possible parameter vallues. } \author{ Tammo Krueger } \seealso{ \code{\link{fastCV}} } \examples{ params = constructParams(kernel="rbfdot", sigma=10^(-1:5), nu=c(0.1, 0.2)) # the expanded grid contains 14 parameter lists: length(params) } CVST/man/noisyDonoho.Rd0000644000176000001440000000272512133340376014443 0ustar ripleyusers\name{noisyDonoho} \alias{noisyDonoho} \alias{heavisine} \alias{doppler} \alias{bumps} \alias{blocks} %- Also NEED an '\alias' for EACH other topic documented here. \title{ Generate Donoho's Toy Data Sets } \description{ This function allows to generate noisy variants of the toy signals introduced by Donoho (see reference section). The scaling is chosen to reflect the setting as discussed in the original paper. } \usage{ noisyDonoho(n, fun = doppler, sigma = 1) blocks(x, scale = 3.656993) bumps(x, scale = 10.52884) doppler(x, scale = 24.22172) heavisine(x, scale = 2.356934) } %- maybe also 'usage' for other objects documented here. \arguments{ \item{n}{ Number of data points that should be generated. } \item{fun}{ Function to use to generate the data. } \item{sigma}{ Standard deviation of the noise component. } \item{x}{ Number of data points that should be generated. } \item{scale}{ Scaling parameter. } } \value{ Returns a data set of type CVST.data } \references{ David L. Donoho and Jain M. Johnstone. Ideal spatial adaptation by wavelet shrinkage. \emph{Biometrika}, 81 (3) 425--455, 1994. } \author{ Tammo Krueger } \seealso{ \code{\link{constructData}} } \examples{ bumpsSet = noisyDonoho(1000, fun=bumps) plot(bumpsSet) dopplerSet = noisyDonoho(1000, fun=doppler) plot(dopplerSet) } % Add one or more standard keywords, see file 'KEYWORDS' in the % R documentation directory. \keyword{datasets} CVST/man/constructCVSTModel.Rd0000644000176000001440000000317512133340376015640 0ustar ripleyusers\name{constructCVSTModel} \alias{constructCVSTModel} %- Also NEED an '\alias' for EACH other topic documented here. \title{ Setup for a CVST Run. } \description{ This is an helper object of type \code{CVST.setup} conatining all necessary parameters for a CVST run. } \usage{ constructCVSTModel(steps = 10, beta = 0.1, alpha = 0.01, similaritySignificance = 0.05, earlyStoppingSignificance = 0.05, earlyStoppingWindow = 3, regressionSimilarityViaOutliers = FALSE) } %- maybe also 'usage' for other objects documented here. \arguments{ \item{steps}{ Number of steps CVST should run } \item{beta}{ Significance level for H0. } \item{alpha}{ Significance level for H1. } \item{similaritySignificance}{ Significance level of the similarity test. } \item{earlyStoppingSignificance}{ Significance level of the early stopping test. } \item{earlyStoppingWindow}{ Size of the early stopping window. } \item{regressionSimilarityViaOutliers}{ Should the less strict outlier-based similarity measure for regression tasks be used. } } \value{ A \code{CVST.setup} object suitable for \code{\link{fastCV}}. } \references{ Tammo Krueger, Danny Panknin, and Mikio Braun. Fast cross-validation via sequential analysis. \emph{Neural Information Processing Systems (NIPS), Big Learning Workshop}, 2011. URL \url{http://biglearn.org/2011/index.php/Papers\#paper2}. Tammo Krueger, Danny Panknin, and Mikio Braun. Fast cross-validation via sequential testing. \emph{CoRR}, abs/1206.2248, 2012. URL \url{http://arxiv.org/abs/1206.2248}. } \author{ Tammo Krueger } \seealso{ \code{\link{fastCV}} } CVST/man/cochranq.test.Rd0000644000176000001440000000266412251610727014712 0ustar ripleyusers\name{cochranq.test} \alias{cochranq.test} %- Also NEED an '\alias' for EACH other topic documented here. \title{ Cochran's Q Test with Permutation } \description{ Performs the Cochran's Q test on the data. If the data matrix contains too few elements, the chisquare distribution of the test statistic is replaced by a permutation variant. } \usage{ cochranq.test(mat) } %- maybe also 'usage' for other objects documented here. \arguments{ \item{mat}{ The data matrix with the individuals in the rows and treatments in the columns. } } \value{ Returns a \code{htest} object with the usual entries. } \references{ W. G. Cochran. The comparison of percentages in matched samples. \emph{Biometrika}, 37 (3-4):256--266, 1950. Kashinath D. Patil. Cochran's Q test: Exact distribution. \emph{Journal of the American Statistical Association}, 70 (349):186--189, 1975. Merle W. Tate and Sara M. Brown. Note on the {Cochran Q} test. \emph{Journal of the American Statistical Association}, 65 (329):155--160, 1970. } \author{ Tammo Krueger } \examples{ mat = matrix(c(rep(0, 10), 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1), ncol=4) cochranq.test(mat) mat = matrix(c(rep(0, 7), 1, rep(0, 12), 1, 1, 0, 1, rep(0, 5), 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1), nrow=8) cochranq.test(mat) }