DiscriMiner/0000755000176200001440000000000012241261076012463 5ustar liggesusersDiscriMiner/NAMESPACE0000644000176200001440000000111012160625242013672 0ustar liggesusersS3method(plot,plsda) S3method(print,desda) S3method(print,disqual) S3method(print,geoda) S3method(print,linda) S3method(print,plsda) S3method(print,quada) S3method(print,qualmca) export(FRatio) export(betweenCov) export(betweenSS) export(binarize) export(classify) export(corRatio) export(desDA) export(discPower) export(disqual) export(easyMCA) export(geoDA) export(getWithin) export(groupMeans) export(groupMedians) export(groupQuants) export(groupStds) export(groupVars) export(linDA) export(plsDA) export(quaDA) export(totalCov) export(totalSS) export(withinCov) export(withinSS) DiscriMiner/data/0000755000176200001440000000000012157657510013404 5ustar liggesusersDiscriMiner/data/insurance.rda0000644000176200001440000001270312157657510016066 0ustar liggesusersg]ugM؉{!` " IL'ŀ&f 4{{ޫFNvY{s{ukEwrS}dccC7jt+s{;'66|@c{xS[xVkSjW#gӾR6thܼsTZK+FF;ZYD䳈?w\Jƫ4GkZcuo?֛MOr뮭yשsi_zqȵ/Sg=~签xF.`sh%ϵ朎1U-D$^"ܖ<_z΋VR_<]bdѺ{zR)u9{kd<gr9i;Rw9:FQosN,Zwj4vs^՛uwE_oi.?٭ofoݝSw./i!ͼFxh#9P:Jk=hrmZ뙚#ׯصO;lR~9wӶGA&HHlZJ؋=r] jUFֺ;w^bOzsXmڪs=]ON{Ry$WsxF2z6.p>K;M'vOڿS_C{}4}[gjmKߐEǍoŦlӹJs׾)EKmjkr{+I_FJ+7QFaO>8R#gz"vTA7[h)צ槞멍Yn bv-T5ZhWs5Im5ƌ9:VO|"y]K\yKs6vn/sc>735WץE7zrU[Căq9;[9Vqp}o$xG}ړ1G+yѸ}&њ+n;~g=w|oݝ|MӻZijAkNIӜ]dƏֳ#%jVn#k.ŹK\Q+5kBiصr^QnMj{˽kk/1gnջGkM=1+Cyף\}]Z{d-uך'91c5i,-Œcx֯ur>a\Xb=%ZciH~r)={Z,j+g)>佇ߖ/J^O/מ|36Fgs_\3}A6\5X.Ft?T*>'Ro_Ҹ./鱫uxb%=vYzb֛\2֜D>ߑ1ֹ%+yg?ԘmK#rGϤEƭũuE<O#>>:{}>Z;F(wh%Lh}yt^ιADr}6ZzsȜK1sX}{ۍ{idu|בs@1F?c /g.n3=*Ew\Jn16#=2h,/^zWH>"S%+cag}y^{4sc87{ԧ=spY:K'K(6gYhM#ș[sbTdzZS_|ɭuږQgk{m+w9vk_W?zMCO͏4?vjs_g_|3x6?xΞ<{rw8w }$(cǢG> }2gE>}BKЗ/G_ }5u7oF߂};wAߋ !GЏG?~/_A~-wCg_D [w?B;'.CZ -x[< kׂ_ FxZ o+WFx\^zu ^Ay y  }6&&+L<עG0! ߆`A`A`A`A`A`A`A 1'l ؐG!!!!!`C~!c############NN_/(UyUbvr?JQRQjR'''''''''''''''~(L)L)L)L)L)uDJJF///////(\)\)\)\)\)D' _ _ _z'11111111}gDMQj˜RS0001//}/_ _ [ [ WJMQRRxRXR‘‘‘‘RO3)) ) ) ) )5EaHaHaH+ CJ]QR‘‘R[~~z𣰣?`G- ? 7 7 7 7 7J]QQQQQQxQxQxQx?E00`DS5V-00jQW . &      7|oxw ^7nxu >7|ns >77|nQO w( u}7Q[b`, lllllllll5è3####F/B ^ ^ ^ ^ ^c0c0c77Ɲ``Ǹ5Ǩ933/F1x1x1jq200000jQK 6 6 6 . . . . .a [@?9<8©   F8l8l8l8é #N]pqjÉÈÈÈÈÈÇÇS FF|c|8|8|QÇSF:p8888|8w+ ?7l8l8l8\8L8L8L8L8L8w+ _Wbpx;w};w|y;~wT;~ww;>w|oߎ;vS;vxS;vjS{S;w˩;}Npܳ{sS3 ^8, ܵ>>>>>00ܵ^ÌSKZF? P$8Jܷ%RK ,%XJ`)RKVO }+USړ*UDI`+V[+W_ |%jP_ |%J+W_ |%JԣczG %jQc,Yc %Kw$XK,YDI`+V$NDIp+W$J+W_Z`,Qk&Yg:,Y+… DiscriMiner/data/infarctus.rda0000644000176200001440000000334712157657510016101 0ustar liggesusersXYlUE&`\J)@^J[_ڲ)0줱]XRL4&&b/hbB7!bbqEBi }cb{=w{7m/ޞmw 2ɟ,Yt%z)LE:EvBl"ҁՋ< j_ -"El6"DN{m}|EG\A>ZaMB:K{'`C7Qbۦ`.C"ρvn zkC_igVuJ~;k&ݫ%+Kl<x] Cnfs~hyÏ~xr @ }i'\]{|NQc!o9`9':׏ ހ s]X=&IkAL泗Qwptg6<\qi0$.׈{'?y<;9sCA?09>lw |osp;Dyx/H?oC+ouG7+.5sy g;'>/~9}p/@JT"U {1kaWgQ_sE2=(W:[ My_;]ƕ*WBB2ુʯFJ+yKQ0B Jg9쵘K'%o_vKBR5Z+~зx湕)y ,wxSK BWC/d=ϼbek.] W}wKy_iWA7>ˡGOD_|TbGM^HyU<,UG> UP6*?u5cGyGN& W4ϭ^M1I}8༼}R)<į5:GM|S4{bxW"0|~@]a<4yt]}XY|g=F]{cI~ŘW+5.~yÆ_xޘ,6 -ge}AJx[nruROކ11b yG[`yJ򢲖޸ߖ/H\;_֟Jn]0=(<3x?Kaqh%[쏤%X2gLf&ߤ_}Dn ׋ٻt5ua7e~IVqHGg=;Ub;˼TD{YM=G:IxcCƚ4cӎM46ě4 [֥-iě׷w “ fvG/Q#o={jp399L zI8DiscriMiner/data/bordeaux.rda0000644000176200001440000000114512157657510015706 0ustar liggesusersKh@dm7Y@E㠰b=xd 1) || any(prior < 0)) stop("'prior' probabilities must range between [0,1]") if (round(sum(prior), 5) != 1) stop("'prior' probabilities don't add to 1") } else { # prior as proportions prior = nobs_group / n props = prior } # group levels glevs = levels(y) ## linDA with no validation if (validation == "none") { get_linda = my_linDA(X, y, 1:n, 1:n, prior, prob) err = 1 - sum(diag(get_linda$conf)) / n } ## linDA with learn-test sets validation if (validation == "learntest") { if (any(learn) <= 0 || any(learn) > n) stop("\nsubscript out of bounds in 'learn' set") if (any(test) <= 0 || any(test) > n) stop("\nsubscript out of bounds in 'test' set") # apply linDA get_linda = my_linDA(X, y, learn, test, prior, prob) # misclassification error rate err = 1 - sum(diag(get_linda$conf))/length(test) } ## linDA with crossvalidation if (validation == "crossval") { # linDA for all observations get_linda = my_linDA(X, y, 1:n, 1:n, prior, prob) # elements in each group elems_group = vector("list", ng) for (k in 1:ng) { elems_group[[k]] = which(group == glevs[k]) } # misclassification error rate mer = 0 # 10 crossvalidation samples for (r in 1:10) { test = vector("list", ng) test_sizes = floor(n * props / 10) for (k in 1:ng) { test[[k]] = sample(elems_group[[k]], test_sizes[k]) } test = unlist(test) learn = (1:n)[-test] # apply linDA linda_cv = my_linDA(X, y, learn, test, prior, prob) # misclassification error rate mer = mer + sum(diag(linda_cv$conf))/n } # total misclassification error rate err = 1 - mer } ## specifications specs = list(n=n, p=p, ng=ng, glevs=glevs, nobs_group=nobs_group, validation=validation) ## results structure(list(functions = get_linda$FDF, confusion = get_linda$conf, scores = get_linda$Disc, classification = get_linda$pred_class, error_rate = err, specs = specs), class = "linda") } DiscriMiner/R/withinCov.R0000644000176200001440000000377212160375153014774 0ustar liggesusers#' Within-class Covariance Matrix #' #' Calculates the within-class covariance matrix #' #' When \code{div_by_n=TRUE} the covariance matrices are divided by n (number #' of observations), otherwise they are divided by n-1 #' #' @param variables matrix or data frame with explanatory variables (No missing #' values are allowed) #' @param group vector or factor with group memberships (No missing values are #' allowed) #' @param div_by_n logical indicating division by number of observations #' @author Gaston Sanchez #' @seealso \code{\link{withinSS}}, \code{\link{betweenCov}}, #' \code{\link{totalCov}} #' @export #' @examples #' #' \dontrun{ #' # load iris dataset #' data(iris) #' #' # within-class covariance matrix (dividing by n-1) #' withinCov(iris[,1:4], iris[,5]) #' #' # within-class covariance matrix (dividing by n) #' withinCov(iris[,1:4], iris[,5], div_by_n=TRUE) #' } #' withinCov <- function(variables, group, div_by_n=FALSE) { # within-class pooled covariance matrix # variables: matrix or data frame with explanatory variables # group: vector or factor with group memberships # div_by_n: logical indicating division by num of observations # check inputs verify_Xy = my_verify(variables, group, na.rm=FALSE) X = verify_Xy$X y = verify_Xy$y # how many observations n = nrow(X) # how many variables p = ncol(X) # group levels and number of levels glevs = levels(y) ng = nlevels(y) # within cov matrix Within = matrix(0, p, p) for (k in 1:ng) { tmp <- y == glevs[k] nk = sum(tmp) if (div_by_n) { Wk = ((nk-1)/n) * var(X[tmp,]) } else { # R version / SPSS #Wk = ((nk-1)/(n-ng)) * var(X[tmp,]) Wk = ((nk-1)/(n-1)) * var(X[tmp,]) } Within = Within + Wk } # add names if (is.null(colnames(variables))) { var_names = paste("X", 1:ncol(X), sep="") dimnames(Within) = list(var_names, var_names) } else { dimnames(Within) = list(colnames(variables), colnames(variables)) } # result Within } DiscriMiner/R/my_plsDA_old.R0000644000176200001440000003006312220631415015352 0ustar liggesusers# version 1.0 of my_plsDA my_plsDA_ver1 <- function(X, y, learn, test, autosel, comps, scaled) { # Perform a PLS discriminant analysis # X: matrix or data.frame with explanatory variables # y: vector or factor with group membership # learn: vector of learning observations # test: vector of testing observations # autosel: logical indicating automatic selection of PLS comps # comps: number of PLS components (only when autosel=FALSE) # scaled: logical indicating whether to scale the data ## prepare ingredients ntest = length(test) # binarize y Y = my_tdc(data.frame(y[learn])) glevs = levels(y[learn]) # dimensions n = nrow(X[learn,]) p = ncol(X) q = ncol(Y) # determine number of PLS components to be computed # taking into account rank of X Xsvd = svd(X[learn,], nu=0, nv=0) rank_X = sum(Xsvd$d > 0.0001) if (rank_X == 0) stop("\nrank = 0: variables are numerically constant") nc = min(n, rank_X) if (!autosel) { if (comps < nc) nc = comps } if (nc == n) nc = n - 1 # standardizing data X.old = scale(X[learn,], scale=scaled) Y.old = scale(Y) # creating matrices to store results Wh = matrix(0, p, nc) Uh = matrix(0, n, nc) Th = matrix(0, n, nc) Ch = matrix(0, q, nc) Ph = matrix(0, p, nc) bh = rep(0, nc) RSS = rbind(rep(n-1,q), matrix(NA, nc, q)) PRESS = matrix(NA, nc, q) Q2 = matrix(NA, nc, q) ## PLS2 algorithm for (h in 1:nc) { # "arbitrary" vector (first column of Y.old) u.new = Y.old[,1] w.old = rep(1, p) iter = 1 repeat { w.new = t(X.old) %*% u.new / sum(u.new^2) w.new = w.new / sqrt(sum(w.new^2)) # normalize w.old t.new = X.old %*% w.new c.new = t(Y.old) %*% t.new / sum(t.new^2) u.new = Y.old %*% c.new / sum(c.new^2) w.dif = w.new - w.old w.old = w.new if (sum(w.dif^2)<1e-06 || iter==100) break iter = iter + 1 } p.new = t(X.old) %*% t.new / sum(t.new^2) # leave-one-out cross validation RSS[h+1,] = colSums((Y.old - t.new%*%t(c.new))^2) press = matrix(0, n, q) for (i in 1:n) { uh.si = Y.old[-i,1] wh.siold = rep(1,p) itcv = 1 repeat { wh.si = t(X.old[-i,]) %*% uh.si / sum(uh.si^2) wh.si = wh.si / sqrt(sum(wh.si^2)) th.si = X.old[-i,] %*% wh.si ch.si = t(Y.old[-i,]) %*% th.si / sum(th.si^2) uh.si = Y.old[-i,] %*% ch.si / sum(ch.si^2) wsi.dif = wh.si - wh.siold wh.siold = wh.si if (sum(wsi.dif^2)<1e-06 || itcv==100) break itcv = itcv + 1 } Yhat.si = (X.old[i,] %*% wh.si) %*% t(ch.si) press[i,] = (Y.old[i,] - Yhat.si)^2 } PRESS[h,] = colSums(press) Q2[h,] = 1 - PRESS[h,]/RSS[h,] X.old = X.old - (t.new %*% t(p.new)) Y.old = Y.old - (t.new %*% t(c.new)) Wh[,h] = w.new Uh[,h] = u.new Th[,h] = t.new Ch[,h] = c.new Ph[,h] = p.new bh[h] = t(u.new) %*% t.new } # finish PLS algorithm ## selection of PLS components # Q2 global Q2G = 1 - rowSums(PRESS)/rowSums(RSS[-nc,]) # automatic selection of PLS components? ncs = nc if (autosel) { # Rule 1: Q2G >= 0.05 (Perez & Tenenhaus, 2003) selcom = which(Q2G >= 0.05) # Rule 2: at least one Q2hk >= 0.095 #aux = apply(Q2, 1, function(x) sum(x>=0.0975)) #selcom = which(aux > 0) ncs = length(selcom) # selecting elements Wh = Wh[,selcom] Uh = Uh[,selcom] Ph = Ph[,selcom] Th = Th[,selcom] Ch = Ch[,selcom] } ## PLS results # weights Ws = Wh %*% solve(t(Ph)%*%Wh) # standardized regression coefficients Bs = Ws %*% t(Ch) # regression coeffs non-standardized Br = diag(1/apply(X[learn,],2,sd)) %*% Bs %*% diag(apply(Y,2,sd)) cte = as.vector((apply(Y,2,mean) - apply(X[learn,],2,mean)%*%Br)) # Q2 global accumulated Q2T = cbind(Q2, Q2G) q2 = c(paste(rep("Q2",q),colnames(Y),sep="."),"Q2.global") # correlations and redundancies cor_tx = cor(X[learn,], Th) cor_ty = cor(Y, Th) R2x = cor(X[learn,], Th)^2 # R2 coefficients R2y = cor(Y, Th)^2 # R2 coefficients Rdx = colMeans(R2x) Rdy = colMeans(R2y) R2 = cbind(Rdx, cumsum(Rdx), Rdy, cumsum(Rdy)) Rd.mat = matrix(0, ncs, ncs) for (j in 1:ncs) Rd.mat[1:j,j] = Rdy[1:j] # variable importance VIP = sqrt((Wh^2) %*% Rd.mat %*% diag(p/cumsum(Rdy), ncs, ncs)) ## adding names dimnames(Th) = list(rownames(X[learn,]), paste(rep("t",ncs),1:ncs,sep="")) dimnames(Ph) = list(colnames(X), paste(rep("p",ncs),1:ncs,sep="")) dimnames(Bs) = list(colnames(X), colnames(Y)) dimnames(Br) = list(colnames(X), colnames(Y)) dimnames(cor_tx) = list(colnames(X), paste(rep("t",ncs),1:ncs,sep="")) dimnames(cor_ty) = list(colnames(Y), paste(rep("t",ncs),1:ncs,sep="")) dimnames(Q2T) = list(paste(rep("t",nc),1:nc,sep=""), q2) dimnames(R2) = list(paste(rep("t",ncs),1:ncs,sep=""), c("R2X","R2Xcum","R2Y","R2Ycum")) dimnames(VIP) = list(colnames(X), paste(rep("t",ncs),1:ncs,sep="")) ## Discrimination coeffs = rbind(INTERCEPT=cte, Br) Disc = X[test,] %*% Br + matrix(rep(cte,each=ntest), ntest, q) # predicted class pred_class = factor(max.col(Disc), levels=seq_along(glevs), labels=glevs) # confusion matrix conf = table(original=y[test], predicted=pred_class) # results res = list(coeffs=coeffs, conf=conf, Disc=Disc, pred_class=pred_class, components=Th, Q2T=Q2T, R2=R2, VIP=VIP, cor_tx=cor_tx, cor_ty=cor_ty) res } # version 2.0 of my_plsDA my_plsDA_ver2 <- function(X, y, learn, test, autosel, comps, cv = "LOO", k = NA) { ## prepare ingredients ntest = length(test) # binarize y Y = my_tdc(data.frame(y[learn])) glevs = levels(y[learn]) # dimensions n = nrow(X[learn,]) p = ncol(X) q = ncol(Y) #added in k for leave-k-out cross validation #k=10 # determine number of PLS components to be computed # taking into account rank of X Xsvd = svd(X[learn,], nu=0, nv=0) rank_X = sum(Xsvd$d > 0.0001) if (rank_X == 0) stop("\nrank = 0: variables are numerically constant") nc = min(n, rank_X) if (!autosel) { if (comps < nc) nc = comps } if (nc == n) nc = n - 1 # standardizing data X.old = scale(X[learn,]) Y.old = scale(Y) # creating matrices to store results Wh = matrix(0, p, nc) Uh = matrix(0, n, nc) Th = matrix(0, n, nc) Ch = matrix(0, q, nc) Ph = matrix(0, p, nc) bh = rep(0, nc) RSS = rbind(rep(n-1,q), matrix(NA, nc, q)) PRESS = matrix(NA, nc, q) Q2 = matrix(NA, nc, q) ### remove random kth out if(cv=="LKO"){ fold=split(sample(1:n), rep(1:k, length=n)) } ## PLS2 algorithm for (h in 1:nc) { # "arbitrary" vector (first column of Y.old) u.new = Y.old[,1] w.old = rep(1, p) iter = 1 repeat { w.new = t(X.old) %*% u.new / sum(u.new^2) w.new = w.new / sqrt(sum(w.new^2))# normalize w.old t.new = X.old %*% w.new c.new = t(Y.old) %*% t.new / sum(t.new^2) u.new = Y.old %*% c.new / sum(c.new^2) w.dif = w.new - w.old w.old = w.new if (sum(w.dif^2)<1e-06 || iter==100) break iter = iter + 1 } p.new = t(X.old) %*% t.new / sum(t.new^2) # Cross validation RSS[h+1,] = colSums((Y.old - t.new%*%t(c.new))^2) press = matrix(0, n, q) ### Random leave-k-out if(cv=="LKO"){ for (i in 1:k) { #removes row i, only column 1 omit=fold[[i]] uh.si <- Y.old[-omit,1] wh.siold <- rep(1,p) itcv <- 1 repeat { wh.si <- t(X.old[-omit,]) %*% uh.si / sum(uh.si^2) wh.si <- wh.si / sqrt(sum(wh.si^2)) th.si <- X.old[-omit,] %*% wh.si ch.si <- t(Y.old[-omit,]) %*% th.si / sum(th.si^2) uh.si <- Y.old[-omit,] %*% ch.si / sum(ch.si^2) wsi.dif <- wh.si - wh.siold wh.siold <- wh.si if (sum(wsi.dif^2)<1e-06 || itcv==100) break itcv <- itcv + 1 } Yhat.si = (X.old[omit,] %*% wh.si) %*% t(ch.si) press[omit,] = (Y.old[omit,] - Yhat.si)^2 } } # Leave-One-Out if(cv=="LOO"){ for (i in 1:n) { uh.si = Y.old[-i,1] wh.siold = rep(1,p) itcv = 1 repeat { wh.si = t(X.old[-i,]) %*% uh.si / sum(uh.si^2) wh.si = wh.si / sqrt(sum(wh.si^2)) th.si = X.old[-i,] %*% wh.si ch.si = t(Y.old[-i,]) %*% th.si / sum(th.si^2) uh.si = Y.old[-i,] %*% ch.si / sum(ch.si^2) wsi.dif = wh.si - wh.siold wh.siold = wh.si if (sum(wsi.dif^2)<1e-06 || itcv==100) break itcv = itcv + 1 } Yhat.si = (X.old[i,] %*% wh.si) %*% t(ch.si) press[i,] = (Y.old[i,] - Yhat.si)^2 } } PRESS[h,] = colSums(press) Q2[h,] = 1 - PRESS[h,]/RSS[h,] X.old = X.old - (t.new %*% t(p.new)) Y.old = Y.old - (t.new %*% t(c.new)) Wh[,h] = w.new Uh[,h] = u.new Th[,h] = t.new Ch[,h] = c.new Ph[,h] = p.new bh[h] = t(u.new) %*% t.new } # finish PLS algorithm ## selection of PLS components # Q2 global Q2G = 1 - rowSums(PRESS)/rowSums(RSS[-nc,]) # automatic selection of PLS components? ncs = nc if (autosel) { # Rule 1: Q2G >= 0.05 (Perez & Tenenhaus, 2003) selcom = which(Q2G >= 0.05) # Rule 2: at least one Q2hk >= 0.095 #aux = apply(Q2, 1, function(x) sum(x>=0.0975)) #selcom = which(aux > 0) ncs = length(selcom) # selecting elements Wh = Wh[,selcom] Uh = Uh[,selcom] Ph = Ph[,selcom] Th = Th[,selcom] Ch = Ch[,selcom] } ## PLS results # weights Ws = Wh %*% solve(t(Ph)%*%Wh) # standardized regression coefficients Bs = Ws %*% t(Ch) # regression coeffs non-standardized Br = diag(1/apply(X[learn,],2,sd)) %*% Bs %*% diag(apply(Y,2,sd)) cte = as.vector((apply(Y,2,mean) - apply(X[learn,],2,mean)%*%Br)) # Q2 global accumulated Q2T = cbind(Q2, Q2G) q2 = c(paste(rep("Q2",q),colnames(Y),sep="."),"Q2.global") # correlations and redundancies cor_tx = cor(X[learn,], Th) cor_ty = cor(Y, Th) R2x = cor(X[learn,], Th)^2 # R2 coefficients R2y = cor(Y, Th)^2 # R2 coefficients Rdx = colMeans(R2x) Rdy = colMeans(R2y) R2 = cbind(Rdx, cumsum(Rdx), Rdy, cumsum(Rdy)) Rd.mat = matrix(0, ncs, ncs) for (j in 1:ncs) Rd.mat[1:j,j] = Rdy[1:j] # variable importance VIP = sqrt((Wh^2) %*% Rd.mat %*% diag(p/cumsum(Rdy), ncs, ncs)) ## adding names ### added Ws and Ch for loadings and Y.loadings respectively dimnames(Ws) = list(colnames(X), paste(rep("w*",ncs),1:ncs,sep="")) dimnames(Ch) = list(colnames(Y), paste(rep("c",ncs),1:ncs,sep="")) dimnames(Th) = list(rownames(X[learn,]), paste(rep("t",ncs),1:ncs,sep="")) dimnames(Ph) = list(colnames(X), paste(rep("p",ncs),1:ncs,sep="")) dimnames(Bs) = list(colnames(X), colnames(Y)) dimnames(Br) = list(colnames(X), colnames(Y)) dimnames(cor_tx) = list(colnames(X), paste(rep("t",ncs),1:ncs,sep="")) dimnames(cor_ty) = list(colnames(Y), paste(rep("t",ncs),1:ncs,sep="")) dimnames(Q2T) = list(paste(rep("t",nc),1:nc,sep=""), q2) dimnames(R2) = list(paste(rep("t",ncs),1:ncs,sep=""), c("R2X","R2Xcum","R2Y","R2Ycum")) dimnames(VIP) = list(colnames(X), paste(rep("t",ncs),1:ncs,sep="")) ## Discrimination coeffs = rbind(INTERCEPT=cte, Br) Disc = X[test,] %*% Br + matrix(rep(cte,each=ntest), ntest, q) # predicted class pred_class = factor(max.col(Disc), levels=seq_along(glevs), labels=glevs) # confusion matrix conf = table(original=y[test], predicted=pred_class) # results ### added loadings and y.loadings res = list(coeffs=coeffs, conf=conf, Disc=Disc, pred_class=pred_class, components=Th, loadings=round(Ws,4), y.loadings=round(Ch,4), Q2T=Q2T, R2=R2, VIP=VIP, cor_tx=cor_tx, cor_ty=cor_ty) res } DiscriMiner/R/print.quada.R0000644000176200001440000000116512160375430015240 0ustar liggesusers#' @S3method print quada print.quada <- function(x, ...) { cat("\nQuadratic Discriminant Analysis\n") cat(rep("-",37), sep="") cat("\n$confusion ", "confusion matrix") cat("\n$scores ", "discriminant values") cat("\n$classification ", "assigned class") cat("\n$error_rate ", "error rate\n") cat(rep("-",37), sep="") cat("\n\n$confusion\n") print(x$confusion, print.gap=2) cat("\n\n$error_rate\n") print(x$error_rate) cat("\n\n$scores\n") print(head(x$scores), print.gap=2) cat("...\n") cat("\n$classification\n") print(head(x$classification)) cat("...\n") invisible(x) } DiscriMiner/R/DiscriMiner-package.R0000644000176200001440000001220312160254536016611 0ustar liggesusers#' Bordeaux Wines Dataset #' #' Quality measures of wines from Bordeaux, France #' #' #' @name bordeaux #' @docType data #' @format A data frame with 34 observations on the following 6 variables. #' \tabular{ll}{ \code{year} \tab year of harvest\cr \code{temperature} \tab #' sum of daily average temperatures (in celsius degrees)\cr \code{sun} \tab #' duration of insolation (in hours)\cr \code{heat} \tab number of super-hot #' days\cr \code{rain} \tab rain level (in millimeters)\cr \code{quality} \tab #' wine quality: a factor with levels \code{bad}, \code{good}, and #' \code{medium}\cr } #' @references Chapter 10: Analyse Discriminante, page 353. \cr Tenenhaus M. #' (2007) \emph{Statistique}. Dunod, Paris. #' @keywords datasets #' @examples #' #' \dontrun{ #' # load data #' data(bordeaux) #' #' # structure of data #' str(bordeaux) #' } #' NULL #' Tools of the Trade for Discriminant Analysis #' #' DiscriMiner contains several functions for Discriminant Analysis and #' Classification purposes covering various methods such as descriptive, #' geometric, linear, quadratic, PLS, as well as qualitative discriminant #' analyses. #' #' \tabular{ll}{ Package: \tab DiscriMiner\cr Type: \tab Package\cr Version: #' \tab 0.1-23\cr Date: \tab 2012-12-20\cr License: \tab GPL-3\cr } #' #' @name DiscriMiner-package #' @docType package #' @author Gaston Sanchez #' #' Maintainer: Gaston Sanchez #' @references \url{http://www.gastonsanchez.com/discriminer} #' #' Lebart L., Piron M., Morineau A. (2006) \emph{Statistique exploratoire #' multidimensionnelle}. Dunod, Paris. #' #' Nakache J-P., Confais J. (2003) \emph{Statistique explicative appliquee}. #' Editions Technip, Paris. #' #' Saporta G. (2006) \emph{Probabilites, analyse des donnees et statistique}. #' Editions Technip, Paris. #' #' Tenenhaus M. (1998) \emph{La Regression PLS}. Editions Technip, Paris. #' #' Tenenhaus M. (2007) \emph{Statistique}. Dunod, Paris. #' #' Tuffery S. (2008) \emph{Data Mining et Statistique Decisionnelle}. Editions #' Technip, Paris. #' #' Tuffery S. (2011) \emph{Data Mining and Statistics for Decision Making}. #' Wiley, Chichester. #' #' \emph{Multiple Correspondence Analysis and Related Methods}. (2006) Edited #' by Michael Greenacre and Jorg Blasius. Chapman and Hall/CRC #' @keywords package NULL #' Infarctus dataset #' #' Infarctus dataset from Saporta (2006) #' #' #' @name infarctus #' @docType data #' @format A data frame with 101 observations on the following 8 variables. #' \tabular{ll}{ \code{FRCAR} \tab Frequence Cardiaque (i.e. heart rate)\cr #' \code{INCAR} \tab Index Cardique (cardiac index)\cr \code{INSYS} \tab Index #' Systolique (systolic index)\cr \code{PRDIA} \tab Pression Diastolique #' (diastolic pressure)\cr \code{PAPUL} \tab Pression Arterielle Pulmonaire #' (pulmonary artery pressure)\cr \code{PVENT} \tab Pression Ventriculaire #' (ventricular pressure)\cr \code{REPUL} \tab Resistance Pulmonaire (pulmonary #' resistance)\cr \code{PRONO} \tab Pronostic (prognosis): a factor with levels #' \code{dead} and \code{survive}\cr } #' @references Chapter 18: Analyse discriminante et regression logistique, pp #' 453-454 \cr Saporta G. (2006) \emph{Probabilites, analyse des donnees et #' statistique}. Editions Technip, Paris. #' @keywords datasets #' @examples #' #' \dontrun{ #' # load data #' data(infarctus) #' #' # summary of variables #' summary(infarctus) #' } #' NULL #' Insurance Dataset #' #' Dataset of car-insurance customers from Belgium in 1992 #' #' Dataset for DISQUAL method #' #' @name insurance #' @docType data #' @format #' #' A data frame with 1106 observations on the following 10 variables. #' \tabular{ll}{ \code{Claims} \tab Group variable. A factor with levels #' \code{bad} and \code{good}\cr \code{Use} \tab Type of Use. A factor with #' levels \code{private} and \code{professional}\cr \code{Type} \tab Insurance #' Type. A factor with levels \code{companies}, \code{female}, and #' \code{male}\cr \code{Language} \tab Language. A factor with levels #' \code{flemish} and \code{french}\cr \code{BirthCohort} \tab Birth Cohort. A #' factor with levels \code{BD_1890_1949}, \code{BD_1950_1973}, and #' \code{BD_unknown}\cr \code{Region} \tab Geographic Region. A factor with #' levels \code{Brussels} and \code{Other_regions}\cr \code{BonusMalus} \tab #' Level of bonus-malus. A factor with levels \code{BM_minus} and #' \code{BM_plus}\cr \code{YearSuscrip} \tab Year of Subscription. A factor #' with levels \code{YS<86} and \code{YS>=86}\cr \code{Horsepower} \tab #' Horsepower. A factor with levels \code{HP<=39} and \code{HP>=40}\cr #' \code{YearConstruc} \tab Year of vehicle construction. A factor with levels #' \code{YC_33_89} and \code{YC_90_91}\cr } #' @seealso \code{\link{disqual}} #' @references Saporta G., Niang N. (2006) Correspondence Analysis and #' Classification. In \emph{Multiple Correspondence Analysis and Related #' Methods}, M. Greenacre and J. Blasius, Eds., pp 371-392. Chapman & Hall/CRC, #' Boca Raton, Florida, USA. #' @keywords datasets #' @examples #' #' \dontrun{ #' # load data #' data(insurance) #' #' # structure #' str(insurance) #' } #' NULL DiscriMiner/R/totalCov.R0000644000176200001440000000353412160375135014611 0ustar liggesusers#' Total Covariance Matrix #' #' Calculates total covariance matrix #' #' When \code{div_by_n=TRUE} the covariance matrices are divided by n (number #' of observations), otherwise they are divided by n-1 #' #' @param variables matrix or data frame with explanatory variables (No missing #' values are allowed) #' @param div_by_n logical indicating division by number of observations #' @author Gaston Sanchez #' @seealso \code{\link{totalSS}}, \code{\link{betweenCov}}, #' \code{\link{withinCov}} #' @export #' @examples #' #' \dontrun{ #' # load iris dataset #' data(iris) #' #' # total covariance matrix (dividing by n-1) #' totalCov(iris[,1:4]) #' #' # total covariance matrix (dividing by n) #' totalCov(iris[,1:4], div_by_n=TRUE) #' } #' totalCov <- function(variables, div_by_n=FALSE) { # Total covariance matrix # variables: matrix or data frame with explanatory variables # div_by_n: logical indicating division by num of observations # X matrix or data.frame if (!is.matrix(variables) && !is.data.frame(variables)) stop("\nSorry, 'variables' must be a matrix") if (is.null(dim(variables))) stop("'variables' is not a matrix") # enforce variables as matrix if (!is.matrix(variables)) variables = as.matrix(variables) # no missing values allowed if (any(!is.finite(variables))) stop("infinite, NA or NaN values in 'variables'") # only numeric values if (!is.numeric(variables)) stop("\nSorry, 'variables' must contain only numeric values") n = nrow(variables) Total = var(variables) if (div_by_n) { Total = ((n-1)/n) * Total } # add names if (is.null(colnames(variables))) { var_names = paste("X", 1:ncol(variables), sep="") dimnames(Total) = list(var_names, var_names) } else { dimnames(Total) = list(colnames(variables), colnames(variables)) } # result Total } DiscriMiner/R/my_linDA.R0000644000176200001440000000402312157657510014512 0ustar liggesusersmy_linDA <- function(X, y, learn, test, prior, prob) { # Perform a lienar discriminant analysis # X: matrix or data.frame with explanatory variables # y: vector or factor with group membership # learn: vector of learning observations # test: vector of testing observations # prior: vector of prior proportions # prob: logical indicating results in proability terms # how many observations n = nrow(X[learn,]) ntest = length(test) # how many groups ng = nlevels(y[learn]) glevs = levels(y[learn]) # how many obs in each group nobs_group = as.vector(table(y[learn])) # group means GM = my_groupMeans(X[learn,], y[learn]) # within-class covariance matrix W = my_withinCov(X[learn,], y[learn]) # inverse of Within cov matrix W_inv = solve(W) # constant terms of classification functions cons = rep(0, ng) # coefficients of classification functions Betas = matrix(0, nrow(W_inv), ng) for (k in 1:ng) { cons[k] = -(1/2) * GM[k,] %*% W_inv %*% GM[k,] + log(prior[k]) Betas[,k] = t(GM[k,]) %*% W_inv } # Fisher's Discriminant Functions FDF = rbind(cons, Betas) rownames(FDF) = c("constant", colnames(X)) colnames(FDF) = glevs # matrix of constant terms A = matrix(rep(cons,ntest), ntest, ng, byrow=TRUE) # apply discrim functions Disc = X[test,] %*% Betas + A # probability values if (prob) { # exponential Disc <- 1 - exp( -(Disc - apply(Disc, 1, min, na.rm=TRUE))) # predicting classes pred = Disc / drop(Disc %*% rep(1, ng)) # predicted class pred_class = factor(max.col(pred), levels=seq_along(glevs), labels=glevs) } else { # predicted class pred = apply(Disc, 1, function(u) which(u == max(u))) names(pred) = NULL # assign class values pred_class = factor(pred, levels=seq_along(glevs), labels=glevs) } dimnames(Disc) = list(rownames(X[test,]), glevs) # confusion matrix conf = table(original=y[test], predicted=pred_class) # results res = list(FDF=FDF, conf=conf, Disc=Disc, pred_class=pred_class) res } DiscriMiner/R/print.disqual.R0000644000176200001440000000172112160375336015612 0ustar liggesusers#' @S3method print disqual print.disqual <- function(x, ...) { cat("\nDiscriminant Analysis on Qualitative Variables\n") cat(rep("-",46), sep="") cat("\n$raw_coefs ", "raw coeffcients") cat("\n$norm_coefs ", "normalized coefficients") cat("\n$confusion ", "confusion matrix") cat("\n$scores ", "scores") cat("\n$classification ", "assigned class") cat("\n$error_rate ", "error rate\n") cat(rep("-",46), sep="") cat("\n\n$raw_coefs\n") print(format(x$raw_coefs, scientific=FALSE, digits=3), print.gap=2, quote=FALSE) cat("\n\n$norm_coefs\n") print(format(x$norm_coefs, scientific=FALSE, digits=3), print.gap=2, quote=FALSE) cat("\n\n$confusion\n") print(x$confusion, print.gap=2) cat("\n\n$error_rate\n") print(x$error_rate) cat("\n\n$scores\n") print(head(x$scores), print.gap=2) cat("...\n") cat("\n$classification\n") print(head(x$classification)) cat("...\n") invisible(x) } DiscriMiner/R/print.linda.R0000644000176200001440000000144012160375372015235 0ustar liggesusers#' @S3method print linda print.linda <- function(x, ...) { cat("\nLinear Discriminant Analysis\n") cat(rep("-",43), sep="") cat("\n$functions ", "discrimination functions") cat("\n$confusion ", "confusion matrix") cat("\n$scores ", "discriminant scores") cat("\n$classification ", "assigned class") cat("\n$error_rate ", "error rate\n") cat(rep("-",43), sep="") cat("\n\n$functions\n") print(format(x$functions, scientific=FALSE, digits=3), print.gap=2, quote=FALSE) cat("\n\n$confusion\n") print(x$confusion, print.gap=2) cat("\n\n$error_rate\n") print(x$error_rate) cat("\n\n$scores\n") print(head(x$scores), print.gap=2) cat("...\n") cat("\n$classification\n") print(head(x$classification)) cat("...\n") invisible(x) } DiscriMiner/R/plsDA.R0000644000176200001440000001360012235013626014011 0ustar liggesusers#' @title PLS Discriminant Analysis #' #' @description Performs a Partial Least Squares (PLS) Discriminant Analysis #' by giving the option to include a random leave-k fold out cross validation #' #' @details When \code{validation=NULL} leave-one-out (loo) cross-validation is #' performed. \cr When \code{validation="learntest"} validation is performed by #' providing a learn-set and a test-set of observations. \cr #' #' @param variables matrix or data frame with explanatory variables #' @param group vector or factor with group memberships #' @param autosel logical indicating automatic selection of PLS components by #' cross-validation. Default \code{autosel=TRUE} #' @param comps integer greater than one indicating the number of PLS #' components to retain. Used only when \code{autosel=FALSE} #' @param validation type of validation, either \code{NULL} or #' \code{"learntest"}. Default \code{NULL} #' @param learn optional vector of indices for a learn-set. Only used when #' \code{validation="learntest"}. Default \code{NULL} #' @param test optional vector of indices for a test-set. Only used when #' \code{validation="learntest"}. Default \code{NULL} #' @param cv string indicating the type of crossvalidation. #' Avialable options are \code{"LOO"} (Leave-One-Out) #' and \code{"LKO"} (Leave-K fold-Out) #' @param k fold left out if using LKO (usually 7 or 10) #' @param retain.models whether to retain lower models (i.e. all lower component #' results) #' @return An object of class \code{"plsda"}, basically a list with the #' following elements: #' @return \item{functions}{table with discriminant functions} #' @return \item{confusion}{confusion matrix} #' @return \item{scores}{discriminant scores for each observation} #' @return \item{loadings}{loadings} #' @return \item{y.loadings}{y loadings} #' @return \item{classification}{assigned class} #' @return \item{error_rate}{misclassification error rate} #' @return \item{components}{PLS components} #' @return \item{Q2}{quality of loo cross-validation} #' @return \item{R2}{R-squared coefficients} #' @return \item{VIP}{Variable Importance for Projection} #' @return \item{comp_vars}{correlations between components and variables} #' @return \item{comp_group}{correlations between components and groups} #' @author Charles Determan Jr, Gaston Sanchez #' @seealso \code{\link{classify}}, \code{\link{geoDA}}, \code{\link{linDA}}, #' \code{\link{quaDA}} #' @references Tenenhaus M. (1998) \emph{La Regression PLS}. Editions Technip, #' Paris. #' #' Perez-Enciso M., Tenenhaus M. (2003) \emph{Prediction of clinical outcome #' with microarray data: a partial least squares discriminant analysis (PLS-DA) #' approach}. Human Genetics 112: 581-592. #' @export #' @examples #' #' \dontrun{ #' # load iris dataset #' data(iris) #' #' # PLS discriminant analysis specifying number of components = 2 #' my_pls1 = plsDA(iris[,1:4], iris$Species, autosel=FALSE, comps=2) #' my_pls1$confusion #' my_pls1$error_rate #' # plot circle of correlations #' plot(my_pls1) #' #' # PLS discriminant analysis with automatic selection of components #' my_pls2 = plsDA(iris[,1:4], iris$Species, autosel=TRUE) #' my_pls2$confusion #' my_pls2$error_rate #' #' # linear discriminant analysis with learn-test validation #' learning = c(1:40, 51:90, 101:140) #' testing = c(41:50, 91:100, 141:150) #' my_pls3 = plsDA(iris[,1:4], iris$Species, validation="learntest", #' learn=learning, test=testing) #' my_pls3$confusion #' my_pls3$error_rate #' } #' plsDA <- function(variables, group, autosel = TRUE, comps = 2, validation = NULL, learn = NULL, test = NULL, cv = "LOO", k = NULL, retain.models = FALSE) { # check inputs verify_Xy = my_verify(variables, group, na.rm = FALSE) X = verify_Xy$X y = verify_Xy$y # autosel if (!is.logical(autosel)) autosel = TRUE # number of components if (!autosel) { if (mode(comps)!="numeric" || length(comps)!=1 || comps<=1 || (comps%%1)!=0) stop("\nInvalid argument 'comps' (number of components)") } # type of validation if (is.null(validation)) { validation = "none" } else { vali = validation %in% c("crossval", "learntest") if (!vali) stop("\nIncorrect type of validation") } # how many observations and variables n = nrow(X) p = ncol(X) # how many groups ng = nlevels(y) # how many obs in each group nobs_group = as.vector(table(y)) # group levels glevs = levels(y) ## plsDA with no validation if (validation %in% c("none", "crossval")) { get_plsda = my_plsDA(X, y, 1:n, 1:n, autosel, comps, cv=cv, k=k, retain.models = retain.models) err = 1 - sum(diag(get_plsda$conf)) / n } ## plsDA with learn-test sets validation if (validation == "learntest") { if (any(learn) <= 0 || any(learn) > n) stop("\nsubscript out of bounds in 'learn' set") if (any(test) <= 0 || any(test) > n) stop("\nsubscript out of bounds in 'test' set") # apply plsDA get_plsda = my_plsDA(X, y, learn, test, autosel, comps, retain.models = retain.models) # misclassification error rate err = 1 - sum(diag(get_plsda$conf))/length(test) } ## specifications specs = list(n=n, p=p, ng=ng, glevs=glevs, nobs_group=nobs_group, validation=validation) ## results ### added loadings and y.loadings structure(list(functions = get_plsda$coeffs, confusion = get_plsda$conf, scores = get_plsda$Disc, loadings = get_plsda$loadings, y.loadings = get_plsda$y.loadings, classification = get_plsda$pred_class, error_rate = err, components = get_plsda$components, Q2 = get_plsda$Q2T, R2 = get_plsda$R2, VIP = get_plsda$VIP, comp_vars = get_plsda$cor_tx, comp_group = get_plsda$cor_ty, specs = specs), class = "plsda") } DiscriMiner/R/my_groupMeans.R0000644000176200001440000000060012157657510015640 0ustar liggesusersmy_groupMeans <- function(X, g) { # X: matrix of explanatory variables # g: factor with group memberships # how many groups ng = nlevels(g) # matrix with group means Means = matrix(0, ng, ncol(X)) for (j in 1:ncol(X)) { Means[,j] = tapply(X[,j], g, FUN=mean) } # add names rownames(Means) = levels(g) colnames(Means) = colnames(X) # results Means } DiscriMiner/R/easyMCA.R0000644000176200001440000000266212160374726014306 0ustar liggesusers#' Multiple Correspondence Analysis #' #' Performs a basic Multiple Correspondence Analysis (MCA) #' #' #' @param variables data frame with categorical variables (coded as factors) #' @return An object of class \code{"qualmca"}, basically a list with the #' following elements: #' @return \item{values}{table with eigenvalues} #' @return \item{coefficients}{coefficients of factorial axes} #' @return \item{components}{factor coordinates} #' @author Gaston Sanchez #' @seealso \code{\link{disqual}}, \code{\link{binarize}} #' @references Lebart L., Piron M., Morineau A. (2006) \emph{Statistique #' Exploratoire Multidimensionnelle}. Dunod, Paris. #' #' Saporta G. (2006) \emph{Probabilites, analyse des donnees et statistique}. #' Editions Technip, Paris. #' @export #' @examples #' #' \dontrun{ #' # load insurance wines dataset #' data(insurance) #' #' # multiple correspondence analysis #' mca1 = easyMCA(insurance[,-1]) #' mca1 #' } #' easyMCA <- function(variables) { # Perform multiple correspondence analysis # X: data frame with categorical variables as factors # check input fac_check = sapply(variables, class) if (!is.data.frame(variables) && any(fac_check != "factor")) stop("\nA data frame with factors was expected") # check for missing values if (length(complete.cases(variables)) != nrow(variables)) stop("\nSorry: no missing values allowed") # apply MCA res = my_mca(variables) res } DiscriMiner/R/geoDA.R0000644000176200001440000001164312160374751014000 0ustar liggesusers#' Geometric Predictive Discriminant Analysis #' #' Performs a Geometric Predictive Discriminant Analysis #' #' When \code{validation=NULL} there is no validation \cr When #' \code{validation="crossval"} cross-validation is performed by randomly #' separating the observations in ten groups. \cr When #' \code{validation="learntest"} validationi is performed by providing a #' learn-set and a test-set of observations. \cr #' #' @param variables matrix or data frame with explanatory variables #' @param group vector or factor with group memberships #' @param validation type of validation, either \code{"crossval"} or #' \code{"learntest"}. Default \code{NULL} #' @param learn optional vector of indices for a learn-set. Only used when #' \code{validation="learntest"}. Default \code{NULL} #' @param test optional vector of indices for a test-set. Only used when #' \code{validation="learntest"}. Default \code{NULL} #' @return An object of class \code{"geoda"}, basically a list with the #' following elements: #' @return \item{functions}{table with discriminant functions} #' @return \item{confusion}{confusion matrix} #' @return \item{scores}{discriminant scores for each observation} #' @return \item{classification}{assigned class} #' @return \item{error_rate}{misclassification error rate} #' @author Gaston Sanchez #' @seealso \code{\link{classify}}, \code{\link{desDA}}, \code{\link{linDA}}, #' \code{\link{quaDA}}, \code{\link{plsDA}} #' @references Lebart L., Piron M., Morineau A. (2006) \emph{Statistique #' Exploratoire Multidimensionnelle}. Dunod, Paris. #' #' Saporta G. (2006) \emph{Probabilites, analyse des donnees et statistique}. #' Editions Technip, Paris. #' #' Tuffery S. (2011) \emph{Data Mining and Statistics for Decision Making}. #' Wiley, Chichester. #' @export #' @examples #' #' \dontrun{ #' # load bordeaux wines dataset #' data(iris) #' #' # geometric predictive discriminant analysis with no validation #' my_geo1 = geoDA(iris[,1:4], iris$Species) #' my_geo1$confusion #' my_geo1$error_rate #' #' # geometric predictive discriminant analysis with cross-validation #' my_geo2 = geoDA(iris[,1:4], iris$Species, validation="crossval") #' my_geo2$confusion #' my_geo2$error_rate #' } #' geoDA <- function(variables, group, validation = NULL, learn = NULL, test = NULL) { # Perform a geometric predictive discriminant analysis # variables: matrix or data.frame with explanatory variables # group: vector or factor with group membership # validation: NULL, "crossval", "learntest" # learn: vector of learn-set # test: vector of test-set # check inputs verify_Xy = my_verify(variables, group, na.rm=FALSE) X = verify_Xy$X y = verify_Xy$y # type of validation if (is.null(validation)) { validation = "none" } else { vali = validation %in% c("crossval", "learntest") if (!vali) stop("\nIncorrect type of validation") } # how many observations n = nrow(X) # how many variables p = ncol(X) # how many groups ng = nlevels(y) glevs = levels(y) # how many obs in each group nobs_group = as.vector(table(y)) # proportions props = nobs_group / n ## geoDA with no validation if (validation == "none") { get_geoda = my_geoDA(X, y, 1:n, 1:n) err = 1 - sum(diag(get_geoda$conf)) / n } ## geoDA with learn-test sets validation if (validation == "learntest") { if (any(learn) <= 0 || any(learn) > n) stop("\nsubscript out of bounds in 'learn' set") if (any(test) <= 0 || any(test) > n) stop("\nsubscript out of bounds in 'test' set") # apply linDA get_geoda = my_geoDA(X, y, learn, test) # misclassification error rate err = 1 - sum(diag(get_geoda$conf))/length(test) } ## geoDA with crossvalidation if (validation == "crossval") { # geoDA for all observations get_geoda = my_geoDA(X, y, 1:n, 1:n) # elements in each group elems_group = vector("list", ng) for (k in 1:ng) { elems_group[[k]] = which(group == glevs[k]) } # misclassification error rate mer = 0 # 10 crossvalidation samples for (r in 1:10) { test = vector("list", ng) test_sizes = floor(n * props / 10) for (k in 1:ng) { test[[k]] = sample(elems_group[[k]], test_sizes[k]) } test = unlist(test) learn = (1:n)[-test] # apply DA geoda_cv = my_geoDA(X, y, learn, test) # misclassification error rate mer = mer + sum(diag(geoda_cv$conf))/n } # total misclassification error rate err = 1 - mer } ## specifications specs = list(n=n, p=p, ng=ng, glevs=glevs, nobs_group=nobs_group, validation=validation) ## results structure(list(functions = get_geoda$FDF, confusion = get_geoda$conf, scores = get_geoda$Disc, classification = get_geoda$pred_class, error_rate = err, specs = specs), class = "geoda") } DiscriMiner/R/my_betweenCov.R0000644000176200001440000000167012157657510015631 0ustar liggesusersmy_betweenCov <- function(X, g, div_by_n=FALSE) { # X: matrix of explanatory variables # g: factor with group memberships # how many observations n = nrow(X) # how many variables p = ncol(X) # group levels and number of levels glevs = levels(g) ng = nlevels(g) # global mean mean_global = colMeans(X) # matrix to store results Between = matrix(0, p, p) # pooled between-class covariance matrix for (k in 1:ng) { # select obs of k-th group tmp <- g == glevs[k] # how many obs in group k nk = sum(tmp) # mean k-th group mean_k = colMeans(X[tmp,]) # mean k-th group - global mean dif_k = mean_k - mean_global # k-th group between cov matrix # between_k = (nk/n) * tcrossprod(dif_k) if (div_by_n) { between_k = (nk/n) * tcrossprod(dif_k) } else { between_k = (nk/(n-1)) * tcrossprod(dif_k) } Between = Between + between_k } # result Between } DiscriMiner/R/withinSS.R0000644000176200001440000000326612160375161014567 0ustar liggesusers#' Within-class Sum of Squares Matrix #' #' Calculates within-class sum of squares and cross product matrix (a.k.a. #' within-class scatter matrix) #' #' #' @param variables matrix or data frame with explanatory variables (No missing #' values are allowed) #' @param group vector or factor with group memberships (No missing values are #' allowed) #' @author Gaston Sanchez #' @seealso \code{\link{withinCov}}, \code{\link{betweenSS}}, #' \code{\link{totalSS}} #' @export #' @examples #' #' \dontrun{ #' # load iris dataset #' data(iris) #' #' # within-class scatter matrix #' withinSS(iris[,1:4], iris[,5]) #' } #' withinSS <- function(variables, group) { # Pooled within-class sum of squares and cross products # variables: matrix or data frame with explanatory variables # group: vector or factor with group memberships # check inputs verify_Xy = my_verify(variables, group, na.rm=FALSE) X = verify_Xy$X y = verify_Xy$y # how many observations nrx = nrow(X) # how many variables ncx = ncol(X) # group levels and number of levels glevs = levels(y) ng = nlevels(y) # within cov matrix Within = matrix(0, ncx, ncx) for (k in 1:ng) { # select obs of k-th group tmp <- y == glevs[k] # mean k-th group mean_k = colMeans(X[tmp,]) # center k-th group matrix Xk = scale(X[tmp,], center=mean_k, scale=FALSE) # add k-th intra-class SS matrix Within = Within + t(Xk) %*% Xk } # add names if (is.null(colnames(variables))) { var_names = paste("X", 1:ncol(X), sep="") dimnames(Within) = list(var_names, var_names) } else { dimnames(Within) = list(colnames(variables), colnames(variables)) } # result Within } DiscriMiner/R/print.geoda.R0000644000176200001440000000145712160375352015233 0ustar liggesusers#' @S3method print geoda print.geoda <- function(x, ...) { cat("\nGeometric Predictive Discriminant Analysis\n") cat(rep("-",43), sep="") cat("\n$functions ", "discrimination functions") cat("\n$confusion ", "confusion matrix") cat("\n$scores ", "discriminant scores") cat("\n$classification ", "assigned class") cat("\n$error_rate ", "error rate\n") cat(rep("-",43), sep="") cat("\n\n$functions\n") print(format(x$functions, scientific=FALSE, digits=3), print.gap=2, quote=FALSE) cat("\n\n$confusion\n") print(x$confusion, print.gap=2) cat("\n\n$error_rate\n") print(x$error_rate) cat("\n\n$scores\n") print(head(x$scores), print.gap=2) cat("...\n") cat("\n$classification\n") print(head(x$classification)) cat("...\n") invisible(x) } DiscriMiner/R/FRatio.R0000644000176200001440000000232312160374737014204 0ustar liggesusers#' F-Statistic Ratio #' #' Calcualtes the F-statistic between a quantitative variable and a qualitative #' variable #' #' #' @param variable a quantitative variable #' @param group a vector or factor with group memberships (i.e. qualitative #' variable) #' @return F-statistic and its p-value #' @author Gaston Sanchez #' @seealso \code{\link{discPower}}, \code{\link{corRatio}} #' @references Tenenhaus M. (2007) \emph{Statistique}. Dunod, Paris. #' @export #' @examples #' #' \dontrun{ #' # load bordeaux wines dataset #' data(bordeaux) #' #' # F-statistic ratio between temperature and quality #' FRatio(bordeaux$temperature, bordeaux$quality) #' } #' FRatio <- function(variable, group) { # F ratio (anova) # variable: vector with explanatory variable # group: vector or factor with group memberships if (!is.numeric(variable)) stop("\nSorry, argument 'x' must be a numeric vector") if (!is.factor(group)) group = as.factor(group) if (nlevels(group) == 1) stop("\nSorry, 'group' has only one category") # correlation ratio (eta) Ftest_temp = oneway.test(variable ~ group, var.equal=TRUE) fstat = Ftest_temp$statistic pval = Ftest_temp$p.value # result c(fstat, p_value=pval) } DiscriMiner/R/print.desda.R0000644000176200001440000000152412160375322015224 0ustar liggesusers#' @S3method print desda print.desda <- function(x, ...) { cat("\nDescriptive Discriminant Analysis\n") cat(rep("-",33), sep="") cat("\n$power ", "discriminant power") cat("\n$values ", "table of eigenvalues") cat("\n$discrivar ", "discriminant variables") cat("\n$discor ", "correlations") cat("\n$scores ", "discriminant scores\n") cat(rep("-",33), sep="") cat("\n\n$power\n") print(format(x$power, scientific=FALSE, digits=4), print.gap=2, quote=FALSE) cat("\n\n$values\n") print(format(x$values, scientific=FALSE, digits=3), print.gap=2, quote=FALSE) cat("\n\n$discrivar\n") print(x$discrivar, print.gap=2, digits=4) cat("\n\n$discor\n") print(head(x$discor), print.gap=2, digits=4) cat("\n\n$scores\n") print(head(x$scores), print.gap=2, digits=4) cat("...\n\n") invisible(x) } DiscriMiner/R/plsDA_old.R0000644000176200001440000001337012160625721014655 0ustar liggesusers#' PLS Discriminant Analysis #' #' Performs a Partial Least Squares (PLS) Discriminant Analysis #' #' When \code{validation=NULL} leave-one-out (loo) cross-validation is #' performed. \cr When \code{validation="learntest"} validation is performed by #' providing a learn-set and a test-set of observations. \cr #' #' @param variables matrix or data frame with explanatory variables #' @param group vector or factor with group memberships #' @param autosel logical indicating automatic selection of PLS components by #' cross-validation. Default \code{autosel=TRUE} #' @param comps integer greater than one indicating the number of PLS #' components to retain. Used only when \code{autosel=FALSE} #' @param validation type of validation, either \code{NULL} or #' \code{"learntest"}. Default \code{NULL} #' @param learn optional vector of indices for a learn-set. Only used when #' \code{validation="learntest"}. Default \code{NULL} #' @param test optional vector of indices for a test-set. Only used when #' \code{validation="learntest"}. Default \code{NULL} #' @param scaled logical indicating whether to scale the data (default #' \code{TRUE}) #' @return An object of class \code{"plsda"}, basically a list with the #' following elements: #' @return \item{functions}{table with discriminant functions} #' @return \item{confusion}{confusion matrix} #' @return \item{scores}{discriminant scores for each observation} #' @return \item{classification}{assigned class} #' @return \item{error_rate}{misclassification error rate} #' @return \item{components}{PLS components} #' @return \item{Q2}{quality of loo cross-validation} #' @return \item{R2}{R-squared coefficients} #' @return \item{VIP}{Variable Importance for Projection} #' @return \item{comp_vars}{correlations between components and variables} #' @return \item{comp_group}{correlations between components and groups} #' @author Gaston Sanchez #' @seealso \code{\link{classify}}, \code{\link{geoDA}}, \code{\link{linDA}}, #' \code{\link{quaDA}} #' @references Tenenhaus M. (1998) \emph{La Regression PLS}. Editions Technip, #' Paris. #' #' Perez-Enciso M., Tenenhaus M. (2003) \emph{Prediction of clinical outcome #' with microarray data: a partial least squares discriminant analysis (PLS-DA) #' approach}. Human Genetics 112: 581-592. #' @keywords internal #' @note This is a previous version of plsDA. Not used anymore. #' @examples #' #' \dontrun{ #' # load iris dataset #' data(iris) #' #' # PLS discriminant analysis specifying number of components = 2 #' my_pls1 = plsDA(iris[,1:4], iris$Species, autosel=FALSE, comps=2) #' my_pls1$confusion #' my_pls1$error_rate #' # plot circle of correlations #' plot(my_pls1) #' #' # PLS discriminant analysis with automatic selection of components #' my_pls2 = plsDA(iris[,1:4], iris$Species, autosel=TRUE) #' my_pls2$confusion #' my_pls2$error_rate #' #' # linear discriminant analysis with learn-test validation #' learning = c(1:40, 51:90, 101:140) #' testing = c(41:50, 91:100, 141:150) #' my_pls3 = plsDA(iris[,1:4], iris$Species, validation="learntest", learn=learning, test=testing) #' my_pls3$confusion #' my_pls3$error_rate #' } #' plsDA_old <- function(variables, group, autosel = TRUE, comps = 2, validation = NULL, learn = NULL, test = NULL, scaled=TRUE) { # Perform a PLS discriminant analysis # variables: matrix or data.frame with explanatory variables # group: vector or factor with group membership # autosel: logical indicating automatic selection of PLS comps # comps: number of PLS components (only when autosel=FALSE) # validation: NULL, "crossval", "learntest" # learn: vector of learn-set # test: vector of test-set # scaled: logical indicating whether to scale the data # check inputs verify_Xy = my_verify(variables, group, na.rm=FALSE) X = verify_Xy$X y = verify_Xy$y # autosel if (!is.logical(autosel)) autosel = TRUE # number of components if (!autosel) { if (mode(comps)!="numeric" || length(comps)!=1 || comps<=1 || (comps%%1)!=0) stop("\nInvalid argument 'comps' (number of components)") } # type of validation if (is.null(validation)) { validation = "none" } else { vali = validation %in% c("crossval", "learntest") if (!vali) stop("\nIncorrect type of validation") } # how many observations and variables n = nrow(X) p = ncol(X) # how many groups ng = nlevels(y) # how many obs in each group nobs_group = as.vector(table(y)) # group levels glevs = levels(y) ## plsDA with no validation if (validation %in% c("none","crossval")) { get_plsda = my_plsDA(X, y, 1:n, 1:n, autosel, comps) err = 1 - sum(diag(get_plsda$conf)) / n } ## plsDA with learn-test sets validation if (validation == "learntest") { if (any(learn) <= 0 || any(learn) > n) stop("\nsubscript out of bounds in 'learn' set") if (any(test) <= 0 || any(test) > n) stop("\nsubscript out of bounds in 'test' set") # apply plsDA get_plsda = my_plsDA(X, y, learn, test, autosel, comps, scaled) # misclassification error rate err = 1 - sum(diag(get_plsda$conf))/length(test) } ## specifications specs = list(n=n, p=p, ng=ng, glevs=glevs, nobs_group=nobs_group, validation=validation) ## results structure(list(functions = get_plsda$coeffs, confusion = get_plsda$conf, scores = get_plsda$Disc, classification = get_plsda$pred_class, error_rate = err, components = get_plsda$components, Q2 = get_plsda$Q2T, R2 = get_plsda$R2, VIP = get_plsda$VIP, comp_vars = get_plsda$cor_tx, comp_group = get_plsda$cor_ty, specs = specs), class = "plsda") } DiscriMiner/R/my_verify.R0000644000176200001440000000317412220631064015020 0ustar liggesusersmy_verify <- function(x, y, qualitative=FALSE, na.rm=na.rm) { # x: matrix or data frame with explanatory variables # y: vector or factor with group memberships # qualitative: logical indicating verification for disqual # na.rm: logical indicating missing values in x # x matrix or data.frame if (is.null(dim(x))) stop("\n'variables' is not a matrix or data.frame") # no missing values allowed when na.rm=FALSE if (!na.rm) { if (length(complete.cases(x)) != nrow(x)) stop("\nSorry, no missing values allowed in 'variables'") } # check lengths of x and y if (nrow(x) != length(y)) stop("\n'variables' and 'group' have different lengths") # y vector or factor if (!is.vector(y) && !is.factor(y)) stop("\n'group' must be a factor") # make sure y is a factor if (!is.factor(y)) y = as.factor(y) # no missing values in y if (any(!is.finite(y))) stop("\nNo missing values allowed in 'group'") # quantitative or qualitative variables? if (!qualitative) { # quantitative data # make sure is matrix if (!is.matrix(x)) x <- as.matrix(x) # only numeric values if (!is.numeric(x)) stop("\n'variables' must contain only numeric values") } else { # data frame with qualitative data # variables as data frame with factors fac_check = sapply(x, class) if (!is.data.frame(x) && any(fac_check != "factor")) stop("\nA data frame with factors was expected") } # verified inputs if (is.null(colnames(x))) colnames(x) = paste(rep("X", ncol(x)), seq_len(ncol(x)), sep='') if (is.null(rownames(x))) rownames(x) = 1L:nrow(x) list(X=x, y=y) } DiscriMiner/R/my_quaDA.R0000644000176200001440000000464312157657510014526 0ustar liggesusersmy_quaDA <- function(X, y, learn, test, prior, prob) { # Perform a quadratic discriminant analysis # X: matrix or data.frame with explanatory variables # y: vector or factor with group membership # learn: vector of learning observations # test: vector of testing observations # prior: vector of prior proportions # prob: logical indicating results in proability terms # how many observations n = nrow(X[learn,]) ntest = length(test) # how many variables p = ncol(X) # how many groups ng = nlevels(y[learn]) glevs = levels(y[learn]) # how many obs in each group gobs = as.vector(table(y[learn])) names(gobs) = glevs if (any(gobs < p+1)) stop("\nsome group category is too small for quaDA") # group means GM = my_groupMeans(X[learn,], y[learn]) # within matrices based on QR decomposition WMqr = as.list(1:ng) # object to store log-determinants ldet = numeric(ng) # calculate ingredients for (k in 1:ng) { nk = gobs[k] - 1 # center data Xcen = scale(X[unclass(y[learn])==k,], center=GM[k,], scale=FALSE) # QR decomposition qx = qr(Xcen / sqrt(nk) ) if(qx$rank < p) stop("rank deficiency in group ", glevs[k]) qx = qx$qr WMqr[[k]] = backsolve(qx[1:p, ], diag(p)) ldet[k] = 2*sum(log(abs(diag(qx)))) } ## classifcation # discrimination matrix to store results Disc <- matrix(0, nrow = ntest, ncol = ng) # calculate distances (the lower, the better) for (k in 1:ng) { Xk = matrix(GM[k,], ntest, p, byrow = TRUE) dev = (X[test,] - Xk) %*% WMqr[[k]] Disc[,k] = 0.5 * rowSums(dev^2) + 0.5 * ldet[k] - log(prior[k]) } # assignment in terms of probability? if (prob) { # Disc in terms of probability Disc <- exp( -(Disc - apply(Disc, 1, min, na.rm=TRUE))) # predicting classes pred = Disc / drop(Disc %*% rep(1, ng)) # predicted class pred_class = factor(max.col(pred), levels=seq_along(glevs), labels=glevs) } else { # predicted class pred = apply(Disc, 1, function(u) which(u == min(u))) names(pred) = NULL pred_class = factor(pred, levels=seq_along(glevs), labels=glevs) } dimnames(Disc) = list(rownames(X[test,]), glevs) # confusion matrix conf = table(original=y[test], predicted=pred_class) ## results res = list(WMqr = WMqr, GM = GM, ldet = ldet, prior = prior, Disc = Disc, pred_class = pred_class, conf = conf) res } DiscriMiner/R/my_tdc.R0000644000176200001440000000144412157657510014301 0ustar liggesusersmy_tdc <- function(X) { # Tableau Disjonctive Complete # (aka Complete Disjunctive Table) # X: data frame with categorical variables as factors # how many observations nobs = nrow(X) # how many variables nvars = ncol(X) # number of categories per variable cats_per_var = sapply(X, nlevels) # total number of categories ncats = sum(cats_per_var) # build super-indicator matrix Z Z = matrix(0, nobs, ncats) ini = cumsum(cats_per_var) - cats_per_var + 1 fin = cumsum(cats_per_var) for (j in 1:nvars) { aux_lev = levels(X[,j]) aux_mat = matrix(0, nobs, cats_per_var[j]) for (k in 1:cats_per_var[j]) { tmp <- X[,j] == aux_lev[k] aux_mat[tmp,k] = 1 } Z[,ini[j]:fin[j]] = aux_mat } colnames(Z) = unlist(lapply(X, levels)) Z } DiscriMiner/R/print.plsda.R0000644000176200001440000000217112160375407015252 0ustar liggesusers#' @S3method print plsda print.plsda <- function(x, ...) { cat("\nPLS Discriminant Analysis\n") cat(rep("-",51), sep="") cat("\n$functions ", "discrimination functions") cat("\n$confusion ", "confusion matrix") cat("\n$scores ", "discriminant scores") cat("\n$classification ", "assigned class") cat("\n$error_rate ", "error rate") cat("\n$components ", "PLS components") cat("\n$Q2 ", "quality of loo cross validation") cat("\n$R2 ", "R-squared coefficients") cat("\n$VIP ", "variables importance") cat("\n$comp_vars ", "correlations components-variables") cat("\n$comp_group ", "correlations components-groups\n") cat(rep("-",51), sep="") cat("\n\n$functions\n") print(format(x$functions, scientific=FALSE, digits=3), print.gap=2, quote=FALSE) cat("\n\n$confusion\n") print(x$confusion, print.gap=2) cat("\n\n$error_rate\n") print(x$error_rate) cat("\n\n$scores\n") print(head(x$scores), print.gap=2) cat("...\n") cat("\n$classification\n") print(head(x$classification)) cat("...\n") invisible(x) } DiscriMiner/R/groupStds.R0000644000176200001440000000275612160375021015007 0ustar liggesusers#' Group Standard Deviations #' #' Calculates the standard deviations for each group #' #' #' @param variables matrix or data frame with explanatory variables (may #' contain missing values) #' @param group vector or factor with group memberships #' @param na.rm logical indicating whether missing values should be removed #' @return matrix of group standard deviations (with variables in the rows, and #' groups in the columns) #' @author Gaston Sanchez #' @seealso \code{\link{groupMeans}}, \code{\link{groupVars}}, #' \code{\link{groupMedians}}, \code{\link{groupQuants}} #' @export #' @examples #' #' \dontrun{ #' # dataset iris #' data(iris) #' #' # group standard deviations #' groupStds(iris[,1:4], iris[,5]) #' } #' groupStds <- function(variables, group, na.rm=FALSE) { # Calculate std deviations by group # variables: matrix or data frame with explanatory variables # group: vector or factor with group memberships # na.rm: logical indicating whether missing values should be removed # check inputs verify_Xy = my_verify(variables, group, na.rm=na.rm) X = verify_Xy$X y = verify_Xy$y # how many groups ng = nlevels(y) # matrix with group std deviations Stds = matrix(0, ncol(X), ng) for (j in 1:ncol(X)) { Stds[j,] = tapply(X[,j], y, FUN=sd, na.rm=na.rm) } # add names if (is.null(colnames(X))) { rownames(Stds) = paste("X", 1:ncol(X), sep="") } else { rownames(Stds) = colnames(X) } colnames(Stds) = levels(y) # results Stds } DiscriMiner/R/groupMeans.R0000644000176200001440000000264712160374771015147 0ustar liggesusers#' Group Means #' #' Calculates means for each group #' #' #' @param variables matrix or data frame with explanatory variables (may #' contain missing values) #' @param group vector or factor with group memberships #' @param na.rm logical indicating whether missing values should be removed #' @return matrix of group means (with variables in the rows, and groups in the #' columns) #' @author Gaston Sanchez #' @seealso \code{\link{groupVars}}, \code{\link{groupStds}}, #' \code{\link{groupMedians}}, \code{\link{groupQuants}} #' @export #' @examples #' #' \dontrun{ #' # dataset iris #' data(iris) #' #' # group means #' groupMeans(iris[,1:4], iris[,5]) #' } #' groupMeans <- function(variables, group, na.rm=FALSE) { # Calculate means by group # variables: matrix or data frame with explanatory variables # group: vector or factor with group memberships # na.rm: logical indicating whether missing values should be removed # check inputs verify_Xy = my_verify(variables, group, na.rm=na.rm) X = verify_Xy$X y = verify_Xy$y # how many groups ng = nlevels(y) # matrix with group means Means = matrix(0, ncol(X), ng) for (j in 1:ncol(X)) { Means[j,] = tapply(X[,j], y, FUN=mean, na.rm=na.rm) } # add names if (is.null(colnames(X))) { rownames(Means) = paste("X", 1:ncol(X), sep="") } else { rownames(Means) = colnames(X) } colnames(Means) = levels(y) # results Means } DiscriMiner/R/binarize.R0000644000176200001440000000253612160374641014623 0ustar liggesusers#' Binarize a data frame into a super-indicator matrix #' #' Convert a data frame with factors into a super-indicator matrix (a.k.a. #' complete disjunctive table from the french \emph{tableau disjonctive #' complete}) #' #' #' @param variables data frame with categorical variables (coded as factors) #' @return A super-indicator matrix of binary data #' @author Gaston Sanchez #' @seealso \code{\link{easyMCA}} #' @references Lebart L., Piron M., Morineau A. (2006) \emph{Statistique #' Exploratoire Multidimensionnelle}. Dunod, Paris. #' @export #' @examples #' #' \dontrun{ #' # load insurance cars dataset #' data(insurance) #' #' # super-indicator matrix of binary data #' bin_insure = binarize(insurance[,-1]) #' head(bin_insure) #' } #' binarize <- function(variables) { # binary super-indicator matrix (aka Complete Disjunctive Table) # variables: matrix or data.frame with explanatory variables # make sure variables is a data frame with factors fac_check = sapply(variables, class) if (!is.data.frame(variables) && any(fac_check != "factor")) stop("\n'variables' must be a data frame with factors") # no missing values allowed if (length(complete.cases(variables)) != nrow(variables)) stop("\nSorry, no missing values allowed in 'variables'") # build super-indicator matrix Z Z = my_tdc(variables) Z } DiscriMiner/R/totalSS.R0000644000176200001440000000265312160375144014410 0ustar liggesusers#' Total Sum of Squares Matrix #' #' Calculates the total sum of squares and cross product matrix (a.k.a. total #' scatter matrix) #' #' #' @param variables matrix or data frame with explanatory variables #' @author Gaston Sanchez #' @seealso \code{\link{totalCov}}, \code{\link{betweenSS}}, #' \code{\link{withinSS}} #' @export #' @examples #' #' \dontrun{ #' # load iris dataset #' data(iris) #' #' # total scatter matrix #' totalSS(iris[,1:4]) #' } #' totalSS <- function(variables) { # Total sum of squares matrix # variables: matrix or data frame with explanatory variables # X matrix or data.frame if (!is.matrix(variables) && !is.data.frame(variables)) stop("\nSorry, 'variables' must be a matrix") if (is.null(dim(variables))) stop("'variables' is not a matrix") # enforce X as matrix if (!is.matrix(variables)) variables = as.matrix(variables) # no missing values allowed if (any(!is.finite(variables))) stop("infinite, NA or NaN values in 'variables'") # only numeric values if (!is.numeric(variables)) stop("\nSorry, 'variables' must contain only numeric values") X = scale(variables, scale=FALSE) Total = t(X) %*% X # add names if (is.null(colnames(variables))) { var_names = paste("X", 1:ncol(X), sep="") dimnames(Total) = list(var_names, var_names) } else { dimnames(Total) = list(colnames(variables), colnames(variables)) } # result Total } DiscriMiner/R/my_discFunctions.R0000644000176200001440000000143712157657510016344 0ustar liggesusersmy_discFunctions <- function(X, g, group_means, within) { # X: explanatory variables # g: factor with group memberships # group_means: group means matrix # within: pooled within-class covariance matrix # group means GM = group_means # how many groups ng = nrow(GM) # inverse of Within Cov Matrix W_inv = solve(within) # constant terms of fisher's discriminant linear functions alphas = rep(0, ng) # coefficients of fisher's discriminant linear functions betas = matrix(0, nrow(W_inv), ng) for (k in 1:ng) { alphas[k] = -(1/2) * GM[k,] %*% W_inv %*% GM[k,] betas[,k] = t(GM[k,]) %*% W_inv } # Fisher's Discriminant Functions FDF = rbind(alphas, betas) rownames(FDF) = c("constant", colnames(X)) colnames(FDF) = c(levels(g)) # result FDF } DiscriMiner/R/my_mca.R0000644000176200001440000000241212157657510014263 0ustar liggesusersmy_mca <- function(X) { # Perform multiple correspondence analysis # X: data frame with categorical variables as factors # how many observations nobs = nrow(X) # how many variables nvars = ncol(X) # number of categories per variable cats_per_var = sapply(X, nlevels) # total number of categories ncats = sum(cats_per_var) # number of factor coordinates (for MCA) nfacs = ncats - nvars # build super-indicator matrix Z Z = my_tdc(X) # number of obs per category nopc = colSums(Z) # normalizing Z Znorm = sweep(Z, 2, sqrt(nvars*nopc), FUN="/") # apply svd Zsvd = svd(Znorm) # sequence with indices of components sec <- 1 + (1L:nfacs) # eigenvalues eigs = Zsvd$d[sec]^2 values = cbind(eigs, 100*eigs/sum(eigs), 100*cumsum(eigs)/sum(eigs)) colnames(values) = c("eigenvalues", "proportion", "accumulated") rownames(values) = 1:nfacs # U-coefficients #U = diag(1/sqrt(nvars*nopc)) %*% Zsvd$v[,sec]/nvars U = diag(sqrt(nobs/(nvars*nopc))) %*% Zsvd$v[,sec] dimnames(U) = list(colnames(Z), paste("U",1:nfacs,sep='')) # row coordinates Fs = Z %*% U # add names dimnames(Fs) <- list(rownames(X), paste("F",1:nfacs,sep='')) structure(list(values=values, coefficients=U, components=Fs), class="qualmca") } DiscriMiner/R/groupVars.R0000644000176200001440000000267512160375030015005 0ustar liggesusers#' Group Variances #' #' Calculates the variances for each group #' #' #' @param variables matrix or data frame with explanatory variables (may #' contain missing values) #' @param group vector or factor with group memberships #' @param na.rm logical indicating whether missing values should be removed #' @return matrix of group variances (with variables in the rows, and groups in #' the columns) #' @author Gaston Sanchez #' @seealso \code{\link{groupMeans}}, \code{\link{groupStds}}, #' \code{\link{groupMedians}}, \code{\link{groupQuants}} #' @export #' @examples #' #' \dontrun{ #' # dataset iris #' data(iris) #' #' # group variances #' groupVars(iris[,1:4], iris[,5]) #' } #' groupVars <- function(variables, group, na.rm=FALSE) { # Calculate variances by group # variables: matrix or data frame with explanatory variables # group: vector or factor with group memberships # na.rm: logical indicating whether missing values should be removed # check inputs verify_Xy = my_verify(variables, group, na.rm=na.rm) X = verify_Xy$X y = verify_Xy$y # how many groups ng = nlevels(y) # matrix with group variances Vars = matrix(0, ncol(X), ng) for (j in 1:ncol(X)) { Vars[j,] = tapply(X[,j], y, FUN=var, na.rm=na.rm) } # add names if (is.null(colnames(X))) { rownames(Vars) = paste("X", 1:ncol(X), sep="") } else { rownames(Vars) = colnames(X) } colnames(Vars) = levels(y) # results Vars } DiscriMiner/R/corRatio.R0000644000176200001440000000216512222021424014563 0ustar liggesusers#' Correlation Ratio #' #' Calculates the correlation ratio between a quantitaive variable and a #' qualitative variable #' #' No missing values are allowed #' #' @param variable a single quantitative variable #' @param group vector or factor with group memberships (qualitative variable) #' @author Gaston Sanchez #' @seealso \code{\link{FRatio}}, \code{\link{discPower}} #' @references Tenenhaus, M. (2007) \emph{Statistique}. Dunod, Paris. #' @export #' @examples #' #' \dontrun{ #' # iris dataset #' data(iris) #' #' # correlation ratio between Petal-Length and Species #' corRatio(iris$Petal.Length, iris$Species) #' } #' corRatio <- function(variable, group) { # Correlation ratio # variable: vector with explanatory variable # group: vector or factor with group memberships if (!is.numeric(variable)) stop("\nSorry, 'variable' must be a numeric vector") if (!is.factor(group)) group = as.factor(group) if (nlevels(group) == 1) stop("\nSorry, 'group' has only one category") # correlation ratio (eta) lm_temp = lm(variable ~ group) eta = summary(lm_temp)$r.squared # result eta } DiscriMiner/R/plot.plsda.R0000644000176200001440000000207712160375262015100 0ustar liggesusers#' @S3method plot plsda plot.plsda <- function(x, ...) { ## Plot circle of correlations between variables # get correlations cor_tx = x$comp_vars cor_ty = x$comp_group # points for generating circle z = seq(0, 2*pi, l=100) # open plot plot(cos(z), sin(z), type="l", main=expression(bold("Circle of Correlations on ") * bold(list(t[1],t[2]))), ylim=c(-1.1,1.1), xlim=c(-1.2,1.2), xlab=expression("PLS-component " * t[1]), ylab=expression("PLS-component " * t[2]), cex.main=1, cex.axis=.8, col="grey") # adding lines abline(h=seq(-1,1,.25), v=seq(-1,1,.25), col="grey", lty=3) abline(h=0, v=0, col="grey", lwd=2) # variables points(cor_tx[,1], cor_tx[,2], pch=20, col=rep("blue",nrow(cor_tx))) text(cor_tx[,1], cor_tx[,2], labels=rownames(cor_tx), pos=2, col=rep("blue",nrow(cor_tx)), cex=.8) # groups points(cor_ty[,1], cor_ty[,2], pch=17, cex=.8, col=rep("red",nrow(cor_ty))) text(cor_ty[,1], cor_ty[,2], labels=rownames(cor_ty), pos=2, col=rep("red",nrow(cor_ty)), cex=.8) # invisible(x) } DiscriMiner/R/groupMedians.R0000644000176200001440000000266512160375000015446 0ustar liggesusers#' Group Medians #' #' Calculates the medians for each group #' #' #' @param variables matrix or data frame with explanatory variables (may #' contain missing values) #' @param group vector or factor with group memberships #' @param na.rm logical indicating whether missing values should be removed #' @return matrix of group medians (with variables in the rows, and groups in #' the columns) #' @author Gaston Sanchez #' @seealso \code{\link{groupVars}}, \code{\link{groupStds}}, #' \code{\link{groupMeans}}, \code{\link{groupQuants}} #' @export #' @examples #' #' \dontrun{ #' # dataset iris #' data(iris) #' #' # group means #' groupMedians(iris[,1:4], iris[,5]) #' } #' groupMedians <- function(variables, group, na.rm=FALSE) { # Calculate medians by group # variables: matrix or data frame with explanatory variables # group: vector or factor with group memberships # na.rm: logical indicating whether missing values should be removed # check inputs verify_Xy = my_verify(variables, group, na.rm=na.rm) X = verify_Xy$X y = verify_Xy$y # how many groups ng = nlevels(y) # matrix with group medians Meds = matrix(0, ncol(X), ng) for (j in 1:ncol(X)) { Meds[j,] = tapply(X[,j], y, FUN=median, na.rm=na.rm) } # add names if (is.null(colnames(X))) { rownames(Meds) = paste("X", 1:ncol(X), sep="") } else { rownames(Meds) = colnames(X) } colnames(Meds) = levels(y) # results Meds } DiscriMiner/R/betweenSS.R0000644000176200001440000000316512160374625014720 0ustar liggesusers#' Between-class Sum of Squares Matrix #' #' Calculates between-class sum of squares and cross product matrix (a.k.a. #' between-class scatter matrix) #' #' #' @param variables matrix or data frame with explanatory variables (No missing #' values are allowed) #' @param group vector or factor with group membership (No missing values are #' allowed) #' @author Gaston Sanchez #' @seealso \code{\link{betweenCov}}, \code{\link{withinSS}}, #' \code{\link{totalSS}} #' @export #' @examples #' #' \dontrun{ #' # load iris dataset #' data(iris) #' #' # between-class scatter matrix #' betweenSS(iris[,1:4], iris[,5]) #' } #' betweenSS <- function(variables, group) { # Between-class sum of squares and cross products matrix # variables: matrix or data frame with explanatory variables # group: vector or factor with group memberships # check inputs verify_Xy = my_verify(variables, group, na.rm=FALSE) X = verify_Xy$X y = verify_Xy$y # group levels and number of levels glevs = levels(y) ng = nlevels(y) # global mean mean_all = colMeans(X) # matrix to store results Between = matrix(0, ncol(X), ncol(X)) # calculate between Sum of squares for (k in 1:ng) { tmp <- y == glevs[k] nk = sum(tmp) mean_k = colMeans(X[tmp,]) dif_k = mean_k - mean_all between_k = nk * dif_k %*% t(dif_k) Between = Between + between_k } # add names if (is.null(colnames(variables))) { var_names = paste("X", 1:ncol(X), sep="") dimnames(Between) = list(var_names, var_names) } else { dimnames(Between) = list(colnames(variables), colnames(variables)) } # result Between } DiscriMiner/R/discPower.R0000644000176200001440000000373612160374702014760 0ustar liggesusers#' Discriminant Power #' #' Measures Discriminant Power of explanatory variables #' #' No missing values are allowed #' #' @param variables matrix or data frame with explanatory variables #' @param group vector or factor with group membership #' @return A data frame containing the following columns #' @return \item{correl_ratio}{Correlation Ratios} #' @return \item{wilks_lambda}{Wilks Lambda} #' @return \item{F_statistic}{F-statistic} #' @return \item{p_value}{p-value of F-statistic} #' @author Gaston Sanchez #' @seealso \code{\link{corRatio}}, \code{\link{FRatio}} #' @references Tenenhaus M. (2007) \emph{Statistique}. Dunod, Paris. #' @export #' @examples #' #' \dontrun{ #' # bordeaux wines dataset #' data(bordeaux) #' #' # discriminant power #' dp = discPower(bordeaux[,2:5], bordeaux$quality) #' dp #' } #' discPower <- function(variables, group) { # measure discriminant power of variables # variables: matrix or data frame with explanatory variables # group: vector or factor with group memberships # check inputs verify_Xy = my_verify(variables, group, na.rm=FALSE) X = verify_Xy$X y = verify_Xy$y # how many observations n = nrow(X) # how many groups ng = nlevels(y) # how many variables p = ncol(X) # group levels and number of levels glevs = levels(y) # between-class covariance matrix B = my_betweenCov(X, y) # within-class covariance matrix W = matrix(0, p, p) for (k in 1:ng) { tmp <- y == glevs[k] nk = sum(tmp) Wk = ((nk-1)/(n-1)) * var(X[tmp,]) W = W + Wk } # total covariance #V = ((n-1)/n) * var(X) V = var(X) ## Discriminant importance of explanatory variables # F-statistics F_statistic = ((n-ng)/(ng-1)) * (diag(B) / diag(W)) p_value = 1 - pf(F_statistic, ng-1, n-ng) # Wilk's lambdas wilks_lambda = diag(W / V) # correlation ratios correl_ratio = diag(B) / diag(V) # table disc_power = data.frame(correl_ratio, wilks_lambda, F_statistic, p_value) disc_power } DiscriMiner/R/groupQuants.R0000644000176200001440000000354212160375012015337 0ustar liggesusers#' Group Quantiles #' #' Calculates the specified quantiles for each group #' #' #' @param variables matrix or data frame with explanatory variables (may #' contain missing values) #' @param group vector or factor with group memberships #' @param prob probability value (numeric value between 0 and 1) #' @param na.rm logical indicating whether missing values should be removed #' @return matrix of group quantiles (with variables in the rows, and groups in #' the columns) #' @author Gaston Sanchez #' @seealso \code{\link{groupMeans}}, \code{\link{groupVars}}, #' \code{\link{groupStds}}, \code{\link{groupMedians}} #' @export #' @examples #' #' \dontrun{ #' # dataset iris #' data(iris) #' #' # group quantile prob=20 #' groupQuants(iris[,1:4], iris[,5], prob=0.20) #' } #' groupQuants <- function(variables, group, prob, na.rm=FALSE) { # Calculate quantile deviations by group # variables: matrix or data frame with explanatory variables # group: vector or factor with group memberships # prob: probability value in [0,1] # na.rm: logical indicating whether missing values should be removed # check inputs verify_Xy = my_verify(variables, group, na.rm=na.rm) X = verify_Xy$X y = verify_Xy$y # quantile if (missing(prob)) { stop("\nOoops, argument 'prob' is missing") } else { if (length(prob) > 1) stop("\nOoops, 'prob' has length > 1") if (prob < 0 || prob > 1) stop("\nOoops, invalid 'prob' value") } # how many groups ng = nlevels(y) # matrix with group quantiles Quants = matrix(0, ncol(X), ng) for (j in 1:ncol(X)) { Quants[j,] = tapply(X[,j], y, FUN=quantile, probs=prob, na.rm=na.rm) } # add names if (is.null(colnames(X))) { rownames(Quants) = paste("X", 1:ncol(X), sep="") } else { rownames(Quants) = colnames(X) } colnames(Quants) = levels(y) # results Quants } DiscriMiner/R/my_plsDA.R0000644000176200001440000002212012226714020014507 0ustar liggesusers#' @title PLS Discriminant Analysis #' #' @description Perform a PLS discriminant analysis #' #' @param X matrix or data.frame with explanatory variables #' @param y vector or factor with group membership #' @param learn vector of learning observations #' @param test vector of testing observations #' @param autosel logical indicating automatic selection of PLS comps #' @param comps number of PLS components (only when autosel=FALSE) #' @param cv cross validation method. Options are \code{"LOO"} (Leave-One-Out) #' and \code{"LKO"} (Leave-K fold-Out) #' @param k fold left out if using LKO #' @param retain.models whether to retain lower models (i.e. all lower component #' results) #' @keywords internal my_plsDA <- function(X, y, learn, test, autosel, comps, cv = "LOO", k = NA, retain.models = FALSE) { ## prepare ingredients ntest = length(test) # binarize y Y = my_tdc(data.frame(y[learn])) glevs = levels(y[learn]) # dimensions n = nrow(X[learn,]) p = ncol(X) q = ncol(Y) #added in k for leave-k-out cross validation #k=10 # determine number of PLS components to be computed # taking into account rank of X Xsvd = svd(X[learn,], nu=0, nv=0) rank_X = sum(Xsvd$d > 0.0001) if (rank_X == 0) stop("\nrank = 0: variables are numerically constant") nc = min(n, rank_X) if (!autosel) { if (comps < nc) nc = comps } if (nc == n) nc = n - 1 # standardizing data X.old = scale(X[learn,]) Y.old = scale(Y) # creating matrices to store results Wh = matrix(0, p, nc) Uh = matrix(0, n, nc) Th = matrix(0, n, nc) Ch = matrix(0, q, nc) Ph = matrix(0, p, nc) bh = rep(0, nc) RSS = rbind(rep(n-1,q), matrix(NA, nc, q)) PRESS = matrix(NA, nc, q) Q2 = matrix(NA, nc, q) ## remove random kth out if (cv == "LKO") { fold = split(sample(1:n), rep(1:k, length=n)) } ## PLS2 algorithm for (h in 1:nc) { # "arbitrary" vector (first column of Y.old) u.new = Y.old[,1] w.old = rep(1, p) iter = 1 repeat { w.new = t(X.old) %*% u.new / sum(u.new^2) w.new = w.new / sqrt(sum(w.new^2))# normalize w.old t.new = X.old %*% w.new c.new = t(Y.old) %*% t.new / sum(t.new^2) u.new = Y.old %*% c.new / sum(c.new^2) w.dif = w.new - w.old w.old = w.new if (sum(w.dif^2)<1e-06 || iter==100) break iter = iter + 1 } p.new = t(X.old) %*% t.new / sum(t.new^2) # Cross validation RSS[h+1,] = colSums((Y.old - t.new%*%t(c.new))^2) press = matrix(0, n, q) if (cv != "none") { ### Random leave-k-out if (cv == "LKO") { for (i in 1:k) { #removes row i, only column 1 omit=fold[[i]] uh.si <- Y.old[-omit,1] wh.siold <- rep(1,p) itcv <- 1 repeat { wh.si <- t(X.old[-omit,]) %*% uh.si / sum(uh.si^2) wh.si <- wh.si / sqrt(sum(wh.si^2)) th.si <- X.old[-omit,] %*% wh.si ch.si <- t(Y.old[-omit,]) %*% th.si / sum(th.si^2) uh.si <- Y.old[-omit,] %*% ch.si / sum(ch.si^2) wsi.dif <- wh.si - wh.siold wh.siold <- wh.si if (sum(wsi.dif^2)<1e-06 || itcv==100) break itcv <- itcv + 1 } Yhat.si = (X.old[omit,] %*% wh.si) %*% t(ch.si) press[omit,] = (Y.old[omit,] - Yhat.si)^2 } } # Leave-One-Out if (cv == "LOO") { for (i in 1:n) { uh.si = Y.old[-i,1] wh.siold = rep(1,p) itcv = 1 repeat { wh.si = t(X.old[-i,]) %*% uh.si / sum(uh.si^2) wh.si = wh.si / sqrt(sum(wh.si^2)) th.si = X.old[-i,] %*% wh.si ch.si = t(Y.old[-i,]) %*% th.si / sum(th.si^2) uh.si = Y.old[-i,] %*% ch.si / sum(ch.si^2) wsi.dif = wh.si - wh.siold wh.siold = wh.si if (sum(wsi.dif^2)<1e-06 || itcv==100) break itcv = itcv + 1 } Yhat.si = (X.old[i,] %*% wh.si) %*% t(ch.si) press[i,] = (Y.old[i,] - Yhat.si)^2 } } PRESS[h,] = colSums(press) Q2[h,] = 1 - PRESS[h,]/RSS[h,] } # deflation X.old = X.old - (t.new %*% t(p.new)) Y.old = Y.old - (t.new %*% t(c.new)) # store new elements Wh[,h] = w.new Uh[,h] = u.new Th[,h] = t.new Ch[,h] = c.new Ph[,h] = p.new bh[h] = t(u.new) %*% t.new ## selection of PLS components # Q2 global Q2G = 1 - rowSums(PRESS)/rowSums(RSS[-nc,]) } # finish PLS algorithm # automatic selection of PLS components? ncs = nc if (autosel) { # Rule 1: Q2G >= 0.05 (Perez & Tenenhaus, 2003) selcom = which(Q2G >= 0.05) # Rule 2: at least one Q2hk >= 0.095 #aux = apply(Q2, 1, function(x) sum(x>=0.0975)) #selcom = which(aux > 0) ncs = length(selcom) # selecting elements Wh = Wh[,selcom] Uh = Uh[,selcom] Ph = Ph[,selcom] Th = Th[,selcom] Ch = Ch[,selcom] } ## PLS results if (retain.models) { mylist.names <- c(paste(seq(nc), "Components", sep=".")) Br <- vector("list", length(mylist.names)) cte <- vector("list", length(mylist.names)) Disc <- vector("list", length(mylist.names)) coeffs <- vector("list", length(mylist.names)) names(Br) <- mylist.names names(cte) <- mylist.names names(Disc) <- mylist.names names(coeffs) <- mylist.names for (i in 1:nc) { # weights Ws = Wh[,1:i] %*% solve(t(Ph[,1:i])%*%Wh[,1:i]) # standardized regression coefficients Bs = Ws[,1:i] %*% t(Ch[,1:i]) # regression coeffs non-standardized Br[[i]] = diag(1/apply(X[learn,],2,sd)) %*% Bs %*% diag(apply(Y,2,sd)) cte[[i]] = as.vector((apply(Y,2,mean) - apply(X[learn,],2,mean)%*%Br[[i]])) Disc[[i]] = X[test,] %*% Br[[i]] + matrix(rep(cte[[i]],each=ntest), ntest, q) coeffs[[i]] = rbind(INTERCEPT=cte[[i]], Br[[i]]) } } else { # weights Ws = Wh %*% solve(t(Ph)%*%Wh) # standardized regression coefficients Bs = Ws %*% t(Ch) # regression coeffs non-standardized Br = diag(1/apply(X[learn,],2,sd)) %*% Bs %*% diag(apply(Y,2,sd)) cte = as.vector((apply(Y,2,mean) - apply(X[learn,],2,mean)%*%Br)) Disc = X[test,] %*% Br + matrix(rep(cte,each=ntest), ntest, q) coeffs = rbind(INTERCEPT=cte, Br) } # Q2 global accumulated Q2T = cbind(Q2, Q2G) q2 = c(paste(rep("Q2",q),colnames(Y),sep="."),"Q2.global") # correlations and redundancies cor_tx = cor(X[learn,], Th) cor_ty = cor(Y, Th) R2x = cor(X[learn,], Th)^2 # R2 coefficients R2y = cor(Y, Th)^2 # R2 coefficients Rdx = colMeans(R2x) Rdy = colMeans(R2y) # Sum of squares Y R2 = cbind(Rdx, cumsum(Rdx), Rdy, cumsum(Rdy)) Rd.mat = matrix(0, ncs, ncs) for (j in 1:ncs) Rd.mat[1:j,j] = Rdy[1:j] # variable importance VIP = sqrt((Wh^2) %*% Rd.mat %*% diag(p/cumsum(Rdy), ncs, ncs)) ### Weighted VIP for entire fitted model weighted.vip <- matrix(0, nrow = p, ncol=ncs) for (i in 1:length(Rdy)){ weighted.vip[,i] <- Rdy[i] * (Wh[,i]^2) } VIP.weighted <- sqrt(rowSums(weighted.vip) * (p/sum(Rdy) )) # combine with individual component VIPs VIP <- cbind(VIP, VIP.weighted) ## adding names ### added Ws and Ch for loadings and Y.loadings respectively dimnames(Ws) = list(colnames(X), paste(rep("w*",ncs),1:ncs,sep="")) dimnames(Ch) = list(colnames(Y), paste(rep("c",ncs),1:ncs,sep="")) dimnames(Th) = list(rownames(X[learn,]), paste(rep("t",ncs),1:ncs,sep="")) dimnames(Ph) = list(colnames(X), paste(rep("p",ncs),1:ncs,sep="")) dimnames(Bs) = list(colnames(X), colnames(Y)) #dimnames(Br) = list(colnames(X), colnames(Y)) dimnames(cor_tx) = list(colnames(X), paste(rep("t",ncs),1:ncs,sep="")) dimnames(cor_ty) = list(colnames(Y), paste(rep("t",ncs),1:ncs,sep="")) dimnames(Q2T) = list(paste(rep("t",nc),1:nc,sep=""), q2) dimnames(R2) = list(paste(rep("t",ncs),1:ncs,sep=""), c("R2X","R2Xcum","R2Y","R2Ycum")) dimnames(VIP) = list(colnames(X), c(paste(rep("Component ",ncs),1:ncs,sep=""),"Model VIP")) #dimnames(VIP.weighted) = list(paste("Model VIP")) # predicted class # confusion matrix if (retain.models) { # for all recursive models pred_class <- vector("list", length(mylist.names)) names(pred_class) <- mylist.names conf <- vector("list", length(mylist.names)) names(conf) <- mylist.names for (i in 1:nc) { pred_class[[i]] <- factor(max.col(Disc[[i]]), levels=seq_along(glevs), labels=glevs) conf = table(original=y[test], predicted=pred_class[[i]]) } } else { # for just the highest chosen component pred_class = factor(max.col(Disc), levels=seq_along(glevs), labels=glevs) conf = table(original=y[test], predicted=pred_class) } # results ### added loadings and y.loadings res = list(coeffs = coeffs, conf = conf, Disc = Disc, pred_class = pred_class, components = Th, loadings = round(Ws,4), y.loadings = round(Ch,4), Q2T = Q2T, R2 = R2, VIP = VIP, cor_tx = cor_tx, cor_ty = cor_ty) res } DiscriMiner/R/classify.R0000644000176200001440000001153712160374654014642 0ustar liggesusers#' Classification function #' #' Classify provided observations based on a given Discriminant object #' #' A \code{DA_object} is a discriminant analysis (DA) object obtained from a #' geometric predictive DA (class \code{"geoda"}), a linear DA (class #' \code{"linda"}), a quadratic DA (class \code{"quada"}), or a DISQUAL #' analysis (class \code{"disqual"}) #' #' @param DA_object discriminant analysis object #' @param newdata vector or matrix or data frame with variables for which their #' classes will be calculated #' @return A list with the following elements #' @return \item{scores}{discriminant scores for each observation} #' @return \item{pred_class}{predicted class} #' @author Gaston Sanchez #' @seealso \code{\link{geoDA}}, \code{\link{linDA}}, \code{\link{quaDA}}, #' \code{\link{plsDA}}, \code{\link{disqual}} #' @export #' @examples #' #' \dontrun{ #' # load iris dataset #' data(iris) #' #' # linear discriminant analysis #' my_lin1 = linDA(iris[,1:4], iris$Species) #' #' # select a sample of 15 observations #' set.seed(111) #' obs = sample(1:nrow(iris), 15) #' some_data = iris[obs, 1:4] #' #' # classify some_data #' get_classes = classify(my_lin1, some_data) #' get_classes #' #' # compare the results against original class #' table(iris$Species[obs], get_classes$pred_class) #' } #' classify <- function(DA_object, newdata) { ## Check arguments # make sure DA_object is valid DA_classes = c("geoda", "linda", "quada", "plsda", "disqual") class_da <- class(DA_object) %in% DA_classes if (!class_da) stop("\nInvalid 'DA_object'") # if vector then convert to matrix if (is.vector(newdata)) newdata = t(as.matrix(newdata)) # newdata matrix or data.frame if (is.null(dim(newdata))) stop("\nSorry, 'newdata' is not a matrix") # no missing values allowed if (length(complete.cases(newdata)) != nrow(newdata)) stop("\nSorry, no missing values allowed in 'newdata'") # check compatibility if (class(DA_object) != "disqual") { if (ncol(newdata) != DA_object$specs$p) stop("\n'newdata' is not compatible with 'DA_object'") } else { # disqual method # newdata as binarized matrix (0s and 1s) if (!any(newdata %in% c(0,1))) stop("\n'newdata' must contain only 0's and 1's (binary format)") # number of variables if (dim(newdata)[1] == 1) { if (sum(newdata) != DA_object$specs$p) stop("\n'newdata' is not compatible with 'DA_object'") } else { if (any(rowSums(newdata) != DA_object$specs$p)) stop("\n'newdata' is not compatible with 'DA_object'") } } # how many observations and variables n = nrow(newdata) p = DA_object$specs$p # how many groups ng = DA_object$specs$ng glevs = DA_object$specs$glevs ## geometric/linear/PLS discriminant analysis if (class(DA_object) %in% c("geoda","linda","plsda")) { # put newdata in right format if (is.data.frame(newdata)) newdata = as.matrix(newdata) ## constant terms and coefficients cons = DA_object$functions[1,] Betas = DA_object$functions[-1,] # matrix of constant terms A = matrix(rep(cons,n), n, ng, byrow=TRUE) # apply discrim functions Disc = newdata %*% Betas + A dimnames(Disc) = list(1:n, glevs) # predicted class pred = apply(Disc, 1, function(u) which(u == max(u))) names(pred) = NULL # assign class values pred_class = factor(pred, levels=seq_along(glevs), labels=glevs) } ## quadratic discriminant analysis if (class(DA_object) == "quada") { # put newdata in right format if (is.data.frame(newdata)) newdata = as.matrix(newdata) # discrimination matrix to store results Disc = matrix(0, n, ncol = ng) # ingredients GM = DA_object$GM WMqr = DA_object$WMqr ldet = DA_object$ldet prior = DA_object$prior # calculate distances (the lower, the better) for (k in 1:ng) { # group means Xk = matrix(GM[k,], n, p, byrow = TRUE) dev = (newdata - Xk) %*% WMqr[[k]] Disc[,k] = 0.5 * rowSums(dev^2) + 0.5 * ldet[k] - log(prior[k]) dimnames(Disc) = list(1:n, glevs) } # Disc in terms of probability Disc <- exp( -(Disc - apply(Disc, 1, min, na.rm=TRUE))) # predicting classes pred = Disc / drop(Disc %*% rep(1, ng)) # predicted class pred_class = factor(max.col(pred), levels=seq_along(glevs), labels=glevs) } ## DISQUAL method if (class(DA_object) == "disqual") { ## coefficients (there is no constant term) Betas = DA_object$norm_coefs # apply discrim functions Disc = newdata %*% Betas dimnames(Disc) = list(1:n, glevs) # predicted class pred = apply(Disc, 1, function(u) which(u == max(u))) names(pred) = NULL # assign class values pred_class = factor(pred, levels=seq_along(glevs), labels=glevs) } ## results res = list(scores=Disc, pred_class=pred_class) } DiscriMiner/R/betweenCov.R0000644000176200001440000000410312160375216015110 0ustar liggesusers#' @title Between-class Covariance Matrix #' #' @description Calculates between-class covariance matrix #' #' @details When \code{div_by_n=TRUE} the covariance matrices are divided by n #' (number of observations), otherwise they are divided by n-1 #' #' @param variables matrix or data frame with explanatory variables (No missing #' values are allowed) #' @param group vector or factor with group memberships (No missing values are #' allowed) #' @param div_by_n logical indicating division by number of observations #' @author Gaston Sanchez #' @seealso \code{\link{getWithin}}, \code{\link{betweenSS}}, #' \code{\link{withinCov}}, \code{\link{totalCov}} #' @export #' @examples #' \dontrun{ #' # load iris dataset #' data(iris) #' #' # between-class covariance matrix (dividing by n-1) #' betweenCov(iris[,1:4], iris[,5]) #' #' # between-class covariance matrix (dividing by n) #' betweenCov(iris[,1:4], iris[,5], div_by_n=TRUE) #' } #' betweenCov <- function(variables, group, div_by_n=FALSE) { # check inputs verify_Xy = my_verify(variables, group, na.rm=FALSE) X = verify_Xy$X y = verify_Xy$y # how many obs and variables n = nrow(X) p = ncol(X) # group levels and number of levels glevs = levels(y) ng = nlevels(y) # global mean mean_global = colMeans(X) # matrix to store results Between = matrix(0, p, p) # pooled between-class covariance matrix for (k in 1:ng) { # select obs of k-th group tmp <- y == glevs[k] # how many obs in group k nk = sum(tmp) # mean k-th group mean_k = colMeans(X[tmp,]) # mean k-th group - global mean dif_k = mean_k - mean_global # k-th group between cov matrix if (div_by_n) { between_k = (nk/n) * tcrossprod(dif_k) } else { between_k = (nk/(n-1)) * tcrossprod(dif_k) } Between = Between + between_k } # add names if (is.null(colnames(variables))) { var_names = paste("X", 1:ncol(X), sep="") dimnames(Between) = list(var_names, var_names) } else { dimnames(Between) = list(colnames(variables), colnames(variables)) } # result Between } DiscriMiner/R/print.qualmca.R0000644000176200001440000000106612160375446015577 0ustar liggesusers#' @S3method print qualmca print.qualmca <- function(x, ...) { cat("\nMultiple Correspondence Analysis\n") cat(rep("-",43), sep="") cat("\n$values ", "eigenvalues") cat("\n$coefficients ", "coeffs of factorial axes") cat("\n$components ", "factor coordinates\n") cat(rep("-",43), sep="") cat("\n\n$values\n") print(format(x$values, scientific=FALSE, digits=3), print.gap=2, quote=FALSE) cat("\n\n$coefficients\n") aux = min(5, ncol(x$coefficients)) print(x$coefficients[,1:aux], print.gap=2) cat("...\n") invisible(x) } DiscriMiner/R/disqual.R0000644000176200001440000001344712160374720014463 0ustar liggesusers#' Discriminant Analysis on Qualitative Variables #' #' Implementation of the DISQUAL methodology. Disqual performs a Fishers #' Discriminant Analysis on components from a Multiple Correspondence Analysis #' #' When \code{validation=NULL} there is no validation \cr When #' \code{validation="crossval"} cross-validation is performed by randomly #' separating the observations in ten groups. \cr When #' \code{validation="learntest"} validationi is performed by providing a #' learn-set and a test-set of observations. \cr #' #' @param variables data frame with qualitative explanatory variables (coded as #' factors) #' @param group vector or factor with group memberships #' @param validation type of validation, either \code{"crossval"} or #' \code{"learntest"}. Default \code{NULL} #' @param learn optional vector of indices for a learn-set. Only used when #' \code{validation="learntest"}. Default \code{NULL} #' @param test optional vector of indices for a test-set. Only used when #' \code{validation="learntest"}. Default \code{NULL} #' @param autosel logical indicating automatic selection of MCA components #' @param prob probability level for automatic selection of MCA components. #' Default \code{prob = 0.05} #' @return An object of class \code{"disqual"}, basically a list with the #' following elements: #' @return \item{raw_coefs}{raw coefficients of discriminant functions} #' @return \item{norm_coefs}{normalizaed coefficients of discriminant functions, #' ranging from 0 - 1000} #' @return \item{confusion}{confusion matrix} #' @return \item{scores}{discriminant scores for each observation} #' @return \item{classification}{assigned class} #' @return \item{error_rate}{misclassification error rate} #' @author Gaston Sanchez #' @seealso \code{\link{easyMCA}}, \code{\link{classify}}, #' \code{\link{binarize}} #' @references Lebart L., Piron M., Morineau A. (2006) \emph{Statistique #' Exploratoire Multidimensionnelle}. Dunod, Paris. #' #' Saporta G. (2006) \emph{Probabilites, analyse des donnees et statistique}. #' Editions Technip, Paris. #' #' Saporta G., Niang N. (2006) Correspondence Analysis and Classification. In #' \emph{Multiple Correspondence Analysis and Related Methods}, Eds. Michael #' Greenacre and Jorg Blasius, 371-392. Chapman and Hall/CRC #' @export #' @examples #' #' \dontrun{ #' # load insurance dataset #' data(insurance) #' #' # disqual analysis with no validation #' my_disq1 = disqual(insurance[,-1], insurance[,1], validation=NULL) #' my_disq1 #' #' # disqual analysis with cross-validation #' my_disq2 = disqual(insurance[,-1], insurance[,1], validation="crossval") #' my_disq2 #' } #' disqual <- function(variables, group, validation = NULL, learn = NULL, test = NULL, autosel = TRUE, prob = 0.05) { # Perform discriminant analysis on qualitative variables # variables: data frame with categorical explanatory variables # group: vector or factor with group membership # validation: NULL, "crossval", "learntest" # learn: vector of learn-set # test: vector of test-set # autosel: logical indicating automatic selection of MCA comps # prob: probability level for automatic selection ## check inputs verify_Xy = my_verify(variables, group, qualitative=TRUE, na.rm=FALSE) X = verify_Xy$X y = verify_Xy$y # type of validation if (is.null(validation)) { validation = "none" } else { vali = validation %in% c("crossval", "learntest") if (!vali) stop("nIncorrect type of validation") } # probability value if (!is.logical(autosel)) stop("\nargument 'autosel' incorrectly defined") if (autosel) { if (prob < 0 || prob >= 1) stop("\nargument 'prob' must be between range [0,1)") } # how many observations and variables n = nrow(X) p = ncol(X) # how many groups ng = nlevels(y) glevs = levels(y) # how many obs in each group nobs_group = as.vector(table(y)) # proportions props = nobs_group / n ## catDA with no validation if (validation == "none") { get_catda = my_catDA(X, y, 1:n, 1:n, autosel, prob) err = 1 - sum(diag(get_catda$conf)) / n } ## catDA with learn-test sets validation if (validation == "learntest") { if (any(learn) <= 0 || any(learn) > n) stop("\nsubscript out of bounds in 'learn' set") if (any(test) <= 0 || any(test) > n) stop("\nsubscript out of bounds in 'test' set") # apply DA get_catda = my_catDA(X, y, learn, test, autosel, prob) # misclassification error rate err = 1 - sum(diag(get_catda$conf))/length(test) } ## catDA with crossvalidation if (validation == "crossval") { # catDA for all observations get_catda = my_catDA(X, y, 1:n, 1:n, autosel, prob) # elements in each group elems_group = vector("list", ng) for (k in 1:ng) { elems_group[[k]] = which(group == glevs[k]) } # misclassification error rate mer = 0 # 10 crossvalidation samples for (r in 1:10) { test = vector("list", ng) test_sizes = floor(n * props / 10) for (k in 1:ng) { test[[k]] = sample(elems_group[[k]], test_sizes[k]) } test = unlist(test) learn = (1:n)[-test] # apply DA catda_cv = my_catDA(X, y, learn, test, autosel, prob) # misclassification error rate mer = mer + sum(diag(catda_cv$conf))/n } # total misclassification error rate err = 1 - mer } ## specifications specs = list(n=n, p=p, ng=ng, glevs=glevs, nobs_group=nobs_group, validation=validation) # results structure(list(raw_coefs = get_catda$Raw, norm_coefs = get_catda$Norm, confusion = get_catda$conf, scores = get_catda$Disc, classification = get_catda$pred_class, error_rate = err, specs = specs), class = "disqual") } DiscriMiner/R/quaDA.R0000644000176200001440000001330512164637253014014 0ustar liggesusers#' Quadratic Discriminant Analysis #' #' Performs a Quadratic Discriminant Analysis #' #' When \code{validation=NULL} there is no validation \cr When #' \code{validation="crossval"} cross-validation is performed by randomly #' separating the observations in ten groups. \cr When #' \code{validation="learntest"} validationi is performed by providing a #' learn-set and a test-set of observations. \cr #' #' @param variables matrix or data frame with explanatory variables #' @param group vector or factor with group memberships #' @param prior optional vector of prior probabilities. Default #' \code{prior=NULL} implies group proportions #' @param validation type of validation, either \code{"crossval"} or #' \code{"learntest"}. Default \code{NULL} #' @param learn optional vector of indices for a learn-set. Only used when #' \code{validation="learntest"}. Default \code{NULL} #' @param test optional vector of indices for a test-set. Only used when #' \code{validation="learntest"}. Default \code{NULL} #' @param prob logical indicating whether the group classification results #' should be expressed in probability terms #' @return An object of class \code{"quada"}, basically a list with the #' following elements: #' @return \item{confusion}{confusion matrix} #' @return \item{scores}{discriminant scores for each observation} #' @return \item{classification}{assigned class} #' @return \item{error_rate}{misclassification error rate} #' @author Gaston Sanchez #' @seealso \code{\link{classify}}, \code{\link{desDA}}, \code{\link{geoDA}}, #' \code{\link{linDA}}, \code{\link{plsDA}} #' @references Lebart L., Piron M., Morineau A. (2006) \emph{Statistique #' Exploratoire Multidimensionnelle}. Dunod, Paris. #' #' Tenenhaus G. (2007) \emph{Statistique}. Dunod, Paris. #' #' Tuffery S. (2011) \emph{Data Mining and Statistics for Decision Making}. #' Wiley, Chichester. #' @export #' @examples #' #' \dontrun{ #' # load iris dataset #' data(iris) #' #' # quadratic discriminant analysis with no validation #' my_qua1 = quaDA(iris[,1:4], iris$Species) #' my_qua1$confusion #' my_qua1$error_rate #' #' # quadratic discriminant analysis with cross-validation #' my_qua2 = quaDA(iris[,1:4], iris$Species, validation="crossval") #' my_qua2$confusion #' my_qua2$error_rate #' #' # quadratic discriminant analysis with learn-test validation #' learning = c(1:40, 51:90, 101:140) #' testing = c(41:50, 91:100, 141:150) #' my_qua3 = quaDA(iris[,1:4], iris$Species, validation="learntest", #' learn=learning, test=testing) #' my_qua3$confusion #' my_qua3$error_rate #' } #' quaDA <- function(variables, group, prior = NULL, validation = NULL, learn = NULL, test = NULL, prob = FALSE) { # check inputs verify_Xy = my_verify(variables, group, na.rm=FALSE) X = verify_Xy$X y = verify_Xy$y # type of validation if (is.null(validation)) { validation = "none" } else { vali = validation %in% c("crossval", "learntest") if (!vali) stop("\nIncorrect type of validation") } # how many observations n = nrow(X) # how many variables p = ncol(X) # how many groups ng = nlevels(y) # how many obs in each group nobs_group = as.vector(table(y)) # prior probabilities if (!is.null(prior)) { if (!is.numeric(prior) || !is.vector(prior)) stop("\n'prior' probabilities incorrectly defined") if (length(prior) != ng) stop("\n'prior' probabilities don't match number of groups") if (any(prior > 1) || any(prior < 0)) stop("\n'prior' probabilities must range between [0,1]") if (round(sum(prior), 5) != 1) stop("\n'prior' probabilities don't add to 1") } else { # prior as proportions prior = nobs_group / n props = prior } # group levels glevs = levels(y) ## quaDA with no validation if (validation == "none") { get_quada = my_quaDA(X, y, 1:n, 1:n, prior, prob) err = 1 - sum(diag(get_quada$conf)) / n } ## quaDA with learn-test sets validation if (validation == "learntest") { if (any(learn) <= 0 || any(learn) > n) stop("\nsubscript out of bounds in 'learn' set") if (any(test) <= 0 || any(test) > n) stop("\nsubscript out of bounds in 'test' set") # apply linDA get_quada = my_quaDA(X, y, learn, test, prior, prob) # misclassification error rate err = 1 - sum(diag(get_quada$conf))/length(test) } ## quaDA with crossvalidation if (validation == "crossval") { # quaDA for all observations get_quada = my_quaDA(X, y, 1:n, 1:n, prior, prob) # elements in each group elems_group = vector("list", ng) for (k in 1:ng) { elems_group[[k]] = which(group == glevs[k]) } # misclassification error rate mer = 0 # 10 crossvalidation samples for (r in 1:10) { test = vector("list", ng) test_sizes = floor(n * props / 10) for (k in 1:ng) { test[[k]] = sample(elems_group[[k]], test_sizes[k]) } test = unlist(test) learn = (1:n)[-test] # apply DA quada_cv = my_quaDA(X, y, learn, test, prior, prob) # misclassification error rate mer = mer + sum(diag(quada_cv$conf))/n } # total misclassification error rate err = 1 - mer } ## specifications specs = list(n=n, p=p, ng=ng, glevs=glevs, nobs_group=nobs_group, validation = validation) ## results structure(list(confusion = get_quada$conf, scores = get_quada$Disc, classification = get_quada$pred_class, error_rate = err, WMqr = get_quada$WMqr, GM = get_quada$GM, ldet = get_quada$ldet, prior = get_quada$prior, specs = specs), class = "quada") } DiscriMiner/R/my_withinCov.R0000644000176200001440000000131512157657510015476 0ustar liggesusersmy_withinCov <- function(X, g, div_by_n=FALSE) { # X: matrix of explanatory variables # g: factor with group memberships # div_by_n: logical indicating division by num of observations # how many observations n = nrow(X) # how many variables ncx = ncol(X) # group levels and number of levels glevs = levels(g) ng = nlevels(g) # within cov matrix Within = matrix(0, ncx, ncx) for (k in 1:ng) { tmp <- g == glevs[k] nk = sum(tmp) if (div_by_n) { Wk = ((nk-1)/n) * var(X[tmp,]) } else { #Wk = ((nk-1)/(n-1)) * var(X[tmp,]) # divide by degrees of freedom Wk = ((nk-1)/(n-ng)) * var(X[tmp,]) } Within = Within + Wk } # result Within } DiscriMiner/R/my_geoDA.R0000644000176200001440000000304612157657510014506 0ustar liggesusersmy_geoDA <- function(X, y, learn, test) { # Perform a geometric predictive discriminant analysis # X: matrix or data.frame with explanatory variables # y: vector or factor with group membership # learn: vector of learning observations # test: vector of testing observations # how many observations n = nrow(X[learn,]) ntest = length(test) # how many groups ng = nlevels(y[learn]) glevs = levels(y[learn]) # group means GM = my_groupMeans(X[learn,], y[learn]) # within-class covariance matrix W = my_withinCov(X[learn,], y[learn]) # inverse of Within cov matrix W_inv = solve(W) # constant terms of classification functions alphas = rep(0, ng) # coefficients of classification functions Betas = matrix(0, nrow(W_inv), ng) for (k in 1:ng) { alphas[k] = -(1/2) * GM[k,] %*% W_inv %*% GM[k,] Betas[,k] = t(GM[k,]) %*% W_inv } # Mahalanobis-Fisher Classification Rule FDF = rbind(alphas, Betas) rownames(FDF) = c("constant", colnames(X)) colnames(FDF) = glevs # matrix of constant terms A = matrix(rep(alphas,ntest), ntest, ng, byrow=TRUE) # apply discrim functions Disc = X[test,] %*% Betas + A dimnames(Disc) = list(rownames(X[test,]), glevs) # predicted class pred = apply(Disc, 1, function(u) which(u == max(u))) names(pred) = NULL # assign class values pred_class = factor(pred, levels=seq_along(glevs), labels=glevs) # confusion matrix conf = table(original=y[test], predicted=pred_class) # results res = list(FDF=FDF, conf=conf, Disc=Disc, pred_class=pred_class) } DiscriMiner/R/my_catDA.R0000644000176200001440000000516412157657510014506 0ustar liggesusersmy_catDA <- function(X, y, learn, test, autosel, prob) { # Perform a geometric predictive discriminant analysis # X: data frame with categorical explanatory variables # y: vector or factor with group membership # learn: vector of learning observations # test: vector of testing observations # autosel: logical indicating automatic selection of MCA comps # prob: probability level for automatic selection ## main ingredients # how many observations n = nrow(X[learn,]) ntest = length(test) # how many variables p = ncol(X) # how many groups ng = nlevels(y[learn]) glevs = levels(y[learn]) # MCA getmca = my_mca(X[learn,]) # number of factors nf = min(p, ng-1) # selection of components select = 1:ncol(getmca$components) if (autosel) { getpower = discPower(getmca$components, y[learn]) select = which(getpower$p_value <= prob) } # coordinate factors (MCA components) Fs = getmca$components[,select] # mca coefficients U = getmca$coefficients[,select] # super-indicator matrix (TDC) Z = my_tdc(X[learn,]) ## geometric discriminant analysis # group means GM = my_groupMeans(Fs, y[learn]) # within-class covariance matrix W = my_withinCov(Fs, y[learn]) # inverse of Within cov matrix W_inv = solve(W) # constant terms of classification functions alphas = rep(0, ng) # coefficients of classification functions Betas = matrix(0, nrow(W_inv), ng) for (k in 1:ng) { alphas[k] = -(1/2) * GM[k,] %*% W_inv %*% GM[k,] Betas[,k] = t(GM[k,]) %*% W_inv } dimnames(Betas) = list(colnames(getmca$components[,select]), glevs) # raw coeffs of classification functions Raw = U %*% Betas # transforming coefficients 0-1000 cats_per_var = sapply(X[learn,], nlevels) fin = cumsum(cats_per_var) ini = fin - cats_per_var + 1 # normalized coefficients Norm = matrix(0, nrow(Raw), ncol(Raw)) for (k in 1:ng) { for (j in 1:p) { tmp = Raw[ini[j]:fin[j],k] Norm[ini[j]:fin[j],k] = tmp - min(tmp) } Norm[,k] = Norm[,k] * (1000/sum(Norm[,k])) } dimnames(Norm) = dimnames(Raw) ## classification # (there is no constant term, Saporta 2006, page 462) # apply discrim functions Ztest = my_tdc(X[test,]) Disc = Ztest %*% Norm # predicted class pred = apply(Disc, 1, function(x) which(x == max(x))) # assign class values pred_class = factor(pred, levels=seq_along(glevs), labels=glevs) # confusion matrix conf = table(original=y[test], predicted=pred_class) # results res = list(Raw = Raw, Norm = Norm, conf = conf, Disc = Disc, pred_class = pred_class) } DiscriMiner/R/desDA.R0000644000176200001440000001212212160374672013774 0ustar liggesusers#' Descriptive Discriminant Analysis #' #' Performs a Descriptive Discriminant Analysis (a.k.a. Factorial Discriminant #' Analysis from the french \emph{Analyse Factorielle Discriminante}) #' #' When \code{covar="within"} the estimated pooled within-class covariance #' matrix is used in the calculations. \cr When \code{covar="total"} the total #' covariance matrix is used in the calculations. \cr The difference between #' \code{covar="within"} and \code{covar="total"} is in the obtained #' eigenvalues. #' #' The estiamted pooled within-class covariance matrix is actually the #' within-class covariance matrix divided by the number of observations minus #' the number of classes (see \code{\link{getWithin}}) #' #' @param variables matrix or data frame with explanatory variables #' @param group vector or factor with group memberships #' @param covar character string indicating the covariance matrix to be used. #' Options are \code{"within"} and \code{"total"} #' @return An object of class \code{"desda"}, basically a list with the #' following elements #' @return \item{power}{table with discriminant power of the #' explanatory variables} #' @return \item{values}{table of eigenvalues} #' @return \item{discrivar}{table of discriminant variables, #' i.e. the coefficients of the linear discriminant functions} #' @return \item{discor}{table of correlations between the variables and the #' discriminant axes} #' @return \item{scores}{table of discriminant scores for each observation} #' @author Gaston Sanchez #' @seealso \code{\link{discPower}} #' @references Lebart L., Piron M., Morineau A. (2006) \emph{Statistique #' Exploratoire Multidimensionnelle}. Dunod, Paris. #' @export #' @examples #' #' \dontrun{ #' # load bordeaux wines dataset #' data(bordeaux) #' #' # descriptive discriminant analysis with within covariance matrix #' my_dda1 = desDA(bordeaux[,2:5], bordeaux$quality) #' my_dda1 #' #' # descriptive discriminant analysis with total covariance matrix #' my_dda2 = desDA(bordeaux[,2:5], bordeaux$quality, covar="total") #' my_dda2 #' #' # plot factor coordinates with ggplot #' library(ggplot2) #' bordeaux$f1 = my_dda1$scores[,1] #' bordeaux$f2 = my_dda1$scores[,2] #' ggplot(data=bordeaux, aes(x=f1, y=f2, colour=quality)) + #' geom_hline(yintercept=0, colour="gray70") + #' geom_vline(xintercept=0, colour="gray70") + #' geom_text(aes(label=year), size=4) + #' opts(title="Discriminant Map - Bordeaux Wines (years)") #' } #' desDA <- function(variables, group, covar="within") { # check inputs verify_Xy = my_verify(variables, group, na.rm=FALSE) check_cov <- covar %in% c("within", "total") if (!check_cov) warning("\nInvalid covar value; 'covar = within' is used") # get ingredients X = verify_Xy$X y = verify_Xy$y # how many obs and variables n = nrow(X) p = ncol(X) # group levels and number of levels glevs = levels(y) ng = nlevels(y) # how many obs in each group nk = as.vector(table(y)) # number of factors nf = min(p, ng-1) # global mean gm = colMeans(X) # total covariance V = var(X) # between-class covariance matrix B = my_betweenCov(X, y) # estimated within-class covariance matrix W = my_withinCov(X, y) # within-class covariance matrix for disc-power Ww = matrix(0, p, p) for (k in 1:ng) { tmp <- y == glevs[k] nk = sum(tmp) Wk = ((nk-1)/(n-1)) * var(X[tmp,]) Ww = Ww + Wk } ## Discriminant importance of explanatory variables # F-statistics F_statistic = ((n-ng)/(ng-1)) * (diag(B) / diag(Ww)) p_values = 1 - pf(F_statistic, ng-1, n-ng) # Wilk's lambdas wilks_lamb = diag(Ww / V) # correlation ratio cor_ratio = diag(B) / diag(V) # table of disc power tab1 = cbind(cor_ratio, wilks_lamb, F_statistic, p_values) ## Discriminant axes # select covariance matrix if (covar == "within") Cov = W else Cov = ((n-1)/n) * var(X) # group means matrix GM = groupMeans(X, y) # decomposing between-class matrix: B = CC' GM_gm = sweep(GM, 1, gm, FUN="-") C = sweep(GM_gm, 2, sqrt(nk/n), FUN="*") # eigen-decomposition EIG = eigen(t(C) %*% solve(Cov) %*% C) # eigenvalues lam = EIG$values[1:nf] # eigenvectors U = solve(V) %*% C %*% EIG$vectors[,1:nf] # normalizing eigenvectors aux = sqrt(diag(t(U) %*% Cov %*% U)) Unorm = sweep(U, 2, aux, FUN="/") # table of eigenvalues tab2 = cbind(lam, 100*lam/sum(lam), 100*cumsum(lam)/sum(lam)) colnames(tab2) = c("value", "proportion", "accumulated") rownames(tab2) = paste("DF", 1:nf, sep="") # Linear Discriminant Functions alphas = (-1) * gm %*% Unorm tab3 = rbind(alphas, Unorm) rownames(tab3) = c("constant", colnames(X)) colnames(tab3) = paste("DF", 1:nf, sep="") # factor coordinates and correlations with expl variables Fs = X %*% Unorm + matrix(rep(alphas,each=n), n, nf) colnames(Fs) = paste("z", 1:nf, sep="") tab4 = cor(X, Fs) colnames(tab4) = paste("DF", 1:nf, sep="") # results structure(list(power = tab1, values = tab2, discrivar = tab3, discor = tab4, scores = Fs), class = "desda") } DiscriMiner/R/getWithin.R0000644000176200001440000000237612160374762014770 0ustar liggesusers#' Within-class Covariance Matrix #' #' Calculates the estimated within-class covariance matrix #' #' The obtained matrix is the estimated within-class covariance matrix (i.e. #' within-class covariance matrix divided by its degrees of freedom \code{n-k}, #' where \code{n} is the number of observations and \code{k} is the number of #' groups) #' #' @param variables matrix or data frame with explanatory variables (No missing #' values are allowed) #' @param group vector or factor with group memberships (No missing values are #' allowed) #' @author Gaston Sanchez #' @seealso \code{\link{withinCov}} #' @export #' @examples #' #' \dontrun{ #' # load iris dataset #' data(iris) #' #' # estimated within-class covariance matrix (dividing by n-k) #' getWithin(iris[,1:4], iris[,5]) #' #' # compared to the within-class covariance matrix (dividing by n-1) #' withinCov(iris[,1:4], iris[,5]) #' } #' getWithin <- function(variables, group) { # within-class pooled covariance matrix # variables: matrix or data frame with explanatory variables # group: vector or factor with group memberships # check inputs verify_Xy = my_verify(variables, group, na.rm=FALSE) X = verify_Xy$X y = verify_Xy$y Within = my_withinCov(X, y) Within } DiscriMiner/MD50000644000176200001440000000767312241261076013010 0ustar liggesusersa2ac1d3be525155029797ab8d65c030d *DESCRIPTION e3ffca50ce37824eb8dc824e9c593d0a *NAMESPACE 3dd5f2a17e5921beff963cd19a32bf88 *R/DiscriMiner-package.R bf207c0e2523b110f09382fabdf2981d *R/FRatio.R 977e78c7a456c294177081acd1a9d39c *R/betweenCov.R ca0ff9c77ce5d43133a8542664727471 *R/betweenSS.R d524f651eff9410138d94fc225645670 *R/binarize.R ecb50b41676a4e73f6d0b7000a7bf899 *R/classify.R c9eb881ac427954d4d1d8db5bfc44ae7 *R/corRatio.R b0daf90ba5816c84f48e260112e28610 *R/desDA.R 2740565c419da1f9a2cd88eef56b4731 *R/discPower.R 94de4a2022d8026b3183839db1d470ea *R/disqual.R b8166dc19e6a6ae620724b77c7e49659 *R/easyMCA.R 593e076f74b37d4603f4760b12e029db *R/geoDA.R a870a6c5fce1a544b07664d141b1c6e6 *R/getWithin.R 552f6fbb70d1b8c82fce113524ebc7de *R/groupMeans.R d7e18e63901225d1f174d6a25066d9f1 *R/groupMedians.R a1fdd13db65c84e2b4739f170bf03436 *R/groupQuants.R d98a2e7df747732e0a78fa9d9d1f7e26 *R/groupStds.R 7177d312c942f4872d8d9cd841e78d7c *R/groupVars.R 0ac8ab161a008e32e5ff3b7abad7c664 *R/linDA.R a2cf1a389b94808378bcc8e4f068baf7 *R/my_betweenCov.R 0a715f35857239297a5d3b56da5f67aa *R/my_catDA.R 893be5a2efb817766850d5e8a98ef2cd *R/my_discFunctions.R 062c790e410d7432702e90970e0a0f30 *R/my_geoDA.R 2f8eca9948e1df58ecad5109dfee8435 *R/my_groupMeans.R 0ad3fa0cc8c26492acf5778eb3d8aa26 *R/my_linDA.R 570f82a083e2c0672021231c6b829d46 *R/my_mca.R 950e0d1bdbfa2c98167e0ba1e276e17d *R/my_plsDA.R 2cdf6e6eac6d07363506a8cf252e527d *R/my_plsDA_old.R d6c834aeff8becfb3038b7d2ca268ef9 *R/my_quaDA.R 1037fae898a13cc2f5253283a61ff2c4 *R/my_tdc.R 6ee4fccbeda79044e30eaeaaac86c8d6 *R/my_verify.R 368394e2c2985419e590c47046530c1e *R/my_withinCov.R 716ca03194cbce32098f1147c58ec5a4 *R/plot.plsda.R 8977e919322113b6306cad8ce26e099c *R/plsDA.R 611b680e50a768b73ea5a3261d2fa4b1 *R/plsDA_old.R 6873e78d0f3333af1dbed6e21fe822b7 *R/print.desda.R 95fd92cb1943eed757e860c0da501b2e *R/print.disqual.R 2b6b84c5f243addd81e4c9f9f1a21176 *R/print.geoda.R b643c08e8b76fe6f9d0f20079c7aae15 *R/print.linda.R 23a8110e42b4539e9333c23e4b634d68 *R/print.plsda.R 3fabc05f0d1e563d8edd4473c7933470 *R/print.quada.R 2743aa037338d718595437a6f83fc6e0 *R/print.qualmca.R 7aa1e4a1afe8c0475255133af473c0b3 *R/quaDA.R 5ba4e3afacdfcc26d44cd0e47631baad *R/totalCov.R d9de76756df8f83ae732b758154470d0 *R/totalSS.R 13cc6b0bbb6e0f527307e6c154767e7a *R/withinCov.R c865c03ba1a8165eb7979e62648b91b9 *R/withinSS.R 5c22ce5583874f255fcf98d56d9fb81c *data/bordeaux.rda 11facd372019f172c834b2a443f8f828 *data/infarctus.rda 96703bfbb9ed15b48ae99ea10d1018c3 *data/insurance.rda d35b0bf5f66b95ce78777581d0d40310 *man/DiscriMiner-package.Rd dd9b856adbba4033f5a1317fef9e0a80 *man/FRatio.Rd 36d9b023eea018c69fc44d925ce46337 *man/betweenCov.Rd 476fa64e448b15f8b6fa3a5256d1ea3b *man/betweenSS.Rd b6f7887853a9d14d47f0d6767fac75db *man/binarize.Rd 450d8f6593a1b62eb800b258af60fb9d *man/bordeaux.Rd 3fe44346d6afa011d10de92cf39d1d43 *man/classify.Rd 06bc84d4552ad6d41cb5f5d694dfa61a *man/corRatio.Rd ce4a10c06ed6bba594bc8644f7b43f53 *man/desDA.Rd 7abcbadc0f2c9d1a3a323535b62a4a46 *man/discPower.Rd ee8faab3c23da493d2a3b1d4c5a00a21 *man/disqual.Rd e3feb146913dd8ab12fa82471295954b *man/easyMCA.Rd aca44e974e3f206272d3e72e71a1a07b *man/geoDA.Rd d67ff43c9233a1420594e32df58db070 *man/getWithin.Rd b7256b23c6cd3b471d43d95474094e65 *man/groupMeans.Rd 78fe3bbd4693d1a3eb137e2a135c019a *man/groupMedians.Rd 67a0596127552484cde58b49cbe902b8 *man/groupQuants.Rd 1f49148a576acda42b40c4fca99ea2ed *man/groupStds.Rd dea21d2afd0f8087af52e9571afa7e05 *man/groupVars.Rd 458a5bcadb6740b50f8d4ff0ddd4a33b *man/infarctus.Rd 7c4934b52da56b4529ef7cd1b7f733b8 *man/insurance.Rd c2453af1839bd39025c3a0a20c74cd8a *man/linDA.Rd be29b251e53a40258565c7ea9bd535c7 *man/my_plsDA.Rd 673cc2469d8e8a464c4a668b265d45a2 *man/plsDA.Rd 6d3bd69fdc7c348ea314b1a756c7dcf3 *man/plsDA_old.Rd 3b80ef65dc9173ef7b8cc4b93a068011 *man/quaDA.Rd bc27de329bc7fd008b41807cac869457 *man/totalCov.Rd 9fe81fc5a4ae736d0f473a5335e97b9a *man/totalSS.Rd 5aefa6dea29aca279b7eb6c4440f49e4 *man/withinCov.Rd dbb8b7e044a690b3e34e76db3621ed6b *man/withinSS.Rd DiscriMiner/DESCRIPTION0000644000176200001440000000307512241261076014176 0ustar liggesusersPackage: DiscriMiner Type: Package Title: Tools of the Trade for Discriminant Analysis Version: 0.1-29 Date: 2013-11-14 Authors@R: c( person("Gaston", "Sanchez", email = "gaston.stat@gmail.com", role = c("aut", "cre")), person("Charles", "Determan", role = "ctb")) Depends: R (>= 2.15.0) Suggests: MASS, FactoMineR Description: Functions for Discriminant Analysis and Classification purposes covering various methods such as descriptive, geometric, linear, quadratic, PLS, as well as qualitative discriminant analyses License: GPL-3 URL: http://www.gastonsanchez.com LazyData: yes Collate: 'DiscriMiner-package.R' 'FRatio.R' 'betweenCov.R' 'betweenSS.R' 'binarize.R' 'classify.R' 'corRatio.R' 'desDA.R' 'discPower.R' 'disqual.R' 'easyMCA.R' 'geoDA.R' 'getWithin.R' 'groupMeans.R' 'groupMedians.R' 'groupQuants.R' 'groupStds.R' 'groupVars.R' 'linDA.R' 'my_betweenCov.R' 'my_catDA.R' 'my_discFunctions.R' 'my_geoDA.R' 'my_groupMeans.R' 'my_linDA.R' 'my_mca.R' 'my_plsDA.R' 'my_quaDA.R' 'my_tdc.R' 'my_verify.R' 'my_withinCov.R' 'plot.plsda.R' 'print.desda.R' 'print.disqual.R' 'print.geoda.R' 'print.linda.R' 'print.plsda.R' 'print.quada.R' 'print.qualmca.R' 'quaDA.R' 'totalCov.R' 'totalSS.R' 'withinCov.R' 'withinSS.R' 'my_plsDA_old.R' 'plsDA_old.R' 'plsDA.R' Packaged: 2013-11-14 19:52:13 UTC; Gaston Author: Gaston Sanchez [aut, cre], Charles Determan [ctb] Maintainer: Gaston Sanchez NeedsCompilation: no Repository: CRAN Date/Publication: 2013-11-15 00:52:30 DiscriMiner/man/0000755000176200001440000000000012160626466013246 5ustar liggesusersDiscriMiner/man/my_plsDA.Rd0000644000176200001440000000153612226714217015245 0ustar liggesusers\name{my_plsDA} \alias{my_plsDA} \title{PLS Discriminant Analysis} \usage{ my_plsDA(X, y, learn, test, autosel, comps, cv = "LOO", k = NA, retain.models = FALSE) } \arguments{ \item{X}{matrix or data.frame with explanatory variables} \item{y}{vector or factor with group membership} \item{learn}{vector of learning observations} \item{test}{vector of testing observations} \item{autosel}{logical indicating automatic selection of PLS comps} \item{comps}{number of PLS components (only when autosel=FALSE)} \item{cv}{cross validation method. Options are \code{"LOO"} (Leave-One-Out) and \code{"LKO"} (Leave-K fold-Out)} \item{k}{fold left out if using LKO} \item{retain.models}{whether to retain lower models (i.e. all lower component results)} } \description{ Perform a PLS discriminant analysis } \keyword{internal} DiscriMiner/man/withinCov.Rd0000644000176200001440000000174112160236532015501 0ustar liggesusers\name{withinCov} \alias{withinCov} \title{Within-class Covariance Matrix} \usage{ withinCov(variables, group, div_by_n = FALSE) } \arguments{ \item{variables}{matrix or data frame with explanatory variables (No missing values are allowed)} \item{group}{vector or factor with group memberships (No missing values are allowed)} \item{div_by_n}{logical indicating division by number of observations} } \description{ Calculates the within-class covariance matrix } \details{ When \code{div_by_n=TRUE} the covariance matrices are divided by n (number of observations), otherwise they are divided by n-1 } \examples{ \dontrun{ # load iris dataset data(iris) # within-class covariance matrix (dividing by n-1) withinCov(iris[,1:4], iris[,5]) # within-class covariance matrix (dividing by n) withinCov(iris[,1:4], iris[,5], div_by_n=TRUE) } } \author{ Gaston Sanchez } \seealso{ \code{\link{withinSS}}, \code{\link{betweenCov}}, \code{\link{totalCov}} } DiscriMiner/man/withinSS.Rd0000644000176200001440000000126012160236532015273 0ustar liggesusers\name{withinSS} \alias{withinSS} \title{Within-class Sum of Squares Matrix} \usage{ withinSS(variables, group) } \arguments{ \item{variables}{matrix or data frame with explanatory variables (No missing values are allowed)} \item{group}{vector or factor with group memberships (No missing values are allowed)} } \description{ Calculates within-class sum of squares and cross product matrix (a.k.a. within-class scatter matrix) } \examples{ \dontrun{ # load iris dataset data(iris) # within-class scatter matrix withinSS(iris[,1:4], iris[,5]) } } \author{ Gaston Sanchez } \seealso{ \code{\link{withinCov}}, \code{\link{betweenSS}}, \code{\link{totalSS}} } DiscriMiner/man/FRatio.Rd0000644000176200001440000000132512160236532014711 0ustar liggesusers\name{FRatio} \alias{FRatio} \title{F-Statistic Ratio} \usage{ FRatio(variable, group) } \arguments{ \item{variable}{a quantitative variable} \item{group}{a vector or factor with group memberships (i.e. qualitative variable)} } \value{ F-statistic and its p-value } \description{ Calcualtes the F-statistic between a quantitative variable and a qualitative variable } \examples{ \dontrun{ # load bordeaux wines dataset data(bordeaux) # F-statistic ratio between temperature and quality FRatio(bordeaux$temperature, bordeaux$quality) } } \author{ Gaston Sanchez } \references{ Tenenhaus M. (2007) \emph{Statistique}. Dunod, Paris. } \seealso{ \code{\link{discPower}}, \code{\link{corRatio}} } DiscriMiner/man/infarctus.Rd0000644000176200001440000000207312160236532015524 0ustar liggesusers\docType{data} \name{infarctus} \alias{infarctus} \title{Infarctus dataset} \format{A data frame with 101 observations on the following 8 variables. \tabular{ll}{ \code{FRCAR} \tab Frequence Cardiaque (i.e. heart rate)\cr \code{INCAR} \tab Index Cardique (cardiac index)\cr \code{INSYS} \tab Index Systolique (systolic index)\cr \code{PRDIA} \tab Pression Diastolique (diastolic pressure)\cr \code{PAPUL} \tab Pression Arterielle Pulmonaire (pulmonary artery pressure)\cr \code{PVENT} \tab Pression Ventriculaire (ventricular pressure)\cr \code{REPUL} \tab Resistance Pulmonaire (pulmonary resistance)\cr \code{PRONO} \tab Pronostic (prognosis): a factor with levels \code{dead} and \code{survive}\cr }} \description{ Infarctus dataset from Saporta (2006) } \examples{ \dontrun{ # load data data(infarctus) # summary of variables summary(infarctus) } } \references{ Chapter 18: Analyse discriminante et regression logistique, pp 453-454 \cr Saporta G. (2006) \emph{Probabilites, analyse des donnees et statistique}. Editions Technip, Paris. } \keyword{datasets} DiscriMiner/man/discPower.Rd0000644000176200001440000000156212160374124015467 0ustar liggesusers\name{discPower} \alias{discPower} \title{Discriminant Power} \usage{ discPower(variables, group) } \arguments{ \item{variables}{matrix or data frame with explanatory variables} \item{group}{vector or factor with group membership} } \value{ A data frame containing the following columns \item{correl_ratio}{Correlation Ratios} \item{wilks_lambda}{Wilks Lambda} \item{F_statistic}{F-statistic} \item{p_value}{p-value of F-statistic} } \description{ Measures Discriminant Power of explanatory variables } \details{ No missing values are allowed } \examples{ \dontrun{ # bordeaux wines dataset data(bordeaux) # discriminant power dp = discPower(bordeaux[,2:5], bordeaux$quality) dp } } \author{ Gaston Sanchez } \references{ Tenenhaus M. (2007) \emph{Statistique}. Dunod, Paris. } \seealso{ \code{\link{corRatio}}, \code{\link{FRatio}} } DiscriMiner/man/groupStds.Rd0000644000176200001440000000146312160236532015522 0ustar liggesusers\name{groupStds} \alias{groupStds} \title{Group Standard Deviations} \usage{ groupStds(variables, group, na.rm = FALSE) } \arguments{ \item{variables}{matrix or data frame with explanatory variables (may contain missing values)} \item{group}{vector or factor with group memberships} \item{na.rm}{logical indicating whether missing values should be removed} } \value{ matrix of group standard deviations (with variables in the rows, and groups in the columns) } \description{ Calculates the standard deviations for each group } \examples{ \dontrun{ # dataset iris data(iris) # group standard deviations groupStds(iris[,1:4], iris[,5]) } } \author{ Gaston Sanchez } \seealso{ \code{\link{groupMeans}}, \code{\link{groupVars}}, \code{\link{groupMedians}}, \code{\link{groupQuants}} } DiscriMiner/man/groupMedians.Rd0000644000176200001440000000141212160236532016157 0ustar liggesusers\name{groupMedians} \alias{groupMedians} \title{Group Medians} \usage{ groupMedians(variables, group, na.rm = FALSE) } \arguments{ \item{variables}{matrix or data frame with explanatory variables (may contain missing values)} \item{group}{vector or factor with group memberships} \item{na.rm}{logical indicating whether missing values should be removed} } \value{ matrix of group medians (with variables in the rows, and groups in the columns) } \description{ Calculates the medians for each group } \examples{ \dontrun{ # dataset iris data(iris) # group means groupMedians(iris[,1:4], iris[,5]) } } \author{ Gaston Sanchez } \seealso{ \code{\link{groupVars}}, \code{\link{groupStds}}, \code{\link{groupMeans}}, \code{\link{groupQuants}} } DiscriMiner/man/desDA.Rd0000644000176200001440000000470012160374124014505 0ustar liggesusers\name{desDA} \alias{desDA} \title{Descriptive Discriminant Analysis} \usage{ desDA(variables, group, covar = "within") } \arguments{ \item{variables}{matrix or data frame with explanatory variables} \item{group}{vector or factor with group memberships} \item{covar}{character string indicating the covariance matrix to be used. Options are \code{"within"} and \code{"total"}} } \value{ An object of class \code{"desda"}, basically a list with the following elements \item{power}{table with discriminant power of the explanatory variables} \item{values}{table of eigenvalues} \item{discrivar}{table of discriminant variables, i.e. the coefficients of the linear discriminant functions} \item{discor}{table of correlations between the variables and the discriminant axes} \item{scores}{table of discriminant scores for each observation} } \description{ Performs a Descriptive Discriminant Analysis (a.k.a. Factorial Discriminant Analysis from the french \emph{Analyse Factorielle Discriminante}) } \details{ When \code{covar="within"} the estimated pooled within-class covariance matrix is used in the calculations. \cr When \code{covar="total"} the total covariance matrix is used in the calculations. \cr The difference between \code{covar="within"} and \code{covar="total"} is in the obtained eigenvalues. The estiamted pooled within-class covariance matrix is actually the within-class covariance matrix divided by the number of observations minus the number of classes (see \code{\link{getWithin}}) } \examples{ \dontrun{ # load bordeaux wines dataset data(bordeaux) # descriptive discriminant analysis with within covariance matrix my_dda1 = desDA(bordeaux[,2:5], bordeaux$quality) my_dda1 # descriptive discriminant analysis with total covariance matrix my_dda2 = desDA(bordeaux[,2:5], bordeaux$quality, covar="total") my_dda2 # plot factor coordinates with ggplot library(ggplot2) bordeaux$f1 = my_dda1$scores[,1] bordeaux$f2 = my_dda1$scores[,2] ggplot(data=bordeaux, aes(x=f1, y=f2, colour=quality)) + geom_hline(yintercept=0, colour="gray70") + geom_vline(xintercept=0, colour="gray70") + geom_text(aes(label=year), size=4) + opts(title="Discriminant Map - Bordeaux Wines (years)") } } \author{ Gaston Sanchez } \references{ Lebart L., Piron M., Morineau A. (2006) \emph{Statistique Exploratoire Multidimensionnelle}. Dunod, Paris. } \seealso{ \code{\link{discPower}} } DiscriMiner/man/betweenSS.Rd0000644000176200001440000000126712160236532015431 0ustar liggesusers\name{betweenSS} \alias{betweenSS} \title{Between-class Sum of Squares Matrix} \usage{ betweenSS(variables, group) } \arguments{ \item{variables}{matrix or data frame with explanatory variables (No missing values are allowed)} \item{group}{vector or factor with group membership (No missing values are allowed)} } \description{ Calculates between-class sum of squares and cross product matrix (a.k.a. between-class scatter matrix) } \examples{ \dontrun{ # load iris dataset data(iris) # between-class scatter matrix betweenSS(iris[,1:4], iris[,5]) } } \author{ Gaston Sanchez } \seealso{ \code{\link{betweenCov}}, \code{\link{withinSS}}, \code{\link{totalSS}} } DiscriMiner/man/totalSS.Rd0000644000176200001440000000101012160236532015105 0ustar liggesusers\name{totalSS} \alias{totalSS} \title{Total Sum of Squares Matrix} \usage{ totalSS(variables) } \arguments{ \item{variables}{matrix or data frame with explanatory variables} } \description{ Calculates the total sum of squares and cross product matrix (a.k.a. total scatter matrix) } \examples{ \dontrun{ # load iris dataset data(iris) # total scatter matrix totalSS(iris[,1:4]) } } \author{ Gaston Sanchez } \seealso{ \code{\link{totalCov}}, \code{\link{betweenSS}}, \code{\link{withinSS}} } DiscriMiner/man/corRatio.Rd0000644000176200001440000000131412160236532015305 0ustar liggesusers\name{corRatio} \alias{corRatio} \title{Correlation Ratio} \usage{ corRatio(variable, group) } \arguments{ \item{variable}{a single quantitative variable} \item{group}{vector or factor with group memberships (qualitative variable)} } \description{ Calculates the correlation ratio between a quantitaive variable and a qualitative variable } \details{ No missing values are allowed } \examples{ \dontrun{ # iris dataset data(iris) # correlation ratio between Petal-Length and Species corRatio(iris$Petal.Length, iris$Species) } } \author{ Gaston Sanchez } \references{ Tenenhaus, M. (2007) \emph{Statistique}. Dunod, Paris. } \seealso{ \code{\link{FRatio}}, \code{\link{discPower}} } DiscriMiner/man/binarize.Rd0000644000176200001440000000145612160236532015335 0ustar liggesusers\name{binarize} \alias{binarize} \title{Binarize a data frame into a super-indicator matrix} \usage{ binarize(variables) } \arguments{ \item{variables}{data frame with categorical variables (coded as factors)} } \value{ A super-indicator matrix of binary data } \description{ Convert a data frame with factors into a super-indicator matrix (a.k.a. complete disjunctive table from the french \emph{tableau disjonctive complete}) } \examples{ \dontrun{ # load insurance cars dataset data(insurance) # super-indicator matrix of binary data bin_insure = binarize(insurance[,-1]) head(bin_insure) } } \author{ Gaston Sanchez } \references{ Lebart L., Piron M., Morineau A. (2006) \emph{Statistique Exploratoire Multidimensionnelle}. Dunod, Paris. } \seealso{ \code{\link{easyMCA}} } DiscriMiner/man/classify.Rd0000644000176200001440000000253212160374124015343 0ustar liggesusers\name{classify} \alias{classify} \title{Classification function} \usage{ classify(DA_object, newdata) } \arguments{ \item{DA_object}{discriminant analysis object} \item{newdata}{vector or matrix or data frame with variables for which their classes will be calculated} } \value{ A list with the following elements \item{scores}{discriminant scores for each observation} \item{pred_class}{predicted class} } \description{ Classify provided observations based on a given Discriminant object } \details{ A \code{DA_object} is a discriminant analysis (DA) object obtained from a geometric predictive DA (class \code{"geoda"}), a linear DA (class \code{"linda"}), a quadratic DA (class \code{"quada"}), or a DISQUAL analysis (class \code{"disqual"}) } \examples{ \dontrun{ # load iris dataset data(iris) # linear discriminant analysis my_lin1 = linDA(iris[,1:4], iris$Species) # select a sample of 15 observations set.seed(111) obs = sample(1:nrow(iris), 15) some_data = iris[obs, 1:4] # classify some_data get_classes = classify(my_lin1, some_data) get_classes # compare the results against original class table(iris$Species[obs], get_classes$pred_class) } } \author{ Gaston Sanchez } \seealso{ \code{\link{geoDA}}, \code{\link{linDA}}, \code{\link{quaDA}}, \code{\link{plsDA}}, \code{\link{disqual}} } DiscriMiner/man/totalCov.Rd0000644000176200001440000000150612160236532015321 0ustar liggesusers\name{totalCov} \alias{totalCov} \title{Total Covariance Matrix} \usage{ totalCov(variables, div_by_n = FALSE) } \arguments{ \item{variables}{matrix or data frame with explanatory variables (No missing values are allowed)} \item{div_by_n}{logical indicating division by number of observations} } \description{ Calculates total covariance matrix } \details{ When \code{div_by_n=TRUE} the covariance matrices are divided by n (number of observations), otherwise they are divided by n-1 } \examples{ \dontrun{ # load iris dataset data(iris) # total covariance matrix (dividing by n-1) totalCov(iris[,1:4]) # total covariance matrix (dividing by n) totalCov(iris[,1:4], div_by_n=TRUE) } } \author{ Gaston Sanchez } \seealso{ \code{\link{totalSS}}, \code{\link{betweenCov}}, \code{\link{withinCov}} } DiscriMiner/man/quaDA.Rd0000644000176200001440000000517412220632266014527 0ustar liggesusers\name{quaDA} \alias{quaDA} \title{Quadratic Discriminant Analysis} \usage{ quaDA(variables, group, prior = NULL, validation = NULL, learn = NULL, test = NULL, prob = FALSE) } \arguments{ \item{variables}{matrix or data frame with explanatory variables} \item{group}{vector or factor with group memberships} \item{prior}{optional vector of prior probabilities. Default \code{prior=NULL} implies group proportions} \item{validation}{type of validation, either \code{"crossval"} or \code{"learntest"}. Default \code{NULL}} \item{learn}{optional vector of indices for a learn-set. Only used when \code{validation="learntest"}. Default \code{NULL}} \item{test}{optional vector of indices for a test-set. Only used when \code{validation="learntest"}. Default \code{NULL}} \item{prob}{logical indicating whether the group classification results should be expressed in probability terms} } \value{ An object of class \code{"quada"}, basically a list with the following elements: \item{confusion}{confusion matrix} \item{scores}{discriminant scores for each observation} \item{classification}{assigned class} \item{error_rate}{misclassification error rate} } \description{ Performs a Quadratic Discriminant Analysis } \details{ When \code{validation=NULL} there is no validation \cr When \code{validation="crossval"} cross-validation is performed by randomly separating the observations in ten groups. \cr When \code{validation="learntest"} validationi is performed by providing a learn-set and a test-set of observations. \cr } \examples{ \dontrun{ # load iris dataset data(iris) # quadratic discriminant analysis with no validation my_qua1 = quaDA(iris[,1:4], iris$Species) my_qua1$confusion my_qua1$error_rate # quadratic discriminant analysis with cross-validation my_qua2 = quaDA(iris[,1:4], iris$Species, validation="crossval") my_qua2$confusion my_qua2$error_rate # quadratic discriminant analysis with learn-test validation learning = c(1:40, 51:90, 101:140) testing = c(41:50, 91:100, 141:150) my_qua3 = quaDA(iris[,1:4], iris$Species, validation="learntest", learn=learning, test=testing) my_qua3$confusion my_qua3$error_rate } } \author{ Gaston Sanchez } \references{ Lebart L., Piron M., Morineau A. (2006) \emph{Statistique Exploratoire Multidimensionnelle}. Dunod, Paris. Tenenhaus G. (2007) \emph{Statistique}. Dunod, Paris. Tuffery S. (2011) \emph{Data Mining and Statistics for Decision Making}. Wiley, Chichester. } \seealso{ \code{\link{classify}}, \code{\link{desDA}}, \code{\link{geoDA}}, \code{\link{linDA}}, \code{\link{plsDA}} } DiscriMiner/man/betweenCov.Rd0000644000176200001440000000176112160255366015640 0ustar liggesusers\name{betweenCov} \alias{betweenCov} \title{Between-class Covariance Matrix} \usage{ betweenCov(variables, group, div_by_n = FALSE) } \arguments{ \item{variables}{matrix or data frame with explanatory variables (No missing values are allowed)} \item{group}{vector or factor with group memberships (No missing values are allowed)} \item{div_by_n}{logical indicating division by number of observations} } \description{ Calculates between-class covariance matrix } \details{ When \code{div_by_n=TRUE} the covariance matrices are divided by n (number of observations), otherwise they are divided by n-1 } \examples{ \dontrun{ # load iris dataset data(iris) # between-class covariance matrix (dividing by n-1) betweenCov(iris[,1:4], iris[,5]) # between-class covariance matrix (dividing by n) betweenCov(iris[,1:4], iris[,5], div_by_n=TRUE) } } \author{ Gaston Sanchez } \seealso{ \code{\link{getWithin}}, \code{\link{betweenSS}}, \code{\link{withinCov}}, \code{\link{totalCov}} } DiscriMiner/man/groupQuants.Rd0000644000176200001440000000156712160236532016065 0ustar liggesusers\name{groupQuants} \alias{groupQuants} \title{Group Quantiles} \usage{ groupQuants(variables, group, prob, na.rm = FALSE) } \arguments{ \item{variables}{matrix or data frame with explanatory variables (may contain missing values)} \item{group}{vector or factor with group memberships} \item{prob}{probability value (numeric value between 0 and 1)} \item{na.rm}{logical indicating whether missing values should be removed} } \value{ matrix of group quantiles (with variables in the rows, and groups in the columns) } \description{ Calculates the specified quantiles for each group } \examples{ \dontrun{ # dataset iris data(iris) # group quantile prob=20 groupQuants(iris[,1:4], iris[,5], prob=0.20) } } \author{ Gaston Sanchez } \seealso{ \code{\link{groupMeans}}, \code{\link{groupVars}}, \code{\link{groupStds}}, \code{\link{groupMedians}} } DiscriMiner/man/plsDA.Rd0000644000176200001440000000667512226714217014551 0ustar liggesusers\name{plsDA} \alias{plsDA} \title{PLS Discriminant Analysis} \usage{ plsDA(variables, group, autosel = TRUE, comps = 2, validation = NULL, learn = NULL, test = NULL, cv = "LOO", k = NULL, retain.models = FALSE) } \arguments{ \item{variables}{matrix or data frame with explanatory variables} \item{group}{vector or factor with group memberships} \item{autosel}{logical indicating automatic selection of PLS components by cross-validation. Default \code{autosel=TRUE}} \item{comps}{integer greater than one indicating the number of PLS components to retain. Used only when \code{autosel=FALSE}} \item{validation}{type of validation, either \code{NULL} or \code{"learntest"}. Default \code{NULL}} \item{learn}{optional vector of indices for a learn-set. Only used when \code{validation="learntest"}. Default \code{NULL}} \item{test}{optional vector of indices for a test-set. Only used when \code{validation="learntest"}. Default \code{NULL}} \item{cv}{string indicating the type of crossvalidation. Avialable options are \code{"LOO"} (Leave-One-Out) and \code{"LKO"} (Leave-K fold-Out)} \item{k}{fold left out if using LKO (usually 7 or 10)} \item{retain.models}{whether to retain lower models (i.e. all lower component results)} } \value{ An object of class \code{"plsda"}, basically a list with the following elements: \item{functions}{table with discriminant functions} \item{confusion}{confusion matrix} \item{scores}{discriminant scores for each observation} \item{loadings}{loadings} \item{y.loadings}{y loadings} \item{classification}{assigned class} \item{error_rate}{misclassification error rate} \item{components}{PLS components} \item{Q2}{quality of loo cross-validation} \item{R2}{R-squared coefficients} \item{VIP}{Variable Importance for Projection} \item{comp_vars}{correlations between components and variables} \item{comp_group}{correlations between components and groups} } \description{ Performs a Partial Least Squares (PLS) Discriminant Analysis by giving the option to include a random leave-k fold out cross validation } \details{ When \code{validation=NULL} leave-one-out (loo) cross-validation is performed. \cr When \code{validation="learntest"} validation is performed by providing a learn-set and a test-set of observations. \cr } \examples{ \dontrun{ # load iris dataset data(iris) # PLS discriminant analysis specifying number of components = 2 my_pls1 = plsDA(iris[,1:4], iris$Species, autosel=FALSE, comps=2) my_pls1$confusion my_pls1$error_rate # plot circle of correlations plot(my_pls1) # PLS discriminant analysis with automatic selection of components my_pls2 = plsDA(iris[,1:4], iris$Species, autosel=TRUE) my_pls2$confusion my_pls2$error_rate # linear discriminant analysis with learn-test validation learning = c(1:40, 51:90, 101:140) testing = c(41:50, 91:100, 141:150) my_pls3 = plsDA(iris[,1:4], iris$Species, validation="learntest", learn=learning, test=testing) my_pls3$confusion my_pls3$error_rate } } \author{ Charles Determan Jr, Gaston Sanchez } \references{ Tenenhaus M. (1998) \emph{La Regression PLS}. Editions Technip, Paris. Perez-Enciso M., Tenenhaus M. (2003) \emph{Prediction of clinical outcome with microarray data: a partial least squares discriminant analysis (PLS-DA) approach}. Human Genetics 112: 581-592. } \seealso{ \code{\link{classify}}, \code{\link{geoDA}}, \code{\link{linDA}}, \code{\link{quaDA}} } DiscriMiner/man/bordeaux.Rd0000644000176200001440000000150512160236532015336 0ustar liggesusers\docType{data} \name{bordeaux} \alias{bordeaux} \title{Bordeaux Wines Dataset} \format{A data frame with 34 observations on the following 6 variables. \tabular{ll}{ \code{year} \tab year of harvest\cr \code{temperature} \tab sum of daily average temperatures (in celsius degrees)\cr \code{sun} \tab duration of insolation (in hours)\cr \code{heat} \tab number of super-hot days\cr \code{rain} \tab rain level (in millimeters)\cr \code{quality} \tab wine quality: a factor with levels \code{bad}, \code{good}, and \code{medium}\cr }} \description{ Quality measures of wines from Bordeaux, France } \examples{ \dontrun{ # load data data(bordeaux) # structure of data str(bordeaux) } } \references{ Chapter 10: Analyse Discriminante, page 353. \cr Tenenhaus M. (2007) \emph{Statistique}. Dunod, Paris. } \keyword{datasets} DiscriMiner/man/DiscriMiner-package.Rd0000644000176200001440000000271412160254542017332 0ustar liggesusers\docType{package} \name{DiscriMiner-package} \alias{DiscriMiner-package} \title{Tools of the Trade for Discriminant Analysis} \description{ DiscriMiner contains several functions for Discriminant Analysis and Classification purposes covering various methods such as descriptive, geometric, linear, quadratic, PLS, as well as qualitative discriminant analyses. } \details{ \tabular{ll}{ Package: \tab DiscriMiner\cr Type: \tab Package\cr Version: \tab 0.1-23\cr Date: \tab 2012-12-20\cr License: \tab GPL-3\cr } } \author{ Gaston Sanchez Maintainer: Gaston Sanchez } \references{ \url{http://www.gastonsanchez.com/discriminer} Lebart L., Piron M., Morineau A. (2006) \emph{Statistique exploratoire multidimensionnelle}. Dunod, Paris. Nakache J-P., Confais J. (2003) \emph{Statistique explicative appliquee}. Editions Technip, Paris. Saporta G. (2006) \emph{Probabilites, analyse des donnees et statistique}. Editions Technip, Paris. Tenenhaus M. (1998) \emph{La Regression PLS}. Editions Technip, Paris. Tenenhaus M. (2007) \emph{Statistique}. Dunod, Paris. Tuffery S. (2008) \emph{Data Mining et Statistique Decisionnelle}. Editions Technip, Paris. Tuffery S. (2011) \emph{Data Mining and Statistics for Decision Making}. Wiley, Chichester. \emph{Multiple Correspondence Analysis and Related Methods}. (2006) Edited by Michael Greenacre and Jorg Blasius. Chapman and Hall/CRC } \keyword{package} DiscriMiner/man/linDA.Rd0000644000176200001440000000531512160374124014517 0ustar liggesusers\name{linDA} \alias{linDA} \title{Linear Discriminant Analysis} \usage{ linDA(variables, group, prior = NULL, validation = NULL, learn = NULL, test = NULL, prob = FALSE) } \arguments{ \item{variables}{matrix or data frame with explanatory variables} \item{group}{vector or factor with group memberships} \item{prior}{optional vector of prior probabilities. Default \code{prior=NULL} implies group proportions} \item{validation}{type of validation, either \code{"crossval"} or \code{"learntest"}. Default \code{NULL}} \item{learn}{optional vector of indices for a learn-set. Only used when \code{validation="learntest"}. Default \code{NULL}} \item{test}{optional vector of indices for a test-set. Only used when \code{validation="learntest"}. Default \code{NULL}} \item{prob}{logical indicating whether the group classification results should be expressed in probability terms} } \value{ An object of class \code{"linda"}, basically a list with the following elements: \item{functions}{table with discriminant functions} \item{confusion}{confusion matrix} \item{scores}{discriminant scores for each observation} \item{classification}{assigned class} \item{error_rate}{misclassification error rate} } \description{ Performs a Linear Discriminant Analysis } \details{ When \code{validation=NULL} there is no validation \cr When \code{validation="crossval"} cross-validation is performed by randomly separating the observations in ten groups. \cr When \code{validation="learntest"} validation is performed by providing a learn-set and a test-set of observations. \cr } \examples{ \dontrun{ # load iris dataset data(iris) # linear discriminant analysis with no validation my_lin1 = linDA(iris[,1:4], iris$Species) my_lin1$confusion my_lin1$error_rate # linear discriminant analysis with cross-validation my_lin2 = linDA(iris[,1:4], iris$Species, validation="crossval") my_lin2$confusion my_lin2$error_rate # linear discriminant analysis with learn-test validation learning = c(1:40, 51:90, 101:140) testing = c(41:50, 91:100, 141:150) my_lin3 = linDA(iris[,1:4], iris$Species, validation="learntest", learn=learning, test=testing) my_lin3$confusion my_lin3$error_rate } } \author{ Gaston Sanchez } \references{ Lebart L., Piron M., Morineau A. (2006) \emph{Statistique Exploratoire Multidimensionnelle}. Dunod, Paris. Saporta G. (2006) \emph{Probabilites, analyse des donnees et statistique}. Editions Technip, Paris. Tuffery S. (2011) \emph{Data Mining and Statistics for Decision Making}. Wiley, Chichester. } \seealso{ \code{\link{classify}}, \code{\link{desDA}}, \code{\link{geoDA}}, \code{\link{quaDA}}, \code{\link{plsDA}} } DiscriMiner/man/groupVars.Rd0000644000176200001440000000141312160236532015513 0ustar liggesusers\name{groupVars} \alias{groupVars} \title{Group Variances} \usage{ groupVars(variables, group, na.rm = FALSE) } \arguments{ \item{variables}{matrix or data frame with explanatory variables (may contain missing values)} \item{group}{vector or factor with group memberships} \item{na.rm}{logical indicating whether missing values should be removed} } \value{ matrix of group variances (with variables in the rows, and groups in the columns) } \description{ Calculates the variances for each group } \examples{ \dontrun{ # dataset iris data(iris) # group variances groupVars(iris[,1:4], iris[,5]) } } \author{ Gaston Sanchez } \seealso{ \code{\link{groupMeans}}, \code{\link{groupStds}}, \code{\link{groupMedians}}, \code{\link{groupQuants}} } DiscriMiner/man/geoDA.Rd0000644000176200001440000000437112160374124014510 0ustar liggesusers\name{geoDA} \alias{geoDA} \title{Geometric Predictive Discriminant Analysis} \usage{ geoDA(variables, group, validation = NULL, learn = NULL, test = NULL) } \arguments{ \item{variables}{matrix or data frame with explanatory variables} \item{group}{vector or factor with group memberships} \item{validation}{type of validation, either \code{"crossval"} or \code{"learntest"}. Default \code{NULL}} \item{learn}{optional vector of indices for a learn-set. Only used when \code{validation="learntest"}. Default \code{NULL}} \item{test}{optional vector of indices for a test-set. Only used when \code{validation="learntest"}. Default \code{NULL}} } \value{ An object of class \code{"geoda"}, basically a list with the following elements: \item{functions}{table with discriminant functions} \item{confusion}{confusion matrix} \item{scores}{discriminant scores for each observation} \item{classification}{assigned class} \item{error_rate}{misclassification error rate} } \description{ Performs a Geometric Predictive Discriminant Analysis } \details{ When \code{validation=NULL} there is no validation \cr When \code{validation="crossval"} cross-validation is performed by randomly separating the observations in ten groups. \cr When \code{validation="learntest"} validationi is performed by providing a learn-set and a test-set of observations. \cr } \examples{ \dontrun{ # load bordeaux wines dataset data(iris) # geometric predictive discriminant analysis with no validation my_geo1 = geoDA(iris[,1:4], iris$Species) my_geo1$confusion my_geo1$error_rate # geometric predictive discriminant analysis with cross-validation my_geo2 = geoDA(iris[,1:4], iris$Species, validation="crossval") my_geo2$confusion my_geo2$error_rate } } \author{ Gaston Sanchez } \references{ Lebart L., Piron M., Morineau A. (2006) \emph{Statistique Exploratoire Multidimensionnelle}. Dunod, Paris. Saporta G. (2006) \emph{Probabilites, analyse des donnees et statistique}. Editions Technip, Paris. Tuffery S. (2011) \emph{Data Mining and Statistics for Decision Making}. Wiley, Chichester. } \seealso{ \code{\link{classify}}, \code{\link{desDA}}, \code{\link{linDA}}, \code{\link{quaDA}}, \code{\link{plsDA}} } DiscriMiner/man/insurance.Rd0000644000176200001440000000323512160236532015516 0ustar liggesusers\docType{data} \name{insurance} \alias{insurance} \title{Insurance Dataset} \format{A data frame with 1106 observations on the following 10 variables. \tabular{ll}{ \code{Claims} \tab Group variable. A factor with levels \code{bad} and \code{good}\cr \code{Use} \tab Type of Use. A factor with levels \code{private} and \code{professional}\cr \code{Type} \tab Insurance Type. A factor with levels \code{companies}, \code{female}, and \code{male}\cr \code{Language} \tab Language. A factor with levels \code{flemish} and \code{french}\cr \code{BirthCohort} \tab Birth Cohort. A factor with levels \code{BD_1890_1949}, \code{BD_1950_1973}, and \code{BD_unknown}\cr \code{Region} \tab Geographic Region. A factor with levels \code{Brussels} and \code{Other_regions}\cr \code{BonusMalus} \tab Level of bonus-malus. A factor with levels \code{BM_minus} and \code{BM_plus}\cr \code{YearSuscrip} \tab Year of Subscription. A factor with levels \code{YS<86} and \code{YS>=86}\cr \code{Horsepower} \tab Horsepower. A factor with levels \code{HP<=39} and \code{HP>=40}\cr \code{YearConstruc} \tab Year of vehicle construction. A factor with levels \code{YC_33_89} and \code{YC_90_91}\cr }} \description{ Dataset of car-insurance customers from Belgium in 1992 } \details{ Dataset for DISQUAL method } \examples{ \dontrun{ # load data data(insurance) # structure str(insurance) } } \references{ Saporta G., Niang N. (2006) Correspondence Analysis and Classification. In \emph{Multiple Correspondence Analysis and Related Methods}, M. Greenacre and J. Blasius, Eds., pp 371-392. Chapman & Hall/CRC, Boca Raton, Florida, USA. } \seealso{ \code{\link{disqual}} } \keyword{datasets} DiscriMiner/man/groupMeans.Rd0000644000176200001440000000137212160236532015647 0ustar liggesusers\name{groupMeans} \alias{groupMeans} \title{Group Means} \usage{ groupMeans(variables, group, na.rm = FALSE) } \arguments{ \item{variables}{matrix or data frame with explanatory variables (may contain missing values)} \item{group}{vector or factor with group memberships} \item{na.rm}{logical indicating whether missing values should be removed} } \value{ matrix of group means (with variables in the rows, and groups in the columns) } \description{ Calculates means for each group } \examples{ \dontrun{ # dataset iris data(iris) # group means groupMeans(iris[,1:4], iris[,5]) } } \author{ Gaston Sanchez } \seealso{ \code{\link{groupVars}}, \code{\link{groupStds}}, \code{\link{groupMedians}}, \code{\link{groupQuants}} } DiscriMiner/man/disqual.Rd0000644000176200001440000000527512160374124015177 0ustar liggesusers\name{disqual} \alias{disqual} \title{Discriminant Analysis on Qualitative Variables} \usage{ disqual(variables, group, validation = NULL, learn = NULL, test = NULL, autosel = TRUE, prob = 0.05) } \arguments{ \item{variables}{data frame with qualitative explanatory variables (coded as factors)} \item{group}{vector or factor with group memberships} \item{validation}{type of validation, either \code{"crossval"} or \code{"learntest"}. Default \code{NULL}} \item{learn}{optional vector of indices for a learn-set. Only used when \code{validation="learntest"}. Default \code{NULL}} \item{test}{optional vector of indices for a test-set. Only used when \code{validation="learntest"}. Default \code{NULL}} \item{autosel}{logical indicating automatic selection of MCA components} \item{prob}{probability level for automatic selection of MCA components. Default \code{prob = 0.05}} } \value{ An object of class \code{"disqual"}, basically a list with the following elements: \item{raw_coefs}{raw coefficients of discriminant functions} \item{norm_coefs}{normalizaed coefficients of discriminant functions, ranging from 0 - 1000} \item{confusion}{confusion matrix} \item{scores}{discriminant scores for each observation} \item{classification}{assigned class} \item{error_rate}{misclassification error rate} } \description{ Implementation of the DISQUAL methodology. Disqual performs a Fishers Discriminant Analysis on components from a Multiple Correspondence Analysis } \details{ When \code{validation=NULL} there is no validation \cr When \code{validation="crossval"} cross-validation is performed by randomly separating the observations in ten groups. \cr When \code{validation="learntest"} validationi is performed by providing a learn-set and a test-set of observations. \cr } \examples{ \dontrun{ # load insurance dataset data(insurance) # disqual analysis with no validation my_disq1 = disqual(insurance[,-1], insurance[,1], validation=NULL) my_disq1 # disqual analysis with cross-validation my_disq2 = disqual(insurance[,-1], insurance[,1], validation="crossval") my_disq2 } } \author{ Gaston Sanchez } \references{ Lebart L., Piron M., Morineau A. (2006) \emph{Statistique Exploratoire Multidimensionnelle}. Dunod, Paris. Saporta G. (2006) \emph{Probabilites, analyse des donnees et statistique}. Editions Technip, Paris. Saporta G., Niang N. (2006) Correspondence Analysis and Classification. In \emph{Multiple Correspondence Analysis and Related Methods}, Eds. Michael Greenacre and Jorg Blasius, 371-392. Chapman and Hall/CRC } \seealso{ \code{\link{easyMCA}}, \code{\link{classify}}, \code{\link{binarize}} } DiscriMiner/man/plsDA_old.Rd0000644000176200001440000000625612160626332015377 0ustar liggesusers\name{plsDA_old} \alias{plsDA_old} \title{PLS Discriminant Analysis} \usage{ plsDA_old(variables, group, autosel = TRUE, comps = 2, validation = NULL, learn = NULL, test = NULL, scaled = TRUE) } \arguments{ \item{variables}{matrix or data frame with explanatory variables} \item{group}{vector or factor with group memberships} \item{autosel}{logical indicating automatic selection of PLS components by cross-validation. Default \code{autosel=TRUE}} \item{comps}{integer greater than one indicating the number of PLS components to retain. Used only when \code{autosel=FALSE}} \item{validation}{type of validation, either \code{NULL} or \code{"learntest"}. Default \code{NULL}} \item{learn}{optional vector of indices for a learn-set. Only used when \code{validation="learntest"}. Default \code{NULL}} \item{test}{optional vector of indices for a test-set. Only used when \code{validation="learntest"}. Default \code{NULL}} \item{scaled}{logical indicating whether to scale the data (default \code{TRUE})} } \value{ An object of class \code{"plsda"}, basically a list with the following elements: \item{functions}{table with discriminant functions} \item{confusion}{confusion matrix} \item{scores}{discriminant scores for each observation} \item{classification}{assigned class} \item{error_rate}{misclassification error rate} \item{components}{PLS components} \item{Q2}{quality of loo cross-validation} \item{R2}{R-squared coefficients} \item{VIP}{Variable Importance for Projection} \item{comp_vars}{correlations between components and variables} \item{comp_group}{correlations between components and groups} } \description{ Performs a Partial Least Squares (PLS) Discriminant Analysis } \details{ When \code{validation=NULL} leave-one-out (loo) cross-validation is performed. \cr When \code{validation="learntest"} validation is performed by providing a learn-set and a test-set of observations. \cr } \note{ This is a previous version of plsDA. Not used anymore. } \examples{ \dontrun{ # load iris dataset data(iris) # PLS discriminant analysis specifying number of components = 2 my_pls1 = plsDA(iris[,1:4], iris$Species, autosel=FALSE, comps=2) my_pls1$confusion my_pls1$error_rate # plot circle of correlations plot(my_pls1) # PLS discriminant analysis with automatic selection of components my_pls2 = plsDA(iris[,1:4], iris$Species, autosel=TRUE) my_pls2$confusion my_pls2$error_rate # linear discriminant analysis with learn-test validation learning = c(1:40, 51:90, 101:140) testing = c(41:50, 91:100, 141:150) my_pls3 = plsDA(iris[,1:4], iris$Species, validation="learntest", learn=learning, test=testing) my_pls3$confusion my_pls3$error_rate } } \author{ Gaston Sanchez } \references{ Tenenhaus M. (1998) \emph{La Regression PLS}. Editions Technip, Paris. Perez-Enciso M., Tenenhaus M. (2003) \emph{Prediction of clinical outcome with microarray data: a partial least squares discriminant analysis (PLS-DA) approach}. Human Genetics 112: 581-592. } \seealso{ \code{\link{classify}}, \code{\link{geoDA}}, \code{\link{linDA}}, \code{\link{quaDA}} } \keyword{internal} DiscriMiner/man/easyMCA.Rd0000644000176200001440000000172412160374124015012 0ustar liggesusers\name{easyMCA} \alias{easyMCA} \title{Multiple Correspondence Analysis} \usage{ easyMCA(variables) } \arguments{ \item{variables}{data frame with categorical variables (coded as factors)} } \value{ An object of class \code{"qualmca"}, basically a list with the following elements: \item{values}{table with eigenvalues} \item{coefficients}{coefficients of factorial axes} \item{components}{factor coordinates} } \description{ Performs a basic Multiple Correspondence Analysis (MCA) } \examples{ \dontrun{ # load insurance wines dataset data(insurance) # multiple correspondence analysis mca1 = easyMCA(insurance[,-1]) mca1 } } \author{ Gaston Sanchez } \references{ Lebart L., Piron M., Morineau A. (2006) \emph{Statistique Exploratoire Multidimensionnelle}. Dunod, Paris. Saporta G. (2006) \emph{Probabilites, analyse des donnees et statistique}. Editions Technip, Paris. } \seealso{ \code{\link{disqual}}, \code{\link{binarize}} } DiscriMiner/man/getWithin.Rd0000644000176200001440000000172612160236532015474 0ustar liggesusers\name{getWithin} \alias{getWithin} \title{Within-class Covariance Matrix} \usage{ getWithin(variables, group) } \arguments{ \item{variables}{matrix or data frame with explanatory variables (No missing values are allowed)} \item{group}{vector or factor with group memberships (No missing values are allowed)} } \description{ Calculates the estimated within-class covariance matrix } \details{ The obtained matrix is the estimated within-class covariance matrix (i.e. within-class covariance matrix divided by its degrees of freedom \code{n-k}, where \code{n} is the number of observations and \code{k} is the number of groups) } \examples{ \dontrun{ # load iris dataset data(iris) # estimated within-class covariance matrix (dividing by n-k) getWithin(iris[,1:4], iris[,5]) # compared to the within-class covariance matrix (dividing by n-1) withinCov(iris[,1:4], iris[,5]) } } \author{ Gaston Sanchez } \seealso{ \code{\link{withinCov}} }