randomForest/0000755000175100001440000000000012037262201012726 5ustar hornikusersrandomForest/MD50000644000175100001440000000543712037262201013247 0ustar hornikusers393a5ca445f6965873eca0259a17f833 *COPYING e92ce0a08930756cf4b8ace34ec6d5aa *DESCRIPTION 1af1bc3aead1dde855db3123db9ac38c *NAMESPACE 164fc34443e28a0a0940abd99cd175ca *R/MDSplot.R eae0c348160c0e1f5c5140169c7610be *R/classCenter.R 18674a0600679868a42a4ce672f0b43e *R/combine.R 6589f2426e6f9a2a961cc1fe1afbb0fa *R/getTree.R 2cbf5555f37b34e8b13b48075dce0837 *R/grow.R d8c96e25b254dc8ca3d82914496e03f9 *R/importance.R 8aa62dd033ba9178cb95b91a247da748 *R/margin.R 9534d1b1bd2f070a12c0663be4575ca2 *R/na.roughfix.R 79adf4b734d2f4a72ccebaf26257123b *R/outlier.R aeb6b6722cd382522b7cda751c98dd09 *R/partialPlot.R 8ed1742b270d6b2406fea238ca062f46 *R/plot.randomForest.R ff293fe1c4e56366812e5aaf4a5e3843 *R/predict.randomForest.R a34e189949da4530eb9ac7efc04abe7a *R/print.randomForest.R 9a47b11b9fe7f00849a8d5c076f701be *R/randomForest.R 01ae5d2bdc0d843a723756bd76b0b271 *R/randomForest.default.R a2110f7dd81abb19f7596f4f36b41adf *R/randomForest.formula.R 4d163330cc40560f8241d7b781c15cca *R/rfImpute.R efe14c96f36ac3890b1d8cf194dc5ec6 *R/rfNews.R 7e89d590d18f16c8abd89ac6c19384cf *R/rfcv.R 901baf10af3254e9996b36dc0f6f39f9 *R/treesize.R e695b3cfc07facf54856bf366870e360 *R/tuneRF.R 143dac20c44a22e3bd132cad4385e90e *R/varImpPlot.R 18f94384910772d7babc8354b4ebae17 *R/varUsed.R a54ba14b0cb6cc15e4b7687ead9a2bf9 *R/zzz.R 6d430e97b1da9c1b19eabda685c248b3 *data/imports85.rda 9162881b7d318756c63a28e52b41c2d7 *inst/CITATION ebc223b2558844eafd4cb0ea2ba78e19 *inst/NEWS b9b69f61e9ed079e1d9d84df6360d522 *man/MDSplot.Rd 4fcbc72dd2475ff32f5a348bfa71af41 *man/classCenter.Rd bf3608aff76e654e77632a516980f451 *man/combine.Rd d5f3014c1eb53de779654d021027cbf0 *man/getTree.Rd a93fad813c51f713fd94c6fd3a46ce07 *man/grow.Rd 3a63f7f925c1b4ca1bf805c5c7eca4f0 *man/importance.Rd 3720f27c0c909c615d6d7f99826e1d3b *man/imports85.Rd c901b3cb1b2ad299574e76e5d80859ed *man/margin.Rd e155d763fd8fbf4301e13da896760f3f *man/na.roughfix.Rd 8ecd590e5b87afb8e84058adea3f8415 *man/outlier.Rd 5f3f517b199cd1fb6e2db84e21bf0719 *man/partialPlot.Rd e0493ce4c5109e96c42cf9dec0e2a018 *man/plot.randomForest.Rd 23088ecd37387e89b75598b51b1dc74d *man/predict.randomForest.Rd a13506cd771d2dbe9978dd6dfaa1674c *man/randomForest.Rd 403c7c0cdec2658cba6c7c4e5607b593 *man/rfImpute.Rd c81b1c719dcdc878c80adb4353f831ae *man/rfNews.Rd 54a5446961baae5495b572b084d228c9 *man/rfcv.Rd f5431e1a30968d9fd31f7b964f9384d1 *man/treesize.Rd e7180386828b0d248c172a03915fc187 *man/tuneRF.Rd ef5c9e454eb4478690b4e292f172c94b *man/varImpPlot.Rd 1638c2a6ba3ef89fe83321470c6d9386 *man/varUsed.Rd bbbce11b132256b5d5cf1d2e11ed4c1d *src/classTree.c ae595f53f19e393952f9615615891c41 *src/regTree.c d029c10e358671cbf50b2f51fc36e35f *src/regrf.c 33f9e47d9ddce65d2137ea0fccbe5a44 *src/rf.c 2326f54dbe8953a4da5b83ad40750f0a *src/rf.h 9c9adf5f9a2ec8ea1db5d688166221f9 *src/rfsub.f ed25324cc43d43fb9fd858db97823a1d *src/rfutils.c randomForest/src/0000755000175100001440000000000012037254525013527 5ustar hornikusersrandomForest/src/rfutils.c0000744000175100001440000002176012037254525015372 0ustar hornikusers/******************************************************************* Copyright (C) 2001-2012 Leo Breiman, Adele Cutler and Merck & Co., Inc. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. *******************************************************************/ #include #include "rf.h" void zeroInt(int *x, int length) { memset(x, 0, length * sizeof(int)); } void zeroDouble(double *x, int length) { memset(x, 0, length * sizeof(double)); } void createClass(double *x, int realN, int totalN, int mdim) { /* Create the second class by bootstrapping each variable independently. */ int i, j, k; for (i = realN; i < totalN; ++i) { for (j = 0; j < mdim; ++j) { k = (int) (unif_rand() * realN); x[j + i * mdim] = x[j + k * mdim]; } } } void normClassWt(int *cl, const int nsample, const int nclass, const int useWt, double *classwt, int *classFreq) { int i; double sumwt = 0.0; if (useWt) { /* Normalize user-supplied weights so they sum to one. */ for (i = 0; i < nclass; ++i) sumwt += classwt[i]; for (i = 0; i < nclass; ++i) classwt[i] /= sumwt; } else { for (i = 0; i < nclass; ++i) { classwt[i] = ((double) classFreq[i]) / nsample; } } for (i = 0; i < nclass; ++i) { classwt[i] = classFreq[i] ? classwt[i] * nsample / classFreq[i] : 0.0; } } void makeA(double *x, const int mdim, const int nsample, int *cat, int *a, int *b) { /* makeA() constructs the mdim by nsample integer array a. For each numerical variable with values x(m, n), n=1, ...,nsample, the x-values are sorted from lowest to highest. Denote these by xs(m, n). Then a(m,n) is the case number in which xs(m, n) occurs. The b matrix is also contructed here. If the mth variable is categorical, then a(m, n) is the category of the nth case number. */ int i, j, n1, n2, *index; double *v; v = (double *) Calloc(nsample, double); index = (int *) Calloc(nsample, int); for (i = 0; i < mdim; ++i) { if (cat[i] == 1) { /* numerical predictor */ for (j = 0; j < nsample; ++j) { v[j] = x[i + j * mdim]; index[j] = j + 1; } R_qsort_I(v, index, 1, nsample); /* this sorts the v(n) in ascending order. index(n) is the case number of that v(n) nth from the lowest (assume the original case numbers are 1,2,...). */ for (j = 0; j < nsample-1; ++j) { n1 = index[j]; n2 = index[j + 1]; a[i + j * mdim] = n1; if (j == 0) b[i + (n1-1) * mdim] = 1; b[i + (n2-1) * mdim] = (v[j] < v[j + 1]) ? b[i + (n1-1) * mdim] + 1 : b[i + (n1-1) * mdim]; } a[i + (nsample-1) * mdim] = index[nsample-1]; } else { /* categorical predictor */ for (j = 0; j < nsample; ++j) a[i + j*mdim] = (int) x[i + j * mdim]; } } Free(index); Free(v); } void modA(int *a, int *nuse, const int nsample, const int mdim, int *cat, const int maxcat, int *ncase, int *jin) { int i, j, k, m, nt; *nuse = 0; for (i = 0; i < nsample; ++i) if (jin[i]) (*nuse)++; for (i = 0; i < mdim; ++i) { k = 0; nt = 0; if (cat[i] == 1) { for (j = 0; j < nsample; ++j) { if (jin[a[i + k * mdim] - 1]) { a[i + nt * mdim] = a[i + k * mdim]; k++; } else { for (m = 0; m < nsample - k; ++m) { if (jin[a[i + (k + m) * mdim] - 1]) { a[i + nt * mdim] = a[i + (k + m) * mdim]; k += m + 1; break; } } } nt++; if (nt >= *nuse) break; } } } if (maxcat > 1) { k = 0; nt = 0; for (i = 0; i < nsample; ++i) { if (jin[k]) { k++; ncase[nt] = k; } else { for (j = 0; j < nsample - k; ++j) { if (jin[k + j]) { ncase[nt] = k + j + 1; k += j + 1; break; } } } nt++; if (nt >= *nuse) break; } } } void Xtranslate(double *x, int mdim, int nrnodes, int nsample, int *bestvar, int *bestsplit, int *bestsplitnext, double *xbestsplit, int *nodestatus, int *cat, int treeSize) { /* this subroutine takes the splits on numerical variables and translates them back into x-values. It also unpacks each categorical split into a 32-dimensional vector with components of zero or one--a one indicates that the corresponding category goes left in the split. */ int i, m; for (i = 0; i < treeSize; ++i) { if (nodestatus[i] == 1) { m = bestvar[i] - 1; if (cat[m] == 1) { xbestsplit[i] = 0.5 * (x[m + (bestsplit[i] - 1) * mdim] + x[m + (bestsplitnext[i] - 1) * mdim]); } else { xbestsplit[i] = (double) bestsplit[i]; } } } } void permuteOOB(int m, double *x, int *in, int nsample, int mdim) { /* Permute the OOB part of a variable in x. * Argument: * m: the variable to be permuted * x: the data matrix (variables in rows) * in: vector indicating which case is OOB * nsample: number of cases in the data * mdim: number of variables in the data */ double *tp, tmp; int i, last, k, nOOB = 0; tp = (double *) Calloc(nsample, double); for (i = 0; i < nsample; ++i) { /* make a copy of the OOB part of the data into tp (for permuting) */ if (in[i] == 0) { tp[nOOB] = x[m + i*mdim]; nOOB++; } } /* Permute tp */ last = nOOB; for (i = 0; i < nOOB; ++i) { k = (int) last * unif_rand(); tmp = tp[last - 1]; tp[last - 1] = tp[k]; tp[k] = tmp; last--; } /* Copy the permuted OOB data back into x. */ nOOB = 0; for (i = 0; i < nsample; ++i) { if (in[i] == 0) { x[m + i*mdim] = tp[nOOB]; nOOB++; } } Free(tp); } /* Compute proximity. */ void computeProximity(double *prox, int oobprox, int *node, int *inbag, int *oobpair, int n) { /* Accumulate the number of times a pair of points fall in the same node. prox: n x n proximity matrix oobprox: should the accumulation only count OOB cases? (0=no, 1=yes) node: vector of terminal node labels inbag: indicator of whether a case is in-bag oobpair: matrix to accumulate the number of times a pair is OOB together n: total number of cases */ int i, j; for (i = 0; i < n; ++i) { for (j = i+1; j < n; ++j) { if (oobprox) { if (! (inbag[i] || inbag[j]) ) { oobpair[j*n + i] ++; oobpair[i*n + j] ++; if (node[i] == node[j]) { prox[j*n + i] += 1.0; prox[i*n + j] += 1.0; } } } else { if (node[i] == node[j]) { prox[j*n + i] += 1.0; prox[i*n + j] += 1.0; } } } } } unsigned int pack(int nBits, int *bits) { int i = nBits; unsigned int pack = 0; while (--i >= 0) pack += bits[i] << i; return(pack); } void unpack(int nBits, unsigned int pack, int *bits) { /* pack is a 4-byte integer. The sub. returns icat, an integer array of zeroes and ones corresponding to the coefficients in the binary expansion of pack. */ int i; for (i = 0; i < nBits; pack >>= 1, ++i) bits[i] = pack & 1; } void F77_NAME(unpack)(int *nBits, unsigned int *pack, int *bits) { unpack(*nBits, *pack, bits); } #ifdef OLD double oldpack(int l, int *icat) { /* icat is a binary integer with ones for categories going left * and zeroes for those going right. The sub returns npack- the integer */ int k; double pack = 0.0; for (k = 0; k < l; ++k) { if (icat[k]) pack += R_pow_di(2.0, k); } return(pack); } void oldunpack(int l, int npack, int *icat) { /* * npack is a long integer. The sub. returns icat, an integer of zeroes and * ones corresponding to the coefficients in the binary expansion of npack. */ int i; zeroInt(icat, 32); icat[0] = npack % 2; for (i = 1; i < l; ++i) { npack = (npack - icat[i-1]) / 2; icat[i] = npack % 2; } } #endif /* OLD */ randomForest/src/rfsub.f0000744000175100001440000003777412037254525015042 0ustar hornikusersc Copyright (C) 2001-7 Leo Breiman and Adele Cutler and Merck & Co, Inc. c This program is free software; you can redistribute it and/or c modify it under the terms of the GNU General Public License c as published by the Free Software Foundation; either version 2 c of the License, or (at your option) any later version. c This program is distributed in the hope that it will be useful, c but WITHOUT ANY WARRANTY; without even the implied warranty of c MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the c GNU General Public License for more details. c c Modified by Andy Liaw and Matt Wiener: c The main program is re-written as a C function to be called from R. c All calls to the uniform RNG is replaced with R's RNG. Some subroutines c not called are excluded. Variables and arrays declared as double as c needed. Unused variables are deleted. c c SUBROUTINE BUILDTREE subroutine buildtree(a, b, cl, cat, maxcat, mdim, nsample, 1 nclass, treemap, bestvar, bestsplit, bestsplitnext, tgini, 1 nodestatus,nodepop, nodestart, classpop, tclasspop, 1 tclasscat,ta,nrnodes, idmove, ndsize, ncase, mtry, iv, 1 nodeclass, ndbigtree, win, wr, wl, mred, nuse, mind) c Buildtree consists of repeated calls to two subroutines, Findbestsplit c and Movedata. Findbestsplit does just that--it finds the best split of c the current node. Movedata moves the data in the split node right and c left so that the data corresponding to each child node is contiguous. c The buildtree bookkeeping is different from that in Friedman's original c CART program. ncur is the total number of nodes to date. c nodestatus(k)=1 if the kth node has been split. nodestatus(k)=2 if the c node exists but has not yet been split, and =-1 of the node is terminal. c A node is terminal if its size is below a threshold value, or if it is c all one class, or if all the x-values are equal. If the current node k c is split, then its children are numbered ncur+1 (left), and c ncur+2(right), ncur increases to ncur+2 and the next node to be split is c numbered k+1. When no more nodes can be split, buildtree returns to the c main program. implicit double precision(a-h,o-z) integer a(mdim, nsample), cl(nsample), cat(mdim), 1 treemap(2,nrnodes), bestvar(nrnodes), 1 bestsplit(nrnodes), nodestatus(nrnodes), ta(nsample), 1 nodepop(nrnodes), nodestart(nrnodes), 1 bestsplitnext(nrnodes), idmove(nsample), 1 ncase(nsample), b(mdim,nsample), 1 iv(mred), nodeclass(nrnodes), mind(mred) double precision tclasspop(nclass), classpop(nclass, nrnodes), 1 tclasscat(nclass, 32), win(nsample), wr(nclass), 1 wl(nclass), tgini(mdim), xrand integer msplit, ntie msplit = 0 call zerv(nodestatus,nrnodes) call zerv(nodestart,nrnodes) call zerv(nodepop,nrnodes) call zermr(classpop,nclass,nrnodes) do j=1,nclass classpop(j, 1) = tclasspop(j) end do ncur = 1 nodestart(1) = 1 nodepop(1) = nuse nodestatus(1) = 2 c start main loop do 30 kbuild = 1, nrnodes c call intpr("kbuild", 6, kbuild, 1) c call intpr("ncur", 4, ncur, 1) if (kbuild .gt. ncur) goto 50 if (nodestatus(kbuild) .ne. 2) goto 30 c initialize for next call to findbestsplit ndstart = nodestart(kbuild) ndend = ndstart + nodepop(kbuild) - 1 do j = 1, nclass tclasspop(j) = classpop(j,kbuild) end do jstat = 0 call findbestsplit(a,b,cl,mdim,nsample,nclass,cat,maxcat, 1 ndstart, ndend,tclasspop,tclasscat,msplit, decsplit, 1 nbest,ncase, jstat,mtry,win,wr,wl,mred,mind) c call intpr("jstat", 5, jstat, 1) c call intpr("msplit", 6, msplit, 1) c If the node is terminal, move on. Otherwise, split. if (jstat .eq. -1) then nodestatus(kbuild) = -1 goto 30 else bestvar(kbuild) = msplit iv(msplit) = 1 if (decsplit .lt. 0.0) decsplit = 0.0 tgini(msplit) = tgini(msplit) + decsplit if (cat(msplit) .eq. 1) then bestsplit(kbuild) = a(msplit,nbest) bestsplitnext(kbuild) = a(msplit,nbest+1) else bestsplit(kbuild) = nbest bestsplitnext(kbuild) = 0 endif endif call movedata(a,ta,mdim,nsample,ndstart,ndend,idmove,ncase, 1 msplit,cat,nbest,ndendl) c call intpr("ndend", 5, ndend, 1) c call intpr("ndendl", 6, ndendl, 1) c leftnode no.= ncur+1, rightnode no. = ncur+2. nodepop(ncur+1) = ndendl - ndstart + 1 nodepop(ncur+2) = ndend - ndendl nodestart(ncur+1) = ndstart nodestart(ncur+2) = ndendl + 1 c find class populations in both nodes do n = ndstart, ndendl nc = ncase(n) j=cl(nc) classpop(j,ncur+1) = classpop(j,ncur+1) + win(nc) end do do n = ndendl+1, ndend nc = ncase(n) j = cl(nc) classpop(j,ncur+2) = classpop(j,ncur+2) + win(nc) end do c call intpr("nL", 2, nodepop(ncur+1), 1) c call intpr("nR", 2, nodepop(ncur+2), 1) c check on nodestatus nodestatus(ncur+1) = 2 nodestatus(ncur+2) = 2 if (nodepop(ncur+1).le.ndsize) nodestatus(ncur+1) = -1 if (nodepop(ncur+2).le.ndsize) nodestatus(ncur+2) = -1 popt1 = 0 popt2 = 0 do j = 1, nclass popt1 = popt1 + classpop(j,ncur+1) popt2 = popt2 + classpop(j,ncur+2) end do do j=1,nclass if (classpop(j,ncur+1).eq.popt1) nodestatus(ncur+1) = -1 if (classpop(j,ncur+2).eq.popt2) nodestatus(ncur+2) = -1 end do treemap(1,kbuild) = ncur + 1 treemap(2,kbuild) = ncur + 2 nodestatus(kbuild) = 1 ncur = ncur+2 if (ncur.ge.nrnodes) goto 50 30 continue 50 continue ndbigtree = nrnodes do k=nrnodes, 1, -1 if (nodestatus(k) .eq. 0) ndbigtree = ndbigtree - 1 if (nodestatus(k) .eq. 2) nodestatus(k) = -1 end do c form prediction in terminal nodes do kn = 1, ndbigtree if (nodestatus(kn) .eq. -1) then pp = 0 ntie = 1 do j = 1, nclass if (classpop(j,kn) .gt. pp) then nodeclass(kn) = j pp = classpop(j,kn) ntie = 1 end if c Break ties at random: if (classpop(j,kn) .eq. pp) then call rrand(xrand) if (xrand .lt. 1.0 / ntie) then nodeclass(kn)=j pp=classpop(j,kn) end if ntie = ntie + 1 end if end do end if c call intpr("node", 4, kn, 1) c call intpr("status", 6, nodestatus(kn), 1) c call intpr("pred", 4, nodeclass(kn), 1) c call dblepr("pop1", 4, classpop(1, kn), 1) c call dblepr("pop2", 4, classpop(2, kn), 1) end do return end c SUBROUTINE FINDBESTSPLIT c For the best split, msplit is the variable split on. decsplit is the c dec. in impurity. If msplit is numerical, nsplit is the case number c of value of msplit split on, and nsplitnext is the case number of the c next larger value of msplit. If msplit is categorical, then nsplit is c the coding into an integer of the categories going left. subroutine findbestsplit(a, b, cl, mdim, nsample, nclass, cat, 1 maxcat, ndstart, ndend, tclasspop, tclasscat, msplit, 2 decsplit, nbest, ncase, jstat, mtry, win, wr, wl, 3 mred, mind) implicit double precision(a-h,o-z) integer a(mdim,nsample), cl(nsample), cat(mdim), 1 ncase(nsample), b(mdim,nsample), nn, j double precision tclasspop(nclass), tclasscat(nclass,32), dn(32), 1 win(nsample), wr(nclass), wl(nclass), xrand integer mind(mred), ncmax, ncsplit,nhit, ntie ncmax = 10 ncsplit = 512 c compute initial values of numerator and denominator of Gini pno = 0.0 pdo = 0.0 do j = 1, nclass pno = pno + tclasspop(j) * tclasspop(j) pdo = pdo + tclasspop(j) end do crit0 = pno / pdo jstat = 0 c start main loop through variables to find best split critmax = -1.0e25 do k = 1, mred mind(k) = k end do nn = mred c sampling mtry variables w/o replacement. do mt = 1, mtry call rrand(xrand) j = int(nn * xrand) + 1 mvar = mind(j) mind(j) = mind(nn) mind(nn) = mvar nn = nn - 1 lcat = cat(mvar) if (lcat .eq. 1) then c Split on a numerical predictor. rrn = pno rrd = pdo rln = 0 rld = 0 call zervr(wl, nclass) do j = 1, nclass wr(j) = tclasspop(j) end do ntie = 1 do nsp = ndstart, ndend-1 nc = a(mvar, nsp) u = win(nc) k = cl(nc) rln = rln + u * (2 * wl(k) + u) rrn = rrn + u * (-2 * wr(k) + u) rld = rld + u rrd = rrd - u wl(k) = wl(k) + u wr(k) = wr(k) - u if (b(mvar, nc) .lt. b(mvar, a(mvar, nsp + 1))) then c If neither nodes is empty, check the split. if (dmin1(rrd, rld) .gt. 1.0e-5) then crit = (rln / rld) + (rrn / rrd) if (crit .gt. critmax) then nbest = nsp critmax = crit msplit = mvar ntie = 1 end if c Break ties at random: if (crit .eq. critmax) then call rrand(xrand) if (xrand .lt. 1.0 / ntie) then nbest = nsp critmax = crit msplit = mvar end if ntie = ntie + 1 end if end if end if end do else c Split on a categorical predictor. Compute the decrease in impurity. call zermr(tclasscat, nclass, 32) do nsp = ndstart, ndend nc = ncase(nsp) l = a(mvar, ncase(nsp)) tclasscat(cl(nc), l) = tclasscat(cl(nc), l) + win(nc) end do nnz = 0 do i = 1, lcat su = 0 do j = 1, nclass su = su + tclasscat(j, i) end do dn(i) = su if(su .gt. 0) nnz = nnz + 1 end do nhit = 0 if (nnz .gt. 1) then if (nclass .eq. 2 .and. lcat .gt. ncmax) then call catmaxb(pdo, tclasscat, tclasspop, nclass, & lcat, nbest, critmax, nhit, dn) else call catmax(pdo, tclasscat, tclasspop, nclass, lcat, & nbest, critmax, nhit, maxcat, ncmax, ncsplit) end if if (nhit .eq. 1) msplit = mvar c else c critmax = -1.0e25 end if end if end do if (critmax .lt. -1.0e10 .or. msplit .eq. 0) jstat = -1 decsplit = critmax - crit0 return end C ============================================================== c SUBROUTINE MOVEDATA c This subroutine is the heart of the buildtree construction. Based on the c best split the data in the part of the a matrix corresponding to the c current node is moved to the left if it belongs to the left child and c right if it belongs to the right child. subroutine movedata(a,ta,mdim,nsample,ndstart,ndend,idmove, 1 ncase,msplit,cat,nbest,ndendl) implicit double precision(a-h,o-z) integer a(mdim,nsample),ta(nsample),idmove(nsample), 1 ncase(nsample),cat(mdim),icat(32) c compute idmove=indicator of case nos. going left if (cat(msplit).eq.1) then do nsp=ndstart,nbest nc=a(msplit,nsp) idmove(nc)=1 end do do nsp=nbest+1, ndend nc=a(msplit,nsp) idmove(nc)=0 end do ndendl=nbest else ndendl=ndstart-1 l=cat(msplit) call unpack(l,nbest,icat) do nsp=ndstart,ndend nc=ncase(nsp) if (icat(a(msplit,nc)).eq.1) then idmove(nc)=1 ndendl=ndendl + 1 else idmove(nc)=0 endif end do endif c shift case. nos. right and left for numerical variables. do 40 msh=1,mdim if (cat(msh).eq.1) then k=ndstart-1 do 50 n=ndstart,ndend ih=a(msh,n) if (idmove(ih).eq.1) then k=k+1 ta(k)=a(msh,n) endif 50 continue do 60 n=ndstart,ndend ih=a(msh,n) if (idmove(ih).eq.0) then k=k+1 ta(k)=a(msh,n) endif 60 continue do 70 k=ndstart,ndend a(msh,k)=ta(k) 70 continue endif 40 continue ndo=0 if (ndo.eq.1) then do 140 msh = 1, mdim if (cat(msh) .gt. 1) then k = ndstart - 1 do 150 n = ndstart, ndend ih = ncase(n) if (idmove(ih) .eq. 1) then k = k + 1 ta(k) = a(msh, ih) endif 150 continue do 160 n = ndstart, ndend ih = ncase(n) if (idmove(ih) .eq. 0) then k = k + 1 ta(k) = a(msh,ih) endif 160 continue do 170 k = ndstart, ndend a(msh,k) = ta(k) 170 continue endif 140 continue end if c compute case nos. for right and left nodes. if (cat(msplit).eq.1) then do 80 n=ndstart,ndend ncase(n)=a(msplit,n) 80 continue else k=ndstart-1 do 90 n=ndstart, ndend if (idmove(ncase(n)).eq.1) then k=k+1 ta(k)=ncase(n) endif 90 continue do 100 n=ndstart,ndend if (idmove(ncase(n)).eq.0) then k=k+1 ta(k)=ncase(n) endif 100 continue do 110 k=ndstart,ndend ncase(k)=ta(k) 110 continue endif return end c subroutine myunpack(l,npack,icat) c c npack is a long integer. The sub. returns icat, an integer of zeroes and c ones corresponding to the coefficients in the binary expansion of npack. c c integer icat(32),npack c do j=1,32 c icat(j)=0 c end do c n=npack c icat(1)=mod(n,2) c do k=2,l c n=(n-icat(k-1))/2 c icat(k)=mod(n,2) c end do c end subroutine zerv(ix,m1) integer ix(m1) do n=1,m1 ix(n)=0 end do end subroutine zervr(rx,m1) double precision rx(m1) do n=1,m1 rx(n)=0.0d0 end do end subroutine zerm(mx,m1,m2) integer mx(m1,m2) do i=1,m1 do j=1,m2 mx(i,j)=0 end do end do end subroutine zermr(rx,m1,m2) double precision rx(m1,m2) do i=1,m1 do j=1,m2 rx(i,j)=0.0d0 end do end do end subroutine zermd(rx,m1,m2) double precision rx(m1,m2) do i=1,m1 do j=1,m2 rx(i,j)=0.0d0 end do end do end randomForest/src/rf.h0000744000175100001440000001114412037254525014311 0ustar hornikusers/******************************************************************* Copyright (C) 2001-7 Leo Breiman, Adele Cutler and Merck & Co., Inc. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. *******************************************************************/ #ifndef RF_H #define RF_H /* test if the bit at position pos is turned on */ #define isBitOn(x,pos) (((x) & (1 << (pos))) > 0) /* swap two integers */ #define swapInt(a, b) ((a ^= b), (b ^= a), (a ^= b)) /* void classRF(double *x, int *dimx, int *cl, int *ncl, int *cat, int *maxcat, int *sampsize, int *Options, int *ntree, int *nvar, int *ipi, double *pi, double *cut, int *nodesize, int *outcl, int *counttr, double *prox, double *imprt, double *, double *impmat, int *nrnodes, int *ndbigtree, int *nodestatus, int *bestvar, int *treemap, int *nodeclass, double *xbestsplit, double *pid, double *errtr, int *testdat, double *xts, int *clts, int *nts, double *countts, int *outclts, int *labelts, double *proxts, double *errts); */ void normClassWt(int *cl, const int nsample, const int nclass, const int useWt, double *classwt, int *classFreq); void classForest(int *mdim, int *ntest, int *nclass, int *maxcat, int *nrnodes, int *jbt, double *xts, double *xbestsplit, double *pid, double *cutoff, double *countts, int *treemap, int *nodestatus, int *cat, int *nodeclass, int *jts, int *jet, int *bestvar, int *nodexts, int *ndbigtree, int *keepPred, int *prox, double *proxmatrix, int *nodes); void regTree(double *x, double *y, int mdim, int nsample, int *lDaughter, int *rDaughter, double *upper, double *avnode, int *nodestatus, int nrnodes, int *treeSize, int nthsize, int mtry, int *mbest, int *cat, double *tgini, int *varUsed); void findBestSplit(double *x, int *jdex, double *y, int mdim, int nsample, int ndstart, int ndend, int *msplit, double *decsplit, double *ubest, int *ndendl, int *jstat, int mtry, double sumnode, int nodecnt, int *cat); void predictRegTree(double *x, int nsample, int mdim, int *lDaughter, int *rDaughter, int *nodestatus, double *ypred, double *split, double *nodepred, int *splitVar, int treeSize, int *cat, int maxcat, int *nodex); void predictClassTree(double *x, int n, int mdim, int *treemap, int *nodestatus, double *xbestsplit, int *bestvar, int *nodeclass, int ndbigtree, int *cat, int nclass, int *jts, int *nodex, int maxcat); unsigned int pack(int l, int *icat); void unpack(int nBits, unsigned int npack, int *icat); void zeroInt(int *x, int length); void zeroDouble(double *x, int length); void createClass(double *x, int realN, int totalN, int mdim); void prepare(int *cl, const int nsample, const int nclass, const int ipi, double *pi, double *pid, int *nc, double *wtt); void makeA(double *x, const int mdim, const int nsample, int *cat, int *a, int *b); void modA(int *a, int *nuse, const int nsample, const int mdim, int *cat, const int maxcat, int *ncase, int *jin); void Xtranslate(double *x, int mdim, int nrnodes, int nsample, int *bestvar, int *bestsplit, int *bestsplitnext, double *xbestsplit, int *nodestatus, int *cat, int treeSize); void permuteOOB(int m, double *x, int *in, int nsample, int mdim); void computeProximity(double *prox, int oobprox, int *node, int *inbag, int *oobpair, int n); /* Template of Fortran subroutines to be called from the C wrapper */ extern void F77_NAME(buildtree)(int *a, int *b, int *cl, int *cat, int *maxcat, int *mdim, int *nsample, int *nclass, int *treemap, int *bestvar, int *bestsplit, int *bestsplitnext, double *tgini, int *nodestatus, int *nodepop, int *nodestart, double *classpop, double *tclasspop, double *tclasscat, int *ta, int *nrnodes, int *, int *, int *, int *, int *, int *, int *, double *, double *, double *, int *, int *, int *); /* Node status */ #define NODE_TERMINAL -1 #define NODE_TOSPLIT -2 #define NODE_INTERIOR -3 #endif /* RF_H */ randomForest/src/rf.c0000744000175100001440000005702012037254525014307 0ustar hornikusers/***************************************************************** Copyright (C) 2001-2012 Leo Breiman, Adele Cutler and Merck & Co., Inc. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. C driver for Breiman & Cutler's random forest code. Re-written from the original main program in Fortran. Andy Liaw Feb. 7, 2002. Modifications to get the forest out Matt Wiener Feb. 26, 2002. *****************************************************************/ #include #include #include "rf.h" void oob(int nsample, int nclass, int *jin, int *cl, int *jtr,int *jerr, int *counttr, int *out, double *errtr, int *jest, double *cutoff); void TestSetError(double *countts, int *jts, int *clts, int *jet, int ntest, int nclass, int nvote, double *errts, int labelts, int *nclts, double *cutoff); /* Define the R RNG for use from Fortran. */ void F77_SUB(rrand)(double *r) { *r = unif_rand(); } void classRF(double *x, int *dimx, int *cl, int *ncl, int *cat, int *maxcat, int *sampsize, int *strata, int *Options, int *ntree, int *nvar, int *ipi, double *classwt, double *cut, int *nodesize, int *outcl, int *counttr, double *prox, double *imprt, double *impsd, double *impmat, int *nrnodes, int *ndbigtree, int *nodestatus, int *bestvar, int *treemap, int *nodeclass, double *xbestsplit, double *errtr, int *testdat, double *xts, int *clts, int *nts, double *countts, int *outclts, int *labelts, double *proxts, double *errts, int *inbag) { /****************************************************************** * C wrapper for random forests: get input from R and drive * the Fortran routines. * * Input: * * x: matrix of predictors (transposed!) * dimx: two integers: number of variables and number of cases * cl: class labels of the data * ncl: number of classes in the response * cat: integer vector of number of classes in the predictor; * 1=continuous * maxcat: maximum of cat * Options: 7 integers: (0=no, 1=yes) * add a second class (for unsupervised RF)? * 1: sampling from product of marginals * 2: sampling from product of uniforms * assess variable importance? * calculate proximity? * calculate proximity based on OOB predictions? * calculate outlying measure? * how often to print output? * keep the forest for future prediction? * ntree: number of trees * nvar: number of predictors to use for each split * ipi: 0=use class proportion as prob.; 1=use supplied priors * pi: double vector of class priors * nodesize: minimum node size: no node with fewer than ndsize * cases will be split * * Output: * * outcl: class predicted by RF * counttr: matrix of votes (transposed!) * imprt: matrix of variable importance measures * impmat: matrix of local variable importance measures * prox: matrix of proximity (if iprox=1) ******************************************************************/ int nsample0, mdim, nclass, addClass, mtry, ntest, nsample, ndsize, mimp, nimp, near, nuse, noutall, nrightall, nrightimpall, keepInbag, nstrata; int jb, j, n, m, k, idxByNnode, idxByNsample, imp, localImp, iprox, oobprox, keepf, replace, stratify, trace, *nright, *nrightimp, *nout, *nclts, Ntree; int *out, *bestsplitnext, *bestsplit, *nodepop, *jin, *nodex, *nodexts, *nodestart, *ta, *ncase, *jerr, *varUsed, *jtr, *classFreq, *idmove, *jvr, *at, *a, *b, *mind, *nind, *jts, *oobpair; int **strata_idx, *strata_size, last, ktmp, nEmpty, ntry; double av=0.0, delta=0.0; double *tgini, *tx, *wl, *classpop, *tclasscat, *tclasspop, *win, *tp, *wr; addClass = Options[0]; imp = Options[1]; localImp = Options[2]; iprox = Options[3]; oobprox = Options[4]; trace = Options[5]; keepf = Options[6]; replace = Options[7]; stratify = Options[8]; keepInbag = Options[9]; mdim = dimx[0]; nsample0 = dimx[1]; nclass = (*ncl==1) ? 2 : *ncl; ndsize = *nodesize; Ntree = *ntree; mtry = *nvar; ntest = *nts; nsample = addClass ? (nsample0 + nsample0) : nsample0; mimp = imp ? mdim : 1; nimp = imp ? nsample : 1; near = iprox ? nsample0 : 1; if (trace == 0) trace = Ntree + 1; tgini = (double *) S_alloc(mdim, sizeof(double)); wl = (double *) S_alloc(nclass, sizeof(double)); wr = (double *) S_alloc(nclass, sizeof(double)); classpop = (double *) S_alloc(nclass* *nrnodes, sizeof(double)); tclasscat = (double *) S_alloc(nclass*32, sizeof(double)); tclasspop = (double *) S_alloc(nclass, sizeof(double)); tx = (double *) S_alloc(nsample, sizeof(double)); win = (double *) S_alloc(nsample, sizeof(double)); tp = (double *) S_alloc(nsample, sizeof(double)); out = (int *) S_alloc(nsample, sizeof(int)); bestsplitnext = (int *) S_alloc(*nrnodes, sizeof(int)); bestsplit = (int *) S_alloc(*nrnodes, sizeof(int)); nodepop = (int *) S_alloc(*nrnodes, sizeof(int)); nodestart = (int *) S_alloc(*nrnodes, sizeof(int)); jin = (int *) S_alloc(nsample, sizeof(int)); nodex = (int *) S_alloc(nsample, sizeof(int)); nodexts = (int *) S_alloc(ntest, sizeof(int)); ta = (int *) S_alloc(nsample, sizeof(int)); ncase = (int *) S_alloc(nsample, sizeof(int)); jerr = (int *) S_alloc(nsample, sizeof(int)); varUsed = (int *) S_alloc(mdim, sizeof(int)); jtr = (int *) S_alloc(nsample, sizeof(int)); jvr = (int *) S_alloc(nsample, sizeof(int)); classFreq = (int *) S_alloc(nclass, sizeof(int)); jts = (int *) S_alloc(ntest, sizeof(int)); idmove = (int *) S_alloc(nsample, sizeof(int)); at = (int *) S_alloc(mdim*nsample, sizeof(int)); a = (int *) S_alloc(mdim*nsample, sizeof(int)); b = (int *) S_alloc(mdim*nsample, sizeof(int)); mind = (int *) S_alloc(mdim, sizeof(int)); nright = (int *) S_alloc(nclass, sizeof(int)); nrightimp = (int *) S_alloc(nclass, sizeof(int)); nout = (int *) S_alloc(nclass, sizeof(int)); if (oobprox) { oobpair = (int *) S_alloc(near*near, sizeof(int)); } /* Count number of cases in each class. */ zeroInt(classFreq, nclass); for (n = 0; n < nsample; ++n) classFreq[cl[n] - 1] ++; /* Normalize class weights. */ normClassWt(cl, nsample, nclass, *ipi, classwt, classFreq); if (stratify) { /* Count number of strata and frequency of each stratum. */ nstrata = 0; for (n = 0; n < nsample0; ++n) if (strata[n] > nstrata) nstrata = strata[n]; /* Create the array of pointers, each pointing to a vector of indices of where data of each stratum is. */ strata_size = (int *) S_alloc(nstrata, sizeof(int)); for (n = 0; n < nsample0; ++n) { strata_size[strata[n] - 1] ++; } strata_idx = (int **) S_alloc(nstrata, sizeof(int *)); for (n = 0; n < nstrata; ++n) { strata_idx[n] = (int *) S_alloc(strata_size[n], sizeof(int)); } zeroInt(strata_size, nstrata); for (n = 0; n < nsample0; ++n) { strata_size[strata[n] - 1] ++; strata_idx[strata[n] - 1][strata_size[strata[n] - 1] - 1] = n; } } else { nind = replace ? NULL : (int *) S_alloc(nsample, sizeof(int)); } /* INITIALIZE FOR RUN */ if (*testdat) zeroDouble(countts, ntest * nclass); zeroInt(counttr, nclass * nsample); zeroInt(out, nsample); zeroDouble(tgini, mdim); zeroDouble(errtr, (nclass + 1) * Ntree); if (*labelts) { nclts = (int *) S_alloc(nclass, sizeof(int)); for (n = 0; n < ntest; ++n) nclts[clts[n]-1]++; zeroDouble(errts, (nclass + 1) * Ntree); } if (imp) { zeroDouble(imprt, (nclass+2) * mdim); zeroDouble(impsd, (nclass+1) * mdim); if (localImp) zeroDouble(impmat, nsample * mdim); } if (iprox) { zeroDouble(prox, nsample0 * nsample0); if (*testdat) zeroDouble(proxts, ntest * (ntest + nsample0)); } makeA(x, mdim, nsample, cat, at, b); R_CheckUserInterrupt(); /* Starting the main loop over number of trees. */ GetRNGstate(); if (trace <= Ntree) { /* Print header for running output. */ Rprintf("ntree OOB"); for (n = 1; n <= nclass; ++n) Rprintf("%7i", n); if (*labelts) { Rprintf("| Test"); for (n = 1; n <= nclass; ++n) Rprintf("%7i", n); } Rprintf("\n"); } idxByNnode = 0; idxByNsample = 0; for (jb = 0; jb < Ntree; jb++) { /* Do we need to simulate data for the second class? */ if (addClass) createClass(x, nsample0, nsample, mdim); do { zeroInt(nodestatus + idxByNnode, *nrnodes); zeroInt(treemap + 2*idxByNnode, 2 * *nrnodes); zeroDouble(xbestsplit + idxByNnode, *nrnodes); zeroInt(nodeclass + idxByNnode, *nrnodes); zeroInt(varUsed, mdim); /* TODO: Put all sampling code into a function. */ /* drawSample(sampsize, nsample, ); */ if (stratify) { /* stratified sampling */ zeroInt(jin, nsample); zeroDouble(tclasspop, nclass); zeroDouble(win, nsample); if (replace) { /* with replacement */ for (n = 0; n < nstrata; ++n) { for (j = 0; j < sampsize[n]; ++j) { ktmp = (int) (unif_rand() * strata_size[n]); k = strata_idx[n][ktmp]; tclasspop[cl[k] - 1] += classwt[cl[k] - 1]; win[k] += classwt[cl[k] - 1]; jin[k] = 1; } } } else { /* stratified sampling w/o replacement */ /* re-initialize the index array */ zeroInt(strata_size, nstrata); for (j = 0; j < nsample; ++j) { strata_size[strata[j] - 1] ++; strata_idx[strata[j] - 1][strata_size[strata[j] - 1] - 1] = j; } /* sampling without replacement */ for (n = 0; n < nstrata; ++n) { last = strata_size[n] - 1; for (j = 0; j < sampsize[n]; ++j) { ktmp = (int) (unif_rand() * (last+1)); k = strata_idx[n][ktmp]; swapInt(strata_idx[n][last], strata_idx[n][ktmp]); last--; tclasspop[cl[k] - 1] += classwt[cl[k]-1]; win[k] += classwt[cl[k]-1]; jin[k] = 1; } } } } else { /* unstratified sampling */ ntry = 0; do { nEmpty = 0; zeroInt(jin, nsample); zeroDouble(tclasspop, nclass); zeroDouble(win, nsample); if (replace) { for (n = 0; n < *sampsize; ++n) { k = unif_rand() * nsample; tclasspop[cl[k] - 1] += classwt[cl[k]-1]; win[k] += classwt[cl[k]-1]; jin[k] = 1; } } else { for (n = 0; n < nsample; ++n) nind[n] = n; last = nsample - 1; for (n = 0; n < *sampsize; ++n) { ktmp = (int) (unif_rand() * (last+1)); k = nind[ktmp]; swapInt(nind[ktmp], nind[last]); last--; tclasspop[cl[k] - 1] += classwt[cl[k]-1]; win[k] += classwt[cl[k]-1]; jin[k] = 1; } } /* check if any class is missing in the sample */ for (n = 0; n < nclass; ++n) { if (tclasspop[n] == 0.0) nEmpty++; } ntry++; } while (nclass - nEmpty < 2 && ntry <= 30); /* If there are still fewer than two classes in the data, throw an error. */ if (nclass - nEmpty < 2) error("Still have fewer than two classes in the in-bag sample after 30 attempts."); } /* If need to keep indices of inbag data, do that here. */ if (keepInbag) { for (n = 0; n < nsample0; ++n) { inbag[n + idxByNsample] = jin[n]; } } /* Copy the original a matrix back. */ memcpy(a, at, sizeof(int) * mdim * nsample); modA(a, &nuse, nsample, mdim, cat, *maxcat, ncase, jin); F77_CALL(buildtree)(a, b, cl, cat, maxcat, &mdim, &nsample, &nclass, treemap + 2*idxByNnode, bestvar + idxByNnode, bestsplit, bestsplitnext, tgini, nodestatus + idxByNnode, nodepop, nodestart, classpop, tclasspop, tclasscat, ta, nrnodes, idmove, &ndsize, ncase, &mtry, varUsed, nodeclass + idxByNnode, ndbigtree + jb, win, wr, wl, &mdim, &nuse, mind); /* if the "tree" has only the root node, start over */ } while (ndbigtree[jb] == 1); Xtranslate(x, mdim, *nrnodes, nsample, bestvar + idxByNnode, bestsplit, bestsplitnext, xbestsplit + idxByNnode, nodestatus + idxByNnode, cat, ndbigtree[jb]); /* Get test set error */ if (*testdat) { predictClassTree(xts, ntest, mdim, treemap + 2*idxByNnode, nodestatus + idxByNnode, xbestsplit + idxByNnode, bestvar + idxByNnode, nodeclass + idxByNnode, ndbigtree[jb], cat, nclass, jts, nodexts, *maxcat); TestSetError(countts, jts, clts, outclts, ntest, nclass, jb+1, errts + jb*(nclass+1), *labelts, nclts, cut); } /* Get out-of-bag predictions and errors. */ predictClassTree(x, nsample, mdim, treemap + 2*idxByNnode, nodestatus + idxByNnode, xbestsplit + idxByNnode, bestvar + idxByNnode, nodeclass + idxByNnode, ndbigtree[jb], cat, nclass, jtr, nodex, *maxcat); zeroInt(nout, nclass); noutall = 0; for (n = 0; n < nsample; ++n) { if (jin[n] == 0) { /* increment the OOB votes */ counttr[n*nclass + jtr[n] - 1] ++; /* count number of times a case is OOB */ out[n]++; /* count number of OOB cases in the current iteration. nout[n] is the number of OOB cases for the n-th class. noutall is the number of OOB cases overall. */ nout[cl[n] - 1]++; noutall++; } } /* Compute out-of-bag error rate. */ oob(nsample, nclass, jin, cl, jtr, jerr, counttr, out, errtr + jb*(nclass+1), outcl, cut); if ((jb+1) % trace == 0) { Rprintf("%5i: %6.2f%%", jb+1, 100.0*errtr[jb * (nclass+1)]); for (n = 1; n <= nclass; ++n) { Rprintf("%6.2f%%", 100.0 * errtr[n + jb * (nclass+1)]); } if (*labelts) { Rprintf("| "); for (n = 0; n <= nclass; ++n) { Rprintf("%6.2f%%", 100.0 * errts[n + jb * (nclass+1)]); } } Rprintf("\n"); #ifdef WIN32 R_FlushConsole(); R_ProcessEvents(); #endif R_CheckUserInterrupt(); } /* DO PROXIMITIES */ if (iprox) { computeProximity(prox, oobprox, nodex, jin, oobpair, near); /* proximity for test data */ if (*testdat) { computeProximity(proxts, 0, nodexts, jin, oobpair, ntest); /* Compute proximity between testset and training set. */ for (n = 0; n < ntest; ++n) { for (k = 0; k < near; ++k) { if (nodexts[n] == nodex[k]) proxts[n + ntest * (k+ntest)] += 1.0; } } } } /* DO VARIABLE IMPORTANCE */ if (imp) { nrightall = 0; /* Count the number of correct prediction by the current tree among the OOB samples, by class. */ zeroInt(nright, nclass); for (n = 0; n < nsample; ++n) { /* out-of-bag and predicted correctly: */ if (jin[n] == 0 && jtr[n] == cl[n]) { nright[cl[n] - 1]++; nrightall++; } } for (m = 0; m < mdim; ++m) { if (varUsed[m]) { nrightimpall = 0; zeroInt(nrightimp, nclass); for (n = 0; n < nsample; ++n) tx[n] = x[m + n*mdim]; /* Permute the m-th variable. */ permuteOOB(m, x, jin, nsample, mdim); /* Predict the modified data using the current tree. */ predictClassTree(x, nsample, mdim, treemap + 2*idxByNnode, nodestatus + idxByNnode, xbestsplit + idxByNnode, bestvar + idxByNnode, nodeclass + idxByNnode, ndbigtree[jb], cat, nclass, jvr, nodex, *maxcat); /* Count how often correct predictions are made with the modified data. */ for (n = 0; n < nsample; n++) { /* Restore the original data for that variable. */ x[m + n*mdim] = tx[n]; if (jin[n] == 0) { if (jvr[n] == cl[n]) { + nrightimp[cl[n] - 1]++; nrightimpall++; } if (localImp && jvr[n] != jtr[n]) { if (cl[n] == jvr[n]) { impmat[m + n*mdim] -= 1.0; } else { impmat[m + n*mdim] += 1.0; } } } } /* Accumulate decrease in proportions of correct predictions. */ /* class-specific measures first: */ for (n = 0; n < nclass; ++n) { if (nout[n] > 0) { delta = ((double) (nright[n] - nrightimp[n])) / nout[n]; imprt[m + n*mdim] += delta; impsd[m + n*mdim] += delta * delta; } } /* overall measure, across all classes: */ if (noutall > 0) { delta = ((double)(nrightall - nrightimpall)) / noutall; imprt[m + nclass*mdim] += delta; impsd[m + nclass*mdim] += delta * delta; } } } } R_CheckUserInterrupt(); #ifdef WIN32 R_ProcessEvents(); #endif if (keepf) idxByNnode += *nrnodes; if (keepInbag) idxByNsample += nsample0; } PutRNGstate(); /* Final processing of variable importance. */ for (m = 0; m < mdim; m++) tgini[m] /= Ntree; if (imp) { for (m = 0; m < mdim; ++m) { if (localImp) { /* casewise measures */ for (n = 0; n < nsample; ++n) impmat[m + n*mdim] /= out[n]; } /* class-specific measures */ for (k = 0; k < nclass; ++k) { av = imprt[m + k*mdim] / Ntree; impsd[m + k*mdim] = sqrt(((impsd[m + k*mdim] / Ntree) - av*av) / Ntree); imprt[m + k*mdim] = av; /* imprt[m + k*mdim] = (se <= 0.0) ? -1000.0 - av : av / se; */ } /* overall measures */ av = imprt[m + nclass*mdim] / Ntree; impsd[m + nclass*mdim] = sqrt(((impsd[m + nclass*mdim] / Ntree) - av*av) / Ntree); imprt[m + nclass*mdim] = av; imprt[m + (nclass+1)*mdim] = tgini[m]; } } else { for (m = 0; m < mdim; ++m) imprt[m] = tgini[m]; } /* PROXIMITY DATA ++++++++++++++++++++++++++++++++*/ if (iprox) { for (n = 0; n < near; ++n) { for (k = n + 1; k < near; ++k) { prox[near*k + n] /= oobprox ? (oobpair[near*k + n] > 0 ? oobpair[near*k + n] : 1) : Ntree; prox[near*n + k] = prox[near*k + n]; } prox[near*n + n] = 1.0; } if (*testdat) { for (n = 0; n < ntest; ++n) { for (k = 0; k < ntest + nsample; ++k) proxts[ntest*k + n] /= Ntree; proxts[ntest * n + n] = 1.0; } } } } void classForest(int *mdim, int *ntest, int *nclass, int *maxcat, int *nrnodes, int *ntree, double *x, double *xbestsplit, double *pid, double *cutoff, double *countts, int *treemap, int *nodestatus, int *cat, int *nodeclass, int *jts, int *jet, int *bestvar, int *node, int *treeSize, int *keepPred, int *prox, double *proxMat, int *nodes) { int j, n, n1, n2, idxNodes, offset1, offset2, *junk, ntie; double crit, cmax; zeroDouble(countts, *nclass * *ntest); idxNodes = 0; offset1 = 0; offset2 = 0; junk = NULL; for (j = 0; j < *ntree; ++j) { /* predict by the j-th tree */ predictClassTree(x, *ntest, *mdim, treemap + 2*idxNodes, nodestatus + idxNodes, xbestsplit + idxNodes, bestvar + idxNodes, nodeclass + idxNodes, treeSize[j], cat, *nclass, jts + offset1, node + offset2, *maxcat); /* accumulate votes: */ for (n = 0; n < *ntest; ++n) { countts[jts[n + offset1] - 1 + n * *nclass] += 1.0; } /* if desired, do proximities for this round */ if (*prox) computeProximity(proxMat, 0, node + offset2, junk, junk, *ntest); idxNodes += *nrnodes; if (*keepPred) offset1 += *ntest; if (*nodes) offset2 += *ntest; } /* Aggregated prediction is the class with the maximum votes/cutoff */ for (n = 0; n < *ntest; ++n) { cmax = 0.0; ntie = 1; for (j = 0; j < *nclass; ++j) { crit = (countts[j + n * *nclass] / *ntree) / cutoff[j]; if (crit > cmax) { jet[n] = j + 1; cmax = crit; ntie = 1; } /* Break ties at random: */ if (crit == cmax) { if (unif_rand() < 1.0 / ntie) jet[n] = j + 1; ntie++; } } } /* if proximities requested, do the final adjustment (division by number of trees) */ if (*prox) { for (n1 = 0; n1 < *ntest; ++n1) { for (n2 = n1 + 1; n2 < *ntest; ++n2) { proxMat[n1 + n2 * *ntest] /= *ntree; proxMat[n2 + n1 * *ntest] = proxMat[n1 + n2 * *ntest]; } proxMat[n1 + n1 * *ntest] = 1.0; } } } /* Modified by A. Liaw 1/10/2003 (Deal with cutoff) Re-written in C by A. Liaw 3/08/2004 */ void oob(int nsample, int nclass, int *jin, int *cl, int *jtr,int *jerr, int *counttr, int *out, double *errtr, int *jest, double *cutoff) { int j, n, noob, *noobcl, ntie; double qq, smax, smaxtr; noobcl = (int *) S_alloc(nclass, sizeof(int)); zeroInt(jerr, nsample); zeroDouble(errtr, nclass+1); noob = 0; for (n = 0; n < nsample; ++n) { if (out[n]) { noob++; noobcl[cl[n]-1]++; smax = 0.0; smaxtr = 0.0; ntie = 1; for (j = 0; j < nclass; ++j) { qq = (((double) counttr[j + n*nclass]) / out[n]) / cutoff[j]; if (j+1 != cl[n]) smax = (qq > smax) ? qq : smax; /* if vote / cutoff is larger than current max, re-set max and change predicted class to the current class */ if (qq > smaxtr) { smaxtr = qq; jest[n] = j+1; ntie = 1; } /* break tie at random */ if (qq == smaxtr) { if (unif_rand() < 1.0 / ntie) { smaxtr = qq; jest[n] = j+1; } ntie++; } } if (jest[n] != cl[n]) { errtr[cl[n]] += 1.0; errtr[0] += 1.0; jerr[n] = 1; } } } errtr[0] /= noob; for (n = 1; n <= nclass; ++n) errtr[n] /= noobcl[n-1]; } void TestSetError(double *countts, int *jts, int *clts, int *jet, int ntest, int nclass, int nvote, double *errts, int labelts, int *nclts, double *cutoff) { int j, n, ntie; double cmax, crit; for (n = 0; n < ntest; ++n) countts[jts[n]-1 + n*nclass] += 1.0; /* Prediction is the class with the maximum votes */ for (n = 0; n < ntest; ++n) { cmax=0.0; ntie = 1; for (j = 0; j < nclass; ++j) { crit = (countts[j + n*nclass] / nvote) / cutoff[j]; if (crit > cmax) { jet[n] = j+1; cmax = crit; ntie = 1; } /* Break ties at random: */ if (crit == cmax) { if (unif_rand() < 1.0 / ntie) { jet[n] = j+1; cmax = crit; } ntie++; } } } if (labelts) { zeroDouble(errts, nclass + 1); for (n = 0; n < ntest; ++n) { if (jet[n] != clts[n]) { errts[0] += 1.0; errts[clts[n]] += 1.0; } } errts[0] /= ntest; for (n = 1; n <= nclass; ++n) errts[n] /= nclts[n-1]; } } randomForest/src/regTree.c0000744000175100001440000002371312037254525015277 0ustar hornikusers/******************************************************************* Copyright (C) 2001-7 Leo Breiman, Adele Cutler and Merck & Co., Inc. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. *******************************************************************/ /****************************************************************** * buildtree and findbestsplit routines translated from Leo's * original Fortran code. * * copyright 1999 by leo Breiman * this is free software and can be used for any purpose. * It comes with no guarantee. * ******************************************************************/ #include #include #include "rf.h" void regTree(double *x, double *y, int mdim, int nsample, int *lDaughter, int *rDaughter, double *upper, double *avnode, int *nodestatus, int nrnodes, int *treeSize, int nthsize, int mtry, int *mbest, int *cat, double *tgini, int *varUsed) { int i, j, k, m, ncur, *jdex, *nodestart, *nodepop; int ndstart, ndend, ndendl, nodecnt, jstat, msplit; double d, ss, av, decsplit, ubest, sumnode; nodestart = (int *) Calloc(nrnodes, int); nodepop = (int *) Calloc(nrnodes, int); /* initialize some arrays for the tree */ zeroInt(nodestatus, nrnodes); zeroInt(nodestart, nrnodes); zeroInt(nodepop, nrnodes); zeroDouble(avnode, nrnodes); jdex = (int *) Calloc(nsample, int); for (i = 1; i <= nsample; ++i) jdex[i-1] = i; ncur = 0; nodestart[0] = 0; nodepop[0] = nsample; nodestatus[0] = NODE_TOSPLIT; /* compute mean and sum of squares for Y */ av = 0.0; ss = 0.0; for (i = 0; i < nsample; ++i) { d = y[jdex[i] - 1]; ss += i * (av - d) * (av - d) / (i + 1); av = (i * av + d) / (i + 1); } avnode[0] = av; /* start main loop */ for (k = 0; k < nrnodes - 2; ++k) { if (k > ncur || ncur >= nrnodes - 2) break; /* skip if the node is not to be split */ if (nodestatus[k] != NODE_TOSPLIT) continue; /* initialize for next call to findbestsplit */ ndstart = nodestart[k]; ndend = ndstart + nodepop[k] - 1; nodecnt = nodepop[k]; sumnode = nodecnt * avnode[k]; jstat = 0; decsplit = 0.0; findBestSplit(x, jdex, y, mdim, nsample, ndstart, ndend, &msplit, &decsplit, &ubest, &ndendl, &jstat, mtry, sumnode, nodecnt, cat); if (jstat == 1) { /* Node is terminal: Mark it as such and move on to the next. */ nodestatus[k] = NODE_TERMINAL; continue; } /* Found the best split. */ mbest[k] = msplit; varUsed[msplit - 1] = 1; upper[k] = ubest; tgini[msplit - 1] += decsplit; nodestatus[k] = NODE_INTERIOR; /* leftnode no.= ncur+1, rightnode no. = ncur+2. */ nodepop[ncur + 1] = ndendl - ndstart + 1; nodepop[ncur + 2] = ndend - ndendl; nodestart[ncur + 1] = ndstart; nodestart[ncur + 2] = ndendl + 1; /* compute mean and sum of squares for the left daughter node */ av = 0.0; ss = 0.0; for (j = ndstart; j <= ndendl; ++j) { d = y[jdex[j]-1]; m = j - ndstart; ss += m * (av - d) * (av - d) / (m + 1); av = (m * av + d) / (m+1); } avnode[ncur+1] = av; nodestatus[ncur+1] = NODE_TOSPLIT; if (nodepop[ncur + 1] <= nthsize) { nodestatus[ncur + 1] = NODE_TERMINAL; } /* compute mean and sum of squares for the right daughter node */ av = 0.0; ss = 0.0; for (j = ndendl + 1; j <= ndend; ++j) { d = y[jdex[j]-1]; m = j - (ndendl + 1); ss += m * (av - d) * (av - d) / (m + 1); av = (m * av + d) / (m + 1); } avnode[ncur + 2] = av; nodestatus[ncur + 2] = NODE_TOSPLIT; if (nodepop[ncur + 2] <= nthsize) { nodestatus[ncur + 2] = NODE_TERMINAL; } /* map the daughter nodes */ lDaughter[k] = ncur + 1 + 1; rDaughter[k] = ncur + 2 + 1; /* Augment the tree by two nodes. */ ncur += 2; } *treeSize = nrnodes; for (k = nrnodes - 1; k >= 0; --k) { if (nodestatus[k] == 0) (*treeSize)--; if (nodestatus[k] == NODE_TOSPLIT) { nodestatus[k] = NODE_TERMINAL; } } Free(nodestart); Free(jdex); Free(nodepop); } /*--------------------------------------------------------------*/ void findBestSplit(double *x, int *jdex, double *y, int mdim, int nsample, int ndstart, int ndend, int *msplit, double *decsplit, double *ubest, int *ndendl, int *jstat, int mtry, double sumnode, int nodecnt, int *cat) { int last, ncat[32], icat[32], lc, nl, nr, npopl, npopr; int i, j, kv, l, *mind, *ncase; double *xt, *ut, *v, *yl, sumcat[32], avcat[32], tavcat[32], ubestt; double crit, critmax, critvar, suml, sumr, d, critParent; ut = (double *) Calloc(nsample, double); xt = (double *) Calloc(nsample, double); v = (double *) Calloc(nsample, double); yl = (double *) Calloc(nsample, double); mind = (int *) Calloc(mdim, int); ncase = (int *) Calloc(nsample, int); zeroDouble(avcat, 32); zeroDouble(tavcat, 32); /* START BIG LOOP */ *msplit = -1; *decsplit = 0.0; critmax = 0.0; ubestt = 0.0; for (i=0; i < mdim; ++i) mind[i] = i; last = mdim - 1; for (i = 0; i < mtry; ++i) { critvar = 0.0; j = (int) (unif_rand() * (last+1)); kv = mind[j]; swapInt(mind[j], mind[last]); last--; lc = cat[kv]; if (lc == 1) { /* numeric variable */ for (j = ndstart; j <= ndend; ++j) { xt[j] = x[kv + (jdex[j] - 1) * mdim]; yl[j] = y[jdex[j] - 1]; } } else { /* categorical variable */ zeroInt(ncat, 32); zeroDouble(sumcat, 32); for (j = ndstart; j <= ndend; ++j) { l = (int) x[kv + (jdex[j] - 1) * mdim]; sumcat[l - 1] += y[jdex[j] - 1]; ncat[l - 1] ++; } /* Compute means of Y by category. */ for (j = 0; j < lc; ++j) { avcat[j] = ncat[j] ? sumcat[j] / ncat[j] : 0.0; } /* Make the category mean the `pseudo' X data. */ for (j = 0; j < nsample; ++j) { xt[j] = avcat[(int) x[kv + (jdex[j] - 1) * mdim] - 1]; yl[j] = y[jdex[j] - 1]; } } /* copy the x data in this node. */ for (j = ndstart; j <= ndend; ++j) v[j] = xt[j]; for (j = 1; j <= nsample; ++j) ncase[j - 1] = j; R_qsort_I(v, ncase, ndstart + 1, ndend + 1); if (v[ndstart] >= v[ndend]) continue; /* ncase(n)=case number of v nth from bottom */ /* Start from the right and search to the left. */ critParent = sumnode * sumnode / nodecnt; suml = 0.0; sumr = sumnode; npopl = 0; npopr = nodecnt; crit = 0.0; /* Search through the "gaps" in the x-variable. */ for (j = ndstart; j <= ndend - 1; ++j) { d = yl[ncase[j] - 1]; suml += d; sumr -= d; npopl++; npopr--; if (v[j] < v[j+1]) { crit = (suml * suml / npopl) + (sumr * sumr / npopr) - critParent; if (crit > critvar) { ubestt = (v[j] + v[j+1]) / 2.0; critvar = crit; } } } if (critvar > critmax) { *ubest = ubestt; *msplit = kv + 1; critmax = critvar; for (j = ndstart; j <= ndend; ++j) { ut[j] = xt[j]; } if (cat[kv] > 1) { for (j = 0; j < cat[kv]; ++j) tavcat[j] = avcat[j]; } } } *decsplit = critmax; /* If best split can not be found, set to terminal node and return. */ if (*msplit != -1) { nl = ndstart; for (j = ndstart; j <= ndend; ++j) { if (ut[j] <= *ubest) { nl++; ncase[nl-1] = jdex[j]; } } *ndendl = imax2(nl - 1, ndstart); nr = *ndendl + 1; for (j = ndstart; j <= ndend; ++j) { if (ut[j] > *ubest) { if (nr >= nsample) break; nr++; ncase[nr - 1] = jdex[j]; } } if (*ndendl >= ndend) *ndendl = ndend - 1; for (j = ndstart; j <= ndend; ++j) jdex[j] = ncase[j]; lc = cat[*msplit - 1]; if (lc > 1) { for (j = 0; j < lc; ++j) { icat[j] = (tavcat[j] < *ubest) ? 1 : 0; } *ubest = pack(lc, icat); } } else *jstat = 1; Free(ncase); Free(mind); Free(v); Free(yl); Free(xt); Free(ut); } /*====================================================================*/ void predictRegTree(double *x, int nsample, int mdim, int *lDaughter, int *rDaughter, int *nodestatus, double *ypred, double *split, double *nodepred, int *splitVar, int treeSize, int *cat, int maxcat, int *nodex) { int i, j, k, m, *cbestsplit; unsigned int npack; /* decode the categorical splits */ if (maxcat > 1) { cbestsplit = (int *) Calloc(maxcat * treeSize, int); zeroInt(cbestsplit, maxcat * treeSize); for (i = 0; i < treeSize; ++i) { if (nodestatus[i] != NODE_TERMINAL && cat[splitVar[i] - 1] > 1) { npack = (unsigned int) split[i]; /* unpack `npack' into bits */ for (j = 0; npack; npack >>= 1, ++j) { cbestsplit[j + i*maxcat] = npack & 1; } } } } for (i = 0; i < nsample; ++i) { k = 0; while (nodestatus[k] != NODE_TERMINAL) { /* go down the tree */ m = splitVar[k] - 1; if (cat[m] == 1) { k = (x[m + i*mdim] <= split[k]) ? lDaughter[k] - 1 : rDaughter[k] - 1; } else { /* Split by a categorical predictor */ k = cbestsplit[(int) x[m + i * mdim] - 1 + k * maxcat] ? lDaughter[k] - 1 : rDaughter[k] - 1; } } /* terminal node: assign prediction and move on to next */ ypred[i] = nodepred[k]; nodex[i] = k + 1; } if (maxcat > 1) Free(cbestsplit); } randomForest/src/regrf.c0000744000175100001440000003442012037254525015004 0ustar hornikusers/******************************************************************* Copyright (C) 2001-2012 Leo Breiman, Adele Cutler and Merck & Co., Inc. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. *******************************************************************/ #include #include "rf.h" void simpleLinReg(int nsample, double *x, double *y, double *coef, double *mse, int *hasPred); void regRF(double *x, double *y, int *xdim, int *sampsize, int *nthsize, int *nrnodes, int *nTree, int *mtry, int *imp, int *cat, int *maxcat, int *jprint, int *doProx, int *oobprox, int *biasCorr, double *yptr, double *errimp, double *impmat, double *impSD, double *prox, int *treeSize, int *nodestatus, int *lDaughter, int *rDaughter, double *avnode, int *mbest, double *upper, double *mse, int *keepf, int *replace, int *testdat, double *xts, int *nts, double *yts, int *labelts, double *yTestPred, double *proxts, double *msets, double *coef, int *nout, int *inbag) { /************************************************************************* Input: mdim=number of variables in data set nsample=number of cases nthsize=number of cases in a node below which the tree will not split, setting nthsize=5 generally gives good results. nTree=number of trees in run. 200-500 gives pretty good results mtry=number of variables to pick to split on at each node. mdim/3 seems to give genrally good performance, but it can be altered up or down imp=1 turns on variable importance. This is computed for the mth variable as the percent rise in the test set mean sum-of- squared errors when the mth variable is randomly permuted. *************************************************************************/ double errts = 0.0, averrb, meanY, meanYts, varY, varYts, r, xrand, errb = 0.0, resid=0.0, ooberr, ooberrperm, delta, *resOOB; double *yb, *xtmp, *xb, *ytr, *ytree, *tgini; int k, m, mr, n, nOOB, j, jout, idx, ntest, last, ktmp, nPerm, nsample, mdim, keepF, keepInbag; int *oobpair, varImp, localImp, *varUsed; int *in, *nind, *nodex, *nodexts; nsample = xdim[0]; mdim = xdim[1]; ntest = *nts; varImp = imp[0]; localImp = imp[1]; nPerm = imp[2]; keepF = keepf[0]; keepInbag = keepf[1]; if (*jprint == 0) *jprint = *nTree + 1; yb = (double *) S_alloc(*sampsize, sizeof(double)); xb = (double *) S_alloc(mdim * *sampsize, sizeof(double)); ytr = (double *) S_alloc(nsample, sizeof(double)); xtmp = (double *) S_alloc(nsample, sizeof(double)); resOOB = (double *) S_alloc(nsample, sizeof(double)); in = (int *) S_alloc(nsample, sizeof(int)); nodex = (int *) S_alloc(nsample, sizeof(int)); varUsed = (int *) S_alloc(mdim, sizeof(int)); nind = *replace ? NULL : (int *) S_alloc(nsample, sizeof(int)); if (*testdat) { ytree = (double *) S_alloc(ntest, sizeof(double)); nodexts = (int *) S_alloc(ntest, sizeof(int)); } oobpair = (*doProx && *oobprox) ? (int *) S_alloc(nsample * nsample, sizeof(int)) : NULL; /* If variable importance is requested, tgini points to the second "column" of errimp, otherwise it's just the same as errimp. */ tgini = varImp ? errimp + mdim : errimp; averrb = 0.0; meanY = 0.0; varY = 0.0; zeroDouble(yptr, nsample); zeroInt(nout, nsample); for (n = 0; n < nsample; ++n) { varY += n * (y[n] - meanY)*(y[n] - meanY) / (n + 1); meanY = (n * meanY + y[n]) / (n + 1); } varY /= nsample; varYts = 0.0; meanYts = 0.0; if (*testdat) { for (n = 0; n < ntest; ++n) { varYts += n * (yts[n] - meanYts)*(yts[n] - meanYts) / (n + 1); meanYts = (n * meanYts + yts[n]) / (n + 1); } varYts /= ntest; } if (*doProx) { zeroDouble(prox, nsample * nsample); if (*testdat) zeroDouble(proxts, ntest * (nsample + ntest)); } if (varImp) { zeroDouble(errimp, mdim * 2); if (localImp) zeroDouble(impmat, nsample * mdim); } else { zeroDouble(errimp, mdim); } if (*labelts) zeroDouble(yTestPred, ntest); /* print header for running output */ if (*jprint <= *nTree) { Rprintf(" | Out-of-bag "); if (*testdat) Rprintf("| Test set "); Rprintf("|\n"); Rprintf("Tree | MSE %%Var(y) "); if (*testdat) Rprintf("| MSE %%Var(y) "); Rprintf("|\n"); } GetRNGstate(); /************************************* * Start the loop over trees. *************************************/ for (j = 0; j < *nTree; ++j) { idx = keepF ? j * *nrnodes : 0; zeroInt(in, nsample); zeroInt(varUsed, mdim); /* Draw a random sample for growing a tree. */ if (*replace) { /* sampling with replacement */ for (n = 0; n < *sampsize; ++n) { xrand = unif_rand(); k = xrand * nsample; in[k] = 1; yb[n] = y[k]; for(m = 0; m < mdim; ++m) { xb[m + n * mdim] = x[m + k * mdim]; } } } else { /* sampling w/o replacement */ for (n = 0; n < nsample; ++n) nind[n] = n; last = nsample - 1; for (n = 0; n < *sampsize; ++n) { ktmp = (int) (unif_rand() * (last+1)); k = nind[ktmp]; swapInt(nind[ktmp], nind[last]); last--; in[k] = 1; yb[n] = y[k]; for(m = 0; m < mdim; ++m) { xb[m + n * mdim] = x[m + k * mdim]; } } } if (keepInbag) { for (n = 0; n < nsample; ++n) inbag[n + j * nsample] = in[n]; } /* grow the regression tree */ regTree(xb, yb, mdim, *sampsize, lDaughter + idx, rDaughter + idx, upper + idx, avnode + idx, nodestatus + idx, *nrnodes, treeSize + j, *nthsize, *mtry, mbest + idx, cat, tgini, varUsed); /* predict the OOB data with the current tree */ /* ytr is the prediction on OOB data by the current tree */ predictRegTree(x, nsample, mdim, lDaughter + idx, rDaughter + idx, nodestatus + idx, ytr, upper + idx, avnode + idx, mbest + idx, treeSize[j], cat, *maxcat, nodex); /* yptr is the aggregated prediction by all trees grown so far */ errb = 0.0; ooberr = 0.0; jout = 0; /* jout is the number of cases that has been OOB so far */ nOOB = 0; /* nOOB is the number of OOB samples for this tree */ for (n = 0; n < nsample; ++n) { if (in[n] == 0) { nout[n]++; nOOB++; yptr[n] = ((nout[n]-1) * yptr[n] + ytr[n]) / nout[n]; resOOB[n] = ytr[n] - y[n]; ooberr += resOOB[n] * resOOB[n]; } if (nout[n]) { jout++; errb += (y[n] - yptr[n]) * (y[n] - yptr[n]); } } errb /= jout; /* Do simple linear regression of y on yhat for bias correction. */ if (*biasCorr) simpleLinReg(nsample, yptr, y, coef, &errb, nout); /* predict testset data with the current tree */ if (*testdat) { predictRegTree(xts, ntest, mdim, lDaughter + idx, rDaughter + idx, nodestatus + idx, ytree, upper + idx, avnode + idx, mbest + idx, treeSize[j], cat, *maxcat, nodexts); /* ytree is the prediction for test data by the current tree */ /* yTestPred is the average prediction by all trees grown so far */ errts = 0.0; for (n = 0; n < ntest; ++n) { yTestPred[n] = (j * yTestPred[n] + ytree[n]) / (j + 1); } /* compute testset MSE */ if (*labelts) { for (n = 0; n < ntest; ++n) { resid = *biasCorr ? yts[n] - (coef[0] + coef[1]*yTestPred[n]) : yts[n] - yTestPred[n]; errts += resid * resid; } errts /= ntest; } } /* Print running output. */ if ((j + 1) % *jprint == 0) { Rprintf("%4d |", j + 1); Rprintf(" %8.4g %8.2f ", errb, 100 * errb / varY); if(*labelts == 1) Rprintf("| %8.4g %8.2f ", errts, 100.0 * errts / varYts); Rprintf("|\n"); } mse[j] = errb; if (*labelts) msets[j] = errts; /* DO PROXIMITIES */ if (*doProx) { computeProximity(prox, *oobprox, nodex, in, oobpair, nsample); /* proximity for test data */ if (*testdat) { /* In the next call, in and oobpair are not used. */ computeProximity(proxts, 0, nodexts, in, oobpair, ntest); for (n = 0; n < ntest; ++n) { for (k = 0; k < nsample; ++k) { if (nodexts[n] == nodex[k]) { proxts[n + ntest * (k+ntest)] += 1.0; } } } } } /* Variable importance */ if (varImp) { for (mr = 0; mr < mdim; ++mr) { if (varUsed[mr]) { /* Go ahead if the variable is used */ /* make a copy of the m-th variable into xtmp */ for (n = 0; n < nsample; ++n) xtmp[n] = x[mr + n * mdim]; ooberrperm = 0.0; for (k = 0; k < nPerm; ++k) { permuteOOB(mr, x, in, nsample, mdim); predictRegTree(x, nsample, mdim, lDaughter + idx, rDaughter + idx, nodestatus + idx, ytr, upper + idx, avnode + idx, mbest + idx, treeSize[j], cat, *maxcat, nodex); for (n = 0; n < nsample; ++n) { if (in[n] == 0) { r = ytr[n] - y[n]; ooberrperm += r * r; if (localImp) { impmat[mr + n * mdim] += (r*r - resOOB[n]*resOOB[n]) / nPerm; } } } } delta = (ooberrperm / nPerm - ooberr) / nOOB; errimp[mr] += delta; impSD[mr] += delta * delta; /* copy original data back */ for (n = 0; n < nsample; ++n) x[mr + n * mdim] = xtmp[n]; } } } } PutRNGstate(); /* end of tree iterations=======================================*/ if (*biasCorr) { /* bias correction for predicted values */ for (n = 0; n < nsample; ++n) { if (nout[n]) yptr[n] = coef[0] + coef[1] * yptr[n]; } if (*testdat) { for (n = 0; n < ntest; ++n) { yTestPred[n] = coef[0] + coef[1] * yTestPred[n]; } } } if (*doProx) { for (n = 0; n < nsample; ++n) { for (k = n + 1; k < nsample; ++k) { prox[nsample*k + n] /= oobprox ? (oobpair[nsample*k + n] > 0 ? oobpair[nsample*k + n] : 1) : *nTree; prox[nsample * n + k] = prox[nsample * k + n]; } prox[nsample * n + n] = 1.0; } if (*testdat) { for (n = 0; n < ntest; ++n) for (k = 0; k < ntest + nsample; ++k) proxts[ntest*k + n] /= *nTree; } } if (varImp) { for (m = 0; m < mdim; ++m) { errimp[m] = errimp[m] / *nTree; impSD[m] = sqrt( ((impSD[m] / *nTree) - (errimp[m] * errimp[m])) / *nTree ); if (localImp) { for (n = 0; n < nsample; ++n) { impmat[m + n * mdim] /= nout[n]; } } } } for (m = 0; m < mdim; ++m) tgini[m] /= *nTree; } /*----------------------------------------------------------------------*/ void regForest(double *x, double *ypred, int *mdim, int *n, int *ntree, int *lDaughter, int *rDaughter, int *nodestatus, int *nrnodes, double *xsplit, double *avnodes, int *mbest, int *treeSize, int *cat, int *maxcat, int *keepPred, double *allpred, int *doProx, double *proxMat, int *nodes, int *nodex) { int i, j, idx1, idx2, *junk; double *ytree; junk = NULL; ytree = (double *) S_alloc(*n, sizeof(double)); if (*nodes) { zeroInt(nodex, *n * *ntree); } else { zeroInt(nodex, *n); } if (*doProx) zeroDouble(proxMat, *n * *n); if (*keepPred) zeroDouble(allpred, *n * *ntree); idx1 = 0; idx2 = 0; for (i = 0; i < *ntree; ++i) { zeroDouble(ytree, *n); predictRegTree(x, *n, *mdim, lDaughter + idx1, rDaughter + idx1, nodestatus + idx1, ytree, xsplit + idx1, avnodes + idx1, mbest + idx1, treeSize[i], cat, *maxcat, nodex + idx2); for (j = 0; j < *n; ++j) ypred[j] += ytree[j]; if (*keepPred) { for (j = 0; j < *n; ++j) allpred[j + i * *n] = ytree[j]; } /* if desired, do proximities for this round */ if (*doProx) computeProximity(proxMat, 0, nodex + idx2, junk, junk, *n); idx1 += *nrnodes; /* increment the offset */ if (*nodes) idx2 += *n; } for (i = 0; i < *n; ++i) ypred[i] /= *ntree; if (*doProx) { for (i = 0; i < *n; ++i) { for (j = i + 1; j < *n; ++j) { proxMat[i + j * *n] /= *ntree; proxMat[j + i * *n] = proxMat[i + j * *n]; } proxMat[i + i * *n] = 1.0; } } } void simpleLinReg(int nsample, double *x, double *y, double *coef, double *mse, int *hasPred) { /* Compute simple linear regression of y on x, returning the coefficients, the average squared residual, and the predicted values (overwriting y). */ int i, nout = 0; double sxx=0.0, sxy=0.0, xbar=0.0, ybar=0.0; double dx = 0.0, dy = 0.0, py=0.0; for (i = 0; i < nsample; ++i) { if (hasPred[i]) { nout++; xbar += x[i]; ybar += y[i]; } } xbar /= nout; ybar /= nout; for (i = 0; i < nsample; ++i) { if (hasPred[i]) { dx = x[i] - xbar; dy = y[i] - ybar; sxx += dx * dx; sxy += dx * dy; } } coef[1] = sxy / sxx; coef[0] = ybar - coef[1] * xbar; *mse = 0.0; for (i = 0; i < nsample; ++i) { if (hasPred[i]) { py = coef[0] + coef[1] * x[i]; dy = y[i] - py; *mse += dy * dy; /* y[i] = py; */ } } *mse /= nout; return; } randomForest/src/classTree.c0000744000175100001440000004242312037254525015626 0ustar hornikusers/******************************************************************* Copyright (C) 2001-9 Leo Breiman, Adele Cutler and Merck & Co., Inc. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. *******************************************************************/ #include #include #include "rf.h" #ifdef C_CLASSTREE void classTree(int *a, int *b, int *class, int *cat, int mdim, int nsample, int nclass, int *treemap, int *bestvar, double *bestsplit, double *bestsplitnext, double *tgini, int *nodeStatus, int *nodePop, int *nodeStart, double *tclassPop, int maxNodes, int nodeSize, int *ncase, int *inBag, int mTry, int *varUsed, int *nodeClass, int *treeSize, double *win) { /* Buildtree consists of repeated calls to two subroutines, Findbestsplit and Movedata. Findbestsplit does just that--it finds the best split of the current node. Movedata moves the data in the split node right and left so that the data corresponding to each child node is contiguous. The buildtree bookkeeping is different from that in Friedman's original CART program. ncur is the total number of nodes to date. nodeStatus(k)=1 if the kth node has been split. nodeStatus(k)=2 if the node exists but has not yet been split, and =-1 of the node is terminal. A node is terminal if its size is below a threshold value, or if it is all one class, or if all the x-values are equal. If the current node k is split, then its children are numbered ncur+1 (left), and ncur+2(right), ncur increases to ncur+2 and the next node to be split is numbered k+1. When no more nodes can be split, buildtree returns to the main program. */ /* integer a(mdim,nsample),cl(nsample),cat(mdim), treemap(2,numNodes),bestvar(numNodes), bestsplit(numNodes), nodeStatus(numNodes),ta(nsample), nodePop(numNodes),nodeStart(numNodes), bestsplitnext(numNodes),idmove(nsample), ncase(nsample),parent(numNodes),b(mdim,nsample), jin(nsample),iv(mred),nodeclass(numNodes),mind(mred) double precision tclasspop(nclass),classpop(nclass,numNodes), 1 tclasscat(nclass,32),win(nsample),wr(nclass),wc(nclass), 1 wl(nclass),tgini(mdim), xrand */ int msplit = 0, i, j; zeroInt(nodeStatus, maxNodes); zeroInt(nodeStart, maxNodes); zeroInt(nodePop, maxNodes); zeroDouble(classPop, nclass * maxNodes); for (i = 0; i < nclass; ++i) classPop[i] = tclassPop[i]; ncur = 1; nodeStart[0] = 1; nodePop[0] = *nuse; nodeStatus[0] = NODE_TOSPLIT; /* 2: not split yet, 1: split, -1: terminal */ /* start main loop */ for (i = 0; i < numNodes; ++i) { if (i > ncur - 1) break; if (nodeStatus[i] != NODE_TOSPLIT) continue; /* initialize for next call to findbestsplit */ ndstart = nodeStart[i]; ndend = ndstart + nodePop[i] - 1; for (j = 0; j < nclass; ++j) { tclassPop[j] = classPop[j + i * nclass]; } jstat = 0; F77_CALL(findbestsplit)(a, b, cl, mdim, nsample, nclass, cat, ndstart, ndend, tclassPop, tclasscat, &msplit, &decsplit, &nbest, ncase, &jstat, inBag, mTry, win, wr, wc, wl, mred, i, mind); if (jstat == 1) { nodeStatus[i] = NODE_TERMINAL; continue; } else { bestvar[i] = msplit; varUsed[msplit - 1] = 1; tgini[msplit - 1] += decsplit; if (cat[msplit-1] == 1) { bestsplit[i] = a[msplit - 1 + nbest * mdim]; bestsplitnext[i] = a[msplit - 1 + (nbest + 1) * mdim]; } else { bestsplit[i] = nbest; bestsplitnext[i] = 0; } } F77_CALL(movedata)(a, ta, mdim, nsample, ndstart, ndend, idmove, ncase, msplit, cat, nbest, ndendl); /* leftnode no.= ncur+1, rightnode no. = ncur+2. */ nodePop[ncur+1] = ndendl - ndstart + 1; nodePop[ncur+2] = ndend - ndendl; nodeStart[ncur+1] = ndstart; nodeStart[ncur+2] = ndendl + 1; /* find class populations in both nodes */ for (n = ndstart; n <= ndendl; ++n) { nc = ncase[n]; j = class[nc-1]; classPop[j - 1 + (ncur+1)*mdim] += win[nc - 1]; } for (n = ndendl + 1; n <= ndend; ++n) { nc = ncase[n]; j = cl[nc - 1]; classPop[j - 1 + (ncur+2) * mdim] += win[nc - 1]; } /* check on nodeStatus */ nodeStatus[ncur + 1] = NODE_TOSPLIT; nodeStatus[ncur + 2] = NODE_TOSPLIT; if (nodePop[ncur + 1] <= ndsize) nodeStatus[ncur+1] = NODE_TERMINAL; if (nodePop[ncur + 2] <= ndsize) nodeStatus[ncur+2] = NODE_TERMINAL; popt1 = 0; popt2 = 0; for (j = 0; j < nclass; ++j) { popt1 += classPop[j + (ncur+1) * mdim]; popt2 += classPop[j + (ncur+2) * mdim]; } for (j = 0; j < nclass; ++j) { if (classPop[j + (ncur+1) * mdim] == popt1) nodeStatus[ncur+1] = NODE_TERMINAL; if (classPop[j + (ncur+2) * mdim] == popt2) nodeStatus[ncur+2] = NODE_TERMINAL; } treemap[i * 2] = ncur + 1; treemap[1 + i * 2] = ncur + 2; nodeStatus[i] = NODE_INTERIOR; ncur += 2; if (ncur >= numNodes) break; } ndbigtree = numNodes; for (k = numNodes-1; k >= 0; --k) { if (nodeStatus[k] == 0) ndbigtree--; if (nodeStatus[k] == NODE_TOSPLIT) nodeStatus[k] = NODE_TERMINAL; } for (k = 0; k < ndbigtree; ++k) { if (nodeStatus[k] == NODE_TERMINAL) { pp = 0; ntie = 1; for (j = 0; j < nclass; ++j) { if (classPop[j + k * nclass] > pp) { nodeClass[k] = j; pp = classPop[j + k * nclass]; ntie = 1; } /* Break ties at random: */ if (classPop[j + k * nclass] == pp) { if (unif_rand() < 1.0 / ntie) { nodeClass[k] = j; pp = classPop[j + k * nclass]; } ntie++; } } } } } void findBestSplit(int *a, double *b, int *class, int mDim, int nSample, int nClass, int *nCat, int maxCat, int ndStart, int ndEnd, double *classCount, double *classCatTable, int *splitVar, double *decGini, int *bestSplit, int *ncase, int *splitStatus, int *inBag, int mtry, double *weight, double *wr, double *wc, double *wl, int *currentNode, int *mind) { /* subroutine findbestsplit(a, b, cl, mdim, nsample, nclass, cat, 1 maxcat, ndstart, ndend, tclasspop, tclasscat, msplit, 2 decsplit, nbest, ncase, jstat, jin, mtry, win, wr, wc, wl, 3 mred, kbuild, mind) */ /* For the best split, msplit is the variable split on. decsplit is the dec. in impurity. If msplit is numerical, nsplit is the case number of value of msplit split on, and nsplitnext is the case number of the next larger value of msplit. If msplit is categorical, then nsplit is the coding into an integer of the categories going left. */ integer a(mdim,nsample), cl(nsample), cat(mdim), 1 ncase(nsample), b(mdim,nsample), jin(nsample), nn, j double precision tclasspop(nclass), tclasscat(nclass,32), dn(32), 1 win(nsample), wr(nclass), wc(nclass), wl(nclass), xrand integer mind(mred), ncmax, ncsplit,nhit ncmax = 10; ncsplit = 512; /* compute initial values of numerator and denominator of Gini */ parentNum = 0.0; parentDen = 0.0; for (i = 0; i < nClass; ++i) { parentNum += classCount[i] * classCount[i]; parentDen += classCount[i]; } crit0 = pno / pdo; *splitStatus = 0; critmax = -1.0e25; for (i = 0; i < mDim; ++i) mind[i] = i; /* start main loop through variables to find best split. */ last = mDim - 1; for (i = 0, i < mtry; ++i) { /* sample mtry variables w/o replacement. */ j = (int) (unif_rand() * (last + 1)); mvar = mIndex[j]; swapInt(mIndex[j], mIndex[last]); last--; lcat = nCat[mvar]; if (lcat == 1) { /* Split on a numerical predictor. */ rightNum = parentNum; rightDen = parentDen; leftNum = 0.0; leftDen = 0.0; zeroDouble(wl, nClass); for (j = 0; j < nClass; ++j) wr[j] = classCount[j]; ntie = 1; for (j = ndstart; j <= ndend - 1; ++j) { nc = a[mvar, j-1]; u = weight[nc]; k = class[nc]; leftNum += u * (2 * wl[k-1] + u); rightNum += u * (-2 * wr[k-1] + u); leftDen += u; rightDen -= u; wl[k-1] += u; wr[k-1] -= u; if (b[mvar, nc] < b[mvar, a[mvar, j]]) { if (fmin2(rightDen, leftDen) > 1.0e-5) { crit = (leftNum / leftDen) + (rightNum / rightDen); if (crit > critmax) { *bestSplit = j; critmax = crit; *splitVar = mvar; ntie = 1; } /* Break ties at random: */ if (crit == critmax) { if (unif_rand() < 1.0 / ntie) { *bestSplit = j; critmax = crit; *splitVar = mvar; } ntie++; } } } } } else { /* Split on a categorical predictor. */ zeroDouble(classCatTable, nClass * 32); for (j = ndstart; j <= ndend; ++j) { nc = ncase[j-1]; l = a[mvar, ncase[j-1]]; classCatTable[class[nc-1], l-1] += weight[nc-1]; } nNotEmpty = 0; for (j = 0; j < lcat; ++j) { catSum = 0; for (k = 0; k < nClass; ++k) { catSum += classCatTable[k, j]; } catCount[j] = su; if (catSum > 0) nNotEmpty ++; } nhit = 0; if (nNotEmpty > 1) { F77_CALL(catmax)(parentden, classcatTable, classCount, &nclass, &lcat, bestSplit, &critmax, &nhit, &maxcat, &ncmax, &ncsplit); } if (nhit) *splitVar = mvar; } } if (critmax < -1.0e10 || msplit == 0) { *splitStatus = -1; } else { *decsplit = critmax - crit0; } } #endif /* C_CLASSTREE */ void F77_NAME(catmax)(double *parentDen, double *tclasscat, double *tclasspop, int *nclass, int *lcat, unsigned int *ncatsp, double *critmax, int *nhit, int *maxcat, int *ncmax, int *ncsplit) { /* This finds the best split of a categorical variable with lcat categories and nclass classes, where tclasscat(j, k) is the number of cases in class j with category value k. The method uses an exhaustive search over all partitions of the category values if the number of categories is 10 or fewer. Otherwise ncsplit randomly selected splits are tested and best used. */ int j, k, n, icat[32], nsplit; double leftNum, leftDen, rightNum, decGini, *leftCatClassCount; leftCatClassCount = (double *) Calloc(*nclass, double); *nhit = 0; nsplit = *lcat > *ncmax ? *ncsplit : (int) pow(2.0, (double) *lcat - 1) - 1; for (n = 0; n < nsplit; ++n) { zeroInt(icat, 32); if (*lcat > *ncmax) { /* Generate random split. TODO: consider changing to generating random bits with more efficient algorithm */ for (j = 0; j < *lcat; ++j) icat[j] = unif_rand() > 0.5 ? 1 : 0; } else { unpack(*lcat, (unsigned int) n + 1, icat); } for (j = 0; j < *nclass; ++j) { leftCatClassCount[j] = 0; for (k = 0; k < *lcat; ++k) { if (icat[k]) { leftCatClassCount[j] += tclasscat[j + k * *nclass]; } } } leftNum = 0.0; leftDen = 0.0; for (j = 0; j < *nclass; ++j) { leftNum += leftCatClassCount[j] * leftCatClassCount[j]; leftDen += leftCatClassCount[j]; } /* If either node is empty, try another split. */ if (leftDen <= 1.0e-8 || *parentDen - leftDen <= 1.0e-5) continue; rightNum = 0.0; for (j = 0; j < *nclass; ++j) { leftCatClassCount[j] = tclasspop[j] - leftCatClassCount[j]; rightNum += leftCatClassCount[j] * leftCatClassCount[j]; } decGini = (leftNum / leftDen) + (rightNum / (*parentDen - leftDen)); if (decGini > *critmax) { *critmax = decGini; *nhit = 1; *ncatsp = *lcat > *ncmax ? pack((unsigned int) *lcat, icat) : n + 1; } } Free(leftCatClassCount); } /* Find best split of with categorical variable when there are two classes */ void F77_NAME(catmaxb)(double *totalWt, double *tclasscat, double *classCount, int *nclass, int *nCat, unsigned int *nbest, double *critmax, int *nhit, double *catCount) { double catProportion[32], cp[32], cm[32]; int kcat[32]; int i, j; double bestsplit=0.0, rightDen, leftDen, leftNum, rightNum, crit; *nhit = 0; for (i = 0; i < *nCat; ++i) { catProportion[i] = catCount[i] ? tclasscat[i * *nclass] / catCount[i] : 0.0; kcat[i] = i + 1; } R_qsort_I(catProportion, kcat, 1, *nCat); for (i = 0; i < *nclass; ++i) { cp[i] = 0; cm[i] = classCount[i]; } rightDen = *totalWt; leftDen = 0.0; for (i = 0; i < *nCat - 1; ++i) { leftDen += catCount[kcat[i]-1]; rightDen -= catCount[kcat[i]-1]; leftNum = 0.0; rightNum = 0.0; for (j = 0; j < *nclass; ++j) { cp[j] += tclasscat[j + (kcat[i]-1) * *nclass]; cm[j] -= tclasscat[j + (kcat[i]-1) * *nclass]; leftNum += cp[j] * cp[j]; rightNum += cm[j] * cm[j]; } if (catProportion[i] < catProportion[i + 1]) { /* If neither node is empty, check the split. */ if (rightDen > 1.0e-5 && leftDen > 1.0e-5) { crit = (leftNum / leftDen) + (rightNum / rightDen); if (crit > *critmax) { *critmax = crit; bestsplit = .5 * (catProportion[i] + catProportion[i + 1]); *nhit = 1; } } } } if (*nhit == 1) { zeroInt(kcat, *nCat); for (i = 0; i < *nCat; ++i) { catProportion[i] = catCount[i] ? tclasscat[i * *nclass] / catCount[i] : 0.0; kcat[i] = catProportion[i] < bestsplit ? 1 : 0; /* Rprintf("%i ", kcat[i]); */ } *nbest = pack(*nCat, kcat); /* Rprintf("\nnbest=%u\nnbest=%i\n", *nbest, *nbest); */ } } void predictClassTree(double *x, int n, int mdim, int *treemap, int *nodestatus, double *xbestsplit, int *bestvar, int *nodeclass, int treeSize, int *cat, int nclass, int *jts, int *nodex, int maxcat) { int m, i, j, k, *cbestsplit; unsigned int npack; /* decode the categorical splits */ if (maxcat > 1) { cbestsplit = (int *) Calloc(maxcat * treeSize, int); zeroInt(cbestsplit, maxcat * treeSize); for (i = 0; i < treeSize; ++i) { if (nodestatus[i] != NODE_TERMINAL) { if (cat[bestvar[i] - 1] > 1) { npack = (unsigned int) xbestsplit[i]; /* unpack `npack' into bits */ for (j = 0; npack; npack >>= 1, ++j) { cbestsplit[j + i*maxcat] = npack & 01; } } } } } for (i = 0; i < n; ++i) { k = 0; while (nodestatus[k] != NODE_TERMINAL) { m = bestvar[k] - 1; if (cat[m] == 1) { /* Split by a numerical predictor */ k = (x[m + i * mdim] <= xbestsplit[k]) ? treemap[k * 2] - 1 : treemap[1 + k * 2] - 1; } else { /* Split by a categorical predictor */ k = cbestsplit[(int) x[m + i * mdim] - 1 + k * maxcat] ? treemap[k * 2] - 1 : treemap[1 + k * 2] - 1; } } /* Terminal node: assign class label */ jts[i] = nodeclass[k]; nodex[i] = k + 1; } if (maxcat > 1) Free(cbestsplit); } randomForest/R/0000755000175100001440000000000012037254525013141 5ustar hornikusersrandomForest/R/zzz.R0000744000175100001440000000043712037254524014125 0ustar hornikusers.onAttach <- function(libname, pkgname) { RFver <- read.dcf(file=system.file("DESCRIPTION", package=pkgname), fields="Version") packageStartupMessage(paste(pkgname, RFver)) packageStartupMessage("Type rfNews() to see new features/changes/bug fixes.") } randomForest/R/varUsed.R0000744000175100001440000000155312037254524014701 0ustar hornikusersvarUsed <- function(x, by.tree=FALSE, count=TRUE) { if (!inherits(x, "randomForest")) stop(deparse(substitute(x)), "is not a randomForest object") if (is.null(x$forest)) stop(deparse(substitute(x)), "does not contain forest") p <- length(x$forest$ncat) # Total number of variables. if (count) { if (by.tree) { v <- apply(x$forest$bestvar, 2, function(x) { xx <- numeric(p) y <- table(x[x>0]) xx[as.numeric(names(y))] <- y xx }) } else { v <- numeric(p) vv <- table(x$forest$bestvar[x$forest$bestvar > 0]) v[as.numeric(names(vv))] <- vv } } else { v <- apply(x$forest$bestvar, 2, function(x) sort(unique(x[x>0]))) if(!by.tree) v <- sort(unique(unlist(v))) } v } randomForest/R/varImpPlot.R0000744000175100001440000000234512037254524015365 0ustar hornikusersvarImpPlot <- function(x, sort=TRUE, n.var=min(30, nrow(x$importance)), type=NULL, class=NULL, scale=TRUE, main=deparse(substitute(x)), ...) { if (!inherits(x, "randomForest")) stop("This function only works for objects of class `randomForest'") imp <- importance(x, class=class, scale=scale, type=type, ...) ## If there are more than two columns, just use the last two columns. if (ncol(imp) > 2) imp <- imp[, -(1:(ncol(imp) - 2))] nmeas <- ncol(imp) if (nmeas > 1) { op <- par(mfrow=c(1, 2), mar=c(4, 5, 4, 1), mgp=c(2, .8, 0), oma=c(0, 0, 2, 0), no.readonly=TRUE) on.exit(par(op)) } for (i in 1:nmeas) { ord <- if (sort) rev(order(imp[,i], decreasing=TRUE)[1:n.var]) else 1:n.var xmin <- if (colnames(imp)[i] %in% c("IncNodePurity", "MeanDecreaseGini")) 0 else min(imp[ord, i]) dotchart(imp[ord,i], xlab=colnames(imp)[i], ylab="", main=if (nmeas == 1) main else NULL, xlim=c(xmin, max(imp[,i])), ...) } if (nmeas > 1) mtext(outer=TRUE, side=3, text=main, cex=1.2) invisible(imp) } randomForest/R/tuneRF.R0000744000175100001440000000455012037254524014473 0ustar hornikuserstuneRF <- function(x, y, mtryStart=if(is.factor(y)) floor(sqrt(ncol(x))) else floor(ncol(x)/3), ntreeTry=50, stepFactor=2, improve=0.05, trace=TRUE, plot=TRUE, doBest=FALSE, ...) { if (improve < 0) stop ("improve must be non-negative.") classRF <- is.factor(y) errorOld <- if (classRF) { randomForest(x, y, mtry=mtryStart, ntree=ntreeTry, keep.forest=FALSE, ...)$err.rate[ntreeTry,1] } else { randomForest(x, y, mtry=mtryStart, ntree=ntreeTry, keep.forest=FALSE, ...)$mse[ntreeTry] } if (errorOld < 0) stop("Initial setting gave 0 error and no room for improvement.") if (trace) { cat("mtry =", mtryStart, " OOB error =", if (classRF) paste(100*round(errorOld, 4), "%", sep="") else errorOld, "\n") } oobError <- list() oobError[[1]] <- errorOld names(oobError)[1] <- mtryStart for (direction in c("left", "right")) { if (trace) cat("Searching", direction, "...\n") Improve <- 1.1*improve mtryBest <- mtryStart mtryCur <- mtryStart while (Improve >= improve) { mtryOld <- mtryCur mtryCur <- if (direction == "left") { max(1, ceiling(mtryCur / stepFactor)) } else { min(ncol(x), floor(mtryCur * stepFactor)) } if (mtryCur == mtryOld) break errorCur <- if (classRF) { randomForest(x, y, mtry=mtryCur, ntree=ntreeTry, keep.forest=FALSE, ...)$err.rate[ntreeTry,"OOB"] } else { randomForest(x, y, mtry=mtryCur, ntree=ntreeTry, keep.forest=FALSE, ...)$mse[ntreeTry] } if (trace) { cat("mtry =",mtryCur, "\tOOB error =", if (classRF) paste(100*round(errorCur, 4), "%", sep="") else errorCur, "\n") } oobError[[as.character(mtryCur)]] <- errorCur Improve <- 1 - errorCur/errorOld cat(Improve, improve, "\n") if (Improve > improve) { errorOld <- errorCur mtryBest <- mtryCur } } } mtry <- sort(as.numeric(names(oobError))) res <- unlist(oobError[as.character(mtry)]) res <- cbind(mtry=mtry, OOBError=res) if (plot) { plot(res, xlab=expression(m[try]), ylab="OOB Error", type="o", log="x", xaxt="n") axis(1, at=res[,"mtry"]) } if (doBest) res <- randomForest(x, y, mtry=res[which.min(res[,2]), 1], ...) res } randomForest/R/treesize.R0000744000175100001440000000046412037254524015122 0ustar hornikuserstreesize <- function(x, terminal=TRUE) { if(!inherits(x, "randomForest")) stop("This function only works for objects of class `randomForest'") if(is.null(x$forest)) stop("The object must contain the forest component") if(terminal) return((x$forest$ndbigtree+1)/2) else return(x$forest$ndbigtree) } randomForest/R/rfNews.R0000744000175100001440000000017012037254524014526 0ustar hornikusersrfNews <- function() { newsfile <- file.path(system.file(package="randomForest"), "NEWS") file.show(newsfile) } randomForest/R/rfImpute.R0000744000175100001440000000376012037254524015065 0ustar hornikusersrfImpute <- function(x, ...) UseMethod("rfImpute") rfImpute.formula <- function(x, data, ..., subset) { if (!inherits(x, "formula")) stop("method is only for formula objects") call <- match.call() m <- match.call(expand.dots = FALSE) names(m)[2] <- "formula" if (is.matrix(eval(m$data, parent.frame()))) m$data <- as.data.frame(data) m$... <- NULL m$na.action <- as.name("na.pass") m[[1]] <- as.name("model.frame") m <- eval(m, parent.frame()) Terms <- attr(m, "terms") attr(Terms, "intercept") <- 0 y <- model.response(m) if (!is.null(y)) m <- m[,-1] for (i in seq(along=ncol(m))) { if(is.ordered(m[[i]])) m[[i]] <- as.numeric(m[[i]]) } ret <- rfImpute.default(m, y, ...) names(ret)[1] <- deparse(as.list(x)[[2]]) ret } rfImpute.default <- function(x, y, iter=5, ntree=300, ...) { if (any(is.na(y))) stop("Can't have NAs in", deparse(substitute(y))) if (!any(is.na(x))) stop("No NAs found in ", deparse(substitute(x))) xf <- na.roughfix(x) hasNA <- which(apply(x, 2, function(x) any(is.na(x)))) if (is.data.frame(x)) { isfac <- sapply(x, is.factor) } else { isfac <- rep(FALSE, ncol(x)) } for (i in 1:iter) { prox <- randomForest(xf, y, ntree=ntree, ..., do.trace=ntree, proximity=TRUE)$proximity for (j in hasNA) { miss <- which(is.na(x[, j])) if (isfac[j]) { lvl <- levels(x[[j]]) catprox <- apply(prox[-miss, miss, drop=FALSE], 2, function(v) lvl[which.max(tapply(v, x[[j]][-miss], mean))]) xf[miss, j] <- catprox } else { sumprox <- colSums(prox[-miss, miss, drop=FALSE]) xf[miss, j] <- (prox[miss, -miss, drop=FALSE] %*% xf[,j][-miss]) / (1e-8 + sumprox) } NULL } } xf <- cbind(y, xf) names(xf)[1] <- deparse(substitute(y)) xf } randomForest/R/rfcv.R0000744000175100001440000000511412037254524014225 0ustar hornikusersrfcv <- function(trainx, trainy, cv.fold=5, scale="log", step=0.5, mtry=function(p) max(1, floor(sqrt(p))), recursive=FALSE, ...) { classRF <- is.factor(trainy) n <- nrow(trainx) p <- ncol(trainx) if (scale == "log") { k <- floor(log(p, base=1/step)) n.var <- round(p * step^(0:(k-1))) same <- diff(n.var) == 0 if (any(same)) n.var <- n.var[-which(same)] if (! 1 %in% n.var) n.var <- c(n.var, 1) } else { n.var <- seq(from=p, to=1, by=step) } k <- length(n.var) cv.pred <- vector(k, mode="list") for (i in 1:k) cv.pred[[i]] <- trainy ## Generate the indices of the splits ## Stratify the classes for classification problem. ## For regression, bin the response into 5 bins and stratify. if(classRF) { f <- trainy } else { ##f <- cut(trainy, c(-Inf, quantile(trainy, 1:4/5), Inf)) f <- factor(rep(1:5, length=length(trainy))[order(order(trainy))]) } nlvl <- table(f) idx <- numeric(n) for (i in 1:length(nlvl)) { idx[which(f == levels(f)[i])] <- sample(rep(1:cv.fold, length=nlvl[i])) } for (i in 1:cv.fold) { ## cat(".") all.rf <- randomForest(trainx[idx != i, , drop=FALSE], trainy[idx != i], trainx[idx == i, , drop=FALSE], trainy[idx == i], mtry=mtry(p), importance=TRUE, ...) cv.pred[[1]][idx == i] <- all.rf$test$predicted impvar <- (1:p)[order(all.rf$importance[,1], decreasing=TRUE)] for (j in 2:k) { imp.idx <- impvar[1:n.var[j]] sub.rf <- randomForest(trainx[idx != i, imp.idx, drop=FALSE], trainy[idx != i], trainx[idx == i, imp.idx, drop=FALSE], trainy[idx == i], mtry=mtry(n.var[j]), importance=recursive, ...) cv.pred[[j]][idx == i] <- sub.rf$test$predicted ## For recursive selection, use importance measures from the sub-model. if (recursive) { impvar <- (1:length(imp.idx))[order(sub.rf$importance[,1], decreasing=TRUE)] } NULL } NULL } ## cat("\n") if(classRF) { error.cv <- sapply(cv.pred, function(x) mean(trainy != x)) } else { error.cv <- sapply(cv.pred, function(x) mean((trainy - x)^2)) } names(error.cv) <- names(cv.pred) <- n.var list(n.var=n.var, error.cv=error.cv, predicted=cv.pred) } randomForest/R/randomForest.R0000744000175100001440000000007712037254524015733 0ustar hornikusers"randomForest" <- function(x, ...) UseMethod("randomForest") randomForest/R/randomForest.formula.R0000744000175100001440000000276512037254524017405 0ustar hornikusers"randomForest.formula" <- function(formula, data = NULL, ..., subset, na.action = na.fail) { ### formula interface for randomForest. ### code gratefully stolen from svm.formula (package e1071). ### if (!inherits(formula, "formula")) stop("method is only for formula objects") m <- match.call(expand.dots = FALSE) ## Catch xtest and ytest in arguments. if (any(c("xtest", "ytest") %in% names(m))) stop("xtest/ytest not supported through the formula interface") names(m)[2] <- "formula" if (is.matrix(eval(m$data, parent.frame()))) m$data <- as.data.frame(data) m$... <- NULL m$na.action <- na.action m[[1]] <- as.name("model.frame") m <- eval(m, parent.frame()) y <- model.response(m) Terms <- attr(m, "terms") attr(Terms, "intercept") <- 0 ## Drop any "negative" terms in the formula. ## test with: ## randomForest(Fertility~.-Catholic+I(Catholic<50),data=swiss,mtry=2) m <- model.frame(terms(reformulate(attributes(Terms)$term.labels)), data.frame(m)) ## if (!is.null(y)) m <- m[, -1, drop=FALSE] for (i in seq(along=ncol(m))) { if (is.ordered(m[[i]])) m[[i]] <- as.numeric(m[[i]]) } ret <- randomForest(m, y, ...) cl <- match.call() cl[[1]] <- as.name("randomForest") ret$call <- cl ret$terms <- Terms if (!is.null(attr(m, "na.action"))) ret$na.action <- attr(m, "na.action") class(ret) <- c("randomForest.formula", "randomForest") return(ret) } randomForest/R/randomForest.default.R0000744000175100001440000005172212037254524017361 0ustar hornikusers## mylevels() returns levels if given a factor, otherwise 0. mylevels <- function(x) if (is.factor(x)) levels(x) else 0 "randomForest.default" <- function(x, y=NULL, xtest=NULL, ytest=NULL, ntree=500, mtry=if (!is.null(y) && !is.factor(y)) max(floor(ncol(x)/3), 1) else floor(sqrt(ncol(x))), replace=TRUE, classwt=NULL, cutoff, strata, sampsize = if (replace) nrow(x) else ceiling(.632*nrow(x)), nodesize = if (!is.null(y) && !is.factor(y)) 5 else 1, maxnodes=NULL, importance=FALSE, localImp=FALSE, nPerm=1, proximity, oob.prox=proximity, norm.votes=TRUE, do.trace=FALSE, keep.forest=!is.null(y) && is.null(xtest), corr.bias=FALSE, keep.inbag=FALSE, ...) { addclass <- is.null(y) classRF <- addclass || is.factor(y) if (!classRF && length(unique(y)) <= 5) { warning("The response has five or fewer unique values. Are you sure you want to do regression?") } if (classRF && !addclass && length(unique(y)) < 2) stop("Need at least two classes to do classification.") n <- nrow(x) p <- ncol(x) if (n == 0) stop("data (x) has 0 rows") x.row.names <- rownames(x) x.col.names <- if (is.null(colnames(x))) 1:ncol(x) else colnames(x) ## overcome R's lazy evaluation: keep.forest <- keep.forest testdat <- !is.null(xtest) if (testdat) { if (ncol(x) != ncol(xtest)) stop("x and xtest must have same number of columns") ntest <- nrow(xtest) xts.row.names <- rownames(xtest) } ## Make sure mtry is in reasonable range. if (mtry < 1 || mtry > p) warning("invalid mtry: reset to within valid range") mtry <- max(1, min(p, round(mtry))) if (!is.null(y)) { if (length(y) != n) stop("length of response must be the same as predictors") addclass <- FALSE } else { if (!addclass) addclass <- TRUE y <- factor(c(rep(1, n), rep(2, n))) x <- rbind(x, x) } ## Check for NAs. if (any(is.na(x))) stop("NA not permitted in predictors") if (testdat && any(is.na(xtest))) stop("NA not permitted in xtest") if (any(is.na(y))) stop("NA not permitted in response") if (!is.null(ytest) && any(is.na(ytest))) stop("NA not permitted in ytest") if (is.data.frame(x)) { xlevels <- lapply(x, mylevels) ncat <- sapply(xlevels, length) ## Treat ordered factors as numerics. ncat <- ifelse(sapply(x, is.ordered), 1, ncat) x <- data.matrix(x) if(testdat) { if(!is.data.frame(xtest)) stop("xtest must be data frame if x is") xfactor <- which(sapply(xtest, is.factor)) if (length(xfactor) > 0) { for (i in xfactor) { if (any(! levels(xtest[[i]]) %in% xlevels[[i]])) stop("New factor levels in xtest not present in x") xtest[[i]] <- factor(xlevels[[i]][match(xtest[[i]], xlevels[[i]])], levels=xlevels[[i]]) } } xtest <- data.matrix(xtest) } } else { ncat <- rep(1, p) xlevels <- as.list(rep(0, p)) } maxcat <- max(ncat) if (maxcat > 32) stop("Can not handle categorical predictors with more than 32 categories.") if (classRF) { nclass <- length(levels(y)) ## Check for empty classes: if (any(table(y) == 0)) stop("Can't have empty classes in y.") if (!is.null(ytest)) { if (!is.factor(ytest)) stop("ytest must be a factor") if (!all(levels(y) == levels(ytest))) stop("y and ytest must have the same levels") } if (missing(cutoff)) { cutoff <- rep(1 / nclass, nclass) } else { if (sum(cutoff) > 1 || sum(cutoff) < 0 || !all(cutoff > 0) || length(cutoff) != nclass) { stop("Incorrect cutoff specified.") } if (!is.null(names(cutoff))) { if (!all(names(cutoff) %in% levels(y))) { stop("Wrong name(s) for cutoff") } cutoff <- cutoff[levels(y)] } } if (!is.null(classwt)) { if (length(classwt) != nclass) stop("length of classwt not equal to number of classes") ## If classwt has names, match to class labels. if (!is.null(names(classwt))) { if (!all(names(classwt) %in% levels(y))) { stop("Wrong name(s) for classwt") } classwt <- classwt[levels(y)] } if (any(classwt <= 0)) stop("classwt must be positive") ipi <- 1 } else { classwt <- rep(1, nclass) ipi <- 0 } } else addclass <- FALSE if (missing(proximity)) proximity <- addclass if (proximity) { prox <- matrix(0.0, n, n) proxts <- if (testdat) matrix(0, ntest, ntest + n) else double(1) } else { prox <- proxts <- double(1) } if (localImp) { importance <- TRUE impmat <- matrix(0, p, n) } else impmat <- double(1) if (importance) { if (nPerm < 1) nPerm <- as.integer(1) else nPerm <- as.integer(nPerm) if (classRF) { impout <- matrix(0.0, p, nclass + 2) impSD <- matrix(0.0, p, nclass + 1) } else { impout <- matrix(0.0, p, 2) impSD <- double(p) names(impSD) <- x.col.names } } else { impout <- double(p) impSD <- double(1) } nsample <- if (addclass) 2 * n else n Stratify <- length(sampsize) > 1 if ((!Stratify) && sampsize > nrow(x)) stop("sampsize too large") if (Stratify && (!classRF)) stop("sampsize should be of length one") if (classRF) { if (Stratify) { if (missing(strata)) strata <- y if (!is.factor(strata)) strata <- as.factor(strata) nsum <- sum(sampsize) if (length(sampsize) > nlevels(strata)) stop("sampsize has too many elements.") if (any(sampsize <= 0) || nsum == 0) stop("Bad sampsize specification") ## If sampsize has names, match to class labels. if (!is.null(names(sampsize))) { sampsize <- sampsize[levels(strata)] } if (any(sampsize > table(strata))) stop("sampsize can not be larger than class frequency") } else { nsum <- sampsize } nrnodes <- 2 * trunc(nsum / nodesize) + 1 } else { ## For regression trees, need to do this to get maximal trees. nrnodes <- 2 * trunc(sampsize/max(1, nodesize - 4)) + 1 } if (!is.null(maxnodes)) { ## convert # of terminal nodes to total # of nodes maxnodes <- 2 * maxnodes - 1 if (maxnodes > nrnodes) warning("maxnodes exceeds its max value.") nrnodes <- min(c(nrnodes, max(c(maxnodes, 1)))) } ## Compiled code expects variables in rows and observations in columns. x <- t(x) storage.mode(x) <- "double" if (testdat) { xtest <- t(xtest) storage.mode(xtest) <- "double" if (is.null(ytest)) { ytest <- labelts <- 0 } else { labelts <- TRUE } } else { xtest <- double(1) ytest <- double(1) ntest <- 1 labelts <- FALSE } nt <- if (keep.forest) ntree else 1 if (classRF) { cwt <- classwt threshold <- cutoff error.test <- if (labelts) double((nclass+1) * ntree) else double(1) rfout <- .C("classRF", x = x, xdim = as.integer(c(p, n)), y = as.integer(y), nclass = as.integer(nclass), ncat = as.integer(ncat), maxcat = as.integer(maxcat), sampsize = as.integer(sampsize), strata = if (Stratify) as.integer(strata) else integer(1), Options = as.integer(c(addclass, importance, localImp, proximity, oob.prox, do.trace, keep.forest, replace, Stratify, keep.inbag)), ntree = as.integer(ntree), mtry = as.integer(mtry), ipi = as.integer(ipi), classwt = as.double(cwt), cutoff = as.double(threshold), nodesize = as.integer(nodesize), outcl = integer(nsample), counttr = integer(nclass * nsample), prox = prox, impout = impout, impSD = impSD, impmat = impmat, nrnodes = as.integer(nrnodes), ndbigtree = integer(ntree), nodestatus = integer(nt * nrnodes), bestvar = integer(nt * nrnodes), treemap = integer(nt * 2 * nrnodes), nodepred = integer(nt * nrnodes), xbestsplit = double(nt * nrnodes), errtr = double((nclass+1) * ntree), testdat = as.integer(testdat), xts = as.double(xtest), clts = as.integer(ytest), nts = as.integer(ntest), countts = double(nclass * ntest), outclts = as.integer(numeric(ntest)), labelts = as.integer(labelts), proxts = proxts, errts = error.test, inbag = if (keep.inbag) matrix(integer(n * ntree), n) else integer(n), DUP=FALSE, PACKAGE="randomForest")[-1] if (keep.forest) { ## deal with the random forest outputs max.nodes <- max(rfout$ndbigtree) treemap <- aperm(array(rfout$treemap, dim = c(2, nrnodes, ntree)), c(2, 1, 3))[1:max.nodes, , , drop=FALSE] } if (!addclass) { ## Turn the predicted class into a factor like y. out.class <- factor(rfout$outcl, levels=1:nclass, labels=levels(y)) names(out.class) <- x.row.names con <- table(observed = y, predicted = out.class)[levels(y), levels(y)] con <- cbind(con, class.error = 1 - diag(con)/rowSums(con)) } out.votes <- t(matrix(rfout$counttr, nclass, nsample))[1:n, ] oob.times <- rowSums(out.votes) if (norm.votes) out.votes <- t(apply(out.votes, 1, function(x) x/sum(x))) dimnames(out.votes) <- list(x.row.names, levels(y)) class(out.votes) <- c(class(out.votes), "votes") if (testdat) { out.class.ts <- factor(rfout$outclts, levels=1:nclass, labels=levels(y)) names(out.class.ts) <- xts.row.names out.votes.ts <- t(matrix(rfout$countts, nclass, ntest)) dimnames(out.votes.ts) <- list(xts.row.names, levels(y)) if (norm.votes) out.votes.ts <- t(apply(out.votes.ts, 1, function(x) x/sum(x))) class(out.votes.ts) <- c(class(out.votes.ts), "votes") if (labelts) { testcon <- table(observed = ytest, predicted = out.class.ts)[levels(y), levels(y)] testcon <- cbind(testcon, class.error = 1 - diag(testcon)/rowSums(testcon)) } } cl <- match.call() cl[[1]] <- as.name("randomForest") out <- list(call = cl, type = if (addclass) "unsupervised" else "classification", predicted = if (addclass) NULL else out.class, err.rate = if (addclass) NULL else t(matrix(rfout$errtr, nclass+1, ntree, dimnames=list(c("OOB", levels(y)), NULL))), confusion = if (addclass) NULL else con, votes = out.votes, oob.times = oob.times, classes = levels(y), importance = if (importance) matrix(rfout$impout, p, nclass+2, dimnames = list(x.col.names, c(levels(y), "MeanDecreaseAccuracy", "MeanDecreaseGini"))) else matrix(rfout$impout, ncol=1, dimnames=list(x.col.names, "MeanDecreaseGini")), importanceSD = if (importance) matrix(rfout$impSD, p, nclass + 1, dimnames = list(x.col.names, c(levels(y), "MeanDecreaseAccuracy"))) else NULL, localImportance = if (localImp) matrix(rfout$impmat, p, n, dimnames = list(x.col.names,x.row.names)) else NULL, proximity = if (proximity) matrix(rfout$prox, n, n, dimnames = list(x.row.names, x.row.names)) else NULL, ntree = ntree, mtry = mtry, forest = if (!keep.forest) NULL else { list(ndbigtree = rfout$ndbigtree, nodestatus = matrix(rfout$nodestatus, ncol = ntree)[1:max.nodes,, drop=FALSE], bestvar = matrix(rfout$bestvar, ncol = ntree)[1:max.nodes,, drop=FALSE], treemap = treemap, nodepred = matrix(rfout$nodepred, ncol = ntree)[1:max.nodes,, drop=FALSE], xbestsplit = matrix(rfout$xbestsplit, ncol = ntree)[1:max.nodes,, drop=FALSE], pid = rfout$classwt, cutoff=cutoff, ncat=ncat, maxcat = maxcat, nrnodes = max.nodes, ntree = ntree, nclass = nclass, xlevels=xlevels) }, y = if (addclass) NULL else y, test = if(!testdat) NULL else list( predicted = out.class.ts, err.rate = if (labelts) t(matrix(rfout$errts, nclass+1, ntree, dimnames=list(c("Test", levels(y)), NULL))) else NULL, confusion = if (labelts) testcon else NULL, votes = out.votes.ts, proximity = if(proximity) matrix(rfout$proxts, nrow=ntest, dimnames = list(xts.row.names, c(xts.row.names, x.row.names))) else NULL), inbag = if (keep.inbag) rfout$inbag else NULL) } else { rfout <- .C("regRF", x, as.double(y), as.integer(c(n, p)), as.integer(sampsize), as.integer(nodesize), as.integer(nrnodes), as.integer(ntree), as.integer(mtry), as.integer(c(importance, localImp, nPerm)), as.integer(ncat), as.integer(maxcat), as.integer(do.trace), as.integer(proximity), as.integer(oob.prox), as.integer(corr.bias), ypred = double(n), impout = impout, impmat = impmat, impSD = impSD, prox = prox, ndbigtree = integer(ntree), nodestatus = matrix(integer(nrnodes * nt), ncol=nt), leftDaughter = matrix(integer(nrnodes * nt), ncol=nt), rightDaughter = matrix(integer(nrnodes * nt), ncol=nt), nodepred = matrix(double(nrnodes * nt), ncol=nt), bestvar = matrix(integer(nrnodes * nt), ncol=nt), xbestsplit = matrix(double(nrnodes * nt), ncol=nt), mse = double(ntree), keep = as.integer(c(keep.forest, keep.inbag)), replace = as.integer(replace), testdat = as.integer(testdat), xts = xtest, ntest = as.integer(ntest), yts = as.double(ytest), labelts = as.integer(labelts), ytestpred = double(ntest), proxts = proxts, msets = double(if (labelts) ntree else 1), coef = double(2), oob.times = integer(n), inbag = if (keep.inbag) matrix(integer(n * ntree), n) else integer(1), DUP=FALSE, PACKAGE="randomForest")[c(16:28, 36:41)] ## Format the forest component, if present. if (keep.forest) { max.nodes <- max(rfout$ndbigtree) rfout$nodestatus <- rfout$nodestatus[1:max.nodes, , drop=FALSE] rfout$bestvar <- rfout$bestvar[1:max.nodes, , drop=FALSE] rfout$nodepred <- rfout$nodepred[1:max.nodes, , drop=FALSE] rfout$xbestsplit <- rfout$xbestsplit[1:max.nodes, , drop=FALSE] rfout$leftDaughter <- rfout$leftDaughter[1:max.nodes, , drop=FALSE] rfout$rightDaughter <- rfout$rightDaughter[1:max.nodes, , drop=FALSE] } cl <- match.call() cl[[1]] <- as.name("randomForest") ## Make sure those obs. that have not been OOB get NA as prediction. ypred <- rfout$ypred if (any(rfout$oob.times < 1)) { ypred[rfout$oob.times == 0] <- NA } out <- list(call = cl, type = "regression", predicted = structure(ypred, names=x.row.names), mse = rfout$mse, rsq = 1 - rfout$mse / (var(y) * (n-1) / n), oob.times = rfout$oob.times, importance = if (importance) matrix(rfout$impout, p, 2, dimnames=list(x.col.names, c("%IncMSE","IncNodePurity"))) else matrix(rfout$impout, ncol=1, dimnames=list(x.col.names, "IncNodePurity")), importanceSD=if (importance) rfout$impSD else NULL, localImportance = if (localImp) matrix(rfout$impmat, p, n, dimnames=list(x.col.names, x.row.names)) else NULL, proximity = if (proximity) matrix(rfout$prox, n, n, dimnames = list(x.row.names, x.row.names)) else NULL, ntree = ntree, mtry = mtry, forest = if (keep.forest) c(rfout[c("ndbigtree", "nodestatus", "leftDaughter", "rightDaughter", "nodepred", "bestvar", "xbestsplit")], list(ncat = ncat), list(nrnodes=max.nodes), list(ntree=ntree), list(xlevels=xlevels)) else NULL, coefs = if (corr.bias) rfout$coef else NULL, y = y, test = if(testdat) { list(predicted = structure(rfout$ytestpred, names=xts.row.names), mse = if(labelts) rfout$msets else NULL, rsq = if(labelts) 1 - rfout$msets / (var(ytest) * (n-1) / n) else NULL, proximity = if (proximity) matrix(rfout$proxts / ntree, nrow = ntest, dimnames = list(xts.row.names, c(xts.row.names, x.row.names))) else NULL) } else NULL, inbag = if (keep.inbag) matrix(rfout$inbag, nrow(rfout$inbag), dimnames=list(x.row.names, NULL)) else NULL) } class(out) <- "randomForest" return(out) } randomForest/R/print.randomForest.R0000744000175100001440000000304712037254524017066 0ustar hornikusers"print.randomForest" <- function(x, ...) { cat("\nCall:\n", deparse(x$call), "\n") cat(" Type of random forest: ", x$type, "\n", sep="") cat(" Number of trees: ", x$ntree, "\n",sep="") cat("No. of variables tried at each split: ", x$mtry, "\n\n", sep="") if(x$type == "classification") { if(!is.null(x$confusion)) { cat(" OOB estimate of error rate: ", round(x$err.rate[x$ntree, "OOB"]*100, digits=2), "%\n", sep="") cat("Confusion matrix:\n") print(x$confusion) if(!is.null(x$test$err.rate)) { cat(" Test set error rate: ", round(x$test$err.rate[x$ntree, "Test"]*100, digits=2), "%\n", sep="") cat("Confusion matrix:\n") print(x$test$confusion) } } } if(x$type == "regression") { if(!is.null(x$mse)) { cat(" Mean of squared residuals: ", x$mse[length(x$mse)], "\n", sep="") cat(" % Var explained: ", round(100*x$rsq[length(x$rsq)], digits=2), "\n", sep="") if(!is.null(x$test$mse)) { cat(" Test set MSE: ", round(x$test$mse[length(x$test$mse)], digits=2), "\n", sep="") cat(" % Var explained: ", round(100*x$test$rsq[length(x$test$rsq)], digits=2), "\n", sep="") } } if (!is.null(x$coefs)) { cat(" Bias correction applied:\n") cat(" Intercept: ", x$coefs[1], "\n") cat(" Slope: ", x$coefs[2], "\n") } } } randomForest/R/predict.randomForest.R0000744000175100001440000002624512037254524017371 0ustar hornikusers"predict.randomForest" <- function (object, newdata, type = "response", norm.votes = TRUE, predict.all=FALSE, proximity = FALSE, nodes=FALSE, cutoff, ...) { if (!inherits(object, "randomForest")) stop("object not of class randomForest") if (is.null(object$forest)) stop("No forest component in the object") out.type <- charmatch(tolower(type), c("response", "prob", "vote", "class")) if (is.na(out.type)) stop("type must be one of 'response', 'prob', 'vote'") if (out.type == 4) out.type <- 1 if (out.type != 1 && object$type == "regression") stop("'prob' or 'vote' not meaningful for regression") if (out.type == 2) norm.votes <- TRUE if (missing(newdata)) { if (object$type == "regression") return(object$predicted) if (proximity & is.null(object$proximity)) warning("cannot return proximity without new data if random forest object does not already have proximity") if (out.type == 1) { if (proximity) { return(list(pred = object$predicted, proximity = object$proximity)) } else return(object$predicted) } if (norm.votes) { t1 <- t(apply(object$votes, 1, function(x) { x/sum(x) })) class(t1) <- c(class(t1), "votes") if (proximity) return(list(pred = t1, proximity = object$proximity)) else return(t1) } else { if (proximity) return(list(pred = object$votes, proximity = object$proximity)) else return(object$votes) } } if (missing(cutoff)) { cutoff <- object$forest$cutoff } else { if (sum(cutoff) > 1 || sum(cutoff) < 0 || !all(cutoff > 0) || length(cutoff) != length(object$classes)) { stop("Incorrect cutoff specified.") } if (!is.null(names(cutoff))) { if (!all(names(cutoff) %in% object$classes)) { stop("Wrong name(s) for cutoff") } cutoff <- cutoff[object$classes] } } if (object$type == "unsupervised") stop("Can't predict unsupervised forest.") if (inherits(object, "randomForest.formula")) { newdata <- as.data.frame(newdata) rn <- row.names(newdata) Terms <- delete.response(object$terms) x <- model.frame(Terms, newdata, na.action = na.omit) keep <- match(row.names(x), rn) } else { if (is.null(dim(newdata))) dim(newdata) <- c(1, length(newdata)) x <- newdata if (nrow(x) == 0) stop("newdata has 0 rows") if (any(is.na(x))) stop("missing values in newdata") keep <- 1:nrow(x) rn <- rownames(x) if (is.null(rn)) rn <- keep } vname <- if (is.null(dim(object$importance))) { names(object$importance) } else { rownames(object$importance) } if (is.null(colnames(x))) { if (ncol(x) != length(vname)) { stop("number of variables in newdata does not match that in the training data") } } else { if (any(! vname %in% colnames(x))) stop("variables in the training data missing in newdata") x <- x[, vname, drop=FALSE] } if (is.data.frame(x)) { xfactor <- which(sapply(x, is.factor)) if (length(xfactor) > 0 && "xlevels" %in% names(object$forest)) { for (i in xfactor) { if (any(! levels(x[[i]]) %in% object$forest$xlevels[[i]])) stop("New factor levels not present in the training data") x[[i]] <- factor(x[[i]], levels=levels(x[[i]])[match(levels(x[[i]]), object$forest$xlevels[[i]])]) } } cat.new <- sapply(x, function(x) if (is.factor(x) && !is.ordered(x)) length(levels(x)) else 1) if (!all(object$forest$ncat == cat.new)) stop("Type of predictors in new data do not match that of the training data.") } mdim <- ncol(x) ntest <- nrow(x) ntree <- object$forest$ntree maxcat <- max(object$forest$ncat) nclass <- object$forest$nclass nrnodes <- object$forest$nrnodes ## get rid of warning: op <- options(warn=-1) on.exit(options(op)) x <- t(data.matrix(x)) if (predict.all) { treepred <- if (object$type == "regression") { matrix(double(ntest * ntree), ncol=ntree) } else { matrix(integer(ntest * ntree), ncol=ntree) } } else { treepred <- numeric(ntest) } proxmatrix <- if (proximity) matrix(0, ntest, ntest) else numeric(1) nodexts <- if (nodes) integer(ntest * ntree) else integer(ntest) if (object$type == "regression") { if (!is.null(object$forest$treemap)) { object$forest$leftDaughter <- object$forest$treemap[,1,, drop=FALSE] object$forest$rightDaughter <- object$forest$treemap[,2,, drop=FALSE] object$forest$treemap <- NULL } keepIndex <- "ypred" if (predict.all) keepIndex <- c(keepIndex, "treepred") if (proximity) keepIndex <- c(keepIndex, "proximity") if (nodes) keepIndex <- c(keepIndex, "nodexts") ## Ensure storage mode is what is expected in C. if (! is.integer(object$forest$leftDaughter)) storage.mode(object$forest$leftDaughter) <- "integer" if (! is.integer(object$forest$rightDaughter)) storage.mode(object$forest$rightDaughter) <- "integer" if (! is.integer(object$forest$nodestatus)) storage.mode(object$forest$nodestatus) <- "integer" if (! is.double(object$forest$xbestsplit)) storage.mode(object$forest$xbestsplit) <- "double" if (! is.double(object$forest$nodepred)) storage.mode(object$forest$nodepred) <- "double" if (! is.integer(object$forest$bestvar)) storage.mode(object$forest$bestvar) <- "integer" if (! is.integer(object$forest$ndbigtree)) storage.mode(object$forest$ndbigtree) <- "integer" if (! is.integer(object$forest$ncat)) storage.mode(object$forest$ncat) <- "integer" ans <- .C("regForest", as.double(x), ypred = double(ntest), as.integer(mdim), as.integer(ntest), as.integer(ntree), object$forest$leftDaughter, object$forest$rightDaughter, object$forest$nodestatus, nrnodes, object$forest$xbestsplit, object$forest$nodepred, object$forest$bestvar, object$forest$ndbigtree, object$forest$ncat, as.integer(maxcat), as.integer(predict.all), treepred = as.double(treepred), as.integer(proximity), proximity = as.double(proxmatrix), nodes = as.integer(nodes), nodexts = as.integer(nodexts), DUP=FALSE, PACKAGE = "randomForest")[keepIndex] ## Apply bias correction if needed. yhat <- rep(NA, length(rn)) names(yhat) <- rn if (!is.null(object$coefs)) { yhat[keep] <- object$coefs[1] + object$coefs[2] * ans$ypred } else { yhat[keep] <- ans$ypred } if (predict.all) { treepred <- matrix(NA, length(rn), ntree, dimnames=list(rn, NULL)) treepred[keep,] <- ans$treepred } if (!proximity) { res <- if (predict.all) list(aggregate=yhat, individual=treepred) else yhat } else { res <- list(predicted = yhat, proximity = structure(ans$proximity, dim=c(ntest, ntest), dimnames=list(rn, rn))) } if (nodes) { attr(res, "nodes") <- matrix(ans$nodexts, ntest, ntree, dimnames=list(rn[keep], 1:ntree)) } } else { countts <- matrix(0, ntest, nclass) t1 <- .C("classForest", mdim = as.integer(mdim), ntest = as.integer(ntest), nclass = as.integer(object$forest$nclass), maxcat = as.integer(maxcat), nrnodes = as.integer(nrnodes), jbt = as.integer(ntree), xts = as.double(x), xbestsplit = as.double(object$forest$xbestsplit), pid = object$forest$pid, cutoff = as.double(cutoff), countts = as.double(countts), treemap = as.integer(aperm(object$forest$treemap, c(2, 1, 3))), nodestatus = as.integer(object$forest$nodestatus), cat = as.integer(object$forest$ncat), nodepred = as.integer(object$forest$nodepred), treepred = as.integer(treepred), jet = as.integer(numeric(ntest)), bestvar = as.integer(object$forest$bestvar), nodexts = as.integer(nodexts), ndbigtree = as.integer(object$forest$ndbigtree), predict.all = as.integer(predict.all), prox = as.integer(proximity), proxmatrix = as.double(proxmatrix), nodes = as.integer(nodes), DUP=FALSE, PACKAGE = "randomForest") if (out.type > 1) { out.class.votes <- t(matrix(t1$countts, nrow = nclass, ncol = ntest)) if (norm.votes) out.class.votes <- sweep(out.class.votes, 1, rowSums(out.class.votes), "/") z <- matrix(NA, length(rn), nclass, dimnames=list(rn, object$classes)) z[keep, ] <- out.class.votes class(z) <- c(class(z), "votes") res <- z } else { out.class <- factor(rep(NA, length(rn)), levels=1:length(object$classes), labels=object$classes) out.class[keep] <- object$classes[t1$jet] names(out.class)[keep] <- rn[keep] res <- out.class } if (predict.all) { treepred <- matrix(object$classes[t1$treepred], nrow=length(keep), dimnames=list(rn[keep], NULL)) res <- list(aggregate=res, individual=treepred) } if (proximity) res <- list(predicted = res, proximity = structure(t1$proxmatrix, dim = c(ntest, ntest), dimnames = list(rn[keep], rn[keep]))) if (nodes) attr(res, "nodes") <- matrix(t1$nodexts, ntest, ntree, dimnames=list(rn[keep], 1:ntree)) } res } randomForest/R/plot.randomForest.R0000744000175100001440000000125112037254524016703 0ustar hornikusersplot.randomForest <- function(x, type="l", main=deparse(substitute(x)), ...) { if(x$type == "unsupervised") stop("No plot for unsupervised randomForest.") test <- !(is.null(x$test$mse) || is.null(x$test$err.rate)) if(x$type == "regression") { err <- x$mse if(test) err <- cbind(err, x$test$mse) } else { err <- x$err.rate if(test) err <- cbind(err, x$test$err.rate) } if(test) { colnames(err) <- c("OOB", "Test") matplot(1:x$ntree, err, type = type, xlab="trees", ylab="Error", main=main, ...) } else { matplot(1:x$ntree, err, type = type, xlab="trees", ylab="Error", main=main, ...) } invisible(err) } randomForest/R/partialPlot.R0000744000175100001440000000662112037254524015564 0ustar hornikuserspartialPlot <- function(x, ...) UseMethod("partialPlot") partialPlot.default <- function(x, ...) stop("partial dependence plot not implemented for this class of objects.\n") partialPlot.randomForest <- function (x, pred.data, x.var, which.class, w, plot=TRUE, add=FALSE, n.pt = min(length(unique(pred.data[, xname])), 51), rug = TRUE, xlab=deparse(substitute(x.var)), ylab="", main=paste("Partial Dependence on", deparse(substitute(x.var))), ...) { classRF <- x$type != "regression" if (is.null(x$forest)) stop("The randomForest object must contain the forest.\n") x.var <- substitute(x.var) xname <- if (is.character(x.var)) x.var else { if (is.name(x.var)) deparse(x.var) else { eval(x.var) } } xv <- pred.data[, xname] n <- nrow(pred.data) if (missing(w)) w <- rep(1, n) if (classRF) { if (missing(which.class)) { focus <- 1 } else { focus <- charmatch(which.class, colnames(x$votes)) if (is.na(focus)) stop(which.class, "is not one of the class labels.") } } if (is.factor(xv) && !is.ordered(xv)) { x.pt <- levels(xv) y.pt <- numeric(length(x.pt)) for (i in seq(along = x.pt)) { x.data <- pred.data x.data[, xname] <- factor(rep(x.pt[i], n), levels = x.pt) if (classRF) { pr <- predict(x, x.data, type = "prob") y.pt[i] <- weighted.mean(log(ifelse(pr[, focus] > 0, pr[, focus], .Machine$double.eps)) - rowMeans(log(ifelse(pr > 0, pr, .Machine$double.eps))), w, na.rm=TRUE) } else y.pt[i] <- weighted.mean(predict(x, x.data), w, na.rm=TRUE) } if (add) { points(1:length(x.pt), y.pt, type="h", lwd=2, ...) } else { if (plot) barplot(y.pt, width=rep(1, length(y.pt)), col="blue", xlab = xlab, ylab = ylab, main=main, names.arg=x.pt, ...) } } else { if (is.ordered(xv)) xv <- as.numeric(xv) x.pt <- seq(min(xv), max(xv), length = n.pt) y.pt <- numeric(length(x.pt)) for (i in seq(along = x.pt)) { x.data <- pred.data x.data[, xname] <- rep(x.pt[i], n) if (classRF) { pr <- predict(x, x.data, type = "prob") y.pt[i] <- weighted.mean(log(ifelse(pr[, focus] == 0, .Machine$double.eps, pr[, focus])) - rowMeans(log(ifelse(pr == 0, .Machine$double.eps, pr))), w, na.rm=TRUE) } else { y.pt[i] <- weighted.mean(predict(x, x.data), w, na.rm=TRUE) } } if (add) { lines(x.pt, y.pt, ...) } else { if (plot) plot(x.pt, y.pt, type = "l", xlab=xlab, ylab=ylab, main = main, ...) } if (rug && plot) { if (n.pt > 10) { rug(quantile(xv, seq(0.1, 0.9, by = 0.1)), side = 1) } else { rug(unique(xv, side = 1)) } } } invisible(list(x = x.pt, y = y.pt)) } randomForest/R/outlier.R0000744000175100001440000000164612037254524014756 0ustar hornikusersoutlier <- function(x, ...) UseMethod("outlier") outlier.randomForest <- function(x, ...) { if (!inherits(x, "randomForest")) stop("x is not a randomForest object") if (x$type == "regression") stop("no outlier measure for regression") if (is.null(x$proximity)) stop("no proximity measures available") outlier.default(x$proximity, x$y) } outlier.default <- function(x, cls=NULL, ...) { if (nrow(x) != ncol(x)) stop ("x must be a square matrix") n <- nrow(x) if (is.null(cls)) cls <- rep(1, n) cls <- factor(cls) lvl <- levels(cls) cls.n <- table(cls)[lvl] id <- if (is.null(rownames(x))) 1:n else rownames(x) outlier <- structure(rep(NA, n), names=id) for (i in lvl) { iclass <- cls == i out <- rowSums(x[iclass, iclass]^2) out <- n / ifelse(out == 0, 1, out) out <- (out - median(out)) / mad(out) outlier[iclass] <- out } outlier } randomForest/R/na.roughfix.R0000744000175100001440000000225712037254524015522 0ustar hornikusersna.roughfix <- function(object, ...) UseMethod("na.roughfix") na.roughfix.data.frame <- function(object, ...) { isfac <- sapply(object, is.factor) isnum <- sapply(object, is.numeric) if (any(!(isfac | isnum))) stop("na.roughfix only works for numeric or factor") roughfix <- function(x) { if (any(is.na(x))) { if (is.factor(x)) { freq <- table(x) x[is.na(x)] <- names(freq)[which.max(freq)] } else { x[is.na(x)] <- median(x, na.rm=TRUE) } } x } object[] <- lapply(object, roughfix) object } na.roughfix.default <- function(object, ...) { if (!is.atomic(object)) return(object) d <- dim(object) if (length(d) > 2) stop("can't handle objects with more than two dimensions") if (all(!is.na(object))) return(object) if (!is.numeric(object)) stop("roughfix can only deal with numeric data.") if (length(d) == 2) { hasNA <- which(apply(object, 2, function(x) any(is.na(x)))) for (j in hasNA) object[is.na(object[, j]), j] <- median(object[, j], na.rm=TRUE) } else { object[is.na(object)] <- median(object, na.rm=TRUE) } object } randomForest/R/MDSplot.R0000744000175100001440000000147112037254524014611 0ustar hornikusersMDSplot <- function(rf, fac, k=2, palette=NULL, pch=20, ...) { if (!inherits(rf, "randomForest")) stop(deparse(substitute(rf)), " must be a randomForest object") if(is.null(rf$proximity)) stop(deparse(substitute(rf)), " does not contain a proximity matrix") op <- par(pty="s") on.exit(par(op)) rf.mds <- stats:::cmdscale(1 - rf$proximity, eig=TRUE, k=k) colnames(rf.mds$points) <- paste("Dim", 1:k) nlevs <- nlevels(fac) if (is.null(palette)) { palette <- if (require(RColorBrewer) && nlevs < 12) brewer.pal(nlevs, "Set1") else rainbow(nlevs) } if (k <= 2) { plot(rf.mds$points, col=palette[as.numeric(fac)], pch=pch, ...) } else { pairs(rf.mds$points, col=palette[as.numeric(fac)], pch=pch, ...) } invisible(rf.mds) } randomForest/R/margin.R0000744000175100001440000000274312037254524014547 0ustar hornikusersmargin <- function(x, ...) { UseMethod("margin") } margin.randomForest <- function(x, ...) { if (x$type == "regression") { stop("margin not defined for regression Random Forests") } if( is.null(x$votes) ) { stop("margin is only defined if votes are present") } margin(x$votes, x$y, ...) } margin.default <- function(x, observed, ...) { if ( !is.factor(observed) ) { stop(deparse(substitute(observed)), " is not a factor") } if (ncol(x) != nlevels(observed)) stop("number of columns in x must equal the number of levels in observed") if (! all(colnames(x) %in% levels(observed)) || ! all(levels(observed) %in% colnames(x))) stop("column names of x must match levels of observed") ## If the votes are not in fractions, normalize them to fractions. if ( any(x > 1) ) x <- sweep(x, 1, rowSums(x), "/") position <- match(as.character(observed), colnames(x)) margin <- numeric(length(observed)) for (i in seq_along(observed)) { margin[i] <- x[i, position[i]] - max(x[i, -position[i]]) } names(margin) <- observed class(margin) <- "margin" margin } plot.margin <- function(x, sort=TRUE, ...) { if (sort) x <- sort(x) nF <- factor(names(x)) nlevs <- length(levels(nF)) if ( require(RColorBrewer) && nlevs < 12) { pal <- brewer.pal(nlevs,"Set1") } else { pal <- rainbow(nlevs) } plot.default(x, col=pal[as.numeric(nF)], pch=20, ... ) } randomForest/R/importance.R0000744000175100001440000000313712037254524015431 0ustar hornikusersimportance <- function(x, ...) UseMethod("importance") importance.default <- function(x, ...) stop("No method implemented for this class of object") importance.randomForest <- function(x, type=NULL, class=NULL, scale=TRUE, ...) { if (!inherits(x, "randomForest")) stop("x is not of class randomForest") classRF <- x$type != "regression" hasImp <- !is.null(dim(x$importance)) || ncol(x$importance) == 1 hasType <- !is.null(type) if (hasType && type == 1 && !hasImp) stop("That measure has not been computed") allImp <- is.null(type) && hasImp if (hasType) { if (!(type %in% 1:2)) stop("Wrong type specified") if (type == 2 && !is.null(class)) stop("No class-specific measure for that type") } imp <- x$importance if (hasType && type == 2) { if (hasImp) imp <- imp[, ncol(imp), drop=FALSE] } else { if (scale) { SD <- x$importanceSD imp[, -ncol(imp)] <- imp[, -ncol(imp), drop=FALSE] / ifelse(SD < .Machine$double.eps, 1, SD) } if (!allImp) { if (is.null(class)) { ## The average decrease in accuracy measure: imp <- imp[, ncol(imp) - 1, drop=FALSE] } else { whichCol <- if (classRF) match(class, colnames(imp)) else 1 if (is.na(whichCol)) stop(paste("Class", class, "not found.")) imp <- imp[, whichCol, drop=FALSE] } } } imp } randomForest/R/grow.R0000744000175100001440000000036412037254524014245 0ustar hornikusersgrow <- function(x, ...) UseMethod("grow") grow.default <- function(x, ...) stop("grow has not been implemented for this class of object") grow.randomForest <- function(x, how.many, ...) { y <- update(x, ntree=how.many) combine(x, y) } randomForest/R/getTree.R0000744000175100001440000000262212037254524014665 0ustar hornikusersgetTree <- function(rfobj, k=1, labelVar=FALSE) { if (is.null(rfobj$forest)) { stop("No forest component in ", deparse(substitute(rfobj))) } if (k > rfobj$ntree) { stop("There are fewer than ", k, "trees in the forest") } if (rfobj$type == "regression") { tree <- cbind(rfobj$forest$leftDaughter[,k], rfobj$forest$rightDaughter[,k], rfobj$forest$bestvar[,k], rfobj$forest$xbestsplit[,k], rfobj$forest$nodestatus[,k], rfobj$forest$nodepred[,k])[1:rfobj$forest$ndbigtree[k],] } else { tree <- cbind(rfobj$forest$treemap[,,k], rfobj$forest$bestvar[,k], rfobj$forest$xbestsplit[,k], rfobj$forest$nodestatus[,k], rfobj$forest$nodepred[,k])[1:rfobj$forest$ndbigtree[k],] } dimnames(tree) <- list(1:nrow(tree), c("left daughter", "right daughter", "split var", "split point", "status", "prediction")) if (labelVar) { tree <- as.data.frame(tree) v <- tree[[3]] v[v == 0] <- NA tree[[3]] <- factor(rownames(rfobj$importance)[v]) if (rfobj$type == "classification") { v <- tree[[6]] v[! v %in% 1:nlevels(rfobj$y)] <- NA tree[[6]] <- levels(rfobj$y)[v] } } tree } randomForest/R/combine.R0000744000175100001440000001546612037254524014714 0ustar hornikuserscombine <- function(...) { pad0 <- function(x, len) c(x, rep(0, len-length(x))) padm0 <- function(x, len) rbind(x, matrix(0, nrow=len-nrow(x), ncol=ncol(x))) rflist <- list(...) areForest <- sapply(rflist, function(x) inherits(x, "randomForest")) if (any(!areForest)) stop("Argument must be a list of randomForest objects") ## Use the first component as a template rf <- rflist[[1]] classRF <- rf$type == "classification" trees <- sapply(rflist, function(x) x$ntree) ntree <- sum(trees) rf$ntree <- ntree nforest <- length(rflist) haveTest <- ! any(sapply(rflist, function(x) is.null(x$test))) ## Check if predictor variables are identical. vlist <- lapply(rflist, function(x) rownames(importance(x))) numvars <- sapply(vlist, length) if (! all(numvars[1] == numvars[-1])) stop("Unequal number of predictor variables in the randomForest objects.") for (i in seq_along(vlist)) { if (! all(vlist[[i]] == vlist[[1]])) stop("Predictor variables are different in the randomForest objects.") } ## Combine the forest component, if any haveForest <- sapply(rflist, function(x) !is.null(x$forest)) if (all(haveForest)) { nrnodes <- max(sapply(rflist, function(x) x$forest$nrnodes)) rf$forest$nrnodes <- nrnodes rf$forest$ndbigtree <- unlist(sapply(rflist, function(x) x$forest$ndbigtree)) rf$forest$nodestatus <- do.call("cbind", lapply(rflist, function(x) padm0(x$forest$nodestatus, nrnodes))) rf$forest $bestvar <- do.call("cbind", lapply(rflist, function(x) padm0(x$forest$bestvar, nrnodes))) rf$forest$xbestsplit <- do.call("cbind", lapply(rflist, function(x) padm0(x$forest$xbestsplit, nrnodes))) rf$forest$nodepred <- do.call("cbind", lapply(rflist, function(x) padm0(x$forest$nodepred, nrnodes))) tree.dim <- dim(rf$forest$treemap) if (classRF) { rf$forest$treemap <- array(unlist(lapply(rflist, function(x) apply(x$forest$treemap, 2:3, pad0, nrnodes))), c(nrnodes, 2, ntree)) } else { rf$forest$leftDaughter <- do.call("cbind", lapply(rflist, function(x) padm0(x$forest$leftDaughter, nrnodes))) rf$forest$rightDaughter <- do.call("cbind", lapply(rflist, function(x) padm0(x$forest$rightDaughter, nrnodes))) } rf$forest$ntree <- ntree if (classRF) rf$forest$cutoff <- rflist[[1]]$forest$cutoff } else { rf$forest <- NULL } if (classRF) { ## Combine the votes matrix: rf$votes <- 0 rf$oob.times <- 0 areVotes <- all(sapply(rflist, function(x) any(x$votes > 1, na.rf=TRUE))) if (areVotes) { for(i in 1:nforest) { rf$oob.times <- rf$oob.times + rflist[[i]]$oob.times rf$votes <- rf$votes + ifelse(is.na(rflist[[i]]$votes), 0, rflist[[i]]$votes) } } else { for(i in 1:nforest) { rf$oob.times <- rf$oob.times + rflist[[i]]$oob.times rf$votes <- rf$votes + ifelse(is.na(rflist[[i]]$votes), 0, rflist[[i]]$votes) * rflist[[i]]$oob.times } rf$votes <- rf$votes / rf$oob.times } rf$predicted <- factor(colnames(rf$votes)[max.col(rf$votes)], levels=levels(rf$predicted)) if(haveTest) { rf$test$votes <- 0 if (any(rf$test$votes > 1)) { for(i in 1:nforest) rf$test$votes <- rf$test$votes + rflist[[i]]$test$votes } else { for (i in 1:nforest) rf$test$votes <- rf$test$votes + rflist[[i]]$test$votes * rflist[[i]]$ntree } rf$test$predicted <- factor(colnames(rf$test$votes)[max.col(rf$test$votes)], levels=levels(rf$test$predicted)) } } else { rf$predicted <- 0 for (i in 1:nforest) rf$predicted <- rf$predicted + rflist[[i]]$predicted * rflist[[i]]$ntree rf$predicted <- rf$predicted / ntree if (haveTest) { rf$test$predicted <- 0 for (i in 1:nforest) rf$test$predicted <- rf$test$predicted + rflist[[i]]$test$predicted * rflist[[i]]$ntree rf$test$predicted <- rf$test$predicted / ntree } } ## If variable importance is in all of them, compute the average ## (weighted by the number of trees in each forest) have.imp <- !any(sapply(rflist, function(x) is.null(x$importance))) if (have.imp) { rf$importance <- rf$importanceSD <- 0 for(i in 1:nforest) { rf$importance <- rf$importance + rflist[[i]]$importance * rflist[[i]]$ntree ## Do the same thing with SD of importance, though that's not ## exactly right... rf$importanceSD <- rf$importanceSD + rflist[[i]]$importanceSD^2 * rflist[[i]]$ntree } rf$importance <- rf$importance / ntree rf$importanceSD <- sqrt(rf$importanceSD / ntree) haveCaseImp <- !any(sapply(rflist, function(x) is.null(x$localImportance))) ## Average casewise importance if (haveCaseImp) { rf$localImportance <- 0 for (i in 1:nforest) { rf$localImportance <- rf$localImportance + rflist[[i]]$localImportance * rflist[[i]]$ntree } rf$localImportance <- rf$localImportance / ntree } } ## If proximity is in all of them, compute the average ## (weighted by the number of trees in each forest) have.prox <- !any(sapply(rflist, function(x) is.null(x$proximity))) if(have.prox) { rf$proximity <- 0 for(i in 1:nforest) rf$proximity <- rf$proximity + rflist[[i]]$proximity * rflist[[i]]$ntree rf$proximity <- rf$proximity / ntree } ## Set confusion matrix and error rates to NULL if(classRF) { rf$confusion <- NULL rf$err.rate <- NULL if(haveTest) { rf$test$confusion <- NULL rf$err.rate <- NULL } } else { rf$mse <- rf$rsq <- NULL if(haveTest) rf$test$mse <- rf$test$rsq <- NULL } rf } randomForest/R/classCenter.R0000744000175100001440000000207112037254524015532 0ustar hornikusersclassCenter <- function(x, label, prox, nNbr = min(table(label))-1) { ## nPrototype=rep(3, length(unique(label))), ...) { label <- as.character(label) clsLabel <- unique(label) ## Find the nearest nNbr neighbors of each case ## (including the case itself). idx <- t(apply(prox, 1, order, decreasing=TRUE)[1:nNbr,]) ## Find the class labels of the neighbors. cls <- label[idx] dim(cls) <- dim(idx) ## Count the number of neighbors in each class for each case. ncls <- sapply(clsLabel, function(x) rowSums(cls == x)) ## For each class, find the case(s) with most neighbors in that class. clsMode <- max.col(t(ncls)) ## Identify the neighbors of the class modes that are of the target class. nbrList <- mapply(function(cls, m) idx[m,][label[idx[m,]] == cls], clsLabel, clsMode, SIMPLIFY=FALSE) ## Get the X data for the neighbors of the class `modes'. xdat <- t(sapply(nbrList, function(i) apply(x[i,,drop=FALSE], 2, median))) xdat } randomForest/NAMESPACE0000744000175100001440000000131412037254524014156 0ustar hornikusersuseDynLib(randomForest) export(combine, getTree, grow, importance, margin, MDSplot, na.roughfix, partialPlot, randomForest, rfImpute, treesize, tuneRF, varImpPlot, varUsed, rfNews, outlier, classCenter, rfcv) S3method(print, randomForest) S3method(predict, randomForest) S3method(plot, randomForest) S3method(plot, margin) S3method(margin, randomForest) S3method(partialPlot, randomForest) S3method(grow, randomForest) S3method(importance, randomForest) S3method(outlier, randomForest) S3method(outlier, default) S3method(randomForest, formula) S3method(randomForest, default) S3method(rfImpute, formula) S3method(rfImpute, default) S3method(na.roughfix, data.frame) S3method(na.roughfix, default) randomForest/man/0000755000175100001440000000000012037254525013513 5ustar hornikusersrandomForest/man/varUsed.Rd0000744000175100001440000000220212037254524015407 0ustar hornikusers\name{varUsed} \alias{varUsed} \title{Variables used in a random forest} \description{ Find out which predictor variables are actually used in the random forest. } \usage{ varUsed(x, by.tree=FALSE, count=TRUE) } \arguments{ \item{x}{An object of class \code{randomForest}.} \item{by.tree}{Should the list of variables used be broken down by trees in the forest?} \item{count}{Should the frequencies that variables appear in trees be returned?} } \value{ If \code{count=TRUE} and \code{by.tree=FALSE}, a integer vector containing frequencies that variables are used in the forest. If \code{by.tree=TRUE}, a matrix is returned, breaking down the counts by tree (each column corresponding to one tree and each row to a variable). If \code{count=FALSE} and \code{by.tree=TRUE}, a list of integer indices is returned giving the variables used in the trees, else if \code{by.tree=FALSE}, a vector of integer indices giving the variables used in the entire forest. } \seealso{ \code{\link{randomForest}} } \examples{ data(iris) set.seed(17) varUsed(randomForest(Species~., iris, ntree=100)) } \author{Andy Liaw} \keyword{tree} randomForest/man/varImpPlot.Rd0000744000175100001440000000222712037254524016102 0ustar hornikusers\name{varImpPlot} \alias{varImpPlot} \title{Variable Importance Plot} \description{ Dotchart of variable importance as measured by a Random Forest } \usage{ varImpPlot(x, sort=TRUE, n.var=min(30, nrow(x$importance)), type=NULL, class=NULL, scale=TRUE, main=deparse(substitute(x)), ...) } \arguments{ \item{x}{An object of class \code{randomForest}.} \item{sort}{Should the variables be sorted in decreasing order of importance?} \item{n.var}{How many variables to show? (Ignored if \code{sort=FALSE}.)} \item{type, class, scale}{arguments to be passed on to \code{\link{importance}}} \item{main}{plot title.} \item{...}{Other graphical parameters to be passed on to \code{\link{dotchart}}.} } \value{ Invisibly, the importance of the variables that were plotted. } \seealso{ \code{\link{randomForest}}, \code{\link{importance}} } \examples{ set.seed(4543) data(mtcars) mtcars.rf <- randomForest(mpg ~ ., data=mtcars, ntree=1000, keep.forest=FALSE, importance=TRUE) varImpPlot(mtcars.rf) } \author{Andy Liaw \email{andy\_liaw@merck.com}} \keyword{regression} \keyword{classif} \keyword{tree} randomForest/man/tuneRF.Rd0000744000175100001440000000315112037254524015205 0ustar hornikusers\name{tuneRF} \alias{tuneRF} \title{Tune randomForest for the optimal mtry parameter} \description{ Starting with the default value of mtry, search for the optimal value (with respect to Out-of-Bag error estimate) of mtry for randomForest. } \usage{ tuneRF(x, y, mtryStart, ntreeTry=50, stepFactor=2, improve=0.05, trace=TRUE, plot=TRUE, doBest=FALSE, ...) } \arguments{ \item{x}{matrix or data frame of predictor variables} \item{y}{response vector (factor for classification, numeric for regression)} \item{mtryStart}{starting value of mtry; default is the same as in \code{\link{randomForest}}} \item{ntreeTry}{number of trees used at the tuning step} \item{stepFactor}{at each iteration, mtry is inflated (or deflated) by this value} \item{improve}{the (relative) improvement in OOB error must be by this much for the search to continue} \item{trace}{whether to print the progress of the search} \item{plot}{whether to plot the OOB error as function of mtry} \item{doBest}{whether to run a forest using the optimal mtry found} \item{...}{options to be given to \code{\link{randomForest}}} } \value{ If \code{doBest=FALSE} (default), it returns a matrix whose first column contains the mtry values searched, and the second column the corresponding OOB error. If \code{doBest=TRUE}, it returns the \code{\link{randomForest}} object produced with the optimal \code{mtry}. } %\details{ %} %\references{ %} \seealso{ \code{\link{randomForest}} } \examples{ data(fgl, package="MASS") fgl.res <- tuneRF(fgl[,-10], fgl[,10], stepFactor=1.5) } %\author{} \keyword{classif} \keyword{tree} randomForest/man/treesize.Rd0000744000175100001440000000152012037254524015632 0ustar hornikusers\name{treesize} \alias{treesize} \title{Size of trees in an ensemble} \description{ Size of trees (number of nodes) in and ensemble. } \usage{ treesize(x, terminal=TRUE) } \arguments{ \item{x}{an object of class \code{randomForest}, which contains a \code{forest} component.} \item{terminal}{count terminal nodes only (\code{TRUE}) or all nodes (\code{FALSE}} } \value{ A vector containing number of nodes for the trees in the \code{randomForest} object. } \note{ The \code{randomForest} object must contain the \code{forest} component; i.e., created with \code{randomForest(..., keep.forest=TRUE)}. } \seealso{\code{\link{randomForest}}} \author{Andy Liaw \email{andy\_liaw@merck.com}} \examples{ data(iris) iris.rf <- randomForest(Species ~ ., iris) hist(treesize(iris.rf)) } \keyword{regression} \keyword{classif} randomForest/man/rfNews.Rd0000744000175100001440000000027712037254524015254 0ustar hornikusers\name{rfNews} \alias{rfNews} \title{Show the NEWS file} \description{ Show the NEWS file of the randomForest package. } \usage{ rfNews() } %\arguments{ %} \value{ None. } \keyword{classif} randomForest/man/rfImpute.Rd0000744000175100001440000000456512037254524015607 0ustar hornikusers\name{rfImpute} \alias{rfImpute} \alias{rfImpute.formula} \alias{rfImpute.default} \title{Missing Value Imputations by randomForest} \description{ Impute missing values in predictor data using proximity from randomForest. } \usage{ \method{rfImpute}{default}(x, y, iter=5, ntree=300, ...) \method{rfImpute}{formula}(x, data, ..., subset) } \arguments{ \item{x}{A data frame or matrix of predictors, some containing \code{NA}s, or a formula.} \item{y}{Response vector (\code{NA}'s not allowed).} \item{data}{A data frame containing the predictors and response.} \item{iter}{Number of iterations to run the imputation.} \item{ntree}{Number of trees to grow in each iteration of randomForest.} \item{...}{Other arguments to be passed to \code{\link{randomForest}}.} \item{subset}{A logical vector indicating which observations to use.} } \value{ A data frame or matrix containing the completed data matrix, where \code{NA}s are imputed using proximity from randomForest. The first column contains the response. } \details{ The algorithm starts by imputing \code{NA}s using \code{\link{na.roughfix}}. Then \code{\link{randomForest}} is called with the completed data. The proximity matrix from the randomForest is used to update the imputation of the \code{NA}s. For continuous predictors, the imputed value is the weighted average of the non-missing obervations, where the weights are the proximities. For categorical predictors, the imputed value is the category with the largest average proximity. This process is iterated \code{iter} times. Note: Imputation has not (yet) been implemented for the unsupervised case. Also, Breiman (2003) notes that the OOB estimate of error from randomForest tend to be optimistic when run on the data matrix with imputed values. } \references{ Leo Breiman (2003). Manual for Setting Up, Using, and Understanding Random Forest V4.0. \url{http://oz.berkeley.edu/users/breiman/Using_random_forests_v4.0.pdf} } \seealso{ \code{\link{na.roughfix}}. } \examples{ data(iris) iris.na <- iris set.seed(111) ## artificially drop some data values. for (i in 1:4) iris.na[sample(150, sample(20)), i] <- NA set.seed(222) iris.imputed <- rfImpute(Species ~ ., iris.na) set.seed(333) iris.rf <- randomForest(Species ~ ., iris.imputed) print(iris.rf) } \author{Andy Liaw} \keyword{regression} \keyword{classif} \keyword{tree} randomForest/man/rfcv.Rd0000744000175100001440000000523212037254524014744 0ustar hornikusers\name{rfcv} \alias{rfcv} \title{Random Forest Cross-Valdidation for feature selection} \description{ This function shows the cross-validated prediction performance of models with sequentially reduced number of predictors (ranked by variable importance) via a nested cross-validation procedure. } \usage{ rfcv(trainx, trainy, cv.fold=5, scale="log", step=0.5, mtry=function(p) max(1, floor(sqrt(p))), recursive=FALSE, ...) } \arguments{ \item{trainx}{matrix or data frame containing columns of predictor variables} \item{trainy}{vector of response, must have length equal to the number of rows in \code{trainx}} \item{cv.fold}{number of folds in the cross-validation} \item{scale}{if \code{"log"}, reduce a fixed proportion (\code{step}) of variables at each step, otherwise reduce \code{step} variables at a time} \item{step}{if \code{log=TRUE}, the fraction of variables to remove at each step, else remove this many variables at a time} \item{mtry}{a function of number of remaining predictor variables to use as the \code{mtry} parameter in the \code{randomForest} call} \item{recursive}{whether variable importance is (re-)assessed at each step of variable reduction} \item{...}{other arguments passed on to \code{randomForest}} } \value{ A list with the following components: list(n.var=n.var, error.cv=error.cv, predicted=cv.pred) \item{n.var}{vector of number of variables used at each step} \item{error.cv}{corresponding vector of error rates or MSEs at each step} \item{predicted}{list of \code{n.var} components, each containing the predicted values from the cross-validation} } %\details{ %} \references{ Svetnik, V., Liaw, A., Tong, C. and Wang, T., ``Application of Breiman's Random Forest to Modeling Structure-Activity Relationships of Pharmaceutical Molecules'', MCS 2004, Roli, F. and Windeatt, T. (Eds.) pp. 334-343. } \seealso{ \code{\link{randomForest}}, \code{\link{importance}} } \examples{ set.seed(647) myiris <- cbind(iris[1:4], matrix(runif(508 * nrow(iris)), nrow(iris), 508)) result <- rfcv(myiris, iris$Species) with(result, plot(n.var, error.cv, log="x", type="o", lwd=2)) ## The following can take a while to run, so if you really want to try ## it, copy and paste the code into R. \dontrun{ result <- replicate(5, rfcv(myiris, iris$Species), simplify=FALSE) error.cv <- sapply(result, "[[", "error.cv") matplot(result[[1]]$n.var, cbind(rowMeans(error.cv), error.cv), type="l", lwd=c(2, rep(1, ncol(error.cv))), col=1, lty=1, log="x", xlab="Number of variables", ylab="CV Error") } } \author{Andy Liaw} \keyword{classif} \keyword{regression} randomForest/man/randomForest.Rd0000744000175100001440000003014412037254524016447 0ustar hornikusers\name{randomForest} \alias{randomForest} \alias{randomForest.formula} \alias{randomForest.default} \alias{print.randomForest} \title{Classification and Regression with Random Forest} \description{ \code{randomForest} implements Breiman's random forest algorithm (based on Breiman and Cutler's original Fortran code) for classification and regression. It can also be used in unsupervised mode for assessing proximities among data points. } \usage{ \method{randomForest}{formula}(formula, data=NULL, ..., subset, na.action=na.fail) \method{randomForest}{default}(x, y=NULL, xtest=NULL, ytest=NULL, ntree=500, mtry=if (!is.null(y) && !is.factor(y)) max(floor(ncol(x)/3), 1) else floor(sqrt(ncol(x))), replace=TRUE, classwt=NULL, cutoff, strata, sampsize = if (replace) nrow(x) else ceiling(.632*nrow(x)), nodesize = if (!is.null(y) && !is.factor(y)) 5 else 1, maxnodes = NULL, importance=FALSE, localImp=FALSE, nPerm=1, proximity, oob.prox=proximity, norm.votes=TRUE, do.trace=FALSE, keep.forest=!is.null(y) && is.null(xtest), corr.bias=FALSE, keep.inbag=FALSE, ...) \method{print}{randomForest}(x, ...) } \arguments{ \item{data}{an optional data frame containing the variables in the model. By default the variables are taken from the environment which \code{randomForest} is called from.} \item{subset}{an index vector indicating which rows should be used. (NOTE: If given, this argument must be named.)} \item{na.action}{A function to specify the action to be taken if NAs are found. (NOTE: If given, this argument must be named.)} \item{x, formula}{a data frame or a matrix of predictors, or a formula describing the model to be fitted (for the \code{print} method, an \code{randomForest} object).} \item{y}{A response vector. If a factor, classification is assumed, otherwise regression is assumed. If omitted, \code{randomForest} will run in unsupervised mode.} \item{xtest}{a data frame or matrix (like \code{x}) containing predictors for the test set.} \item{ytest}{response for the test set.} \item{ntree}{Number of trees to grow. This should not be set to too small a number, to ensure that every input row gets predicted at least a few times. } \item{mtry}{Number of variables randomly sampled as candidates at each split. Note that the default values are different for classification (sqrt(p) where p is number of variables in \code{x}) and regression (p/3)} \item{replace}{Should sampling of cases be done with or without replacement?} \item{classwt}{Priors of the classes. Need not add up to one. Ignored for regression.} \item{cutoff}{(Classification only) A vector of length equal to number of classes. The `winning' class for an observation is the one with the maximum ratio of proportion of votes to cutoff. Default is 1/k where k is the number of classes (i.e., majority vote wins).} \item{strata}{A (factor) variable that is used for stratified sampling.} \item{sampsize}{Size(s) of sample to draw. For classification, if sampsize is a vector of the length the number of strata, then sampling is stratified by strata, and the elements of sampsize indicate the numbers to be drawn from the strata.} \item{nodesize}{Minimum size of terminal nodes. Setting this number larger causes smaller trees to be grown (and thus take less time). Note that the default values are different for classification (1) and regression (5).} \item{maxnodes}{Maximum number of terminal nodes trees in the forest can have. If not given, trees are grown to the maximum possible (subject to limits by \code{nodesize}). If set larger than maximum possible, a warning is issued.} \item{importance}{Should importance of predictors be assessed? } \item{localImp}{Should casewise importance measure be computed? (Setting this to \code{TRUE} will override \code{importance}.) } \item{nPerm}{Number of times the OOB data are permuted per tree for assessing variable importance. Number larger than 1 gives slightly more stable estimate, but not very effective. Currently only implemented for regression.} \item{proximity}{Should proximity measure among the rows be calculated?} \item{oob.prox}{Should proximity be calculated only on ``out-of-bag'' data?} \item{norm.votes}{If \code{TRUE} (default), the final result of votes are expressed as fractions. If \code{FALSE}, raw vote counts are returned (useful for combining results from different runs). Ignored for regression.} \item{do.trace}{If set to \code{TRUE}, give a more verbose output as \code{randomForest} is run. If set to some integer, then running output is printed for every \code{do.trace} trees.} \item{keep.forest}{If set to \code{FALSE}, the forest will not be retained in the output object. If \code{xtest} is given, defaults to \code{FALSE}.} \item{corr.bias}{perform bias correction for regression? Note: Experimental. Use at your own risk.} \item{keep.inbag}{Should an \code{n} by \code{ntree} matrix be returned that keeps track of which samples are ``in-bag'' in which trees (but not how many times, if sampling with replacement)} \item{...}{optional parameters to be passed to the low level function \code{randomForest.default}.} } \value{ An object of class \code{randomForest}, which is a list with the following components: \item{call}{the original call to \code{randomForest}} \item{type}{one of \code{regression}, \code{classification}, or \code{unsupervised}.} \item{predicted}{the predicted values of the input data based on out-of-bag samples.} \item{importance}{a matrix with \code{nclass} + 2 (for classification) or two (for regression) columns. For classification, the first \code{nclass} columns are the class-specific measures computed as mean descrease in accuracy. The \code{nclass} + 1st column is the mean descrease in accuracy over all classes. The last column is the mean decrease in Gini index. For Regression, the first column is the mean decrease in accuracy and the second the mean decrease in MSE. If \code{importance=FALSE}, the last measure is still returned as a vector.} \item{importanceSD}{The ``standard errors'' of the permutation-based importance measure. For classification, a \code{p} by \code{nclass + 1} matrix corresponding to the first \code{nclass + 1} columns of the importance matrix. For regression, a length \code{p} vector.} \item{localImp}{a p by n matrix containing the casewise importance measures, the [i,j] element of which is the importance of i-th variable on the j-th case. \code{NULL} if \code{localImp=FALSE}.} \item{ntree}{number of trees grown.} \item{mtry}{number of predictors sampled for spliting at each node.} \item{forest}{(a list that contains the entire forest; \code{NULL} if \code{randomForest} is run in unsupervised mode or if \code{keep.forest=FALSE}.} \item{err.rate}{(classification only) vector error rates of the prediction on the input data, the i-th element being the (OOB) error rate for all trees up to the i-th.} \item{confusion}{(classification only) the confusion matrix of the prediction (based on OOB data).} \item{votes}{(classification only) a matrix with one row for each input data point and one column for each class, giving the fraction or number of (OOB) `votes' from the random forest.} \item{oob.times}{number of times cases are `out-of-bag' (and thus used in computing OOB error estimate)} \item{proximity}{if \code{proximity=TRUE} when \code{randomForest} is called, a matrix of proximity measures among the input (based on the frequency that pairs of data points are in the same terminal nodes).} \item{mse}{(regression only) vector of mean square errors: sum of squared residuals divided by \code{n}.} \item{rsq}{(regression only) ``pseudo R-squared'': 1 - \code{mse} / Var(y).} \item{test}{if test set is given (through the \code{xtest} or additionally \code{ytest} arguments), this component is a list which contains the corresponding \code{predicted}, \code{err.rate}, \code{confusion}, \code{votes} (for classification) or \code{predicted}, \code{mse} and \code{rsq} (for regression) for the test set. If \code{proximity=TRUE}, there is also a component, \code{proximity}, which contains the proximity among the test set as well as proximity between test and training data.} } \note{ The \code{forest} structure is slightly different between classification and regression. For details on how the trees are stored, see the help page for \code{\link{getTree}}. If \code{xtest} is given, prediction of the test set is done ``in place'' as the trees are grown. If \code{ytest} is also given, and \code{do.trace} is set to some positive integer, then for every \code{do.trace} trees, the test set error is printed. Results for the test set is returned in the \code{test} component of the resulting \code{randomForest} object. For classification, the \code{votes} component (for training or test set data) contain the votes the cases received for the classes. If \code{norm.votes=TRUE}, the fraction is given, which can be taken as predicted probabilities for the classes. For large data sets, especially those with large number of variables, calling \code{randomForest} via the formula interface is not advised: There may be too much overhead in handling the formula. The ``local'' (or casewise) variable importance is computed as follows: For classification, it is the increase in percent of times a case is OOB and misclassified when the variable is permuted. For regression, it is the average increase in squared OOB residuals when the variable is permuted. } \references{ Breiman, L. (2001), \emph{Random Forests}, Machine Learning 45(1), 5-32. Breiman, L (2002), ``Manual On Setting Up, Using, And Understanding Random Forests V3.1'', \url{http://oz.berkeley.edu/users/breiman/Using_random_forests_V3.1.pdf}. } \author{Andy Liaw \email{andy\_liaw@merck.com} and Matthew Wiener \email{matthew\_wiener@merck.com}, based on original Fortran code by Leo Breiman and Adele Cutler.} \seealso{\code{\link{predict.randomForest}}, \code{\link{varImpPlot}}} \examples{ ## Classification: ##data(iris) set.seed(71) iris.rf <- randomForest(Species ~ ., data=iris, importance=TRUE, proximity=TRUE) print(iris.rf) ## Look at variable importance: round(importance(iris.rf), 2) ## Do MDS on 1 - proximity: iris.mds <- cmdscale(1 - iris.rf$proximity, eig=TRUE) op <- par(pty="s") pairs(cbind(iris[,1:4], iris.mds$points), cex=0.6, gap=0, col=c("red", "green", "blue")[as.numeric(iris$Species)], main="Iris Data: Predictors and MDS of Proximity Based on RandomForest") par(op) print(iris.mds$GOF) ## The `unsupervised' case: set.seed(17) iris.urf <- randomForest(iris[, -5]) MDSplot(iris.urf, iris$Species) ## stratified sampling: draw 20, 30, and 20 of the species to grow each tree. (iris.rf2 <- randomForest(iris[1:4], iris$Species, sampsize=c(20, 30, 20))) ## Regression: ## data(airquality) set.seed(131) ozone.rf <- randomForest(Ozone ~ ., data=airquality, mtry=3, importance=TRUE, na.action=na.omit) print(ozone.rf) ## Show "importance" of variables: higher value mean more important: round(importance(ozone.rf), 2) ## "x" can be a matrix instead of a data frame: set.seed(17) x <- matrix(runif(5e2), 100) y <- gl(2, 50) (myrf <- randomForest(x, y)) (predict(myrf, x)) ## "complicated" formula: (swiss.rf <- randomForest(sqrt(Fertility) ~ . - Catholic + I(Catholic < 50), data=swiss)) (predict(swiss.rf, swiss)) ## Test use of 32-level factor as a predictor: set.seed(1) x <- data.frame(x1=gl(32, 5), x2=runif(160), y=rnorm(160)) (rf1 <- randomForest(x[-3], x[[3]], ntree=10)) ## Grow no more than 4 nodes per tree: (treesize(randomForest(Species ~ ., data=iris, maxnodes=4, ntree=30))) } \keyword{classif}% at least one, from doc/KEYWORDS \keyword{regression} \keyword{tree} randomForest/man/predict.randomForest.Rd0000744000175100001440000001064212037254524020101 0ustar hornikusers\name{predict.randomForest} \alias{predict.randomForest} \title{predict method for random forest objects} \description{ Prediction of test data using random forest. } \usage{ \method{predict}{randomForest}(object, newdata, type="response", norm.votes=TRUE, predict.all=FALSE, proximity=FALSE, nodes=FALSE, cutoff, ...) } \arguments{ \item{object}{an object of class \code{randomForest}, as that created by the function \code{randomForest}.} \item{newdata}{a data frame or matrix containing new data. (Note: If not given, the out-of-bag prediction in \code{object} is returned.} \item{type}{one of \code{response}, \code{prob}. or \code{votes}, indicating the type of output: predicted values, matrix of class probabilities, or matrix of vote counts. \code{class} is allowed, but automatically converted to "response", for backward compatibility.} \item{norm.votes}{Should the vote counts be normalized (i.e., expressed as fractions)? Ignored if \code{object$type} is \code{regression}.} \item{predict.all}{Should the predictions of all trees be kept?} \item{proximity}{Should proximity measures be computed? An error is issued if \code{object$type} is \code{regression}.} \item{nodes}{Should the terminal node indicators (an n by ntree matrix) be return? If so, it is in the ``nodes'' attribute of the returned object.} \item{cutoff}{(Classification only) A vector of length equal to number of classes. The `winning' class for an observation is the one with the maximum ratio of proportion of votes to cutoff. Default is taken from the \code{forest$cutoff} component of \code{object} (i.e., the setting used when running \code{\link{randomForest}}).} \item{...}{not used currently.} } \value{ If \code{object$type} is \code{regression}, a vector of predicted values is returned. If \code{predict.all=TRUE}, then the returned object is a list of two components: \code{aggregate}, which is the vector of predicted values by the forest, and \code{individual}, which is a matrix where each column contains prediction by a tree in the forest. If \code{object$type} is \code{classification}, the object returned depends on the argument \code{type}: \item{response}{predicted classes (the classes with majority vote).} \item{prob}{matrix of class probabilities (one column for each class and one row for each input).} \item{vote}{matrix of vote counts (one column for each class and one row for each new input); either in raw counts or in fractions (if \code{norm.votes=TRUE}).} If \code{predict.all=TRUE}, then the \code{individual} component of the returned object is a character matrix where each column contains the predicted class by a tree in the forest. If \code{proximity=TRUE}, the returned object is a list with two components: \code{pred} is the prediction (as described above) and \code{proximity} is the proximitry matrix. An error is issued if \code{object$type} is \code{regression}. If \code{nodes=TRUE}, the returned object has a ``nodes'' attribute, which is an n by ntree matrix, each column containing the node number that the cases fall in for that tree. NOTE: If the \code{object} inherits from \code{randomForest.formula}, then any data with \code{NA} are silently omitted from the prediction. The returned value will contain \code{NA} correspondingly in the aggregated and individual tree predictions (if requested), but not in the proximity or node matrices. NOTE2: Any ties are broken at random, so if this is undesirable, avoid it by using odd number \code{ntree} in \code{randomForest()}. } \references{ Breiman, L. (2001), \emph{Random Forests}, Machine Learning 45(1), 5-32. } \author{ Andy Liaw \email{andy\_liaw@merck.com} and Matthew Wiener \email{matthew\_wiener@merck.com}, based on original Fortran code by Leo Breiman and Adele Cutler.} \seealso{\code{\link{randomForest}}} \examples{ data(iris) set.seed(111) ind <- sample(2, nrow(iris), replace = TRUE, prob=c(0.8, 0.2)) iris.rf <- randomForest(Species ~ ., data=iris[ind == 1,]) iris.pred <- predict(iris.rf, iris[ind == 2,]) table(observed = iris[ind==2, "Species"], predicted = iris.pred) ## Get prediction for all trees. predict(iris.rf, iris[ind == 2,], predict.all=TRUE) ## Proximities. predict(iris.rf, iris[ind == 2,], proximity=TRUE) ## Nodes matrix. str(attr(predict(iris.rf, iris[ind == 2,], nodes=TRUE), "nodes")) } \keyword{classif}% at least one, from doc/KEYWORDS \keyword{regression} randomForest/man/plot.randomForest.Rd0000744000175100001440000000215712037254524017427 0ustar hornikusers\name{plot.randomForest} \alias{plot.randomForest} \title{Plot method for randomForest objects} \description{ Plot the error rates or MSE of a randomForest object } \usage{ \method{plot}{randomForest}(x, type="l", main=deparse(substitute(x)), ...) } \arguments{ \item{x}{an object of class \code{randomForest}.} \item{type}{type of plot.} \item{main}{main title of the plot.} \item{...}{other graphical parameters.} } \value{ Invisibly, the error rates or MSE of the \code{randomForest} object. If the object has a non-null \code{test} component, then the returned object is a matrix where the first column is the out-of-bag estimate of error, and the second column is for the test set. } \note{ This function does not work for \code{randomForest} objects that have \code{type=unsupervised}. If the \code{x} has a non-null \code{test} component, then the test set errors are also plotted. } \seealso{ \code{\link{randomForest}} } \examples{ data(mtcars) plot(randomForest(mpg ~ ., mtcars, keep.forest=FALSE, ntree=100), log="y") } \author{Andy Liaw} \keyword{classif} \keyword{regression} \keyword{tree} randomForest/man/partialPlot.Rd0000744000175100001440000000667112037254524016307 0ustar hornikusers\name{partialPlot} \alias{partialPlot} \alias{partialPlot.default} \alias{partialPlot.randomForest} \title{Partial dependence plot} \description{ Partial dependence plot gives a graphical depiction of the marginal effect of a variable on the class probability (classification) or response (regression). } \usage{ \method{partialPlot}{randomForest}(x, pred.data, x.var, which.class, w, plot = TRUE, add = FALSE, n.pt = min(length(unique(pred.data[, xname])), 51), rug = TRUE, xlab=deparse(substitute(x.var)), ylab="", main=paste("Partial Dependence on", deparse(substitute(x.var))), ...) } \arguments{ \item{x}{an object of class \code{randomForest}, which contains a \code{forest} component.} \item{pred.data}{a data frame used for contructing the plot, usually the training data used to contruct the random forest.} \item{x.var}{name of the variable for which partial dependence is to be examined.} \item{which.class}{For classification data, the class to focus on (default the first class).} \item{w}{weights to be used in averaging; if not supplied, mean is not weighted} \item{plot}{whether the plot should be shown on the graphic device.} \item{add}{whether to add to existing plot (\code{TRUE}).} \item{n.pt}{if \code{x.var} is continuous, the number of points on the grid for evaluating partial dependence.} \item{rug}{whether to draw hash marks at the bottom of the plot indicating the deciles of \code{x.var}.} \item{xlab}{label for the x-axis.} \item{ylab}{label for the y-axis.} \item{main}{main title for the plot.} \item{...}{other graphical parameters to be passed on to \code{plot} or \code{lines}.} } \value{ A list with two components: \code{x} and \code{y}, which are the values used in the plot. } \details{ The function being plotted is defined as: \deqn{ \tilde{f}(x) = \frac{1}{n} \sum_{i=1}^n f(x, x_{iC}), } where \eqn{x} is the variable for which partial dependence is sought, and \eqn{x_{iC}} is the other variables in the data. The summand is the predicted regression function for regression, and logits (i.e., log of fraction of votes) for \code{which.class} for classification: \deqn{ f(x) = \log p_k(x) - \frac{1}{K} \sum_{j=1}^K \log p_j(x),} where \eqn{K} is the number of classes, \eqn{k} is \code{which.class}, and \eqn{p_j} is the proportion of votes for class \eqn{j}. } \note{ The \code{randomForest} object must contain the \code{forest} component; i.e., created with \code{randomForest(..., keep.forest=TRUE)}. This function runs quite slow for large data sets. } \references{ Friedman, J. (2001). Greedy function approximation: the gradient boosting machine, \emph{Ann. of Stat.}} \seealso{\code{\link{randomForest}}} \author{Andy Liaw \email{andy\_liaw@merck.com}} \examples{ data(iris) set.seed(543) iris.rf <- randomForest(Species~., iris) partialPlot(iris.rf, iris, Petal.Width, "versicolor") ## Looping over variables ranked by importance: data(airquality) airquality <- na.omit(airquality) set.seed(131) ozone.rf <- randomForest(Ozone ~ ., airquality, importance=TRUE) imp <- importance(ozone.rf) impvar <- rownames(imp)[order(imp[, 1], decreasing=TRUE)] op <- par(mfrow=c(2, 3)) for (i in seq_along(impvar)) { partialPlot(ozone.rf, airquality, impvar[i], xlab=impvar[i], main=paste("Partial Dependence on", impvar[i]), ylim=c(30, 70)) } par(op) } \keyword{classif} \keyword{regression} \keyword{tree} randomForest/man/outlier.Rd0000744000175100001440000000222612037254524015467 0ustar hornikusers\name{outlier} \alias{outlier} \alias{outlier.randomForest} \alias{outlier.default} \title{Compute outlying measures} \description{Compute outlying measures based on a proximity matrix.} \usage{ \method{outlier}{default}(x, cls=NULL, ...) \method{outlier}{randomForest}(x, ...) } \arguments{ \item{x}{a proximity matrix (a square matrix with 1 on the diagonal and values between 0 and 1 in the off-diagonal positions); or an object of class \code{\link{randomForest}}, whose \code{type} is not \code{regression}.} \item{cls}{the classes the rows in the proximity matrix belong to. If not given, all data are assumed to come from the same class.} \item{...}{arguments for other methods.} } \value{ A numeric vector containing the outlying measures. The outlying measure of a case is computed as n / sum(squared proximity), normalized by subtracting the median and divided by the MAD, within each class. } \seealso{ \code{\link{randomForest}} } \examples{ set.seed(1) iris.rf <- randomForest(iris[,-5], iris[,5], proximity=TRUE) plot(outlier(iris.rf), type="h", col=c("red", "green", "blue")[as.numeric(iris$Species)]) } \keyword{classif} randomForest/man/na.roughfix.Rd0000744000175100001440000000216612037254524016237 0ustar hornikusers\name{na.roughfix} \alias{na.roughfix} \alias{na.roughfix.default} \alias{na.roughfix.data.frame} \title{Rough Imputation of Missing Values} \description{ Impute Missing Values by median/mode. } \usage{ na.roughfix(object, ...) } \arguments{ \item{object}{a data frame or numeric matrix.} \item{\dots}{further arguments special methods could require.} } \value{ A completed data matrix or data frame. For numeric variables, \code{NA}s are replaced with column medians. For factor variables, \code{NA}s are replaced with the most frequent levels (breaking ties at random). If \code{object} contains no \code{NA}s, it is returned unaltered. } \note{ This is used as a starting point for imputing missing values by random forest. } %\references{ %} \seealso{ \code{\link{rfImpute}}, \code{\link{randomForest}}. } \examples{ data(iris) iris.na <- iris set.seed(111) ## artificially drop some data values. for (i in 1:4) iris.na[sample(150, sample(20)), i] <- NA iris.roughfix <- na.roughfix(iris.na) iris.narf <- randomForest(Species ~ ., iris.na, na.action=na.roughfix) print(iris.narf) } \author{Andy Liaw} \keyword{NA} randomForest/man/MDSplot.Rd0000744000175100001440000000251312037254524015325 0ustar hornikusers\name{MDSplot} \alias{MDSplot} \title{Multi-dimensional Scaling Plot of Proximity matrix from randomForest} \description{ Plot the scaling coordinates of the proximity matrix from randomForest. } \usage{ MDSplot(rf, fac, k=2, palette=NULL, pch=20, ...) } \arguments{ \item{rf}{an object of class \code{\link{randomForest}} that contains the \code{proximity} component.} \item{fac}{a factor that was used as response to train \code{rf}.} \item{k}{number of dimensions for the scaling coordinates.} \item{palette}{colors to use to distinguish the classes; length must be the equal to the number of levels.} \item{pch}{plotting symbols to use.} \item{...}{other graphical parameters.} } \value{ The output of \code{\link[stats]{cmdscale}} on 1 - \code{rf$proximity} is returned invisibly. } \note{ If \code{k > 2}, \code{\link{pairs}} is used to produce the scatterplot matrix of the coordinates. } \seealso{\code{\link{randomForest}}} \examples{ set.seed(1) data(iris) iris.rf <- randomForest(Species ~ ., iris, proximity=TRUE, keep.forest=FALSE) MDSplot(iris.rf, iris$Species) ## Using different symbols for the classes: MDSplot(iris.rf, iris$Species, palette=rep(1, 3), pch=as.numeric(iris$Species)) } \author{Robert Gentleman, with slight modifications by Andy Liaw} \keyword{classif} \keyword{tree} randomForest/man/margin.Rd0000744000175100001440000000305612037254524015263 0ustar hornikusers\name{margin} \alias{margin} \alias{margin.default} \alias{margin.randomForest} \alias{plot.margin} \title{Margins of randomForest Classifier} \description{ Compute or plot the margin of predictions from a randomForest classifier. } \usage{ \method{margin}{randomForest}(x, ...) \method{margin}{default}(x, observed, ...) \method{plot}{margin}(x, sort=TRUE, ...) } \arguments{ \item{x}{an object of class \code{\link{randomForest}}, whose \code{type} is not \code{regression}, or a matrix of predicted probabilities, one column per class and one row per observation. For the \code{plot} method, \code{x} should be an object returned by \code{margin}.} \item{observed}{the true response corresponding to the data in \code{x}.} \item{sort}{Should the data be sorted by their class labels?} \item{...}{other graphical parameters to be passed to \code{plot.default}.} } \value{ For \code{margin}, the \emph{margin} of observations from the \code{\link{randomForest}} classifier (or whatever classifier that produced the predicted probability matrix given to \code{margin}). The margin of a data point is defined as the proportion of votes for the correct class minus maximum proportion of votes for the other classes. Thus under majority votes, positive margin means correct classification, and vice versa. } \seealso{ \code{\link{randomForest}} } \examples{ set.seed(1) data(iris) iris.rf <- randomForest(Species ~ ., iris, keep.forest=FALSE) plot(margin(iris.rf)) } \author{Robert Gentlemen, with slight modifications by Andy Liaw} \keyword{classif} randomForest/man/imports85.Rd0000744000175100001440000000455012037254524015660 0ustar hornikusers\name{imports85} \docType{data} \alias{imports85} \title{The Automobile Data} \description{ This is the `Automobile' data from the UCI Machine Learning Repository. } \usage{ data(imports85) } \format{ \code{imports85} is a data frame with 205 cases (rows) and 26 variables (columns). This data set consists of three types of entities: (a) the specification of an auto in terms of various characteristics, (b) its assigned insurance risk rating, (c) its normalized losses in use as compared to other cars. The second rating corresponds to the degree to which the auto is more risky than its price indicates. Cars are initially assigned a risk factor symbol associated with its price. Then, if it is more risky (or less), this symbol is adjusted by moving it up (or down) the scale. Actuarians call this process `symboling'. A value of +3 indicates that the auto is risky, -3 that it is probably pretty safe. The third factor is the relative average loss payment per insured vehicle year. This value is normalized for all autos within a particular size classification (two-door small, station wagons, sports/speciality, etc...), and represents the average loss per car per year. } \source{ Originally created by Jeffrey C. Schlimmer, from 1985 Model Import Car and Truck Specifications, 1985 Ward's Automotive Yearbook, Personal Auto Manuals, Insurance Services Office, and Insurance Collision Report, Insurance Institute for Highway Safety. The original data is at \url{http://www.ics.uci.edu/~mlearn/MLSummary.html}. } \references{ 1985 Model Import Car and Truck Specifications, 1985 Ward's Automotive Yearbook. Personal Auto Manuals, Insurance Services Office, 160 Water Street, New York, NY 10038 Insurance Collision Report, Insurance Institute for Highway Safety, Watergate 600, Washington, DC 20037 } \seealso{ \code{\link{randomForest}} } \examples{ data(imports85) imp85 <- imports85[,-2] # Too many NAs in normalizedLosses. imp85 <- imp85[complete.cases(imp85), ] ## Drop empty levels for factors. imp85[] <- lapply(imp85, function(x) if (is.factor(x)) x[, drop=TRUE] else x) stopifnot(require(randomForest)) price.rf <- randomForest(price ~ ., imp85, do.trace=10, ntree=100) print(price.rf) numDoors.rf <- randomForest(numOfDoors ~ ., imp85, do.trace=10, ntree=100) print(numDoors.rf) } \author{Andy Liaw} \keyword{datasets}randomForest/man/importance.Rd0000744000175100001440000000420712037254524016146 0ustar hornikusers\name{importance} \alias{importance} \alias{importance.default} \alias{importance.randomForest} \title{Extract variable importance measure} \description{ This is the extractor function for variable importance measures as produced by \code{\link{randomForest}}. } \usage{ \method{importance}{randomForest}(x, type=NULL, class=NULL, scale=TRUE, ...) } \arguments{ \item{x}{an object of class \code{\link{randomForest}}}. \item{type}{either 1 or 2, specifying the type of importance measure (1=mean decrease in accuracy, 2=mean decrease in node impurity).} \item{class}{for classification problem, which class-specific measure to return.} \item{scale}{For permutation based measures, should the measures be divided their ``standard errors''?} \item{...}{not used.} } \value{ A matrix of importance measure, one row for each predictor variable. The column(s) are different importance measures. } \details{ Here are the definitions of the variable importance measures. The first measure is computed from permuting OOB data: For each tree, the prediction error on the out-of-bag portion of the data is recorded (error rate for classification, MSE for regression). Then the same is done after permuting each predictor variable. The difference between the two are then averaged over all trees, and normalized by the standard deviation of the differences. If the standard deviation of the differences is equal to 0 for a variable, the division is not done (but the average is almost always equal to 0 in that case). The second measure is the total decrease in node impurities from splitting on the variable, averaged over all trees. For classification, the node impurity is measured by the Gini index. For regression, it is measured by residual sum of squares. } %\references{ %} \seealso{ \code{\link{randomForest}}, \code{\link{varImpPlot}} } \examples{ set.seed(4543) data(mtcars) mtcars.rf <- randomForest(mpg ~ ., data=mtcars, ntree=1000, keep.forest=FALSE, importance=TRUE) importance(mtcars.rf) importance(mtcars.rf, type=1) } %\author{} \keyword{regression} \keyword{classif} \keyword{tree} randomForest/man/grow.Rd0000744000175100001440000000204712037254524014763 0ustar hornikusers\name{grow} \alias{grow} \alias{grow.default} \alias{grow.randomForest} \title{Add trees to an ensemble} \description{ Add additional trees to an existing ensemble of trees. } \usage{ \method{grow}{randomForest}(x, how.many, ...) } \arguments{ \item{x}{an object of class \code{randomForest}, which contains a \code{forest} component.} \item{how.many}{number of trees to add to the \code{randomForest} object.} \item{...}{currently ignored.} } \value{ An object of class \code{randomForest}, containing \code{how.many} additional trees. } \note{ The \code{confusion}, \code{err.rate}, \code{mse} and \code{rsq} components (as well as the corresponding components in the \code{test} compnent, if exist) of the combined object will be \code{NULL}. } \seealso{\code{\link{combine}}, \code{\link{randomForest}}} \author{Andy Liaw \email{andy\_liaw@merck.com}} \examples{ data(iris) iris.rf <- randomForest(Species ~ ., iris, ntree=50, norm.votes=FALSE) iris.rf <- grow(iris.rf, 50) print(iris.rf) } \keyword{regression} \keyword{classif} randomForest/man/getTree.Rd0000744000175100001440000000365412037254523015410 0ustar hornikusers\name{getTree} \alias{getTree} \title{Extract a single tree from a forest.} \description{ This function extract the structure of a tree from a \code{randomForest} object. } \usage{ getTree(rfobj, k=1, labelVar=FALSE) } \arguments{ \item{rfobj}{a \code{\link{randomForest}} object.} \item{k}{which tree to extract?} \item{labelVar}{Should better labels be used for splitting variables and predicted class?} } \value{ A matrix (or data frame, if \code{labelVar=TRUE}) with six columns and number of rows equal to total number of nodes in the tree. The six columns are: \item{left daughter}{the row where the left daughter node is; 0 if the node is terminal} \item{right daughter}{the row where the right daughter node is; 0 if the node is terminal} \item{split var}{which variable was used to split the node; 0 if the node is terminal} \item{split point}{where the best split is; see Details for categorical predictor} \item{status}{is the node terminal (-1) or not (1)} \item{prediction}{the prediction for the node; 0 if the node is not terminal} } \details{ For numerical predictors, data with values of the variable less than or equal to the splitting point go to the left daughter node. For categorical predictors, the splitting point is represented by an integer, whose binary expansion gives the identities of the categories that goes to left or right. For example, if a predictor has four categories, and the split point is 13. The binary expansion of 13 is (1, 0, 1, 1) (because \eqn{13 = 1*2^0 + 0*2^1 + 1*2^2 + 1*2^3}), so cases with categories 1, 3, or 4 in this predictor get sent to the left, and the rest to the right. } %\references{ %} \seealso{ \code{\link{randomForest}} } \examples{ data(iris) ## Look at the third trees in the forest. getTree(randomForest(iris[,-5], iris[,5], ntree=10), 3, labelVar=TRUE) } \author{Andy Liaw \email{andy\_liaw@merck.com}} \keyword{tree} randomForest/man/combine.Rd0000744000175100001440000000174112037254523015420 0ustar hornikusers\name{combine} \alias{combine} \title{Combine Ensembles of Trees} \description{ Combine two more more ensembles of trees into one. } \usage{ combine(...) } \arguments{ \item{...}{two or more objects of class \code{randomForest}, to be combined into one.} } \value{ An object of class \code{randomForest}. } \note{ The \code{confusion}, \code{err.rate}, \code{mse} and \code{rsq} components (as well as the corresponding components in the \code{test} compnent, if exist) of the combined object will be \code{NULL}. } \seealso{\code{\link{randomForest}}, \code{\link{grow}}} \author{Andy Liaw \email{andy\_liaw@merck.com}} \examples{ data(iris) rf1 <- randomForest(Species ~ ., iris, ntree=50, norm.votes=FALSE) rf2 <- randomForest(Species ~ ., iris, ntree=50, norm.votes=FALSE) rf3 <- randomForest(Species ~ ., iris, ntree=50, norm.votes=FALSE) rf.all <- combine(rf1, rf2, rf3) print(rf.all) } \keyword{regression} \keyword{classif} randomForest/man/classCenter.Rd0000744000175100001440000000356612037254523016261 0ustar hornikusers\name{classCenter} \alias{classCenter} \title{Prototypes of groups.} \description{ Prototypes are `representative' cases of a group of data points, given the similarity matrix among the points. They are very similar to medoids. The function is named `classCenter' to avoid conflict with the function \code{prototype} in the \code{methods} package. } \usage{ classCenter(x, label, prox, nNbr = min(table(label))-1) } \arguments{ \item{x}{a matrix or data frame} \item{label}{group labels of the rows in \code{x}} \item{prox}{the proximity (or similarity) matrix, assumed to be symmetric with 1 on the diagonal and in [0, 1] off the diagonal (the order of row/column must match that of \code{x})} \item{nNbr}{number of nearest neighbors used to find the prototypes.} } \value{ A data frame containing one prototype in each row. } \details{ This version only computes one prototype per class. For each case in \code{x}, the \code{nNbr} nearest neighors are found. Then, for each class, the case that has most neighbors of that class is identified. The prototype for that class is then the medoid of these neighbors (coordinate-wise medians for numerical variables and modes for categorical variables). This version only computes one prototype per class. In the future more prototypes may be computed (by removing the `neighbors' used, then iterate). } %\references{} \seealso{ \code{\link{randomForest}}, \code{\link{MDSplot}} } \examples{ data(iris) iris.rf <- randomForest(iris[,-5], iris[,5], prox=TRUE) iris.p <- classCenter(iris[,-5], iris[,5], iris.rf$prox) plot(iris[,3], iris[,4], pch=21, xlab=names(iris)[3], ylab=names(iris)[4], bg=c("red", "blue", "green")[as.numeric(factor(iris$Species))], main="Iris Data with Prototypes") points(iris.p[,3], iris.p[,4], pch=21, cex=2, bg=c("red", "blue", "green")) } \author{Andy Liaw} \keyword{classif} randomForest/inst/0000755000175100001440000000000012037254525013715 5ustar hornikusersrandomForest/inst/NEWS0000744000175100001440000004127212037254523014421 0ustar hornikusersWishlist (formerly TODO): * There are still errors in the classification mode with predictors having 32 categories. Don't use such predictors for now. I will try my best to fix it in the next version. * Implement the new scheme of handling classwt in classification. * Allow categorical predictors with more than 32 categories. * Use more compact storage of proximity matrix. * Allow case weights by using the weights in sampling? ======================================================================== Change in 4.6-6: * In randomForest(), proximity matrix is not computed correctly if oob.prox=TRUE. (Thanks to Adele Cutler for the report!) * In rfcv(), changed the sample-splitting for regression to handle cases when there are many ties in the data. * The calculation of importanceSD for classification was missing a divisor. (Thanks to Abhishek Jaiantilal for the report.) The simulated data in unsupervised randomForest() were not in the right range. (Thanks to Abhishek Jaiantilal for the report.) * In rfImpute(), padded 1e-8 to the division by sum of proximities to avoid division by 0. * Some R code clean up to keep R CMD check happy (complete argument names). Change in 4.6-6: * Fixed yet another bug in checking for missing classes in the in-bag sample. Change in 4.6-5: * Minor bug fix for drawing samples in classification that was introduced in 4.6-4. (Thanks to Joran Elias for reporting.) Changes in 4.6-4: * Changed the error condition added in 4.6-3 to the case when there are fewer than two classes present in the in-bag sample. Changes in 4.6-3: * Fixed bugs in the tie-breaking code in various places. (Thanks to Abhishek Jaiantilal and Nathan Longbotham for the report and Abhishek for the fix.) * Throw error if some class has no data after 10 sampling attempts in classification. (Thanks to Abhishek Jaiantilal for the report.) Changes in 4.6-2: * Part of the enhancement in 4.5-37 causes segfault in R-2.12.x. That part has been reverted to the older code for the time being while the problem is investigated further. Changes in 4.6-1: * The package now includes the rfcv() function for feature selection. See the reference in the help page for details. * predict.randomForest() was not retaining names of observations in some cases. Changes in 4.5-37: * Many repeated calls to predict.randomForest() should run faster, thanks to Philip Pham and Rory Martin who pointed out some unnecessary overhead. Changes in 4.5-36: * outlier() now works when the input matrix to randomForest() has no row names. (Reported by Rau Carrio Gaspar.) * na.roughfix() is now much faster on some data (fixed based on idea from Hadley Wickham; problem reported by Mike Williamson.) * Corrected typos in the description of categorical splits in ?getTree. Changes in 4.5-35: * Fixed an error in partialPlot.randomForest(). Now the partial dependence plots for classification data should be more sensible. (Thanks to Adele Cutler for the bug report and patch.) * Re-worded part of the help pages regarding variable importance calculation. Changes in 4.5-34: * Fixed infinite loop when randomForest() is called with non-null maxnodes. * Fixed a bug in margin.default() that gave nonsensical results. Changes in 4.5-33: * Fixed a _long standing_ bug (existed since the original Fortran) in randomForest(): If importance=TRUE and proximity=TRUE, the proximity matrix returned is incorrect. Those computed with importance=FALSE, or with predict.randomForest(..., proximity=TRUE) are correct. Changes in 4.5-32: * Fixed a bug in predict.randomForest(..., predict.all=TRUE) introduced in 4.5-31. Added examples in ?predict.randomForest for the options. Changes in 4.5-31: * Added a new option `maxnodes' in randomForest() that limits the size of trees. * margin() is now generic with a method for randomForest objects. * Fixed the help page for getTree() about how data are split on numeric variables (`<=' instead of `<'). * Fixed predict.randomForest() so that if the randomForest object is of type "regression" and built from the formula interface and newdata contains NAs, NAs are returned in the corresponding positions (instead of being dropped altogether). Change in 4.5-30: * In regression, cases that had not been out-of-bag now gets NA as prediction (as in classification). Change in 4.5-29: * Fixed a couple of benign errors in help pages spotted by the new Rd parser. Change in 4.5-28: * randomForest() would segfault if there are 32-level factors among the predictor variables. Changes in 4.5-27: * Fixed handling of ordered factors in predict.randomForest(). Changes in 4.5-26: * Fix formula parsing so use of functions in formula won't trigger errors. * predict.randomForest() did not work when given a matrix without column names as the newdata. Changes in 4.5-25: * In regression, the out-of-bag estimate of MSE and R-squared for the first few trees (for which not all observations have been OOB yet), were computed wrong, leading to gross over-estimates of MSEs for the first few trees. This did not affect the final MSE and R-squared for the whole forest; i.e., the first few elements of the `mse' component (and thus the corresponding elements in `rsq') in the randomForest object are wrong, but others are correct. (Thanks to Ulrike Gromping for pointing out this problem, as well as the one fixed in 4.5-24.) Changes in 4.5-24: * randomForest.formula() did not exclude terms preceded by -. Changes in 4.5-23: * Fixed tuneRF() to work with R version > 2.6.1. * Make predict.randomForest() more backward compatible with randomForest objects created from versions older than 4.5-21. Changes in 4.5-22: * Allow unsupervised randomForest not to produce a proximity matrix (by specifying proximity=FALSE), suggested by Nick Crookston. Changes in 4.5-21: * The added check for factor level consistency in predictor variables in 4.5-20 was not working for predictors given in a matrix (reported by Ramon Diaz-Uriate). Changes in 4.5-20: * Fixed a memory bug in the C code when the test set is given and proximity is requested in regression. (Reported by Clayton Springer.) * Fixed the one-pass random tie-breaking algorithm in various places. * Added code to check consistency of levels for factors in the predictors, as well as allowing missing levels of factors and extraneous variables in predict(..., newdata). (Thanks to Nick Crookston for suggesting a patch.) Changes in 4.5-19: * In classification, if sampsize is small and sampling is not stratified, the actual sample might be larger than specified in some trees. Now fixed. * Fixed combine() to work on regression randomForest objects and for cases when ntree is small. * randomForest.default() for regression was unnecessarily creating a matrix of 0s for localImportance when importance=TRUE but localImp=FALSE. (Thanks to Robert McGehee for reporting these bugs.) * predict.randomForest(..., nodes=TRUE) now works for regression. Changes in 4.5-18: * Added S-PLUS 8 compatibility. Changes in 4.5-17: * Added `w' (for weights) to partialPlot.randomForest(). Changes in 4.5-16: * Fixed some typos in the documentation source files (e.g., \note vs. \notes, etc). Changes in 4.5-15: * Fixed error message call in predict.randomForest(). Changes in 4.5-14: * varImpPlot() was ignoring the `type' argument. * "<" was used instead of ".lt." in Fortran code, which is not F77-compliant. Changes in 4.5-13: * Fixed a bug in randomForest() when biasCorr=TRUE for regression. * Fixed bug in predict.randomForest() when newdata is a matrix with no rownames. Changes in 4.5-12: * Added the `strata' argument to randomForest, which, in conjunction with `sampsize', allow sampling (with or without replacement) according to a strata variable (which can be something other than the class variable). Currently only works in classification. Changes in 4.5-11: * Fixed partialPlot.randomForest() so that if x.var is a character, it's taken as the name of the variable. * Clean up code for importance() and varImpPlot() so that if the randomForest object only contains one importance measure, varImpPlot() will work as intended. Changes in 4.5-10: * Renamed the first argument of randomForest.formula() to `formula', to be consistent with other formula interfaces. Changes in 4.5-9: * Fixed a bug with unsupervised randomForest(..., keep.forest=TRUE). * Fixed a bug in regression that caused crash when proximiy=TRUE. * Added `keep.inbag' argument to randomForest(), which, if set to TRUE, cause randomForest() to return a matrix of indicators that indicate which case is included in the bootstrap sample to grow the trees. Changes in 4.5-8: * Added some code in predict.randomForest() so it works with randomForest objects created in older versions of the package. * Fixed randomForest.default() so that getTree() works when the forest contains only one tree. * Added the argument `labelVar' (default FALSE) to getTree() for prettier output. Changes in 4.5-7: * Fixed (another!) bug in splitting on categorical variables, especially impacting data with binary (categorical) variables. Changes in 4.5-6: * Fixed a bug introduced in 4.5-2 that used the wrong default class weights. Changes in 4.5-5: * Fixed a couple of bugs in C/Fortran code for splitting on categorical variables in classification trees, which lead to negative or Inf decrease in the Gini index. Changes in 4.5-4: * Fixed a bug in regression when there are categorical predictors. (The splits can be completely wrong!) Changes in 4.5-3: * Fixed predict.randomForest() so that it uses the class labels stored in the randomForest object (for classification). Changes in 4.5-2: * New argument `cutoff' added to predict.randomForest(). The usage is analogous to the same argument to randomForest(). * Added `palette' and `pch' arguments to MDSplot() to allow more user control. * In randomForest(), allow the forest to be returned in `unsupervised' mode. * Fixed some inaccuracies in help pages. * Fixed the way version number of the package is found at start-up. Changes in 4.5-1: * In classification, split on a categorical predictor with more than 10 categories is made more efficient: For two-class problems, the heuristic presented in Section 4.2.2 of the CART book is used. Otherwise 512 randomly sampled (not necessarily unique) splits are tested, instead of all possible splits. * New function classCenter() has been added. It takes a proximity matrix and a vector of class labels and compute one prototype per class. * Added the `Automobile' data from UCI Machine Learning Repository. * Fixed partialPlot() for categorical predictors (wrong barplot was produced). * Some re-organization and clean-up of internal C/Fortran code is on-going. Changes in 4.4-3: * Added the nPerm argument to randomForest(), which controls the number of times the out-of-bag part of each variable is permuted, per tree, for computing variable importance. (Currently only implemented for regression.) * When computing the out-of-bag MSE for each tree for assessing variable importance in regression, the total number of cases was wrongly used as the divisor. * Fixed the default and formula methods of randomForest(), so that the `call' component of the returned object calls the generic. * The `% increase in MSE' measure of variable importance in regression was not being computed correctly (should divide sum of squares by number of out-of-bag samples rather than total number of samples, for each tree). * Fixed a bug in na.roughfix.default() that gave warning with matrix input. Changes in 4.4-2: * Fixed two memory leaks in the regression code (introduced in 4.3-1). * Fixed a bug that sometimes caused crash in regression when nodesize is set to something larger than default (5). * Changed the tree structure in regression slightly: "treemap" is replaced by "leftDaughter" and "rightDaughter". Changes in 4.4-1: * Made slight change in regression code so that it won't split `pure' nodes. Also fixed the `increase in node purity' importance measure in regression. * The outscale option in randomForest() is removed. Use the outlier() function instead. The default outlier() method can be used with other proximity/dissimilarity measures. * More Fortran subroutines migrated to C. Changes in 4.3-3: * Fixed randomForest.formula() so that update() will work. * Fixed up problem in importance(), which was broken in a couple of ways. Changes in 4.3-2: * Fixed a bug that caused crashes in classification if test set data are supplied. Changes in 4.3-1: * Fixed bugs in sampling cases and variables without replacement. * Added the rfNews() function to display the NEWS file. Advertised in the start up banner. * (Not user-visible.) Translated regression tree building code from Fortran to C. One perhaps noticeable change is less memory usage. Changes in 4.3-0: * Thanks to Adele Cutler, there's now casewise variable importance measures in classification. Similar feature is also added for regression. Use the new localImp option in randomForest(). * The `importance' component of randomForest object has been changed: The permutation-based measures are not divided by their `standard errors'. Instead, the `standard errors' are stored in the `importanceSD' component. One should use the importance() extractor function rather than something like rf.obj$importance for extracting the importance measures. * The importance() extractor function has been updated: If the permutation-based measures are available, calling importance() with only a randomForest object returns the matrix of variable importance measures. There is the `scale' argument, which defaults to TRUE. * In predict.randomForest, there is a new argument `nodes' (default to FALSE). For classification, if nodes=TRUE, the returned object has an attribute `nodes', which is an n by ntree matrix of terminal node indicators. This is ignored for regression. Changes in 4.2-1: * There is now a package name space. Only generics are exported. * Some function names have been changed: partial.plot -> partialPlot var.imp.plot -> varImpPlot var.used -> varUsed * There is a new option `replace' in randomForest() (default to TRUE) indicating whether the sampling of cases is with or without replacement. * In randomForest(), the `sampsize' option now works for both classification and regression, and indicate the number of cases to be drawn to grow each tree. For classification, if sampsize is a vector of length the number of classes, then sampling is stratified by class. * With the formula interface for randomForest(), the default na.action, na.fail, is effective. I.e., an error is given if there are NAs present in the data. If na.omit is desired, it must be given explicitly. * For classification, the err.rate component of the randomForest object (and the corresponding one for test set) now is a ntree by (nclass + 1) matrix, the first column of which contains the overall error rate, and the remaining columns the class error rates. The running output now also prints class error rates. The plot method for randomForest will plot the class error rates as well. * The predict() method now checks whether the variable names in newdata match those from the training data (if the randomForest object is not created from the formula interface). * partialPlot() and varImpPlot() now have optional arguments xlab, ylab and main for more flexible labelling. Also, if a factor is given as the variable, a real bar plot is produced. * partialPlot() will now remove rows with NAs from the data frame given. * For regression, if proximity=FALSE, an n by n array of integers is erroneously allocated but not used (it's only used for proximity calculation, so not needed otherwise). * Updated combine() to conform to the new randomForest object. * na.roughfix() was not working correctly for matrices, which in turns causes problem in rfImpute(). Changes in 4.1-0: * In randomForest(), if sampsize is given, the sampling is now done without replacement, in addition to stratified by class. Therefore sampsize can not be larger than the class frequencies. * In classification randomForest, checks are added to avoid trees with only the root node. * Fixed a bug in the Fortran code for classification that caused segfault on some system when encountering a tree with only root node. * The help page for predict.randomForest() now states the fact that when newdata is not specified, the OOB predictions from the randomForest object is returned. * plot.randomForest() and print.randomForest() were not checking for existence of performance (err.rate or mse) on test data correctly. randomForest/inst/CITATION0000744000175100001440000000131412037254523015050 0ustar hornikuserscitHeader("To cite randomForest in publications use:") citEntry(entry="Article", title = "Classification and Regression by randomForest", author = personList(person(last="Liaw", first="Andy"), person(last="Wiener", first="Matthew")), journal = "R News", year = "2002", volume = "2", number = "3", pages = "18-22", url = "http://CRAN.R-project.org/doc/Rnews/", textVersion = paste("A. Liaw and M. Wiener (2002). ", "Classification and Regression by randomForest. ", "R News 2(3), 18--22.", sep="")) randomForest/DESCRIPTION0000744000175100001440000000116212037262201014435 0ustar hornikusersPackage: randomForest Title: Breiman and Cutler's random forests for classification and regression Version: 4.6-7 Date: 2012-10-16 Depends: R (>= 2.5.0), stats Suggests: RColorBrewer, MASS Author: Fortran original by Leo Breiman and Adele Cutler, R port by Andy Liaw and Matthew Wiener. Description: Classification and regression based on a forest of trees using random inputs. Maintainer: Andy Liaw License: GPL (>= 2) URL: http://stat-www.berkeley.edu/users/breiman/RandomForests Packaged: 2012-10-16 12:55:49 UTC; Liawand Repository: CRAN Date/Publication: 2012-10-16 13:43:29 randomForest/data/0000755000175100001440000000000012037254523013647 5ustar hornikusersrandomForest/data/imports85.rda0000744000175100001440000001376512037254523016226 0ustar hornikusers ] p\yzC1$cG6meVwI+vW2!10$)C2 3u(6d2 GJ&i 44)9ggݵ m'||{?{sBa_ f_a, rBo!, _ H4)0L4|R֤aNGibdG s?,1-Fx~>vZťِcYEu8!_c[ŧUM9;hkg?0}?C,,` 0 )AN%= Ľ Oc*6  nbf8VO#^'Mnƶ} wTb 9Pun.`aϋ]0 #pǛ#h+OVGunTUZy@{1At:9:pT ,pшc4XX'N,񎊋1suJ;Rc)YcQkj*t &Qs3KXЖgFLʌ'2)S0~Yjd*UC}ΛK^o#.~-D;Vg^/W8/9x=o߃vwKc/6[|ޓ;/uvʞO4tx[.D#OV^\C:ۯ{p=yq?raq1QGp֮d]b؏G_QGO}KK7&;/C. 4ړ%#8/8i ?ͻoWȻXUq@lq͗=Ol{O9ٛ?x~:>8qag?8?EW~6#8oi?b=NcJBt/Z8^9%*܏Ə1q"N،9s/1.u܍8 _8{cgFܲ8 b<_j9x > ;΃!zZ~=Yn/Xk۱y[_kX^'bdַ+ -zqڏdm?vvb=7saܿ=3;m4ޜ*S~;m-b|vc91T1Ǹ7|uqyx۲9f<5: W8Ey |ݺW\X7\e_u뙽ב`~ 7-wdqو vpgKv?q?@?x*f7,Go~>#~qބp P%o|~P\|kLqDv|Ӹ#o&qo*b9ƃS7S=onob8_:18㢜!1)[^q:ʫ[)^(OzegbQ̸jLNf|xpeY˾ k-kx9WzegZTc1ql'lߙ~e ] K?m@_ c^uha7ІΦo;:aC ){?; Ƕ4Qa ،3ȗ)y,탷5;/L KO{),cGr+`Z?zW]?x.>߇ x0϶<,p *l-5f|7/on_ChlA5˿P6{@{h){> :~C?*T.^l- @B3NꀸL ρ逭b  eSϴB:.@\:@TǿB?x]Gߢ=Vul翭[/oy"MC}C?wHFzLW<~ Mni4 *iVȗNh>zRgӃ|4Y m8zsaaKoѬ_| ]h'r'[i*o)殤fnd>j/͘cZ|B,%-^r-.ьq !UP\{؋ت 䕾ಣq*s up)[~~f .ٛc=a|"Pd7\_%rojp%5p)^)an_"f5H)s 1M(y'ܥV?Q ;0O\[h]P.˯,U96 '8=4)om+<Կ߃9iUzYd5ouסV[=rQij\kb\<^k֏C݊ӡNu}tRnɗ?g):>)Ɍ.{},Ο nʋ W~bۿ] V KK#8]tUwc [>8rw>b]9?~F{f ׽2xw^UO̯x :m-wU `|8>B SU{[uWόq?|+ yxb:ÙqyIUYs~_^Wt9I}rv5")nnhUu |CbquW/8kƛX}q,r~Qd'G7rn/gC~89n~^G0Ny=AhRQann\UOœ柏ˌ8bZsn~оf1^>J]W2/М}ݗ^gj5Ƚȫ4܋_]Tyb]>%^C{팹kYyĒiv][ v۰|"|-PL-{Ά5VV!x\+قՆD-53EߨC#^U^^n}t,/2ܭWq<QaۇeVn VeȤԐzV`ٹa B_N+*?Ǣmj̖!(fVS fu (ݥhǠNWTW[/,Wƴ|7Xv!V"9Vd! SZתMԸ,Ę\k],Q~ڲD\f9fձf!N2Xfò1*V1taW*+yô* 2l{!ZCKTzRtr-Wu ޱʨKt4_~R|{% 9uzJLTR jk|OU*r,:띪KM^7'`hD %O]py a8{a}!m\^m‰p6;`^-_`3~L}k)\g.8|܅/4:B_@]i5`w}Fm[ DO59em:о [@?qzB=\GƆv`NOl0΃syp]\õs\]77Ҿ:πpYп}4 ~&/0=>fc1;. C+ӛ CM|o}?_!|Z}P bzO`LZ LY}̭s_ǀߊ}l? $wcWAc!^Oװ+ ~+AgŏvZ c [~?Qg 1C+`vu1>igk M` 0V 18W |?:~( ۾ta@b #h=#v?y1v kdw<, LMGJ:p~,71 &Xnٜ/ Rò9v$僩d*.9|Yn܎L!7m5*oN&W`ƪsVH-Ǡ^friU û*JleKϞdLa-aŒ9NIR!/ ,"8rxAM˷ORt4erDgRJId!O#cWL:5[afC(gNU5C%BS"B(%bS"A$&֋THD*,RTL")!!!!!aaaaaQQQQQ11111qqqqq     IIIIIh[^&d2$adT&c2ɄLJ>'ZTj}ROI>'BR-$BR-$BR-$BR-$BR-$R-,R-,R-,R-,R-,"R-""R-""R-""R-""R-"բR-*բR-*բR-*բR-*բR-*bR-&bR-&bR-&bR-&bR-&R-.R-.R-.R-.R-.R-!R-!R-!R-!R-!ՒR-)ՒR-)ՒR-)ՒR-)ՒRM%!^{IH%!zx:lvzrandomForest/COPYING0000744000175100001440000004313112037254523013774 0ustar hornikusers GNU GENERAL PUBLIC LICENSE Version 2, June 1991 Copyright (C) 1989, 1991 Free Software Foundation, Inc. 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. Preamble The licenses for most software are designed to take away your freedom to share and change it. By contrast, the GNU General Public License is intended to guarantee your freedom to share and change free software--to make sure the software is free for all its users. This General Public License applies to most of the Free Software Foundation's software and to any other program whose authors commit to using it. (Some other Free Software Foundation software is covered by the GNU Library General Public License instead.) You can apply it to your programs, too. When we speak of free software, we are referring to freedom, not price. Our General Public Licenses are designed to make sure that you have the freedom to distribute copies of free software (and charge for this service if you wish), that you receive source code or can get it if you want it, that you can change the software or use pieces of it in new free programs; and that you know you can do these things. To protect your rights, we need to make restrictions that forbid anyone to deny you these rights or to ask you to surrender the rights. These restrictions translate to certain responsibilities for you if you distribute copies of the software, or if you modify it. For example, if you distribute copies of such a program, whether gratis or for a fee, you must give the recipients all the rights that you have. You must make sure that they, too, receive or can get the source code. And you must show them these terms so they know their rights. We protect your rights with two steps: (1) copyright the software, and (2) offer you this license which gives you legal permission to copy, distribute and/or modify the software. Also, for each author's protection and ours, we want to make certain that everyone understands that there is no warranty for this free software. If the software is modified by someone else and passed on, we want its recipients to know that what they have is not the original, so that any problems introduced by others will not reflect on the original authors' reputations. Finally, any free program is threatened constantly by software patents. We wish to avoid the danger that redistributors of a free program will individually obtain patent licenses, in effect making the program proprietary. To prevent this, we have made it clear that any patent must be licensed for everyone's free use or not licensed at all. The precise terms and conditions for copying, distribution and modification follow. GNU GENERAL PUBLIC LICENSE TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 0. This License applies to any program or other work which contains a notice placed by the copyright holder saying it may be distributed under the terms of this General Public License. The "Program", below, refers to any such program or work, and a "work based on the Program" means either the Program or any derivative work under copyright law: that is to say, a work containing the Program or a portion of it, either verbatim or with modifications and/or translated into another language. (Hereinafter, translation is included without limitation in the term "modification".) Each licensee is addressed as "you". Activities other than copying, distribution and modification are not covered by this License; they are outside its scope. The act of running the Program is not restricted, and the output from the Program is covered only if its contents constitute a work based on the Program (independent of having been made by running the Program). Whether that is true depends on what the Program does. 1. You may copy and distribute verbatim copies of the Program's source code as you receive it, in any medium, provided that you conspicuously and appropriately publish on each copy an appropriate copyright notice and disclaimer of warranty; keep intact all the notices that refer to this License and to the absence of any warranty; and give any other recipients of the Program a copy of this License along with the Program. You may charge a fee for the physical act of transferring a copy, and you may at your option offer warranty protection in exchange for a fee. 2. You may modify your copy or copies of the Program or any portion of it, thus forming a work based on the Program, and copy and distribute such modifications or work under the terms of Section 1 above, provided that you also meet all of these conditions: a) You must cause the modified files to carry prominent notices stating that you changed the files and the date of any change. b) You must cause any work that you distribute or publish, that in whole or in part contains or is derived from the Program or any part thereof, to be licensed as a whole at no charge to all third parties under the terms of this License. c) If the modified program normally reads commands interactively when run, you must cause it, when started running for such interactive use in the most ordinary way, to print or display an announcement including an appropriate copyright notice and a notice that there is no warranty (or else, saying that you provide a warranty) and that users may redistribute the program under these conditions, and telling the user how to view a copy of this License. (Exception: if the Program itself is interactive but does not normally print such an announcement, your work based on the Program is not required to print an announcement.) These requirements apply to the modified work as a whole. If identifiable sections of that work are not derived from the Program, and can be reasonably considered independent and separate works in themselves, then this License, and its terms, do not apply to those sections when you distribute them as separate works. But when you distribute the same sections as part of a whole which is a work based on the Program, the distribution of the whole must be on the terms of this License, whose permissions for other licensees extend to the entire whole, and thus to each and every part regardless of who wrote it. Thus, it is not the intent of this section to claim rights or contest your rights to work written entirely by you; rather, the intent is to exercise the right to control the distribution of derivative or collective works based on the Program. In addition, mere aggregation of another work not based on the Program with the Program (or with a work based on the Program) on a volume of a storage or distribution medium does not bring the other work under the scope of this License. 3. You may copy and distribute the Program (or a work based on it, under Section 2) in object code or executable form under the terms of Sections 1 and 2 above provided that you also do one of the following: a) Accompany it with the complete corresponding machine-readable source code, which must be distributed under the terms of Sections 1 and 2 above on a medium customarily used for software interchange; or, b) Accompany it with a written offer, valid for at least three years, to give any third party, for a charge no more than your cost of physically performing source distribution, a complete machine-readable copy of the corresponding source code, to be distributed under the terms of Sections 1 and 2 above on a medium customarily used for software interchange; or, c) Accompany it with the information you received as to the offer to distribute corresponding source code. (This alternative is allowed only for noncommercial distribution and only if you received the program in object code or executable form with such an offer, in accord with Subsection b above.) The source code for a work means the preferred form of the work for making modifications to it. For an executable work, complete source code means all the source code for all modules it contains, plus any associated interface definition files, plus the scripts used to control compilation and installation of the executable. However, as a special exception, the source code distributed need not include anything that is normally distributed (in either source or binary form) with the major components (compiler, kernel, and so on) of the operating system on which the executable runs, unless that component itself accompanies the executable. If distribution of executable or object code is made by offering access to copy from a designated place, then offering equivalent access to copy the source code from the same place counts as distribution of the source code, even though third parties are not compelled to copy the source along with the object code. 4. You may not copy, modify, sublicense, or distribute the Program except as expressly provided under this License. Any attempt otherwise to copy, modify, sublicense or distribute the Program is void, and will automatically terminate your rights under this License. However, parties who have received copies, or rights, from you under this License will not have their licenses terminated so long as such parties remain in full compliance. 5. You are not required to accept this License, since you have not signed it. However, nothing else grants you permission to modify or distribute the Program or its derivative works. These actions are prohibited by law if you do not accept this License. Therefore, by modifying or distributing the Program (or any work based on the Program), you indicate your acceptance of this License to do so, and all its terms and conditions for copying, distributing or modifying the Program or works based on it. 6. Each time you redistribute the Program (or any work based on the Program), the recipient automatically receives a license from the original licensor to copy, distribute or modify the Program subject to these terms and conditions. You may not impose any further restrictions on the recipients' exercise of the rights granted herein. You are not responsible for enforcing compliance by third parties to this License. 7. If, as a consequence of a court judgment or allegation of patent infringement or for any other reason (not limited to patent issues), conditions are imposed on you (whether by court order, agreement or otherwise) that contradict the conditions of this License, they do not excuse you from the conditions of this License. If you cannot distribute so as to satisfy simultaneously your obligations under this License and any other pertinent obligations, then as a consequence you may not distribute the Program at all. For example, if a patent license would not permit royalty-free redistribution of the Program by all those who receive copies directly or indirectly through you, then the only way you could satisfy both it and this License would be to refrain entirely from distribution of the Program. If any portion of this section is held invalid or unenforceable under any particular circumstance, the balance of the section is intended to apply and the section as a whole is intended to apply in other circumstances. It is not the purpose of this section to induce you to infringe any patents or other property right claims or to contest validity of any such claims; this section has the sole purpose of protecting the integrity of the free software distribution system, which is implemented by public license practices. Many people have made generous contributions to the wide range of software distributed through that system in reliance on consistent application of that system; it is up to the author/donor to decide if he or she is willing to distribute software through any other system and a licensee cannot impose that choice. This section is intended to make thoroughly clear what is believed to be a consequence of the rest of this License. 8. If the distribution and/or use of the Program is restricted in certain countries either by patents or by copyrighted interfaces, the original copyright holder who places the Program under this License may add an explicit geographical distribution limitation excluding those countries, so that distribution is permitted only in or among countries not thus excluded. In such case, this License incorporates the limitation as if written in the body of this License. 9. The Free Software Foundation may publish revised and/or new versions of the General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns. Each version is given a distinguishing version number. If the Program specifies a version number of this License which applies to it and "any later version", you have the option of following the terms and conditions either of that version or of any later version published by the Free Software Foundation. If the Program does not specify a version number of this License, you may choose any version ever published by the Free Software Foundation. 10. If you wish to incorporate parts of the Program into other free programs whose distribution conditions are different, write to the author to ask for permission. For software which is copyrighted by the Free Software Foundation, write to the Free Software Foundation; we sometimes make exceptions for this. Our decision will be guided by the two goals of preserving the free status of all derivatives of our free software and of promoting the sharing and reuse of software generally. NO WARRANTY 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. END OF TERMS AND CONDITIONS How to Apply These Terms to Your New Programs If you develop a new program, and you want it to be of the greatest possible use to the public, the best way to achieve this is to make it free software which everyone can redistribute and change under these terms. To do so, attach the following notices to the program. It is safest to attach them to the start of each source file to most effectively convey the exclusion of warranty; and each file should have at least the "copyright" line and a pointer to where the full notice is found. Copyright (C) This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA Also add information on how to contact you by electronic and paper mail. If the program is interactive, make it output a short notice like this when it starts in an interactive mode: Gnomovision version 69, Copyright (C) year name of author Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. This is free software, and you are welcome to redistribute it under certain conditions; type `show c' for details. The hypothetical commands `show w' and `show c' should show the appropriate parts of the General Public License. Of course, the commands you use may be called something other than `show w' and `show c'; they could even be mouse-clicks or menu items--whatever suits your program. You should also get your employer (if you work as a programmer) or your school, if any, to sign a "copyright disclaimer" for the program, if necessary. Here is a sample; alter the names: Yoyodyne, Inc., hereby disclaims all copyright interest in the program `Gnomovision' (which makes passes at compilers) written by James Hacker. , 1 April 1989 Ty Coon, President of Vice This General Public License does not permit incorporating your program into proprietary programs. If your program is a subroutine library, you may consider it more useful to permit linking proprietary applications with the library. If this is what you want to do, use the GNU Library General Public License instead of this License.