ROSE/0000755000176200001440000000000014061607341011024 5ustar liggesusersROSE/NAMESPACE0000644000176200001440000000126614061411763012251 0ustar liggesusers#exportPattern("^[[:alpha:]]+") export( "roc.curve", "ROSE.eval", "ROSE", "ovun.sample", "accuracy.meas" ) S3method(print, accuracy.meas) S3method(print, roc.curve) S3method(summary, roc.curve) S3method(print, summary.roc.curve) S3method(print, ovun.sample) S3method(summary, ovun.sample) S3method(print, summary.ovun.sample) S3method(print, ROSE) S3method(summary, ROSE) S3method(print, summary.ROSE) S3method(print, ROSE.eval) S3method(summary, ROSE.eval) S3method(print, summary.ROSE.eval) importFrom(utils, head, tail, packageDescription, methods) importFrom(graphics, abline, lines, plot) importFrom(stats, as.formula, model.frame, predict, rbinom, rnorm, sd, terms) ROSE/data/0000755000176200001440000000000012313344612011732 5ustar liggesusersROSE/data/hacide.rda0000644000176200001440000004613512213606607013654 0ustar liggesusersgPTϻ.J(JPËHQFDP *bAA$%#!aHC!9e}Su{YkM}w[C^GU~@qaaٸo~`d]qV447mԡ߲q_?&Yn8ǧؿ K}F3#U32xQZs,7p6C,riY'kYRo5cmR~';B݋׃ˑx4}!#ltjDZup rÔDi ۾b yZ]|ٰyR*3Gl.b:솺TaluղB؂L'u!P5]sG:6뼓qς"W,ꞷfW|J&_/cZߦ1.Xci8,OF[i@ YҀp%/6w4]aOl-r%U Wt6-,8{ֵ~=ݻ8lfigazm@YVb864DӋb[On.~1lgAy OHQytuo]$Y Q~NJ꒰6B@  $cEY@޾ccXV^qlm X]玮0UL*U ߽(_AGQGӁjRBr!g-B:zvu)򁺝CJ$1v5Pk剅ۀ=ԃd:F2&5B`y(36ܾU?6֌%[CnlI19UL-1$2!aY>DV40{usHU#vR*&nMނEM@ljy4NZ3]#91ROEiwvng鰇^oAuamZWlA9;e>q;,3Ehz Nc*di)lJnX򧱺2U{$p: 99+ G+1 whku_r7{cs !ߕc</$5o\_ Kip_{j#}ȀNoVa8"E~$m 54n0胭5?awԱsM+}[_~zBzdmSt=XD΂htJtcN~Cr;TwFr}>M3W {fERM=kF.A])NN>h&{[u-=jhf}LfO7CwAyzzA wvƏo& X;n55\A) l7#Q!.”vIg+ [^Eu^(5~67 $zK?e>,9Ʉe-{e܉vYGP5ɤ"^ҥCaI _R¶γaU$bgjNKRXq3F^{^LJpћcW tso94vM7{/)m~9Hx g{_b*fk`3,\٫ئCz/1zmz{߼3̞ƚRD) , gzi =}I ܴŵѣ#;&b(#._;LO ~ep|0ȫ=1'}P!y{6E.<0k &g?Zju'-(EoۜqJNKi@bDa&tYZ]e8Xn'7pN1L19t;4P}F Q06X/'NL*8c#KC4*u{L5+o "BUuyFSxHQ8$o~kF/D 6/{~ۜзdXJ9xKrraǩ|4&#e"&8 XnX<#lQ.8B <):99odB!$f AjGqO%yl\{v; .hv>S)M`ܵ;>黅toa֝0xxjOc*opCr}J`h,>x&R*a"Hoo1P^ęgކ1 {Ȯ?OivUk0UpN9͛Eji-eM J4;kUobWR m8*ʍث/-%DMn% L~Pp2._H+O˨ٓ:'g^\ -hY^eeO ga<{[Ь,@?I6:@'3_W &Zn˚}oFYcTFY $W&8+3.jcˮ &0EՠÌN{zK#f7">F@4qCM;Y+be3##þqYvJ;I̤z)2M "g`貝A'liꊫ5ב/P}e(l9#rZ`Ijwtj@i\@2VW[f5`9_1Z3MwFccK'X 6]3Fm~,*&o#qvSOݪ#G*2|/ 1>L"46m@IXv:L 蛞XnJm] I hu"d\I3+R4`H 'tǙ4O$Cw,S5; Xh$H<;҂[B 9 {}9j^ ܨ˪XÁ"{6#f#sܻ=+U">ZВ׾p#¬ä阤)z\y6A 6>j=$sw\ma~/gYx*|j4B)ޛ=/{O?қY"(I,,lrt/6.L^~ XY{SulJV>3zhdV??/R o[J@Odp0t\,*Ŷ"Ei 9*sANyq@V*ZdevyGN+9|;| 8qy(io#ˉٷ{GƼZ3Dhwl(a']ZaagĖ`q3[s9s1V{ob{".€ݟPPvC )T^V뭅931w RE%#ڕcǎ`y\%k"{x]NdŢ^AC>H7L7uHBob@R5EU6 ߌ/=Q) ⰞO^^1:r!߻J_26 ~K:w쾰|Aeõ]cX xlEVQ./^ڼ4‡%4w۵?g?rvš#^Wא}i^2O>U+OAeJ[O,n2g-urJl4Ҫɍ߀EO˳;ᤆk7hd`ņ /Bd6,x&䢯&-;H{GQsуrOj KvG?9 V3{ԹSG+tn7[H8q!?q\wHC2v쟓yXT ;$g֝啀YJt@ yq{?&Mfn7BPrr`e 5qjHxeLU˄~W2#g N?_U\Uy u5 "VK 36-oP/T)vw> =֫\s8>2cc^`'7bո{C\Egaa6X| P487y)v00! w'δ@.n$hVREf6#G谊dܯ M窀5rjBjaGm( Vmy k)$ )PM._f1n>tO8q+ "fw`Ddq]օ ^B$^(vڿqv^/qi|\g&I:D&FX$N;=d|vʥUT eb>~EbwEO.옪nejn8 b D1,eQq+_I BLTv^&=ѝ n?!~Lmvzs&lъ ˰qէZ>C,rtbE%#|!U?m5ƿW)hNlhH-OG?}\5jl5ns굏Y2qyψ9.ro~P s, 15!TH ?M F |٢.~TQ]+L$Hds5>BqtrQe;3lI*.G7k0#Wpo'Dv>~shW-?Ylz|O.>W°πӣb˵5f*z2#S)i`)5zQ fSӗ`E5t{;hCm3PK|g>dk@X8y=rq->~&WwQkA}W9.s0;]G= sI+p+R~U^K_k߁-rZбmiS.K;ؓ2ǘ0 <)8"dW0YtZwR<=w"U,y)u7@4)TkܽB_nG62DV@R߂팧qfte?ž ~n5 L.lJ{ը秀t,Xu̸7[]Et}MCk qL\;mbz  G$-,tb?yH*NF~U_kmf䇅9b ~Rh8|>r:$bη2Ϭ'07:պ-c*GƦ,Ǚ{\ԡ՜Dn`~.YnU44)5/`7c>H=R`}jd>,}u.=eE8@#d{:h\TyN t{[UG~TTØ'Pt4=6؟x.ӏC¶WA*k0aǢȼ0?Q1f97͔0ZbO[ohZHqMrXG+GH{79v䢩9<A7I5;;)DŽCĔW5X0+pl8!_:6 ^v\y[&Ew1U*0| 6~y Hu=2q =gy鐞ք:wm2{BmZ.U'vL-Jm?a1W93/ȕdP$ӈqvc֞yPS>?mk c^._r۽}?@Y7Pe:k;ieŐT9P4ᇲJR v-ػ@?[gC?, {}؈jf30ep$bb ؆A']g մ}Xw>3*pHJz:vǼ9_bu%4S31qMډOi_ f0Ny`hncAI_-+|j9M5;!Fk?Xpf)FɼYݏG\pǦ^_2zЎ, 稁nNzSo=wՆA[&(ĞFW*ʼns-FX:w+ d_9*zk+0A S+fbQ]U# sq U$ʆK`y֒`wAw):۵7ԖZC֮?+z!ַƲwZHHy*N $EPwrAW_͵8'ߩ8}E,1X"ۮN sQ( T@}/ئq2`<β !!#uC|)(Ԕ:Sm$_{)sE9e Uۭ%71h2 ^Anȋu7x >hwC hqnW. n@ _d!n_ |*"̿%7CHT_%| Ifx(3=ll/,a{A{ϠnX>|w i7PG1R8*jf0Yͻݢ|ē.}FcO!ఒ%,&Iߋ/z0cAF۽eLMq2]k Orτ$W׷αe:`w%#V!@Rp0x|{܇"k<(}_y{*GbQ^oXb:ŴHe8z:j젃(jpno`+6J}+ n7{9qj[seˢ̒DimĆ+m32LŌ{pZV].Q'|x;Yd#yYw#^}<\&z^NxYr8Ze~9ÞЯvCN7/ қY‹/&`{؜~΀A}Rő7јE }6lη**U7͞*y@Tp 9ww^я'&oR`=zw@–b%Z,>rrv( 朗rݣk ;Үd<Ø+K%j22GDf 6D|mmqN%nqdz#^sP.r׏BǢh`~`Mك=ZuM=S]k)UTL\]8æx*E\: CYRZ7*B tk8^;kbQƃ"d]smrJ~e^޷]}G;F?>fNJ]S}űiw̕> IW ?3Cwq}׳'z_6A n+Ϡ)<6[-h("{kcF WLV;Uhߵꌬ,=-% lz#]~q_n[BGޖǟnr=J#b;h5RWlf+! LefUc+dH8:_RFă!LGC)6L3,M}~ru̯L!?'?^?̃$D&z_< R' rzcNJBx(;Yk 1S&|.;:\) F\$v ]൦mH폇_ی8hNi$(w(\ͅp|.}zvWPQƽ8u(TRbo5؛U ͬW+3nABUMt,6Ofہ4Pw2NAXq=X& 璏W^5X)P*58pv8>/49"5J(xI`_6 Ǖ =-۾9g(0 ٷOP(ESR6~Xԣ]ZFq=($N (yD;R24/- 2G "T.Y57 0xXھ} ֿ E Xj9+&e|fj׹^vu{q£EZn-{s3 :-3}EqlMˁ@Lo7,VO)u’r#{n]QZ a؋PlĵdC獣ͱ݁A )Ƌz*pȃ"1"{84\}3cj<0E{@Xf"ގ};o&>d  ʝ]oq URGpxF $f*&C87k':\ܺq@rOo&N'"INm=7;ha l]Qۡ*7A ~q[\JЏ tvh*)hR{BمI~O6,Z߾te 4&.UM\|U ְ" a dp:}氵_?m Ⱥ?_1<>B>}c6mKOأuH.a5BCy@M,‘zI #0vN ^ )0A[MSk+Vm=nVP%a19 <F(yxBc^2pnú4@qg:$ſw zޘvpFl-OH}Jpx tC%PDNqFX</U nrueSXgU^  +'əHM-:p))?ZK%8 ow AOٝQ^.UZ8 :Cϛ'0+!\^\~ƍ#*ڋzɁhEs„8dgO<&TAw虋CuŰ 16+M^lӷ zpDQ. <[?c Xdkw+q hYWI,OU]/";7= hݏ}*,akcnlYڻM f"*7y318Zš ? -=<1)Y&Xјf6?N.g0+Sf~;h V}eU u7Nʑ+6P)kfw$)rNʜ}?@h#Xk~b}7/Ӆ 0|Eo3w-&rN˜kozPC{ؓŗ(uH#ydT$y68)E)g f!,ɜ3%LU?Gp:VȍT `[{`AVǺ5?Q0D9s ]+8rˉ`\ieiCg3@_wFUbÜ .GX] "E싰zE g2/gm<,=1 уYR͢0< هxl T컸qh3qݓXGnм9dyU^gyli~mdv J$<ukJj˄d5O"kfE =הXy=pG;:S!XO2,>qpa-8XW[^H|nנ]-57 2MO~{산lw =|~Wrű,-9QSYk]G:D^@뚄a?4fKW/Άۖz1W^}=d{}@w"\P>Х;,Lo.nZ&ĿzjLML,LM&~WCJ1$flϱۡ9Fb7e5s/K-`IɡD Y/'N3N_GFMfnA9DNmkt!BXzjS4 ֊֨  _U`0† N\(}dDPk+V)UZ۸p0~5WxΞ?FD2֑pHVm$uQ`Oe.VO۶@xN6$ 4|-D@D A!-XyUdK!}"{Z[6rwpi#?g"\GE]+F#=t3P`JljIױt6\]O"Rޓ3S_ f׌S:yP@Ex6G7AA ]$'(8Oņvo^6C@N!;0qӞ?0q04w(' {#~&,󣁑X8-$$?W y@ep 8>L;wl͌`Cƅ}աfR QJpK:nX;%Yqۭ 6.Zo:r6Ҍg;$UqnCJY9h+s԰ >%0anUK%%av=B1h;VoaVC: Sa|qr0>vbIo:G{6ܱOr~.4ɂC& ᫴l 8ru 4eQjr&w?Yկۮ *ĤSWqNxk+y$FK0kn gCG{yrl&*…7&!PrjO3jp۽ؾD8+,F#NC#%,(+Y}woWƒ5o,M*A4^b y@ge]Pƒ2oO|΃:ﻰuj%F[Ǯ\rW@Zs !pL)=[`JqIAHh}X/s㛵C9n% (0ջaNBZ_yd. uu/[pp3]o [7j6iЪKj809U$}{~8+sl`] ATdi$$fMqF08,SjJc ы3Y%Jq/y8ɋ:o~I,<4"<5xt"H9tXL+DfU)mn7*iү<,M+u/s*4cSV#彿k~9"O yGpxk!{;.Ks}:Vl_ߑukW+:.qPpՊ 81-?|z )5Eie0qc( NsѤ$[ve;!gyscV0qRf~IYuk^+(wL`LL)@%ttwrs=lոO ZE岉mdeoаZNF4vq3um캆8y+TA1LwYql*qGoXܬT+H)2Hz:MsHKCe?6E\sC}:S.pT$peyNV{`Tl\*6, O\ű&!j.obftDJZhmX).ء SUnYBQ}rb: hpt[W {u`2QQz,k*4 V(jwv{ R~ǝ°/pZ/>ew衹f"%Ke/Ы~:*_(~EG[@aEb(K%ݙyCbWžGjY(ז]GR^bJ^d}8i#SgǡR,|´%d Xξ'P?[ms׍&{//o3hMXQƢ۾%߮:XYy@ s; Z/oˏ`}Ge8v!r./P="u+v|y1\ac8ܱO& _AGK~A.XJ-H;ySZՎ̕p0U];ç5L"< ҽquqgq.Z4zu$.+87E?{ ݌`i G얐I*/>cZY$ F0PxF?+k9Dב/0^yU5sJ\Ӹt>Kڭ )AyyP#biv"0fK^?gktk8:s'xcor )C;lI&.bU@G l[*Lbjwe.t&0XfRx ټ] ]agesmKR"~|YO&+ ֮iy; +Hn:^g^G9Y?8uSh?&=jOSK5Kص᷇(66LVt BO'dף5p*_l(q/jr@ztCWFCMSz۲F$s#5n\%dvO-9Au)K1yqѴQ=a6"s'sct CE-U (853澲<:* rpz~j.著XWwd,뇮T&30S:LtBqB$Ч>MW a{?셾 \>ghu LJ&q$|ɥ> }.twj`/,'owجb׵9\8Jj']SWcty3[t)O{W m}7І +)0-1q$@_uF`eh}@yjYٶ;\|'ː;ڌX注@ ^0a", [^w}>xBIa²k"bo/WH|1qzU:&ﯞi4”)w\{T'Pz};s!< XIJ (~(w$)qnQflEgavat#8#k祁 UQO.O]?5財Sxu9[z/ R]&@\n `V_x A9T$?3No; C6ù%nu80*w:V)Zw}jOmAK0uMHS鞙K3Ë@>Y#[X-p؜#M1F }&ܕRļ 8 qa7, e,-u1a c#>Ve 3g/Xp8*f~DЂ=-1v50iiDW;`%Ef {X꛷= 'pi:o,}aLͪPo%nzgH2- h`z/$(2^b3F Gp1 4>"0Vǰn' yɮ Kh=b e0\ ꝰf;v";oB %,<)X1>2q̙Zlqn,Ҳ}v:b0Y`[ۊ@bc|074KCh-b5،3BayP(ֲ"гZaыfs0P!>5|̎t#EҩdLb gfo>V.zؑ)V\ "" 0h'2/ /baYfh f&R8|q%;]yqwO=xF`E Up?,OiC`$Ysal)|on^k۰tW [e;_w{{qrh:Rl(,o fi8cӧ\8b7g2!4}5]͟v#y;qHlk.9Gpљ7VԊT՛JGmeiX8֕O/{ƤU]4y.gq}G\;`-T#3bWpk Ӷ ɬ:>\8lgjaF"`gei-z`JW!l월dJĖtu)YG|ꭟ _qO zupŞoYnMp臎8EUiC qSnf؊< Rp.őx$`|r^Ϙn4o5J؟ϐQnvIx/;'z$QAԏ#zދ \igaY"y}"b;= ڱv:Uz*LcROSE/man/0000755000176200001440000000000014061415607011601 5ustar liggesusersROSE/man/ovun.sample.Rd0000644000176200001440000000637112305347565014354 0ustar liggesusers\name{ovun.sample} \alias{ovun.sample} \title{ Over-sampling, under-sampling, combination of over- and under-sampling. } \description{ Creates possibly balanced samples by random over-sampling minority examples, under-sampling majority examples or combination of over- and under-sampling. } \usage{ ovun.sample(formula, data, method="both", N, p=0.5, subset=options("subset")$subset, na.action=options("na.action")$na.action, seed) } \arguments{ \item{formula}{ An object of class \code{\link{formula}} (or one that can be coerced to that class). See \code{\link{ROSE}} for information about interaction among predictors or their transformations. } \item{data}{ An optional data frame, list or environment (or object coercible to a data frame by \code{as.data.frame}) in which to preferentially interpret ``formula''. If not specified, the variables are taken from ``environment(formula)''. } \item{method}{ One among \code{c("over", "under", "both")} to perform over-sampling minority examples, under-sampling majority examples or combination of over- and under-sampling, respectively. } \item{N}{ The desired sample size of the resulting data set. If missing and \code{method} is either \code{"over"} or \code{"under"} the sample size is determined by oversampling or, respectively, undersampling examples so that the minority class occurs approximately in proportion \code{p}. When \code{method = "both"} the default value is given by the length of vectors specified in \code{formula}. } \item{p}{ The probability of resampling from the rare class. If missing and \code{method} is either \code{"over"} or \code{"under"} this proportion is determined by oversampling or, respectively, undersampling examples so that the sample size is equal to \code{N}. When \code{method ="both"} the default value given by 0.5. } \item{subset}{ An optional vector specifying a subset of observations to be used in the sampling process. The default is set by the \code{\link{subset}} setting of \code{\link{options}}. } \item{na.action}{ A function which indicates what should happen when the data contain 'NA's. The default is set by the \code{\link{na.action}} setting of \code{\link{options}}. } \item{seed}{ A single value, interpreted as an integer, recommended to specify seeds and keep trace of the sample. } } \value{ The value is an object of class \code{ovun.sample} which has components \item{Call}{The matched call.} \item{method}{The method used to balance the sample. Possible choices are \cr \code{c("over", "under", "both")}.} \item{data}{ The resulting new data set.} } \seealso{ \code{\link{ROSE}}. } \examples{ # 2-dimensional example # loading data data(hacide) # imbalance on training set table(hacide.train$cls) # balanced data set with both over and under sampling data.balanced.ou <- ovun.sample(cls~., data=hacide.train, N=nrow(hacide.train), p=0.5, seed=1, method="both")$data table(data.balanced.ou$cls) # balanced data set with over-sampling data.balanced.over <- ovun.sample(cls~., data=hacide.train, p=0.5, seed=1, method="over")$data table(data.balanced.over$cls) } ROSE/man/accuracy.meas.Rd0000644000176200001440000000645412215534603014614 0ustar liggesusers\name{accuracy.meas} \alias{accuracy.meas} \title{ Metrics to evaluate a classifier accuracy in imbalanced learning } \description{ This function computes precision, recall and the F measure of a prediction. } \usage{ accuracy.meas(response, predicted, threshold = 0.5) } \arguments{ \item{response}{ A vector of responses containing two classes to be used to evaluate prediction accuracy. It can be of class \code{"factor"}, \code{"numeric"} or \code{"character"}. } \item{predicted}{ A vector containing a prediction for each observation. This can be of class \code{"factor"} or \code{"character"} if the predicted label classes are provided or \code{"numeric"} for the probabilities of the rare class (or a monotonic function of them). } \item{threshold}{ When \code{predicted} is of class \code{numeric}, it defines the probability threshold to classify an example as positive. Default value is meant for predicted probabilities and is set to 0.5. See further details below. Ignored if \code{predicted} is of class \code{factor}} } \details{ Prediction of positive or negative labels depends on the classification threshold, here defined as the value such that observations with predicted value greater than the threshold are assigned to the positive class. Some caution is due in setting the threshold as well as in using the default setting both because the default value is meant for predicted probabilities and because the default 0.5 is not necessarily the optimal choice for imbalanced learning. Smaller values set for the threshold correspond to assign a larger misclassification costs to the rare class, which is usually the case. Precision is defined as follows: \deqn{\frac{\mbox{true positives}}{\mbox{true positives + false positives}}} Recall is defined as: \deqn{\frac{\mbox{true positives}}{\mbox{true positives + false negative}}} The F measure is the harmonic average between precision and recall: \deqn{2 \cdot \frac{\mbox{precision} \cdot \mbox{recall}}{\mbox{precision+recall}}} } \value{ The value is an object of class \code{accuracy.meas} which has components \item{Call}{The matched call.} \item{threshold}{The selected threshold.} \item{precision}{A vector of length one giving the precision of the prediction} \item{recall}{A vector of length one giving the recall of the prediction} \item{F}{A vector of length one giving the F measure} } \references{ Fawcet T. (2006). An introduction to ROC analysis. \emph{Pattern Recognition Letters}, 27 (8), 861--875. } \seealso{ \code{\link{roc.curve}} } \examples{ # 2-dimensional example # loading data data(hacide) # imbalance on training set table(hacide.train$cls) # model estimation using logistic regression fit.hacide <- glm(cls~., data=hacide.train, family="binomial") # prediction on training set pred.hacide.train <- predict(fit.hacide, newdata=hacide.train, type="response") # compute accuracy measures (training set) accuracy.meas(hacide.train$cls, pred.hacide.train, threshold = 0.02) # imbalance on test set table(hacide.test$cls) # prediction on test set pred.hacide.test <- predict(fit.hacide, newdata=hacide.test, type="response") # compute accuracy measures (test set) accuracy.meas(hacide.test$cls, pred.hacide.test, threshold = 0.02) } \keyword{ supervised classification } ROSE/man/ROSE.eval.Rd0000644000176200001440000003177312361167120013574 0ustar liggesusers\name{ROSE.eval} \alias{ROSE.eval} \title{ Evaluation of learner accuracy by ROSE } \description{ Given a classifier and a set of data, this function exploits ROSE generation of synthetic samples to provide holdout, bootstrap or leave-K-out cross-validation estimates of a specified accuracy measure. } \usage{ ROSE.eval(formula, data, learner, acc.measure="auc", extr.pred=NULL, method.assess="holdout", K=1, B=100, control.rose=list(), control.learner=list(), control.predict=list(), control.accuracy=list(), trace=FALSE, subset=options("subset")$subset, na.action=options("na.action")$na.action, seed)} \arguments{ \item{formula}{ An object of class \code{\link{formula}} (or one that can be coerced to that class). The specification of the formula must be suited for the selected classifier. See \code{\link{ROSE}} and the ``Note'' below for information about interaction among predictors or their transformations.} \item{data}{ An optional data frame, list or environment (or object coercible to a data frame by \code{as.data.frame}) in which to preferentially interpret ``formula''. If not specified, the variables are taken from ``environment(formula)''. } \item{learner}{ Either a built-in \pkg{R} or an user defined function that fits a classifier and that returns a vector of predicted values. See ``Details'' below. } \item{acc.measure}{ One among \code{c("auc", "precision", "recall", "F")}, it defines the accuracy measure to be estimated. Function \code{\link{roc.curve}} is internally called when \code{auc="auc"} while the other options entail an internal call of function \cr \code{\link{accuracy.meas}}. Default value is \code{"auc"}. } \item{extr.pred}{ An optional function that extracts from the output of a \code{predict} function the vector of predicted values. If not specified, the value returned by ``predict'' is used. See Examples below. } \item{method.assess}{ One among \code{c("holdout", "LKOCV", "BOOT")}, it is the method used for model assessment. When \code{"holdout"} is chosen, the learner is fitted on one ROSE sample and tested on the data provided in \code{formula}. \code{"LKOCV"} stands for ``leave-K-out cross validation": the original data set is divided into \eqn{Q} subsets of \code{K} observations; at each round, the specified learner is estimated on a ROSE sample built on the provided data but one of these groups and then a prediction on the excluded set of observations is made. At the end of the process, the \eqn{Q} distinct predictions are deployed to compute the selected accuracy measure. \code{"BOOT"} estimates the accuracy measure by fitting a learner on \code{B} ROSE samples and testing each of them on the provided data. } \item{K}{ An integer value indicating the size of the subsets created when \cr\code{method.assess="LKOCV"}. If \code{K} is not a multiple of the sample size \eqn{n}, then \eqn{Q-1} sets of size \code{K} are created and the remaining \eqn{n-(Q-1)K} observations are used to form the last subset. Default value is 1, i.e. leave-1-out cross validation is performed. } \item{B}{ The number of bootstrap replications to set when \code{method.assess="BOOT"}. Ignored otherwise. Default value is 100. } \item{control.learner}{ Further arguments to be passed to \code{learner}} \item{control.rose}{ Optional arguments to be passed to \code{\link{ROSE}}. } \item{control.predict}{ Further arguments to be passed to \code{\link{predict}}. } \item{control.accuracy}{ Optional arguments to be passed to either \code{\link{roc.curve}} or \code{\link{accuracy.meas}} depending on the selected accuracy measure. } \item{trace}{ logical, if \code{TRUE} traces information on the progress of model assessment (number of bootstrap or cross validation iterations performed). } \item{subset}{ An optional vector specifying a subset of observations to be used in the sampling and learning process. The default is set by the \code{\link{subset}} setting of \code{\link{options}}. } \item{na.action}{ A function which indicates what should happen when the data contain 'NA's. The default is set by the \code{\link{na.action}} setting of \code{\link{options}}. } \item{seed}{ A single value, interpreted as an integer, recommended to specify seeds and keep trace of the generated ROSE sample/es. } } \details{ This function estimates a measure of accuracy of a classifier specified by the user by using either holdout, cross-validation, or bootstrap estimators. Operationally, the classifier is trained over synthetic data generated by ROSE and then evaluated on the original data. Whatever accuracy measure and estimator are chosen, the \emph{true} accuracy depends on the probability distribution underlying the training data. This is clearly affected by the imbalance and its estimation is then regulated by argument \code{control.rose}. A default setting of the arguments (that is, \code{p=0.5}) entails the estimation of the learner accuracy conditional to a balanced training set. In order to estimate the accuracy of a learner fitted on unbalanced data, the user may set argument \code{p} of \code{control.rose} to the proportion of positive examples in the observed sample. See Example 2 below and, for further details, Menardi and Torelli (2014). To the aim of a grater flexibility, \code{ROSE.eval} is not linked to the use of a specific learner and works virtually with any classifier. The actual implementation supports the following two type of \code{learner}. In the first case, \code{learner} has a 'standard' behavior in the sense that it is a function having \code{\link{formula}} as a mandatory argument and retrieves an object whose class is associated to a \code{\link{predict}} method. The user that is willing to define her/his own \code{learner} must follow the implicit convention that when a classed object is created, then the function name and the class should match (such as \code{lm}, \code{glm}, \code{rpart}, \code{tree}, \code{nnet}, \code{lda}, etc). Furthermore, since \code{predict} returns are very heterogeneous, the user is allowed to define some function \code{extr.pred} which extracts from the output of \code{predict} the desired vector of predicted values. In the second case, \code{learner} is a wrapper that allows to embed functions that do not meet the aforementioned requirements. The wrapper must have the following mandatory arguments: \code{data} and \code{newdata}, and must return a vector of predicted values. Optional arguments can be passed as well into the wrapper including the \code{...} and by specifiyng them through \code{control.learner}. When argument \code{data} in \code{ROSE.eval} is not missing, \code{data} in \code{learner} receives a data frame structured as the one in input, otherwise it is constructed according to the template provided by \code{formula}. The same rule applies for argument \code{newdata} with the exception that the class label variable is dropped. See ``Examples'' below. } \value{ The value is an object of class \code{ROSE.eval} which has components \item{Call}{The matched call.} \item{method}{The selected method for model assessment.} \item{measure}{The selected measure to evaluate accuracy.} \item{acc }{The vector of the estimated measure of accuracy. It has length \eqn{1} if \cr \code{method.assess="holdout"}, or \code{method.assess="LKOCV"} and length \code{B} if \code{method.assess="BOOT"}, corresponding to the bootstrap distribution of the accuracy estimator.} } \references{ Lunardon, N., Menardi, G., and Torelli, N. (2014). ROSE: a Package for Binary Imbalanced Learning. \emph{R Jorunal}, 6:82--92. Menardi, G. and Torelli, N. (2014). Training and assessing classification rules with imbalanced data. \emph{Data Mining and Knowledge Discovery}, 28:92--122. } \section{Note}{ The function allows the user to include in the formula transformations of predictors or interactions among them. ROSE samples are generated on the original data and transformations or interactions are ignored. These are then retrieved in fitting the classifier, provided that the selected learner function can handle them. See also ``Warning'' in \code{\link{ROSE}}. } \seealso{ \code{\link{ROSE}}, \code{\link{roc.curve}}, \code{\link{accuracy.meas}}. } \examples{ # 2-dimensional data # loading data data(hacide) # in the following examples # use of a small subset of observations only --> argument subset dat <- hacide.train table(dat$cls) ##Example 1 # classification with logit model # arguments to glm are passed through control.learner # leave-one-out cross-validation estimate of auc of classifier # trained on balanced data ROSE.eval(cls~., data=dat, glm, subset=c(1:50, 981:1000), method.assess="LKOCV", K=5, control.learner=list(family=binomial), seed=1) \dontrun{ ##Example 2 # classification with decision tree # require package rpart library(rpart) # function is needed to extract predicted probability of cls 1 f.pred.rpart <- function(x) x[,2] # holdout estimate of auc of two classifiers # first classifier trained on ROSE unbalanced sample # proportion of rare events in original data p <- (table(dat$cls)/sum(table(dat$cls)))[2] ROSE.eval(cls~., data=dat, rpart, subset=c(1:50, 981:1000), control.rose=list(p = p), extr.pred=f.pred.rpart, seed=1) # second classifier trained on ROSE balanced sample # optional arguments to plot the roc.curve are passed through # control.accuracy ROSE.eval(cls~., data=dat, rpart, subset=c(1:50, 981:1000), control.rose=list(p = 0.5), control.accuracy = list(add.roc = TRUE, col = 2), extr.pred=f.pred.rpart, seed=1) ##Example 3 # classification with linear discriminant analysis library(MASS) # function is needed to extract the predicted values from predict.lda f.pred.lda <- function(z) z$posterior[,2] # bootstrap estimate of precision of learner trained on balanced data prec.distr <- ROSE.eval(cls~., data=dat, lda, subset=c(1:50, 981:1000), extr.pred=f.pred.lda, acc.measure="precision", method.assess="BOOT", B=100, trace=TRUE) summary(prec.distr) ##Example 4 # compare auc of classification with neural network # with auc of classification with tree # require package nnet # require package tree library(nnet) library(tree) # optional arguments to nnet are passed through control.learner ROSE.eval(cls~., data=dat, nnet, subset=c(1:50, 981:1000), method.assess="holdout", control.learn=list(size=1), seed=1) # optional arguments to plot the roc.curve are passed through # control.accuracy # a function is needed to extract predicted probability of class 1 f.pred.rpart <- function(x) x[,2] f.pred.tree <- function(x) x[,2] ROSE.eval(cls~., data=dat, tree, subset=c(1:50, 981:1000), method.assess="holdout", extr.pred=f.pred.tree, control.acc=list(add=TRUE, col=2), seed=1) ##Example 5 # An user defined learner with a standard behavior # Consider a dummy example for illustrative purposes only # Note that function name and the name of the class returned match DummyStump <- function(formula, ...) { mc <- match.call() m <- match(c("formula", "data", "na.action", "subset"), names(mc), 0L) mf <- mc[c(1L, m)] mf[[1L]] <- as.name("model.frame") mf <- eval(mf, parent.frame()) data.st <- data.frame(mf) out <- list(colname=colnames(data.st)[2], threshold=1) class(out) <- "DummyStump" out } # Associate to DummyStump a predict method # Usual S3 definition: predic.classname predict.DummyStump <- function(object, newdata) { out <- newdata[,object$colname]>object$threshold out } ROSE.eval(formula=cls~., data=dat, learner=DummyStump, subset=c(1:50, 981:1000), method.assess="holdout", seed=3) ##Example 6 # The use of the wrapper for a function with non standard behaviour # Consider knn in package class # require package class library(class) # the wrapper require two mandatory arguments: data, newdata. # optional arguments can be passed by including the object '...' # note that we are going to specify data=data in ROSE.eval # therefore data in knn.wrap will receive a data set structured # as dat as well as newdata but with the class label variable dropped # note that inside the wrapper we dispense to knn # the needed quantities accordingly knn.wrap <- function(data, newdata, ...) { knn(train=data[,-1], test=newdata, cl=data[,1], ...) } # optional arguments to knn.wrap may be specified in control.learner ROSE.eval(formula=cls~., data=dat, learner=knn.wrap, subset=c(1:50, 981:1000), method.assess="holdout", control.learner=list(k=2, prob=T), seed=1) # if we swap the columns of dat we have to change the wrapper accordingly dat <- dat[,c("x1","x2","cls")] # now class label variable is the last one knn.wrap <- function(data, newdata, ...) { knn(train=data[,-3], test=newdata, cl=data[,3], ...) } ROSE.eval(formula=cls~., data=dat, learner=knn.wrap, subset=c(1:50, 981:1000), method.assess="holdout", control.learner=list(k=2, prob=T), seed=1) } } \keyword{ bootstrap } ROSE/man/roc.ROSE.Rd0000644000176200001440000000562012215536016013422 0ustar liggesusers\name{roc.curve} \alias{roc.curve} \title{ ROC curve } \description{ This function returns the ROC curve and computes the area under the curve (AUC) for binary classifiers. } \usage{ roc.curve(response, predicted, plotit = TRUE, add.roc = FALSE, n.thresholds=100, ...) } \arguments{ \item{response}{A vector of responses containing two classes to be used to compute the ROC curve. It can be of class \code{"factor"}, \code{"numeric"} or \code{"character"}.} \item{predicted}{A vector containing a prediction for each observation. This can be of class \code{"factor"} or \code{"character"} if the predicted label classes are provided or \code{"numeric"} for the probabilities of the rare class (or a monotonic function of them).} \item{plotit}{Logical, if \code{TRUE} the ROC curve is plotted in a new window. Default value is set to \code{TRUE}.} \item{add.roc}{Logical, if \code{TRUE} the ROC curve is added to an existing window. Default value is set to \code{FALSE}.} \item{n.thresholds}{Number of \code{thresholds} at which the ROC curve is computed. Default value is the minimum between 100 and the number of elements in \code{response}. A value of \code{n.thresholds} greater than the length of \code{response} is ignored.} \item{\dots}{Further arguments to be passed either to \code{plot} or \code{lines}.} } \value{ The value is an object of class \code{roc.curve} which has components \item{Call}{The matched call.} \item{auc}{The value of the area under the ROC curve.}\cr \item{false positive rate}{The false positive rate (or equivalently the complement of sensitivity) of the classifier at the evaluated \code{thresholds}.} \item{true positive rate}{The true positive rate (or equivalently the specificity) of the classifier at the evaluated \code{thresholds}.} \item{thresholds}{Thresholds at which the ROC curve is evaluated.} } \references{ Fawcet T. (2006). An introduction to ROC analysis. \emph{Pattern Recognition Letters}, 27 (8), 861--875. } \seealso{ \code{\link{accuracy.meas}}, \code{\link[pROC:roc]{roc}}. } \examples{ # 2-dimensional example # loading data data(hacide) # check imbalance on training set table(hacide.train$cls) # model estimation using logistic regression fit.hacide <- glm(cls~., data=hacide.train, family="binomial") # prediction on training set pred.hacide.train <- predict(fit.hacide, newdata=hacide.train) # plot the ROC curve (training set) roc.curve(hacide.train$cls, pred.hacide.train, main="ROC curve \n (Half circle depleted data)") # check imbalance on test set table(hacide.test$cls) # prediction using test set pred.hacide.test <- predict(fit.hacide, newdata=hacide.test) # add the ROC curve (test set) roc.curve(hacide.test$cls, pred.hacide.test, add=TRUE, col=2, lwd=2, lty=2) legend("topleft", c("Resubstitution estimate", "Holdout estimate"), col=1:2, lty=1:2, lwd=2) } \keyword{ supervised classification } ROSE/man/ROSE.Rd0000644000176200001440000001567612361167112012653 0ustar liggesusers\name{ROSE} \alias{ROSE} \title{ Generation of synthetic data by Randomly Over Sampling Examples (ROSE) } \description{ Creates a sample of synthetic data by enlarging the features space of minority and majority class examples. Operationally, the new examples are drawn from a conditional kernel density estimate of the two classes, as described in Menardi and Torelli (2013). } \usage{ ROSE(formula, data, N, p=0.5, hmult.majo=1, hmult.mino=1, subset=options("subset")$subset, na.action=options("na.action")$na.action, seed) } \arguments{ \item{formula}{ An object of class \code{\link{formula}} (or one that can be coerced to that class). The left-hand-side (response) should be a vector specifying the class labels. The right-hand-side should be a series of vectors with the predictors. See ``Warning'' for information about interaction among predictors or their transformations. } \item{data}{ An optional data frame, list or environment (or object coercible to a data frame by \code{as.data.frame}) in which to preferentially interpret ``formula''. If not specified, the variables are taken from ``environment(formula)''. } \item{N}{ The desired sample size of the resulting data set generated by ROSE. If missing, it is set equal to the length of the response variable in \code{formula}. } \item{p}{ The probability of the minority class examples in the resulting data set generated by ROSE. } \item{hmult.majo}{ Optional shrink factor to be multiplied by the smoothing parameters to estimate the conditional kernel density of the majority class. See ``References'' and ``Details''. } \item{hmult.mino}{ Optional shrink factor to be multiplied by the smoothing parameters to estimate the conditional kernel density of the minority class. See ``References'' and ``Details''. } \item{subset}{ An optional vector specifying a subset of observations to be used in the sampling process. The default is set by the \code{\link{subset}} setting of \code{\link{options}}. } \item{na.action}{ A function which indicates what should happen when the data contain 'NA's. The default is set by the \code{\link{na.action}} setting of \code{\link{options}}. } \item{seed}{ A single value, interpreted as an integer, recommended to specify seeds and keep trace of the generated sample. } } \details{ ROSE (Random Over-Sampling Examples) aids the task of binary classification in the presence of rare classes. It produces a synthetic, possibly balanced, sample of data simulated according to a smoothed-bootstrap approach. Denoted by \eqn{y} the binary response and by \eqn{x} a vector of numeric predictors observed on \eqn{n} subjects \eqn{i,} (\eqn{i=1, \ldots, n}), syntethic examples with class label \eqn{k, (k=0, 1)} are generated from a kernel estimate of the conditional density \eqn{f(x|y = k)}. The kernel is a Normal product function centered at each of the \eqn{x_i} with diagonal covariance matrix \eqn{H_k}. Here, \eqn{H_k} is the asymptotically optimal smoothing matrix under the assumption of multivariate normality. See ``References'' below and further references therein. Essentially, ROSE selects an observation belonging to the class \eqn{k} and generates new examples in its neighbourhood, where the width of the neighbourhood is determined by \eqn{H_k}. The user is allowed to shrink \eqn{H_k} by varying arguments \code{h.mult.majo} and \code{h.mult.mino}. Balancement is regulated by argument \code{p}, i.e. the probability of generating examples from class \eqn{k=1}. As they stand, kernel-based methods may be applied to continuous data only. However, as ROSE includes combination of over and under-sampling as a special case when \eqn{H_k} tend to zero, the assumption of continuity may be circumvented by using a degenerate kernel distribution to draw synthetic categorical examples. Basically, if the \eqn{j-}th component of \eqn{x_i} is categorical, a syntehic clone of \eqn{x_i} will have as \eqn{j-}th component the same value of the \eqn{j-}th component of \eqn{x_i}. } \value{ The value is an object of class \code{ROSE} which has components \item{Call}{The matched call.} \item{method}{The method used to balance the sample. The only possible choice is \cr \code{ROSE}.} \item{data}{An object of class \code{data.frame} containing new examples generated by ROSE.} } \references{ Lunardon, N., Menardi, G., and Torelli, N. (2014). ROSE: a Package for Binary Imbalanced Learning. \emph{R Jorunal}, 6:82--92. Menardi, G. and Torelli, N. (2014). Training and assessing classification rules with imbalanced data. \emph{Data Mining and Knowledge Discovery}, 28:92--122. } \section{Warning}{ The purpose of \code{ROSE} is to generate new synthetic examples in the features space. The use of \code{formula} is intended solely to distinguish the response variable from the predictors. Hence, \code{formula} must not be confused with the one supplied to fit a classifier in which the specification of either tranformations or interactions among variables may be sensible/necessary. In the current version \code{ROSE} discards possible interactions and transformations of predictors specified in \code{formula} automatically. The automatic parsing of \code{formula} is able to manage virtually all cases on which it has been tested it but the user is warned to use caution in the specification of entangled functions of predictors. Any report about possible malfunctioning of the parsing mechanism is welcome. } \seealso{ \code{\link{ovun.sample}}, \code{\link{ROSE.eval}}. } \examples{ # 2-dimensional example # loading data data(hacide) # imbalance on training set table(hacide.train$cls) #imbalance on test set table(hacide.test$cls) # plot unbalanced data highlighting the majority and # minority class examples. par(mfrow=c(1,2)) plot(hacide.train[, 2:3], main="Unbalanced data", xlim=c(-4,4), ylim=c(-4,4), col=as.numeric(hacide.train$cls), pch=20) legend("topleft", c("Majority class","Minority class"), pch=20, col=1:2) # model estimation using logistic regression fit <- glm(cls~., data=hacide.train, family="binomial") # prediction using test set pred <- predict(fit, newdata=hacide.test) roc.curve(hacide.test$cls, pred, main="ROC curve \n (Half circle depleted data)") # generating data according to ROSE: p=0.5 as default data.rose <- ROSE(cls~., data=hacide.train, seed=3)$data table(data.rose$cls) par(mfrow=c(1,2)) # plot new data generated by ROSE highlighting the # majority and minority class examples. plot(data.rose[, 2:3], main="Balanced data by ROSE", xlim=c(-6,6), ylim=c(-6,6), col=as.numeric(data.rose$cls), pch=20) legend("topleft", c("Majority class","Minority class"), pch=20, col=1:2) fit.rose <- glm(cls~., data=data.rose, family="binomial") pred.rose <- predict(fit.rose, data=data.rose, type="response") roc.curve(data.rose$cls, pred.rose, main="ROC curve \n (Half circle depleted data balanced by ROSE)") par(mfrow=c(1,1)) } \keyword{ supervised classification } \keyword{ imbalanced classes } \keyword{ bootstrap } ROSE/man/hacide.Rd0000644000176200001440000000500412361167135013305 0ustar liggesusers\encoding{UTF-8} \name{hacide} \alias{hacide.train} \alias{hacide.test} \title{Half circle filled data} \description{ Simulated training and test set for imbalanced binary classification. The rare class may be described as a half circle depleted filled with the prevalent class, which is normally distributed and has elliptical contours. } \usage{data(hacide)} \format{ Data represent 2 real features (denoted as \code{x1, x2}) and a binary label class (denoted as \code{cls}). Positive examples occur in about 2\% of the data. \describe{ \item{\code{hacide.train}}{Includes 1000 rows and 20 positive examples.} \item{\code{hacide.test}}{Includes 250 rows and 5 positive examples.} } Data have been simulated as follows: \describe{ \item{-}{if \code{cls} = 0 then \code{(x1, x2)}\eqn{\sim \mathbf{N}_{2} \left(\mathbf{0}_{2}, (1/4, 1) \mathbf{I}_{2}\right)}} \item{-}{if \code{cls} = 1 then \code{(x1, x2)}\eqn{\sim \mathbf{N}_{2} \left(\mathbf{0}_{2}, \mathbf{I}_{2}\right) \cap \left\|\mathbf{x}\right\|^2>4 \cap x_2 \leq 0}} } } \references{ Lunardon, N., Menardi, G., and Torelli, N. (2014). ROSE: a Package for Binary Imbalanced Learning. \emph{R Jorunal}, 6:82--92. Menardi, G. and Torelli, N. (2014). Training and assessing classification rules with imbalanced data. \emph{Data Mining and Knowledge Discovery}, 28:92--122. } \examples{ data(hacide) summary(hacide.train) summary(hacide.test) } \keyword{datasets} %\usage{data(sefihy)} %\format{ %Data represent 10 real features (denoted as \code{x1,...,x10}) and a binary label class (denoted as \code{cls}). Positive examples occur in about 5\% of the data. %\describe{ %\item{\code{sepihy.train}}{Includes 1000 rows and 50 positive examples.} %\item{\code{sepihy.test}}{Includes 250 rows and 12 positive examples.} %} %Data have been simulated as follows: %\deqn{ %(\mbox{\code{x1,...,x10, cls}) s.t.} %\left \{ \begin{array}{ll} %\mbox{\code{x1,...,x10}} \sim \mathbf{N}_{10} \left(\mathbf{0}_{10}, (0.25, \mathbf{0}_9) \mathbf{I}_{10}\right) & \mbox{ if \code{cls} }=0 \\ %\mbox{\code{x1,...,x10}} \sim \mathbf{N}_{10} \left(\mathbf{0}_{10}, \mathbf{I}_{10}\right) \cap \left\|\mathbf{x}\right\|<4 \cap x_1 \leq 0 &\mbox{ if \code{cls} }=1\\ %\end{array}\right. %} %} % %\references{Menardi, G. and Torelli, N. (2012). Training and assessing classification rules with imbalanced data. \emph{Data Mining and Knowledge Discovery}, DOI:10.1007/s10618-012-0295-5, to appear.} %\examples{ %data(sefihy) %summary(sefihy.train) %summary(sefihy.test) %} %\keyword{datasets} ROSE/man/ROSE-package.Rd0000744000176200001440000000746514061415607014246 0ustar liggesusers\name{ROSE-package} \alias{ROSE-package} \alias{ROSEpack} \docType{package} \title{ ROSE: Random Over-Sampling Examples} \description{ Functions to deal with binary classification problems in the presence of imbalanced classes. Synthetic balanced samples are generated according to ROSE (Menardi and Torelli, 2014). Functions that implement more traditional remedies to the class imbalance are also provided, as well as different metrics to evaluate a learner accuracy. These are estimated by holdout, bootrstrap or cross-validation methods. } \details{ %\tabular{ll}{ %Package: \tab ROSE\cr %Type: \tab Package\cr %Version: \tab 0.0-1\cr %Date: \tab 2013-01-28\cr %License: \tab GPL 2\cr %} The package pivots on function \code{\link{ROSE}} which generates synthetic balanced samples and thus allows to strenghten the subsequent estimation of any binary classifier. ROSE (Random Over-Sampling Examples) is a bootstrap-based technique which aids the task of binary classification in the presence of rare classes. It handles both continuous and categorical data by generating synthetic examples from a conditional density estimate of the two classes. %(with the caution of selecting a degenerate kernel to draw synthetic categorical data). Different metrics to evaluate a learner accuracy are supplied by functions \code{\link{roc.curve}} and \code{\link{accuracy.meas}}. Holdout, bootstrap or cross-validation estimators of these accuracy metrics are computed by means of ROSE and provided by function \code{\link{ROSE.eval}}, to be used in conjuction with virtually any binary classifier. Additionally, function \code{\link{ovun.sample}} implements more traditional remedies to the class imbalance, such as over-sampling the minority class, under-sampling the majority class, or a combination of over- and under- sampling. } \author{ Nicola Lunardon, Giovanna Menardi, Nicola Torelli Maintainer: Nicola Lunardon } \references{ Lunardon, N., Menardi, G., and Torelli, N. (2014). ROSE: a Package for Binary Imbalanced Learning. \emph{R Jorunal}, 6:82--92. Menardi, G. and Torelli, N. (2014). Training and assessing classification rules with imbalanced data. \emph{Data Mining and Knowledge Discovery}, 28:92--122. } %~~ Optionally other standard keywords, one per line, from file KEYWORDS in ~~ %~~ the R documentation directory ~~ \keyword{ package } \keyword{ machine learning } \keyword{ imbalanced data } \seealso{ \code{\link[nnet:nnet]{nnet}}, \code{\link[rpart:rpart]{rpart}} } \examples{ # loading data data(hacide) # check imbalance table(hacide.train$cls) # train logistic regression on imbalanced data log.reg.imb <- glm(cls ~ ., data=hacide.train, family=binomial) # use the trained model to predict test data pred.log.reg.imb <- predict(log.reg.imb, newdata=hacide.test, type="response") # generate new balanced data by ROSE hacide.rose <- ROSE(cls ~ ., data=hacide.train, seed=123)$data # check (im)balance of new data table(hacide.rose$cls) # train logistic regression on balanced data log.reg.bal <- glm(cls ~ ., data=hacide.rose, family=binomial) # use the trained model to predict test data pred.log.reg.bal <- predict(log.reg.bal, newdata=hacide.test, type="response") # check accuracy of the two learners by measuring auc roc.curve(hacide.test$cls, pred.log.reg.imb) roc.curve(hacide.test$cls, pred.log.reg.bal, add.roc=TRUE, col=2) # determine bootstrap distribution of the AUC of logit models # trained on ROSE balanced samples # B has been reduced from 100 to 10 for time saving solely boot.auc.bal <- ROSE.eval(cls ~ ., data=hacide.train, learner= glm, method.assess = "BOOT", control.learner=list(family=binomial), trace=TRUE, B=10) summary(boot.auc.bal) } ROSE/DESCRIPTION0000644000176200001440000000146014061607341012533 0ustar liggesusersPackage: ROSE Type: Package Title: Random Over-Sampling Examples Version: 0.0-4 Date: 2021-06-14 Author: Nicola Lunardon, Giovanna Menardi, Nicola Torelli Maintainer: Nicola Lunardon Suggests: MASS, nnet, rpart, tree Description: Functions to deal with binary classification problems in the presence of imbalanced classes. Synthetic balanced samples are generated according to ROSE (Menardi and Torelli, 2013). Functions that implement more traditional remedies to the class imbalance are also provided, as well as different metrics to evaluate a learner accuracy. These are estimated by holdout, bootstrap or cross-validation methods. License: GPL-2 Packaged: 2021-06-14 07:29:48 UTC; nicola NeedsCompilation: no Repository: CRAN Date/Publication: 2021-06-14 08:10:09 UTC ROSE/R/0000755000176200001440000000000012313344627011230 5ustar liggesusersROSE/R/estimation_funcs.R0000644000176200001440000001145312301106220014707 0ustar liggesusers#Last modified on 01/28/2014 ##Accuracy measures accuracy.meas <- function (response, predicted, threshold = 0.5) { ### checks if(length(response) != length(predicted)) stop("Response and predicted must have the same length.\n") if(length(labels <- levels(factor(response))) != 2) stop("Response must have two levels.\n") if(cl <- class(predicted) == "factor" | class(predicted) == "character") { if(lev <- length(levels(factor(predicted))) != 2) stop("predicted must have two levels.\n") predicted <- as.numeric(predicted) } ### splitted <- split(predicted, response) negatives <- splitted[[as.character(labels[1])]] n.negatives <- length(negatives) positives <- splitted[[as.character(labels[2])]] n.positives <- length(positives) TP <- sum(positives >= threshold) FP <- sum(negatives >= threshold) TN <- sum(negatives < threshold) FN <- sum(positives < threshold) PRECISION <- TP/(TP+FP) RECALL <- TP/(TP+FN) F <- RECALL*PRECISION/(RECALL+PRECISION) out <- list(Call=match.call(), threshold=threshold, precision= PRECISION, recall = RECALL, F=F) class(out) <- "accuracy.meas" out } ####print method for accuracy measures print.accuracy.meas <- function(x, ...) { cat("\n") cat("Call: \n") print(x$Call) cat("\n") cat("Examples are labelled as positive when predicted is greater than", x$threshold,"\n") cat("\n") cat( paste("precision: ", sprintf("%.3f",x$precision),"\n", sep="") ) cat( paste("recall: ", sprintf("%.3f",x$recall),"\n", sep="") ) cat( paste("F: ", sprintf("%.3f",x$F),"\n", sep="") ) } ###################################################################### ##ROC curve and related internal functions ###################################################################### ##Roc curve roc.curve <- function(response, predicted, plotit=TRUE, add.roc=FALSE, n.thresholds=100, ...) { ### checks if( length(response)!=length(predicted) ) stop("Response and predicted must have the same length.\n") if( length( labels <- levels( factor(response) ) ) != 2 ) stop("Response must have two levels.\n") if( cl <- class(predicted)=="factor" | class(predicted)=="character" ) { if( lev <- length( levels( factor(predicted) ) ) > 2 ) stop("predicted must have no more than two levels.\n") predicted <- as.numeric(factor(predicted)) } ### thresholds <- sort(unique(predicted)) ind.thresholds <- round( seq( 1, length(thresholds), len = min(length(thresholds), n.thresholds) ) ) thresholds <- (c(-Inf, thresholds[ind.thresholds]) + c(thresholds[ind.thresholds], +Inf))*0.5 splitted <- split(predicted, response) negatives <- splitted[[as.character(labels[1])]] n.negatives <- length(negatives) positives <- splitted[[as.character(labels[2])]] n.positives <- length(positives) pts <- sapply(thresholds, f.roc, positives=positives, negatives=negatives, n.positives=n.positives, n.negatives=n.negatives) auc <- -sum( ( pts[2,-1] + pts[2,-ncol(pts)] )*diff(pts[1,]) )*0.5 if(auc<0.5) { auc <- 1-auc pts[1:2,] <- pts[2:1,] } if(plotit) { if(add.roc) { lines(x=pts[1,], y=pts[2,], ...) } else { plot.roc.curve(pts[1,], pts[2,], ...) abline(0, 1, col="grey70") } } obj.roc.curve <- list(Call=match.call(), auc=auc, false.positive.rate=pts[1,], true.positive.rate=pts[2,], thresholds=thresholds) class(obj.roc.curve) <- "roc.curve" obj.roc.curve } ###print method for roc curve print.roc.curve <- function(x, ...) { if( !is.null(x$auc) ) cat( paste("Area under the curve (AUC): ", sprintf("%.3f",x$auc),"\n", sep="") ) } ###summary method for roc curve summary.roc.curve <- function(object, ...) { LST <- list( Call=object$Call, auc=object$auc, false.positive.rate=summary(object$false.positive.rate), true.positive.rate=summary(object$true.positive.rate) ) class(LST) <- "summary.roc.curve" LST } ###print method for summary roc curve print.summary.roc.curve <- function(x, ...) { cat("\n") cat("Call: \n") print(x$Call) cat("\n") cat("Area under the curve (AUC): \n") cat(round(x$auc, digits=3),"\n") cat("\n") cat("False positive rate for evaluated thresholds: \n") print(x$false.positive.rate) cat("\n") cat("True positive rate for evaluated thresholds: \n") print(x$true.positive.rate) cat("\n") } ##compute specificity and sensibility f.roc <- function(x, positives, negatives, n.positives, n.negatives) { c( sum( negatives>x )/n.negatives, sum( positives>=x )/n.positives ) } ###plot the ROC curve with some default parameters in plot() plot.roc.curve <- function(x, y, ...) { plot.roc.inner(x, y, ...) } ###plot the ROC curve with some default parameters in plot() plot.roc.inner <- function(x, y, main="ROC curve", xlab="False positive rate", ylab="True positive rate", xlim=c(0,1), ylim=c(0,1), col=1, type="l", lwd=2, ...) { plot(x, y, main=main, xlab=xlab, ylab=ylab, xlim=xlim, ylim=ylim, col=col, type=type, lwd=lwd,...) } ROSE/R/ROSE_eval.R0000644000176200001440000002133312301106144013117 0ustar liggesusers#Last modified on 01/30/2014 ROSE.eval <- function(formula, data, learner, acc.measure="auc", extr.pred=NULL, method.assess="holdout", K=1, B=100, control.rose=list(), control.learner=list(), control.predict=list(), control.accuracy=list(), trace=FALSE, subset=options("subset")$subset, na.action=options("na.action")$na.action, seed) { #check arguments: formula and learner are mandatory if( missing(formula) ) stop("A formula is required.\n") if( missing( learner ) ) stop("Argument 'learner' is missing, with no default. \n") #check if provided learner is "standard" in the sense that it has an associated predict method with arguments "object" and "newdata" func.name <- as.character(substitute(learner)) if( any( methods(class=func.name)==paste("predict.",func.name,sep="") ) ) flg.learner <- 1 else flg.learner <- 0 mc <- match.call() formula.env <- attr(formula,".Environment") varnames.func <- all.vars(formula, functions=TRUE) varnames <- all.vars(formula, functions=FALSE) ###catch the original data.frame/variables in formula.env name.data <- NULL if(missing(data)) { if( any( varnames.func%in%c("$","[","]")) ) name.data <- varnames[1] } else name.data <- as.character(mc$data) ##this is the case for variables not contained in a data frame in formula.env if(is.null(name.data)) name.data <- varnames ###store original data.frame/variables in formula.env data.orig <- sapply(name.data, function(x) get(x, envir=formula.env) ) cn.order.orig <- attributes(data.orig)$dimnames[[1]] ###end ###keep formula unchanged for the learner formula.learn <- formula ###drop trasformations etc from formula to provide a nice formula to ROSE formula.rose <- adj.formula(formula, data) if(missing(data)) lst.model.frame <- list(formula=formula.rose, data=NULL, subset=subset, na.action=na.action) else lst.model.frame <- list(formula=formula.rose, data=data, subset=subset, na.action=na.action) ###create data set for ROSE and prediction # mc$formula <- formula.rose # m <- match(c("formula", "data", "na.action", "subset"), names(mc), 0L) # mf <- mc[c(1L, m)] # mf[[1L]] <- as.name("model.frame") # mf <- eval(mf, parent.frame()) mf <- do.call(model.frame,lst.model.frame) cn <- rownames( attributes( attributes(mf)$terms )$factors ) data.st <- data.frame(mf) y <- data.st[,1] if( any( varnames.func%in%c("$")) ) colnames(data.st) <- gsub(paste(name.data, ".", sep=""), "", colnames(data.st)) ##create new formula for ROSE with the right environment formula.rose <- formula(data.st) ###end #right order of columns as specified in data d <- NCOL(data.st)-1 if( !missing(data) ) if(d!=1) data.st <- data.st[cn.order.orig] #check accuracy estimator method.assess <- match.arg(method.assess, choices=c("holdout", "LKOCV", "BOOT")) if(!method.assess %in% c("holdout", "LKOCV", "BOOT") ) stop("Method for model assessment must be one among 'holdout', 'BOOT' or 'LKOCV'.\n") #check accuracy measure acc.measure <- match.arg(acc.measure, choices=c("auc", "precision", "recall", "F")) if(!acc.measure %in% c("auc", "precision", "recall", "F") ) stop("Accuracy measure must be one among 'auc' 'precision', 'recall' or 'F'.\n") if(acc.measure=="auc") { fun.accuracy <- roc.curve ##as default, do not plot the roc curve in ROSE.eval when the user do not want it if( is.null(names(control.accuracy)) ) control.accuracy <- c(control.accuracy, list("plotit"=FALSE)) if( !names(control.accuracy)=="plotit" ) control.accuracy <- c(control.accuracy, list("plotit"=FALSE)) } else { fun.accuracy <- accuracy.meas } pos.accuracy <- match(acc.measure,c("auc","precision", "recall", "F")) + 1 method.assess.inn <- method.assess if(!missing(seed)) set.seed(seed) if(trace) { ind <- ifelse( B<50, 1, ifelse( B<500, 10, 100 ) ) cat("Iteration:", "\n") } if( method.assess.inn =="holdout" ) { method.assess.inn <- "BOOT" B <- 1 } if( method.assess.inn=="BOOT" ) { if(trace) max.ind <- floor(B/ind)*ind acc.vec <- vector(mode="numeric", length=B) if( flg.learner ) { #functions with "standard" behaviour for(i in 1:B) { data.rose <- do.call(ROSE, c(list(formula=formula.rose, data=data.st), control.rose))$data fit <- do.call(learner, c(list(formula=formula.learn, data=data.rose), control.learner)) pred <- do.call(predict, c(list(object=fit, newdata=data.st), control.predict)) if(!is.null(extr.pred)) pred <- extr.pred(pred) acc.vec[i] <- do.call(fun.accuracy, c(list(response=y, predicted=pred), control.accuracy))[[pos.accuracy]] if(trace) if(i %% ind == 0) {if( i!=max.ind ) cat(i, ", ", sep="") else cat(i, "\n", sep="")} } } else { #user defined functions with "non-standard" behaviour for(i in 1:B) { data.rose <- do.call(ROSE, c(list(formula=formula.rose, data=data.st), control.rose))$data pred <- do.call(learner, c(list(data=data.rose, newdata=data.st[,cn[-1]]), control.learner)) acc.vec[i] <- do.call(fun.accuracy, c(list(response=y, predicted=pred), control.accuracy))[[pos.accuracy]] if(trace) if(i %% ind == 0) {if( i!=max.ind ) cat(i, ", ", sep="") else cat(i, "\n", sep="")} } } } else { pred <- y.cp <- numeric(0) if(trace) max.ind <- floor(B/ind)*ind #n.obs to leave out if(K%%1!=0) stop("Leave K out CV: K must be an integer\n") n.g <- K #number of subsets if(length(data.st[,1])%%n.g==0) { K <- length(data.st[,1])/n.g ind.g <- sample( rep(1:K, n.g) ) } else { K <- floor(length(data.st[,1])/n.g) + 1 n.g.remain <- length(data.st[,1])-floor(length(data.st[,1])/n.g)*n.g message(paste("\nLeave K out CV: the sample size is not a multiple of K. \nThe routine has automatically created", K-1, "subsets of size", n.g, "and one subset of size", n.g.remain,".")) ind.g <- sample( c(rep(1:(K-1), n.g), rep(K,n.g.remain) ) ) } B <- K if( flg.learner ) { #functions with "standard" behaviour for(i in 1:B) { data.rose <- do.call(ROSE, c(list(formula=formula.rose, data=data.st[ -which(ind.g==i) ,]), control.rose))$data fit <- do.call(learner, c(list(formula=formula.learn, data=data.rose), control.learner)) predi <- do.call(predict, c(list(object=fit, newdata=data.st[ which(ind.g==i) ,]), control.predict)) if(!is.null(extr.pred)) predi <- extr.pred(predi) pred <- c(pred,predi) if(trace) if(i %% ind == 0) {if( i!=max.ind ) cat(i, ", ", sep="") else cat(i, "\n", sep="")} y.cp <- c(y.cp,y[which(ind.g==i)]) } acc.vec <- do.call(fun.accuracy, c(list(response=y.cp, predicted=pred), control.accuracy))[[pos.accuracy]] } else { #user defined functions with "non-standard" behaviour for(i in 1:B) { data.rose <- do.call(ROSE, c(list(formula=formula.rose, data=data.st[ -which(ind.g==i) ,]), control.rose))$data predi <- do.call(learner, c(list(data=data.rose, newdata=data.st[which(ind.g==i),cn[-1]]), control.learner)) pred <- c(pred,predi) if(trace) if(i %% ind == 0) {if( i!=max.ind ) cat(i, ", ", sep="") else cat(i, "\n", sep="")} y.cp <- c(y.cp,y[which(ind.g==i)]) } acc.vec <- do.call(fun.accuracy, c(list(response=y.cp, predicted=pred), control.accuracy))[[pos.accuracy]] } } # out <- list(Call = match.call(), method=method.assess, measure = acc.measure, acc = acc.vec) out <- list(Call = mc, method=method.assess, measure = acc.measure, acc = acc.vec) class(out) <- "ROSE.eval" out } ##print method for ROSE.eval print.ROSE.eval <- function(x, ...) { if (x$method =="BOOT") method <- "Bootstrap" if (x$method =="LKOCV") method <- "Leave K out cross-validation" if (x$method =="holdout") method <- "Holdout" cat("\n") cat("Call: \n") print(x$Call) cat("\n") if (method == "Bootstrap") cat( paste(method, " estimate of ", x$measure, " on ", length(x$acc), " samples: \n ", sep="") ) else cat( paste(method, " estimate of ", x$measure, ": ", sep="") ) cat(sprintf("%.3f",x$acc),"\n") } ###summary method for ROSE.eval summary.ROSE.eval <- function(object, ...) { acc<-object$acc if (length(acc) > 1) acc <- summary(acc) LST <- list( call=object$Call, method=object$method, measure=object$measure, acc=acc ) class(LST) <- "summary.ROSE.eval" LST } ###print method for summary print.summary.ROSE.eval <- function(x, ...) { cat("\n") cat("Call: \n") print(x$call) cat("\n") if (x$method =="BOOT") method <- "Bootstrap" if (x$method =="LKOCV") method <- "Leave K out cross-validation" if (x$method =="holdout") method <- "Holdout" if(x$method !="BOOT") cat( paste(method, " estimate of ", x$measure, ": ", sprintf("%.3f",x$acc),"\n", sep="") ) else { cat( "Summary of bootstrap distribution of auc: \n" ) print(x$acc) cat("\n") } } ROSE/R/data_balancing_funcs.R0000644000176200001440000003314712305351747015472 0ustar liggesusers#Last modified on 01/30/2014 ###################################################################### .onAttach <- function(libname,pkgname){ packageStartupMessage("Loaded ROSE ", as.character(packageDescription("ROSE")[["Version"]]),"\n") } ###################################################################### ###################################################################### #ovun.sample main function ###################################################################### ovun.sample <- function(formula, data, method="both", N, p=0.5, subset=options("subset")$subset, na.action=options("na.action")$na.action, seed) { ###checks if( missing(formula) ) stop("formula is reaquired.\n") method <- match.arg(method, choices=c("both", "under", "over")) if( !method%in%c("both", "over", "under") ) stop("Method must be 'both', 'over', or 'under'.\n") ### Call <- match.call() m <- match(c("formula", "data","method","N", "p", "seed", "subset", "na.action"), names(Call), 0L) Call1 <- Call[c(1L, m)] Call1[[1L]] <- omnibus.balancing res <- eval(Call1) out <- list(Call=match.call(), method=method, data=res$data) class(out) <- "ovun.sample" out } ##print method for ovun.sample print.ovun.sample <- function(x, ...) { cat("\n") cat("Call: \n") print(x$Call) Method <- switch(match.arg(x$method, choices=c("both", "under", "over")), both="combination of over- and under-sampling", under="undersampling", over="oversampling" ) cat("\n") cat("Data balanced by", Method,"\n") cat("\n") print(x$data) } ###summary method for ovun.sample summary.ovun.sample <- function(object, ...) { out <- list( Call=object$Call, Summary=summary(object$data), method=object$method ) class(out) <- "summary.ovun.sample" out } ###print method for summary ovun.sample print.summary.ovun.sample <- function(x, ...) { cat("\n") cat("Call: \n") print(x$Call) cat("\n") Method <- switch(match.arg(x$method, choices=c("both", "under", "over")), both="combination of over- and under-sampling", under="undersampling", over="oversampling" ) cat("Summary of data balanced by", Method ,"\n") cat("\n") print(x$Summary) } ###################################################################### ##function that provides a formula with non tranformed variables only ###################################################################### ##this function is NOT exported adj.formula <- function(formula, data) { if( missing(data) ) frml.env <- environment(formula) else frml.env <- data formula <- terms(formula, data = frml.env) vars <- attr(formula, "variables") vars <- sapply(vars, function(x) paste(deparse(x,width.cutoff=500), collapse=' '))[-1L] #remove all characters before either ( or / vars <- sub("*.*[(/]","", vars) #remove all characters after either ^ or ) vars <- sub("['^')].*","", vars) vars <- unique(vars) formula <- as.formula(paste(vars[1], "~", paste(vars[-1], collapse= "+"))) attr(formula, "variables") <- vars formula } ###################################################################### #This function is the wrapper for all the implemented data balaning remedies ###################################################################### ##this function is NOT exported omnibus.balancing <- function(formula, data, method, subset, na.action, N, p=0.5, seed, hmult.majo=1, hmult.mino=1) { if( missing(formula) ) stop("formula is required\n") if(missing(method)) method <- "both" if( (method=="under" | method=="over" ) & !missing(N) & !missing(p) ) stop("Too many arguments. Need to specify either N or p.\n") formula.orig <- formula formula <- adj.formula(formula, data) if( missing(subset) ) subset=options("subset")$subset if( missing(na.action) ) na.action=options("na.action")$na.action flg.data <- 0 if( !missing(data) ) { lst.model.frame <- list(formula=formula, data=data, subset=subset, na.action=na.action) if( is.environment(data) )#| is.list(data) ) flg.data <- 2 else flg.data <- 1 } else lst.model.frame <- list(formula=formula, data=NULL, subset=subset, na.action=na.action) if( formula.orig[[3]]!="." & eval(formula)!=formula.orig ) warning("Transformations of variables are not allowed.\n New data have been generated by using non-transformed variables.\n ") mf <- do.call(model.frame, lst.model.frame) cn <- rownames( attributes( attributes(mf)$terms )$factors ) data.st <- data.frame(mf) y <- data.st[, 1] X <- data.frame(data.st[,-1]) n <- length(y) d <- NCOL(X) classy <- class(y) y <- factor(y) T <- table(y) classx <- sapply(as.data.frame(X), class) ###checks if(n<2) stop("Too few observations.\n") if( length(T)>2 ) stop("The response variable must have 2 levels.\n") else if( length(T)==1 ) stop("The response variable has only one class.\n") if( p<0 | p>1 ) stop("p must be in the interval 0-1.\n") ### #identify which is the label associated to the majority and minority classes majoY <- levels(y)[which.max(T)] minoY <- levels(y)[which.min(T)] #identify the majority and minority class examples ind.mino <- which( y == minoY ) ind.majo <- which( y == majoY ) if( !missing(seed) ) set.seed(seed) data.obj <- switch(method, both = ou.sampl(n, N, p, ind.majo, majoY, ind.mino, minoY, classy, X), over = over.sampl(n, N, p, ind.majo, ind.mino, majoY, minoY, y, classy, X), under = under.sampl(n, N, p, ind.majo, majoY, ind.mino, minoY, y, classy, X), rose = rose.sampl(n, N, p, ind.majo, majoY, ind.mino, minoY, y, classy, X, classx, d, T, hmult.majo, hmult.mino) ) data.out <- data.obj$data.out ynew <- data.obj$ynew Xnew <- data.obj$Xnew #re-position columns if( !missing(data) & flg.data!=0 ) { #put data frame names in the right order if(flg.data==1) colnames(data.out) <- colnames(data)[colnames(data)%in%cn] else colnames(data.out) <- attr(formula, "variables")[attr(formula, "variables")%in%cn] #insert y indY <- colnames(data.out)==cn[1] data.out[, indY] <- ynew #see wether the order of the variables in formula is the same as in data. If no, swap columns according to the order in data swap.col <- order( pmatch( cn[-1], colnames(data.out)[!indY] ) ) data.out[,!indY] <- Xnew[, (1:d)[swap.col] ] } else { if( length(cn)-1 < d ) colnames(data.out) <- c(cn[1], colnames(X)) else colnames(data.out) <- cn } list(data=data.out, call=match.call()) } ###################################################################### #Combination of over and under sampling ###################################################################### ##this function is NOT exported ou.sampl <- function(n, N, p, ind.majo, majoY, ind.mino, minoY, classy, X) { if( missing(N) ) N <- n #number of new minority class examples n.mino.new <- sum(rbinom(N, 1, p)) #number of new majority class examples n.majo.new <- N-n.mino.new id.majo.new <- sample(ind.majo, n.majo.new, replace=TRUE) id.mino.new <- sample(ind.mino, n.mino.new, replace=TRUE) #create X Xnew <- data.frame(X[c(id.majo.new, id.mino.new),]) #create y if( classy%in%c("character", "integer", "numeric") ) ynew <- as.vector( c(rep(majoY, n.majo.new), rep(minoY, n.mino.new)), mode=classy ) if( classy=="factor" ) ynew <- factor( c(rep(majoY, n.majo.new), rep(minoY, n.mino.new)), levels=c(majoY, minoY) ) data.out <- data.frame(ynew, Xnew) rownames(data.out) <- 1:N list(data.out=data.out, ynew=ynew, Xnew=Xnew) } ###################################################################### #Under sampling ###################################################################### ##this function is NOT exported under.sampl <- function(n, N, p, ind.majo, majoY, ind.mino, minoY, y, classy, X) { n.mino.new <- sum(y == minoY) if( missing(N) ) { # Determination of N and n.majo in version 0.0.2 if( p 0) { Xnew[1:n.majo.new, id.num] <- rose.real(X[,id.num], hmult=hmult.majo, n=length(ind.majo), q=d.num, ids.class=ind.majo, ids.generation=id.majo.new) Xnew[(n.majo.new+1):N, id.num] <- rose.real(X[,id.num], hmult=hmult.mino, n=length(ind.mino), q=d.num, ids.class=ind.mino, ids.generation=id.mino.new) } #create y if( classy%in%c("character", "integer", "numeric") ) ynew <- as.vector( c(rep(majoY, n.majo.new), rep(minoY, n.mino.new)), mode=classy ) if( classy=="factor" ) ynew <- factor( c(rep(majoY, n.majo.new), rep(minoY, n.mino.new)), levels=c(majoY, minoY) ) data.out <- data.frame(ynew, Xnew) rownames(data.out) <- 1:N list(data.out=data.out, ynew=ynew, Xnew=Xnew) } ###################################################################### #function to generate synthetic real data ###################################################################### ##This function is NOT exported rose.real <- function(X, hmult=1, n, q = NCOL(X), ids.class, ids.generation) { X <- data.matrix(X) n.new <- length(ids.generation) cons.kernel <- (4/((q+2)*n))^(1/(q+4)) if(q!=1) H <- hmult*cons.kernel*diag(apply(X[ids.class,], 2, sd), q) else H <- hmult*cons.kernel*sd(X[ids.class,]) Xnew.num <- matrix(rnorm(n.new*q), n.new, q)%*%H Xnew.num <- data.matrix(Xnew.num + X[ids.generation,]) Xnew.num } ###################################################################### #Wrapper for ROSE ###################################################################### ROSE <- function(formula, data, N, p=0.5, hmult.majo=1, hmult.mino=1, subset=options("subset")$subset, na.action=options("na.action")$na.action, seed) { mc <- match.call() obj <- omnibus.balancing(formula, data, subset, na.action, N, p, method="rose", seed, hmult.majo, hmult.mino) out <- list(Call=mc, method="ROSE", data=obj$data) class(out) <- "ROSE" out } ##print method for ROSE print.ROSE <- function(x, ...) { cat("\n") cat("Call: \n") print(x$Call) cat("\n") cat("Data balanced by", x$method,"\n") cat("\n") print(x$data) } ###summary method for ROSE summary.ROSE <- function(object, ...) { out <- list( Call=object$Call, Summary=summary(object$data) ) class(out) <- "summary.ROSE" out } ###print method for summary ROSE print.summary.ROSE <- function(x, ...) { cat("\n") cat("Call: \n") print(x$Call) cat("\n") cat("Summary of data balanced by ROSE","\n") cat("\n") print(x$Summary) } ROSE/MD50000644000176200001440000000136414061607341011340 0ustar liggesusersc39d8f1eb5e0d7bbb3e0c57502a8b68d *DESCRIPTION 8497c6b8fe481b6385edb7a257a5c4fc *NAMESPACE daf79ce1bd7ceb08cad1ea3521368b03 *R/ROSE_eval.R 1fe07b00fbb38cfd6d9a178081433bf9 *R/data_balancing_funcs.R 0a5c42458f7452313068b202f94ffcea *R/estimation_funcs.R 93e15cd837be1938847cc551904e6277 *data/hacide.rda 460e040863eb80490d63825c3af54133 *inst/CITATION 46bb55904599bf8a107d7de0f64545c7 *inst/ChangeLog 6c144902be99dc3bf8c1814881d5c60b *man/ROSE-package.Rd e765a9333a21809ba2bd358dc4ca0a58 *man/ROSE.Rd a2770e21651a6fb63ad7ead6d962a924 *man/ROSE.eval.Rd 8da5ec4b1bc2e72a5c1368e513747f02 *man/accuracy.meas.Rd 41ee84e25e6519500164f151798cda73 *man/hacide.Rd 3e9ede1e3959ce47952bb623e6a62b0b *man/ovun.sample.Rd 33f0810df92b444f1bfc20e31f49f885 *man/roc.ROSE.Rd ROSE/inst/0000755000176200001440000000000012361171026011776 5ustar liggesusersROSE/inst/ChangeLog0000644000176200001440000000247412301113613013547 0ustar liggesusersThe version 0-0.2 completed on September 13 2013 incorporates the following changes ovun.sample - whenever method is used, p is the probability of a minority example; - returns an object of class "ovun.sample" and a print and summary method are associated. ROSE - the output includes method="ROSE"; - returns an object of class "ROSE" and a print and summary method are associated. - corrected typo in which(cls=="numeric" | cls=="interger") ROSE.eval - the option "L1OCV" of argument method.access is now "LKOCV", i.e. it is possible ro perform a leave K out cross validation; - in argument learner it is possible to specify R functions to fit a classifier that are not endowed with a predict method through a suitable wrapper. - when acc.measure="auc" by default the plot of the ROC curve is not displayed. sefihy - sefihy has been changed to hacide. Sefihy was 10 dimensional, whereas hacide is 2 dimensional. The version 0-0.3 completed on January 30 2014 incorporates the following internal change that do not affects the usability of the package by users ovun.sample and ROSE - these functions are now simple wrappers calling an internal function 'omnibus.balancing' that performs the balancing strategies formerly implemented in package ROSE. ROSE/inst/CITATION0000644000176200001440000000135412361171021013131 0ustar liggesuserscitHeader("To cite the ROSE package in publications use:") if(!exists("meta") || is.null(meta)) meta <- packageDescription("ROSE") citEntry(entry = "Article", title = "{ROSE}: a {P}ackage for {B}inary {I}mbalanced {L}earning", author = personList(as.person("Nicola Lunardon"), as.person("Giovanna Menardi"), as.person("Nicola Torelli")), journal = "{R} Journal", year = "2014", volume = "6", number = "1", pages = "82--92", textVersion = paste("Nicola Lunardon, Giovanna Menardi, and Nicola Torelli (2014).", "ROSE: a Package for Binary Imbalanced Learning.", "R Journal, 6(1), 82-92.") )