projpred/0000755000176200001440000000000013614525572012111 5ustar liggesusersprojpred/NAMESPACE0000644000176200001440000000115613614341122013316 0ustar liggesusers# Generated by roxygen2: do not edit by hand S3method(as.matrix,projection) S3method(get_refmodel,brmsfit) S3method(get_refmodel,cvsel) S3method(get_refmodel,refmodel) S3method(get_refmodel,stanreg) S3method(get_refmodel,vsel) S3method(predict,refmodel) S3method(print,cvsel) S3method(print,vsel) export(cv_varsel) export(cvfolds) export(cvind) export(get_refmodel) export(init_refmodel) export(proj_linpred) export(proj_predict) export(project) export(suggest_size) export(varsel) export(varsel_plot) export(varsel_stats) import(ggplot2) import(stats) importFrom(Rcpp,sourceCpp) importFrom(loo,psis) useDynLib(projpred) projpred/data/0000755000176200001440000000000013531657577013033 5ustar liggesusersprojpred/data/df_binom.rda0000644000176200001440000005541413531660153015271 0ustar liggesusersw8aoJLLh-dUhBʈ2*DdFf)D{7mk|:sy_ܟj]FEEEKE@MEKդ74PSQ1oLV7,lݿ5sj{֭K|lҋ:grвR' T@)MxO97#<`cEpX)4VWx6{KU5q9836;.E5XELד`z*F$ESÓlKgqdfݥ.0$RJ~n`Hb+DU`aKC dhIƦ{7!z?S 9+v 27FBicʑlarS҆_Ǐ$ګ߂\*ig"]cz^֏rL,ZJ8ɏ. rѫ$AmSN{-ǼC~,oۧJ`'>.)žfrFhsbI>IK۝kq9ϱ @{J&t' `a]b|6ۼ'@nsO4-k]d"-kыD߽QwD~/ Nݽe[b~0O74;%O@iX K84,=]Ȇ价)r TOW`x>6﫾i_^uwYqi^jm 8^E9-Zz_LݓYCC1Cw s69S͋) ̒9ܳS*f)"8j8j9?  z8HyY( F9LG'0}u_d35={8k zNNJ C&e[>}lta{2d G sj%k.}+X q?NγR).~!мCiZ OiG|TxO 5FN.Q1T̟#@$I_ `Gݝ,$+~ Mq.6 4DO:N\H7SJO-dnH2E8ʼ]gFsԍ\XI%[Ƭ2-WAbQJXH7#FĚ_r@t& wh}-/v\눁26J2]MB#M^\8rt p^U0Slr,~3px5. U wwDyXws]q)䃳n2Sưe\ty R,?H|p329&{+!ضHLG$]9}}NC.EKK=()VϮ@ +^`޿Kwo=Tgq+~wKdIbTi}$Jٸʼnok@e(iS]!#j.t/?ǜ|0'jPef`)¡s8g }f.gzԹ=8yu8SgqWqWqg(s9Fͳ#= '1z _=^y![ d>uO\~s 4+l`Wg.ek)qU3?BSՒ󏭒Ǿy%`ZI$lo9SG\S:`?7`,r?Y\ƜQ on?4mwП7TcrTU@}YjJ;BeWwSMV.uV|?^0m diB-kئOJЂ!)gܭ)(z"6 vwe=8*!p` $#{C2C2Z|)TS/vwA\2} =%j@r ?ITb-|Ue|#:2^k"IL\Τ6+-HQV-:M6!FJ"lLTM7`| o,n;"u%V+~0unAL8PrkVp.A jVL[r!%qlQ*wq~W=w.PKbY2ǐ@bwSH搏tПdJ܁÷;dU-CXd\w6t=3Qׇᵥ?ԗw}RKI޸8Ojh~Oz;䟕d|̌XrɚNr>X>i8q+U+8jM#vlpLn*^jί-'φ@'vi8Ki>pheJpBs/kɗl^,YLoBl'RvVOHaCQϸpŹ!-pCW#tf9.Db-Eo6Rt)\uH a$WYFb _Dݯ]ņ,a%ֳ5a`9eUw?vgtgF~E[f= a/ߥyqB6Oud@7)Q.͌4?bߧs 8r{ۈ?9+%H K< MT#9 pKoڏ@{%OxNhno>MHZyz,iW`Yszu3΅]\1on)hzwc]T"~RzoWi;u7%~BBy-v| uMM&/ꨐڝA1 6 >C2\Y\r ,Hqdkx\wB0p6ڷt ';xȱaC/g7k̟"f]u᰽R*oz%w~TA$|s;_ mՀPȂ#ϵS`ڸgK(Ue6n`eP.OkƆ6QEZ˳tqgEƇ|=m&U]wu$o>t ؼҵgqtL Ѭ.ڸRI6nW[YK~CFU7mߥ\wa3R=Lr-{ *A'M&`6$$]M8D'g7I#XQ]6`$%dl`:ǎbtBHYdKMbp=b5,(vze"1T|+.˕V^dٔC=erLv']>`Mo砡7皸lj7a'ey9L(N9˟2Ք_xxe36I!a;^Ȯ[Xq<9'9}آn_?m$OܾK`r.vO܈z3y@ ȎWGkӏ0@.T tCB+Me DV]Vn9՟b">XǨ[\FV*v*0Zं9}MT>G\ E/׾zZubWa&QIaXPe+ή#8<`IշϹGaH^ Z".Ȥ܆3bbZ+MљGEF6nܾqt^?\I▀`Y.)=e)kSP7U6Tݷweą)K^JN8nh8NXǾY=ǜhʹBb-!8E6Xɥ "ߜ֮d>CS0\jLK]ފ˱|?R#:vah :ZCS4*CEiq=OC;̫ehUc.â˩y:2I>H4yu򟖥#b}^-/{O&ZwAnaI}`+I'@4ӗ{[oEIT(M5$.\m33o_Q@U=5Ҏ%e)P#+97.Ypmk?*ɗ"v˜a-U<3 Ff: O6> Lf%uK*gG>7>. O< ?¦ItC!l}^N83.922׻/_YIjλן ln4fbYk6l謰6uĿg_hrB݉pdMںoCk",_NޣR QEi,YN#X0#q?F~9gDc<^`E Y8Yj^ GL7 p_3X4@NAs"zB׬v·Xv$L@+wC+U*S,?^38]'2'jCӘ8S^%-+ zgYh(jrzuyq![4'.V=oO:&%'zt2אxqHywφom4~*ԝʇ`sVT%weE쒽,ИTVJ>ww Q׬^PfO`+ @TU ?ó0¹bm| zj'׾ +콫u|q3q@S|}ռ0*cYpBNp!,$hĬ4&P裿?&=\Su1BЃ>G"tEʱ´GnfX:xѽ- nu@0oq,U  ,-}b'mR95.u6:IO Nnjơ#fN@Itמũ)Ɲy~m<bLUtf2_B`bcc#SX~貚.3Hk@ٵvuhHʖ})u]Ϭ!i5x{N4:_& xa>t#Mlu=LտQUb8+Yq^8vEDmS%ROLJ46+);slOha[Y[lK(e9H⩝˔K'L?⫚r)m~}wa$xGVs$]0W\!`/e}C1frxos;,FNơl $Ԃ_>30?oQ%|iAdչv²+8:]^ H``Aۄv+f63znз)= ؓ!ϽV6.daoЧ"ѬBu=m{ Zޱb *YxxWV.F`_<8GWd0H nȸ*C}*Xl!"7ƶ8(*u'3PX/*J8%.KKG;^8ۂ:OOw|;=>4eK)$?u$_)#fGt{2C-qemgUX=YNG+"-(+>\J~ yچg!oz?:{ws*"aeٙ"#Dƕ<҉|2iisgp vZ‘߁583|y%>zϹm :7/)@y}Y 8ͫc} icB<&P=b7GbcZ]M5`vR7tw% 9Bـ~%NhbGqOck 3#J=nLxԳ1GgL9ӊJ~@t< ٠>LuTu^m7E[U8q+g<Ԏ-i ^j2 $Qŗg 7f!ָVӅ=q@H}txS nmǑS/I'0ֽAQlmk`×fe}c/N IR3K7V0߫X7  7J⤱&5s9qHAPIi+eX03p2A ̖%\@bs[OAi'AI6qfV%.~Eo@WN{ڰ eƚ k 9fӖ@ʣs:8n ]GH4B tX9~YpbH[{eef TЃi=Xmɫq ۦcRu?žw+G݋cǤOCWn]wrErz$N,b}1ҿd xcd tPlG4@~ ]h~3* ]ս ۆMCïa@dx;tڦxޗR4Wh3lgYi8ɼ?8!pC \òZ5ihuCOjv'4_pOr 9%եBu8|Qx["tXD3WS5pR8w$͖?! ̐L{ĿIv]C 'xoH/pQqE25O4 JI8/y{z?߾Ԯ/ u/pϵ(حٓ۷B,*$\0>SgΙo24iF?4 ]$cÉIh/="WƇ|Wm6bL&Rvi}~ۆ%!_vaͮ|$EX/N<qcLj̹:tc^t)#ԥRBH'|0Y8D҆7]ћ֪ǵb. (o{lo{@[ a$4h5Ѩ…oAO%CKעL 2Q!EnZW$9ǺN1:(ɓ|Ṉ 92lƗ]9o7GUYe1shHU&G̾= WK`'CzTK-- _ 'qhÐш*,T" =yׇs#H{ïaәp"j|j6^0'=~Da[.Zr_#S[\+aT/]u9;a5Zi#0H=c%-oaAP`gzy+wd{#8-(LX N|=mx d|b t^_ÚyV[~qI^uװsis)) d+@M5-,`O$۷(p7 (=pC1,{ߦaNx>֧rIS`q-޲#-Ɩ_U602e+@ɗ7N]=q;bq>KoLa[Wmb0=[b8ךCX qب̜'B>%R9_`JDW>GkkH>UH18dCu퓿b_™gKA #)\z߄vhN3owđ{6eS 8i?pcv#P*~xիX;V1.p6Pj>)E~{Ǻk3/BDش9&FFUGrq!Tѱ@Kքf/4HXY!ѷwѧv7̧++\6,HU-雾獠2A*R^]93 645>Aeje$uA@5.f a= мAU:o[v ^-vaOuQF8*mqa58^V+>ÿA>opj~\̟<6:~辋3.)Mh\4'?x>z.:r4ᨸzt4g7֛FÅUZ 3$=·Pb[)79:6oOXQ9L1zN'KWsŬq"&s p_PeԒY&NZPŪާF3Nm`zv38xIq{wCo'. ݽcH{nɶ(e_|xDR N<Ymq>xQ%.4 7n9r5Cav4gA$/9/UJayy!X|~ɂyj0#r+߫9K=-L畸@Q)Jskvݠ:-;iMWn㸜D ./_t{Qİ1p4]˄}0ta6=?)рK 6{p;T &1w8a R; WPۑIz;(\;nfEgx`OȞCȎI)U㓟pޭyvCCs˷8xVnF:ؚ%L;r~2פ?C)6p f+S,]钯 ~H鵨l67#֚?a/ a(_EX j2V9+R쟗Fҿ7tu,B{yu=~a0&Pg[^3ccwO0Zb @жR8%zᠥ;,0)-{VΈvY-&wg B+דHQil1a V; zKpſi(\oSE)uP [2X2f3h.i{8rq6S <(WYDNZb<:FډuC*Û0򽄣Ev1> V|,:K\kڡo7aY`@iay{[ p# 9qS8/-(`*}KhSZswmlIUXjc^JI^`BMV+&w3tGVz۪au6`G!5K7}|xj 2vN(ݳ>HiCaSY,RTGb˿}P;S:X{oJV#ߜ׻o{.@ق$v Z[:Y)i[=udxN;M_xX<Eɾu*nr_-yI?g^ۆtկ}a.eV B'< S`J8,='O~<]ncHήqJfwq<\(CiRBœc܊6r:-CaXU}ߺ)e*ur|ok@uE>YX[#ÒDhNFx$pgg 3N`qGN߂8N5h Xx{, k'@Eqޅy&Xf&g$( g:H'݆~>2a~4IV`&,B 9fx|FM>n?jkyپlGBe[0lE[EgۭU1k*i9㼤@$k\؞471;18i]O2 '0K1V_`00NhW~?;f^ώc/]H)i\y蹒S*Ӷ)vKD[ |+F2ߏodhcv._(Q]Vde>6MNxE4ՎhxĮ#6r0^*/=wT2c^h(sLp.˰~e/_nĈX=f->@Cgq<ܘ# N=O ǘipiO=R}42w[OeDz'(f\cITW"x;v};0mjW ?? ohn?Uo;*@Lޮ{ sm_Ά# y,-zWEln W-?¥gwW0ozlv` V_9aEZatCV_v[eN>cm-wIס㥅L\.Nb r>gEY8O%~y:K;L2Ś= 0S]c@"U}6$'ܛʵ z~[:FP<޸ m8WE`Jۗ{qmBho:ĖOAM:Uz>s˛`L6]5mPqzH0*y響_ t+t>{W&Ѯ0 ;$'-hhCߔAt!Hv:eoy8asF\ap9DnT3_EG)ɲ1 xo2vڛ [R4[KA_@u63I`l~?O^αqǑ4?F ;ݖ^@qWzK~Q>6秠}%,2W>R\ktwXnÐe>ڱϱ 41A;[w/OfM d7t-:k(:798uS `F3@VθO b捭3qV㣶Sq<KxƲKG r=k))/߹`3/|5+pup[;7R ʎo  Cc1 =Eg(vޮ/m-ʖ#oo^_o}=;g׵H1)};$t{J.=q۵z ׼^|d3w{ Fzƕ?q;X_۬)0wXVbE8bn8l6)~ŕYO0{>w59/tvE؜`YC ˊ*iL_GF2,x L&z(*x&۲n5Ϻ!֢h9k2 e ap\v}}.qp:rR?-5 LEiCY'1|| X|kPvQjঊ lůؕITMͻ|HU8\v8D)ac=2B!K}8b=(|z~ S>ѕn0{*T#Βu+NqhIӍZ+\IWMծWբI됈%XoF% 2`#}=K/'^3!*\A\hط!X {.߁s{aۃ.sT RW$֊QaR-{TS[}yrmZ8Eh"xՃԗƨ4;#>>2xO)yM!'x+kA c{jH=0-W& z]^9OC}7BFrb͖P~X 9 ^j)tW˓9wlUw|OE[YoOC}唽X|]îlw@ bKȏr>;?Z-:HeZ>Ԁ"O ;;p̱5l<n]430|[CJYyqS8ƚ/^Yh&1q'nB 'T\4<VjJ0є=fy nRGxԍ%P\0潞&bKTUTrJ,XhDmSzq8[.%P#4^>:BJ7H6L{8sTx]Q,AZ2!H`mgބONm6JΞXd}؅Y4L"Дt__N{33@{gs zGy~`W7}Av]p?Y}" k{*C&afJ>6O|E"?nl<5}O+I&U/+gd"[ $:^SfOg z %_cLLm-.D.nw y?}NI'5rviu J ߅knڻJuMG~Ii[ ^7K#++}{e>7uV[4-GzddbyϱxK,f v1o81ɟ\=6 Mq㸧SKU~oNtM*հR;2"U\kY#dVb':l[-3y@"}eW LQtqI~d 'F Rnݝm9a_ vFG_n);ux3#ujbF':*ɜ[DBMrK'Y,dBWP~}ƶ:I-}Yр;q5H*S/'c= zpЮb\Ӑ\3"WRtG Py?6=(vupjuƐ@C/ ' n*RŗQd"2ӫ4"0/Fbj2U?p*RrɦC#O3PZ׸z0E._?Lnѻ_mJ#` CQؐNEX.o MoOrI*ZEӼ5}}*rgnO]1ֽ# r*M8]|`:wd -=v{QIQ?qBF a1L֩:/ޗl<^El _qQ3J`^c!TiWM:|1j3Jm5 -2qC-o`zSr;ν;p* :k'!)S]\61\ΙHcPWO!v:vϟ|[Z @Y㻁r%9G߯LweΕǐb4yHe^tb.'3Q\gD%Gf IoKgG` w{ A>B«⒭=:$UN%UzKGh#M*fyi2.;B}_*&|0w,">έȫ&Ǚ (3ۓ*@}Ac9>3`ū h$(*Q]]dB ^I(>-~L[4bA uH ? SoA xW;H[Bl9@ZI췄S{=1ƬIQ釺kQKnώT0{Qc>X?|/=9>6 6GKJv}ˏ*QZ3 :h*lc3xup6lv}o\&`]Tu70e\ڡXy 3\}ہ-ݠtNUz ̴+*싻N{6ΛYņfJj4b7EY1 }C+^r1e}|*`Gzg5"Pq{ _o>2{%[8 D/M0OL>Aw7 8`O$RwuF5"i'$ *Q]HEWW/0ܢ i$'QJTnӂ| @TT MVpWR+,@ts8mmqBU8J8zF8ib¡pڑՠIʈ;}poGťCᾄ$c}ll . OǺj/X;seAjUǭ (Jn\\u@L*N8wryi+S+ PJ0QGRʫxa 'qkJAC8g0o᫆XоC6v7Մ3D1lzO KWc|cл`n| eWߍ=&Kar&Yˏv@TD)U# -n!A p 5:ٌ.RQXpuǼ0kwS4k;t}_B~G7}SèDZrC3J{jdYhP'/]L(O}'?"P~L23; r$6Ŭ#py-KsQGZ^IrqcN[k8==w;,:0a=6؉ 1Hh#]fH4AXTh;}X|-X+Nm񌜗-52'b0xt"9EnAk[K$vTT*x }WCؿc15XSp4r[NhsRX T,W[2a?Y; [-ZNhSqO'OGZ]tJK`vq(V(&|7YCObRvFJHa@J³/l\#w&=v[N+vp?"SeDSȿ+u*pخL y*H}cL3v2~ Z9퇜.ʭVȧ 21=J!'gR(ݬ ;T}!E.g9$7!K[ $a_Yg jL <}PlO‚q.a4\nی%EfzΕI^ZӤcM f GÇ+-gxwqY ƫ\/~>f[F/_ gn> U)alco&, eZesXі^j m! vZHe[Żw |p [H[1tmrXa'E:n/9ߥw"|x [IRCH@¾sاͬsS-eg)_};Xn$ LAm}t6yA_O;1?j"L%<:A]3Q̉"k>|i_UU?99ӍX6-ް6U~$ī\A+a Y_1sDlXINpϼ] ȕ3DH_a&hĈ&|VUCB6 ,++RϦc+ 4iaVխ?jGbƟX21j:#Z; *QS!ѳZz2Po_"_K`}e*W0rRq#w ci?˰8Cd( ;mNXq FZvykRv?y\Lj=aߛny+ ,Ш.6(lԆz+AhecT2ȩMHkЈވcQc_s =!W+4"1>vwYLΤWl,s$M&I_vɗ7p+-ea~0&rdž@e5]53'=>֜_7{ADC&$^[bRzCtXZh=W+53Cؼ,׃ص@0'؟i*k#gaݾKu7wWQdőÆ8)QO']/ N{ *p\ꅊtX;)qvUU Ǹ`TƭXFX1ҕIz\O7mx7z 3#X-]Olq~ۛޘjl;4=ObC?X%=^Y=_vx\]]=~fΈ$skѢ8$S(G_v7ĶًƂ#1{4w5П"UO#qOLsn|^HJÙ=`!"y{O8sx ̖u{Vĩį^cEj3虳Cо& FүdWY78nsk8oi4ZE教ⳁɟWR*LPq7n>"]8 pxCg FCES i +,N%bk$8J}N({K٨m0lj֦{H9PgReQF(]jG}5ĖOCY`\vq|X)wm(WWsp]?fzh\; #/8sIy +te!M,|w}V??NeUnw?TcOMߞl}jiݙwY:;|Z?jSsm:}gssY?2`k=NR`projpred/data/df_gaussian.rda0000644000176200001440000003774513531657341016013 0ustar liggesusers]y4QELepDQ$%2(HEEBP" f2Tyy:1y={uuk?߽=d6z-4EƫWԥhh6+4 4̛9momvߵٴn; Z{)C7SdD瞙gF>P˃)Fi0K7d9 /B+%(0v'_IpZ4ӮzUyFJnXzv \Yb[RNpS#@VW ke7,+-]kɕnek/&dcziCbsog!Cel,>xL%0 39ċ]wN5}y/j"Lȴby Dǡlu(&>oyqFS>ĈIy`H8ǰ``h een7Wg"?cC`t,1 ;pK0_t2\k]$zqqAx(WxCdcCS1XzcfQ1.Pfvcwơv~m**vdǒ+0_Nm ȊXk %1{I-R\ed#aTV/=9Gvaشn80J*7+#S[d.n]VJȮ,%GlGFXyH+ w9r½g`bvH![`5 ]:~`3Nek1$y(P?zG㚫dVNμ .lx2VBm[9P3׾|$|)>l l!.TiNlm֣̭z`lkn#NrGem4 v?v4yhz“PMz>EVh}nL]V Y͏Q*_ ~ʚ| {%_SSڳDܛAioF-Pnk[=5 -?dž5a5hgt8`8$"Sa`\~H۲f^f(R;M^_xqe;yl1˧cJ OpBXQ/UǮd2_+Á4VA>0U<+p^?`f^>1Ȏ~v}8..GsB$kk.ZM`~_/y iȂS+8`Da{[*LT}"hsQ=eO'oA6s ǘKw}ZTX:;α >1oۅ ޝ#so sN=!|]\+Vc6|Pǟ,<r.aw-kqJҔfǗ`f A@FQG0xDkڳ1Tӡݣ%aK/,K4մ] y!vGww[1]msZt"탺]cބ_Ir,]YgqƖjQpfTĞ&o^P Z9`K,20P? 9cH`ǂA*?0dSZ7$"{8=!X9=Q@+aoxdӑ:3@uL0pƚ]ZcK 0 x]a, *9ɦwa\4Xqy71GF׫;a:[Ec遢 }Æ\>/L_o2*S@:]Nl83wflR%Kˌtw|xT] :7}[Q~ut<;+R[[Mp{ƻ3N^|m=.J6#2=Mg;C!JK;mdk ]>rl`~[{7NrxLԉ;*ܭRe}T&*IQ⎰]E#YcxrygN7Շ*s: f]rNZ Gi5{+YnsFTuOi<'З ?]=SW:l"gEDl.i[?XhˇC ?>܉ ʎ0=h)z:ft%jIxlt8F=-ѓ.$;"f~[mqηwN)< >wʕ0#F)}2,͋ wf>Is$<;k8E6ܕL}-<7K=shFbǎbT( ysˠ ToqN];QNwew'epV)SR/bT8^̍9f;5JE@ tO$P,c_ LmM }7"+݈":eWOBXؓfh}F9&Yx!C-n8`g8>nk K}No 8UZ<ݍIiRlw XF/&#%3X>E]f bGhs rV~K~1WCM Skۊ0EV,pSRsAE$u20 5㑖X嚡T S_OQw aop yq _=!˯q$TwVY[XeejQۇeG N.gU_#ÖR/ K5Ʊ$=%|EpֽswYKlU04tqb2u ~$VyczWPhR7U] = W3bc =qU s0?<)D 3%zփ]o^y3 J?{?\KDN{]_vG]rB/ijy:L0`mQ܌OȮBbT$W Cmuu1VVNĤ"q>WY]X:]vfer9,`޹Z&#uu(إйo(]ot }ִ\J]Hg?-Tţ GN_%ڢhpg8g\$^)ש0Jv$ki#W;D 9oFf:ܒ =a;=OɽI-k}=\i6r$%Hd*%5%VJ5 qu"%ἸkK9ړ-K -zǜ~m>՜QgY@C&<-}/_ܑr ~ (G"E k/COx0&10Š>rHnŪ#1Jʉ W|.`_xsY'3:Hᙟu)8}'ZuXR?è6k R\7z7C")jN0'œ#H}xVGR[ |ư@Km+ sQ05w&"BeS@*nXlc~OLB[7Kq-0Ȯ[@x] d kP/97^o,O Avݿie߷QiΥF'10/1uLU qu#и>$~6uͳCY_ ɾs%B3N"ht2m4OT7T6|-FGtr]Щ1q>`p;OP!6J Zk,ٕT_\gS$jL8e.(0Q69ʍZc̖|"',"V7u12|uLN>w|J}?=3O,/bv57䰛jH`TVy7R潓 GXɵgLl WڴJ/AEDZ_UK#ا|L.J W웗yC 8c=x;22z}E]+X_Ya\ۊi=DazxkJa'_5Nb>(05g}1E?ăd2|Y܊n|.9h_[b߮6u߻[M^)GYY\/@{%vkrWl7ӧT1}!M _!,fUZ|uaހ9;k_l'=1l ޚZ~y\If/H*`ݘ^`r\}"= X );%<9';Ơ2k{=}Be'?ϰ7j#(.{g ڞh$ UwSؗ*eYV IY.S lӓ8sX>>-\}+uQ6.X ɇcy 4 ;pJ}eَYX9oEh;O9x~f${jH!L*x5LB8ͣ w[vb̕Uyf" ͪfؒE!(&}f)u}\NڽJ9s]7.kPA1oVs':F*ִ܉XJ׎_r82rN?;yo]) _13{<~evAn0e,%c|~7}vkзY˘'I0?<˦R-%I 1Wzy YNņXXgRp^c5g}z?]-ądocp22| %G?DJ鄶åWh&/^!DtS^|]-.bd g;〟@xŒdqBXU4V\4@0%:]-E]al̤DL&v:8eSݱ>+_Ry#Ң`|!@6QEJ< IM8Puַg^ Ċ8Sa~ϐ8*ٔ94,y;\^r}0FC!7ĎqþAXM@;>\ l<.=Y칷}wX4d;_!taEsi]Hg;qp@g|pxz;U1XF)bk9ɩP2pW/`;l7R/<RԦ>R;(̅wOAk!\HʞQ4kq_rJyǗ[ظ@ nYs= zH~X/MngE5k䗓qEHJW3NOL]yY!y..>( Wnng9d?؞8~-+E%t8a⦯pp^˄".ݰ?'Oyr!\N۩#{c0ǾI+4̰.h&r#ɪ2+u死7!6 Lcrv/(Ǵ]?¾>8 BOAA^hy95$"β,远iG*HZbwlY%Ո@{_2pۯ (aL/?gI ^ћa_ 9Nى?W9awe\8+.qٛS[m7u6O<1+Vx;aV@Иp!wҮ7:1km{Џ%)95;Fa!PPn:tbW[nn_K14; 95'BůS'/EGN5cJ;qTj?ݻڂjZנ5h'k3ˢ[]yYAmj ح!ZϬRUXX\ w9Ht٫Zͅ y钕y[٩gspoi%ޖ^{ȕ ™6/B\DhfZ7^*:J̫/P{D$PSC8Gqx#]SǠཻTG" OGa_wTcvDdCջb:|^^Vk/_q`Q/Wa1@$B8v. 2/x>*A݌"Hgx[O 3|,֠LLE&[[pxak'?e'axV\=vIF*G@_E^r;=:Ĉ5HnUUiu6FhɹJy4ۉ_o 8NIZ)+h\ga2V\x^R=8o{pT7RZ_wهױꦔbkLw4*ۇ`DH:v1/P$ v%P 8БywK!>k:`WҬFcip~Z/C\KŵSJ}R%u8ϵƶF[kpaEXxn=0Cs!Z&]<LP~G>G?iqerO0@rUa?^BI$oD'IW.Fl1ᗣY8z}e j/BӱwO `T -hNdv;EjTݤ ~1W܋eF+Dz8vN#\KhZT;vkмf8}Jjs"zs{cCŦ?Rն E| X eQAC +wv9kFT;upH"aGL2-cL$-:T]Q~cĠXy[;G#9aDvz;WG }fJ⧙e adQ**1Ԥ?ufKꪡzE Wc?λtʧկXt>N"lQbs~76HJyF:kLtHk9㙣QC]Sghb&&[7q7ݠNWa ^g?Tm(uwY7n,q۽!ٸTbkoW`qnU;E])}jՏ?`X LgrqN}|F"Y+ܥev/ v40- oV4BCB!` ː$(ٷE4}%S<L,}> ^i/o#baUި\?PhbW?~ < CQ1Zq?dFyf;]>P=uzzk=V̱O!,]dq>$zN9?~ H*/_چhYqxH{E8l.V>>[% x76zY9>CpT\*1Y'kzjߓ{PX:zsjaKY(nVWP;"|z78-5goj.LNRj5:羫t8K:H)jW?f*b06McY.-4?%K_`iKEo8L}sml<2R*˖>pD*:cߔ_P<4gw2;3L+%IVȷg)]SXP#^İpa`ȨlT,&s74]wUɮCݷ1H$yE {Gs@AHHz-P];u7l-k,N0 \w >nU^s(^iJf o=o=0|E?,SU{p ٯ^ cJya_C9zQ =/8 61=ʂUsd6=p։?4…彿y_DѴԏ0qA#a8OcΕ@Ge)(r魾_s^DBoG)c.dKT@fٶ]:;,|VoTϴP"|4fw"+ZտtI~&=B }o؞F%8wjo OZΪ|a2}xzlaA+ߢEO3hC9X8,1e){}kOҦq69V?~NQJ{T.IWt0>fbC~)9 B3)<1~ɌÞ/(u [Rpڏ:H^dH6-ƿo'݄NI 09> Taw0KˑÍ9ʮz뛺3휎=vv)ٔdOEp,6GI rhk9`hB I(Gm\OС8@y̮g1%A1+gπ;{~  :\Q. cL|@) {| g\ީe78?E4vp!4V_aV>XH]PU-/YDJA3 ZYf4XGXT|`n~E$qkOş.p/@pP'B4rcrE ,؟mR:v9ca/-pഫ|4~9&ݎ6uw Y]Kb8*8yIjΛ{6؛R5fDQ|cjWW0G2ig1bv^t^E8I3yfg=|o7\p ZݚknwE!ۮͨ`74.Kr?d)bCEa2Z؋:V0ݓp;?E(Ւx\^ /,ge][bWWx)4Y'pTm4 fCZL5lQنw/DVH'{~"_=QsC:s M_^Ǚ ,A%7~Xv 7iǠw#|)JWe qKC7vB6yhqLQﶵ+yLS3}ɍj=kuBL诀agv QcuzM߰u`M{ L}ki9d<+G*%\4vGA䓕֟Go]{C}\Uūj;i\GGhl$~EdiG^Nٝb+Cdxzd`VRQ&Md^7DO䠩KW*˱ UDWfViQV1a3mQ n!2K?JBITԂ5[#Ū,h+Oj_ՉmE=G%=uX ;וKrZ9_?}4e|~5-/N/J8sF,ܼcphgiæ'!0|K\gOl$n[ ViD//>}N"d52I`0$g5dLB x̷KŠYo*C-\i{.7KWgvf hє$3T< (fP7NQ| ս+hv?&CR; Tuߛ-DZyVÃC^`vӛY& V/FgBG䥾sF /].Y׺ TJ|w-//3.SBC/fӃuܟm~N/} 4Y<[|pqcDՐrx0oJM iܐD~;w ^6D=#+*~D&s߉|%=t+n5Y߀FZ;c t|lll|yMmo?ϼifvm ?d Bprojpred/data/mesquite.rda0000644000176200001440000000164513361364246015352 0ustar liggesusersVKLa[ P*W54*$u-m[Q@$CL0ĘHУ^/ b4 Pg;3\q3BhBw:ëSp 7м`3nЋmul;X9 l{2r+++*jZ@) PllTՀ]ݘk p*T?p_@Lt2V4o&oؗ~ W(fQnxJt^m/v%餷rC#J~Gԗ?e3䯄"Gq 7K{%o(qsIHOֳiIgϡITvY1K˶{6Z훡i[⩿ygu%9I_ٯ"Lo<<%C>`=˺~_9o^'Ѿ|S[6Oug?\w>/.,ϽrQ(}sS֝1G9uxu<&}3?Ew{"_rKe~Ls\qןVW<պstKodg_ ܊LϠ ]7=7|ס0 8c)cqN1߸zV/}?\JH{m$yCrF Ey5#m$-?z4y5mP bwnW.y(wH~a]?];ǘrp9xƑ3 ^G1ffLcQb֌HFؙ!h[ٗOiPRtǣ]^NԞzy*z%(  projpred/man/0000755000176200001440000000000013531311224012645 5ustar liggesusersprojpred/man/mesquite.Rd0000644000176200001440000000175413363326134015010 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/data.R \docType{data} \name{mesquite} \alias{mesquite} \title{Mesquite data set.} \format{The outcome variable is the total weight (in grams) of photosynthetic material as derived from actual harvesting of the bush. The predictor variables are: \describe{ \item{diam1}{diameter of the canopy (the leafy area of the bush) in meters, measured along the longer axis of the bush.} \item{diam2}{canopy diameter measured along the shorter axis} \item{canopy height}{height of the canopy.} \item{total height}{total height of the bush.} \item{density}{plant unit density (# of primary stems per plant unit).} \item{group}{group of measurements (0 for the first group, 1 for the second group)} }} \source{ \url{http://www.stat.columbia.edu/~gelman/arm/examples/} } \usage{ mesquite } \description{ The mesquite bushes yields data set from Gelman and Hill (2007) (\url{http://www.stat.columbia.edu/~gelman/arm/}). } \keyword{datasets} projpred/man/init_refmodel.Rd0000644000176200001440000001246713611400232015763 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/refmodel.R \name{init_refmodel} \alias{init_refmodel} \title{Custom reference model initialization} \usage{ init_refmodel( z, y, family, x = NULL, predfun = NULL, dis = NULL, offset = NULL, wobs = NULL, wsample = NULL, intercept = TRUE, cvfun = NULL, cvfits = NULL, ... ) } \arguments{ \item{z}{Predictor matrix of dimension \code{n}-by-\code{dz} containing the training features for the reference model. Rows denote the observations and columns the different features.} \item{y}{Vector of length \code{n} giving the target variable values.} \item{family}{\link{family} object giving the model family} \item{x}{Predictor matrix of dimension \code{n}-by-\code{dx} containing the candidate features for selection (i.e. variables from which to select the submodel). Rows denote the observations and columns the different features. Notice that this can different from \code{z}. If missing, same as \code{z} by default.} \item{predfun}{Function that takes a \code{nt}-by-\code{dz} test predictor matrix \code{zt} as an input (\code{nt} = # test points, \code{dz} = number of features in the reference model) and outputs a \code{nt}-by-\code{S} matrix of expected values for the target variable \code{y}, each column corresponding to one posterior draw for the parameters in the reference model (the number of draws \code{S} can also be 1). Notice that the output should be computed without any offsets, these are automatically taken into account internally, e.g. in cross-validation. If omitted, then the returned object will be 'data reference', that is, it can be used to compute penalized maximum likelihood solutions such as Lasso (see examples below and in the quickstart vignette.)} \item{dis}{Vector of length \code{S} giving the posterior draws for the dispersion parameter in the reference model if there is such a parameter in the model family. For Gaussian observation model this is the noise std \code{sigma}.} \item{offset}{Offset to be added to the linear predictor in the projection. (Same as in function \code{glm}.)} \item{wobs}{Observation weights. If omitted, equal weights are assumed.} \item{wsample}{vector of length \code{S} giving the weights for the posterior draws. If omitted, equal weights are assumed.} \item{intercept}{Whether to use intercept. Default is \code{TRUE}.} \item{cvfun}{Function for performing K-fold cross-validation. The input is an \code{n}-element vector where each value is an integer between 1 and K denoting the fold for each observation. Should return a list with K elements, each of which is a list with fields \code{predfun} and \code{dis} (if the model has a dispersion parameter) which are defined the same way as the arguments \code{predfun} and \code{dis} above but are computed using only the corresponding subset of the data. More precisely, if \code{cvres} denotes the list returned by \code{cvfun}, then \code{cvres[[k]]$predfun} and \code{cvres[[k]]$dis} must be computed using only data from indices \code{folds != k}, where \code{folds} is the \code{n}-element input for \code{cvfun}. Can be omitted but either \code{cvfun} or \code{cvfits} is needed for K-fold cross-validation for genuine reference models. See example below.} \item{cvfits}{A list with K elements, that has the same format as the value returned by \code{cvind} but each element of \code{cvfits} must also contain a field \code{omitted} which indicates the indices that were left out for the corresponding fold. Usually it is easier to specify \code{cvfun} but this can be useful if you have already computed the cross-validation for the reference model and would like to avoid recomputing it. Can be omitted but either \code{cvfun} or \code{cvfits} is needed for K-fold cross-validation for genuine reference models.} \item{...}{Currently ignored.} } \value{ An object that can be passed to all the functions that take the reference fit as the first argument, such as \link{varsel}, \link{cv_varsel}, \link[=proj-pred]{proj_predict} and \link[=proj-pred]{proj_linpred}. } \description{ Initializes a structure that can be used as a reference fit for the projective variable selection. This function is provided to allow construction of the reference fit from arbitrary fitted models, because only limited information is needed for the actual projection and variable selection. } \examples{ \donttest{ # generate some toy data set.seed(1) n <- 100 d <- 10 x <- matrix(rnorm(n*d), nrow=n, ncol=d) b <- c(c(1,1),rep(0,d-2)) # first two variables are relevant y <- x \%*\% b + rnorm(n) # fit the model (this uses rstanarm for posterior inference, # but any other tool could also be used) fit <- stan_glm(y~x, family=gaussian(), data=data.frame(x=I(x),y=y)) draws <- as.matrix(fit) a <- draws[,1] # intercept b <- draws[,2:(ncol(draws)-1)] # regression coefficients sigma <- draws[,ncol(draws)] # noise std # initialize the reference model structure predfun <- function(xt) t( b \%*\% t(xt) + a ) ref <- init_refmodel(x,y, gaussian(), predfun=predfun, dis=sigma) # variable selection based on the reference model vs <- cv_varsel(ref) varsel_plot(vs) # pass in the original data as 'reference'; this allows us to compute # traditional estimates like Lasso dref <- init_refmodel(x,y,gaussian()) lasso <- cv_varsel(dref, method='l1') # lasso varsel_plot(lasso, stat='rmse') } } projpred/man/suggest_size.Rd0000644000176200001440000000701313611400232015645 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/methods.R \name{suggest_size} \alias{suggest_size} \title{Suggest model size} \usage{ suggest_size( object, stat = "elpd", alpha = 0.32, pct = 0, type = "upper", baseline = NULL, warnings = TRUE, ... ) } \arguments{ \item{object}{The object returned by \link[=varsel]{varsel} or \link[=cv_varsel]{cv_varsel}.} \item{stat}{Statistic used for the decision. Default is 'elpd'. See \code{varsel_stats} for other possible choices.} \item{alpha}{A number indicating the desired coverage of the credible intervals based on which the decision is made. E.g. \code{alpha=0.32} corresponds to 68\% probability mass within the intervals (one standard error intervals). See details for more information.} \item{pct}{Number indicating the relative proportion between baseline model and null model utilities one is willing to sacrifice. See details for more information.} \item{type}{Either 'upper' (default) or 'lower' determining whether the decisions are based on the upper or lower credible bounds. See details for more information.} \item{baseline}{Either 'ref' or 'best' indicating whether the baseline is the reference model or the best submodel found. Default is 'ref' when the reference model exists, and 'best' otherwise.} \item{warnings}{Whether to give warnings if automatic suggestion fails, mainly for internal use. Default is TRUE, and usually no reason to set to FALSE.} \item{...}{Currently ignored.} } \description{ This function can be used for suggesting an appropriate model size based on a certain default rule. Notice that the decision rules are heuristic and should be interpreted as guidelines. It is recommended that the user studies the results via \code{varsel_plot} and/or \code{varsel_stats} and makes the final decision based on what is most appropriate for the given problem. } \details{ The suggested model size is the smallest model for which either the lower or upper (depending on argument \code{type}) credible bound of the submodel utility \eqn{u_k} with significance level \code{alpha} falls above \deqn{u_base - pct*(u_base - u_0)} Here \eqn{u_base} denotes the utility for the baseline model and \eqn{u_0} the null model utility. The baseline is either the reference model or the best submodel found (see argument \code{baseline}). The lower and upper bounds are defined to contain the submodel utility with probability 1-alpha (each tail has mass alpha/2). By default \code{ratio=0}, \code{alpha=0.32} and \code{type='upper'} which means that we select the smallest model for which the upper tail exceeds the baseline model level, that is, which is better than the baseline model with probability 0.16 (and consequently, worse with probability 0.84). In other words, the estimated difference between the baseline model and submodel utilities is at most one standard error away from zero, so the two utilities are considered to be close. NOTE: Loss statistics like RMSE and MSE are converted to utilities by multiplying them by -1, so call such as \code{suggest_size(object, stat='rmse', type='upper')} should be interpreted as finding the smallest model whose upper credible bound of the \emph{negative} RMSE exceeds the cutoff level (or equivalently has the lower credible bound of RMSE below the cutoff level). This is done to make the interpretation of the argument \code{type} the same regardless of argument \code{stat}. } \examples{ \donttest{ ### Usage with stanreg objects fit <- stan_glm(y~x, binomial()) vs <- cv_varsel(fit) suggest_size(vs) } } projpred/man/get-refmodel.Rd0000644000176200001440000000336113362335272015524 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/refmodel.R \name{get-refmodel} \alias{get-refmodel} \alias{get_refmodel} \alias{get_refmodel.refmodel} \alias{get_refmodel.vsel} \alias{get_refmodel.cvsel} \alias{get_refmodel.stanreg} \alias{get_refmodel.brmsfit} \title{Get reference model structure} \usage{ get_refmodel(object, ...) \method{get_refmodel}{refmodel}(object, ...) \method{get_refmodel}{vsel}(object, ...) \method{get_refmodel}{cvsel}(object, ...) \method{get_refmodel}{stanreg}(object, ...) \method{get_refmodel}{brmsfit}(object, ...) } \arguments{ \item{object}{Object based on which the reference model is created. See possible types below.} \item{...}{Arguments passed to the methods.} } \value{ An object of type \code{refmodel} (the same type as returned by \link{init_refmodel}) that can be passed to all the functions that take the reference fit as the first argument, such as \link{varsel}, \link{cv_varsel}, \link{project}, \link[=proj-pred]{proj_predict} and \link[=proj-pred]{proj_linpred}. } \description{ Generic function that can be used to create and fetch the reference model structure for all those objects that have this method. All these implementations are wrappers to the \code{\link{init_refmodel}}-function so the returned object has the same type. } \examples{ \donttest{ ### Usage with stanreg objects dat <- data.frame(y = rnorm(100), x = rnorm(100)) fit <- stan_glm(y ~ x, family = gaussian(), data = dat) ref <- get_refmodel(fit) print(class(ref)) # variable selection, use the already constructed reference model vs <- varsel(ref) # this will first construct the reference model and then execute # exactly the same way as the previous command (the result is identical) vs <- varsel(fit) } } projpred/man/df_gaussian.Rd0000644000176200001440000000100413363324737015432 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/data.R \docType{data} \name{df_gaussian} \alias{df_gaussian} \title{Gaussian toy example.} \format{A simulated regression dataset containing 100 observations. \describe{ \item{y}{target, real-valued.} \item{x}{features, 20 in total. Mean and sd approximately 0 and 1.} }} \source{ \url{http://web.stanford.edu/~hastie/glmnet/glmnetData/QSExample.RData} } \usage{ df_gaussian } \description{ Gaussian toy example. } \keyword{datasets} projpred/man/projpred.Rd0000644000176200001440000000363613611400232014766 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/projpred-package.R \docType{package} \name{projpred} \alias{projpred} \title{Projection predictive feature selection} \description{ Description \pkg{projpred} is an R package to perform projection predictive variable selection for generalized linear models. The package is aimed to be compatible with \pkg{rstanarm} but also other reference models can be used (see function \code{\link{init_refmodel}}). Currently, the supported models (family objects in R) include Gaussian, Binomial and Poisson families, but more will be implemented later. See the \href{https://mc-stan.org/projpred/articles/quickstart.html}{quickstart-vignette} for examples. } \section{Functions}{ \describe{ \item{\link{varsel}, \link{cv_varsel}, \link{init_refmodel}, \link{suggest_size}}{ Perform and cross-validate the variable selection. \link{init_refmodel} can be used to initialize a reference model other than \pkg{rstanarm}-fit.} \item{\link{project}}{ Get the projected posteriors of the reduced models.} \item{\link{proj_predict}, \link{proj_linpred}}{ Make predictions with reduced number of features.} \item{\link{varsel_plot}, \link{varsel_stats}}{ Visualize and get some key statistics about the variable selection.} } } \section{References}{ Dupuis, J. A. and Robert, C. P. (2003). Variable selection in qualitative models via an entropic explanatory power. \emph{Journal of Statistical Planning and Inference}, 111(1-2):77–94. Goutis, C. and Robert, C. P. (1998). Model choice in generalised linear models: a Bayesian approach via Kullback–Leibler projections. \emph{Biometrika}, 85(1):29–37. Juho Piironen and Aki Vehtari (2017). Comparison of Bayesian predictive methods for model selection. \emph{Statistics and Computing}, 27(3):711-735. doi:10.1007/s11222-016-9649-y. (\href{https://link.springer.com/article/10.1007/s11222-016-9649-y}{Online}). } projpred/man/print-vsel.Rd0000644000176200001440000000147213531312173015247 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/methods.R \name{print-vsel} \alias{print-vsel} \alias{print.vsel} \alias{print.cvsel} \title{Print methods for vsel/cvsel objects} \usage{ \method{print}{vsel}(x, digits = 2, ...) \method{print}{cvsel}(x, digits = 2, ...) } \arguments{ \item{x}{An object of class vsel/cvsel.} \item{digits}{Number of decimal places to be reported (2 by default).} \item{...}{Further arguments passed to \code{\link{varsel_stats}}.} } \value{ Returns invisibly the data frame produced by \code{\link{varsel_stats}}. } \description{ The \code{print} methods for vsel/cvsel objects created by \code{\link{varsel}} or \code{\link{cv_varsel}}) rely on \code{\link{varsel_stats}} to display a summary of the results of the projection predictive variable selection. } projpred/man/df_binom.Rd0000644000176200001440000000072713345277366014744 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/data.R \docType{data} \name{df_binom} \alias{df_binom} \title{Binomial toy example.} \format{A simulated classification dataset containing 100 observations. \describe{ \item{y}{target, 0 or 1.} \item{x}{features, 30 in total.} }} \source{ \url{http://web.stanford.edu/~hastie/glmnet/glmnetData/BNExample.RData} } \usage{ df_binom } \description{ Binomial toy example. } \keyword{datasets} projpred/man/project.Rd0000644000176200001440000000614013611400232014600 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/project.R \name{project} \alias{project} \title{Projection to submodels} \usage{ project( object, nv = NULL, vind = NULL, relax = NULL, ns = NULL, nc = NULL, intercept = NULL, seed = NULL, regul = 1e-04, ... ) } \arguments{ \item{object}{Either a \code{refmodel}-type object created by \link[=get_refmodel]{get_refmodel} or \link[=init_refmodel]{init_refmodel}, or an object which can be converted to a reference model using \link[=get_refmodel]{get_refmodel}.} \item{nv}{Number of variables in the submodel (the variable combination is taken from the \code{varsel} information). If a list, then the projection is performed for each model size. Default is the model size suggested by the variable selection (see function \code{suggest_size}). Ignored if \code{vind} is specified.} \item{vind}{Variable indices onto which the projection is done. If specified, \code{nv} is ignored.} \item{relax}{If TRUE, then the projected coefficients after L1-selection are computed without any penalization (or using only the regularization determined by \code{regul}). If FALSE, then the coefficients are the solution from the L1-penalized projection. This option is relevant only if L1-search was used. Default is TRUE for genuine reference models and FALSE if \code{object} is datafit (see \link[=init_refmodel]{init_refmodel}).} \item{ns}{Number of samples to be projected. Ignored if \code{nc} is specified. Default is 400.} \item{nc}{Number of clusters in the clustered projection.} \item{intercept}{Whether to use intercept. Default is \code{TRUE}.} \item{seed}{A seed used in the clustering (if \code{nc!=ns}). Can be used to ensure same results every time.} \item{regul}{Amount of regularization in the projection. Usually there is no need for regularization, but sometimes for some models the projection can be ill-behaved and we need to add some regularization to avoid numerical problems.} \item{...}{Currently ignored.} } \value{ A list of submodels (or a single submodel if projection was performed onto a single variable combination), each of which contains the following elements: \describe{ \item{\code{kl}}{The kl divergence from the reference model to the submodel.} \item{\code{weights}}{Weights for each draw of the projected model.} \item{\code{dis}}{Draws from the projected dispersion parameter.} \item{\code{alpha}}{Draws from the projected intercept.} \item{\code{beta}}{Draws from the projected weight vector.} \item{\code{vind}}{The order in which the variables were added to the submodel.} \item{\code{intercept}}{Whether or not the model contains an intercept.} \item{\code{family_kl}}{A modified \code{\link{family}}-object.} } } \description{ Perform projection onto submodels of selected sizes or a specified feature combination. } \examples{ \donttest{ ### Usage with stanreg objects fit <- stan_glm(y~x, binomial()) vs <- varsel(fit) # project onto the best model with 4 variables proj4 <- project(vs, nv = 4) # project onto an arbitrary variable combination (variable indices 3,4 and 8) proj <- project(fit, vind=c(3,4,8)) } } projpred/man/varsel.Rd0000644000176200001440000001001013611400232014415 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/varsel.R \name{varsel} \alias{varsel} \title{Variable selection for generalized linear models} \usage{ varsel( object, d_test = NULL, method = NULL, ns = NULL, nc = NULL, nspred = NULL, ncpred = NULL, relax = NULL, nv_max = NULL, intercept = NULL, penalty = NULL, verbose = F, lambda_min_ratio = 1e-05, nlambda = 150, thresh = 1e-06, regul = 1e-04, ... ) } \arguments{ \item{object}{Either a \code{refmodel}-type object created by \link[=get_refmodel]{get_refmodel} or \link[=init_refmodel]{init_refmodel}, or an object which can be converted to a reference model using \link[=get_refmodel]{get_refmodel}.} \item{d_test}{A test dataset, which is used to evaluate model performance. If not provided, training data is used. Currently this argument is for internal use only.} \item{method}{The method used in the variable selection. Possible options are \code{'L1'} for L1-search and \code{'forward'} for forward selection. Default is 'forward' if the number of variables in the full data is at most 20, and \code{'L1'} otherwise.} \item{ns}{Number of posterior draws used in the variable selection. Cannot be larger than the number of draws in the reference model. Ignored if nc is set.} \item{nc}{Number of clusters to use in the clustered projection. Overrides the \code{ns} argument. Defaults to 1.} \item{nspred}{Number of samples used for prediction (after selection). Ignored if ncpred is given.} \item{ncpred}{Number of clusters used for prediction (after selection). Default is 5.} \item{relax}{If TRUE, then the projected coefficients after L1-selection are computed without any penalization (or using only the regularization determined by \code{regul}). If FALSE, then the coefficients are the solution from the L1-penalized projection. This option is relevant only if \code{method}='L1'. Default is TRUE for genuine reference models and FALSE if \code{object} is datafit (see \link[=init_refmodel]{init_refmodel}).} \item{nv_max}{Maximum number of varibles until which the selection is continued. Defaults to min(20, D, floor(0.4*n)) where n is the number of observations and D the number of variables.} \item{intercept}{Whether to use intercept in the submodels. Defaults to TRUE.} \item{penalty}{Vector determining the relative penalties or costs for the variables. Zero means that those variables have no cost and will therefore be selected first, whereas Inf means that those variables will never be selected. Currently works only if method == 'L1'. By default 1 for each variable.} \item{verbose}{If TRUE, may print out some information during the selection. Defaults to FALSE.} \item{lambda_min_ratio}{Ratio between the smallest and largest lambda in the L1-penalized search. This parameter essentially determines how long the search is carried out, i.e., how large submodels are explored. No need to change the default value unless the program gives a warning about this.} \item{nlambda}{Number of values in the lambda grid for L1-penalized search. No need to change unless the program gives a warning about this.} \item{thresh}{Convergence threshold when computing L1-path. Usually no need to change this.} \item{regul}{Amount of regularization in the projection. Usually there is no need for regularization, but sometimes for some models the projection can be ill-behaved and we need to add some regularization to avoid numerical problems.} \item{...}{Additional arguments to be passed to the \code{get_refmodel}-function.} } \value{ An object of type \code{vsel} that contains information about the feature selection. The fields are not meant to be accessed directly by the user but instead via the helper functions (see the vignettes or type ?projpred to see the main functions in the package.) } \description{ Perform the projection predictive variable selection for generalized linear models using generic reference models. } \examples{ \donttest{ ### Usage with stanreg objects fit <- stan_glm(y~x, binomial()) vs <- varsel(fit) varsel_plot(vs) } } projpred/man/cv_varsel.Rd0000644000176200001440000000725713611400232015130 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/cv_varsel.R \name{cv_varsel} \alias{cv_varsel} \title{Cross-validate the variable selection (varsel)} \usage{ cv_varsel( fit, method = NULL, cv_method = NULL, ns = NULL, nc = NULL, nspred = NULL, ncpred = NULL, relax = NULL, nv_max = NULL, intercept = NULL, penalty = NULL, verbose = T, nloo = NULL, K = NULL, lambda_min_ratio = 1e-05, nlambda = 150, thresh = 1e-06, regul = 1e-04, validate_search = T, seed = NULL, ... ) } \arguments{ \item{fit}{Same as in \link[=varsel]{varsel}.} \item{method}{Same as in \link[=varsel]{varsel}.} \item{cv_method}{The cross-validation method, either 'LOO' or 'kfold'. Default is 'LOO'.} \item{ns}{Number of samples used for selection. Ignored if nc is provided or if method='L1'.} \item{nc}{Number of clusters used for selection. Default is 1 and ignored if method='L1' (L1-search uses always one cluster).} \item{nspred}{Number of samples used for prediction (after selection). Ignored if ncpred is given.} \item{ncpred}{Number of clusters used for prediction (after selection). Default is 5.} \item{relax}{Same as in \link[=varsel]{varsel}.} \item{nv_max}{Same as in \link[=varsel]{varsel}.} \item{intercept}{Same as in \link[=varsel]{varsel}.} \item{penalty}{Same as in \link[=varsel]{varsel}.} \item{verbose}{Whether to print out some information during the validation, Default is TRUE.} \item{nloo}{Number of observations used to compute the LOO validation (anything between 1 and the total number of observations). Smaller values lead to faster computation but higher uncertainty (larger errorbars) in the accuracy estimation. Default is to use all observations, but for faster experimentation, one can set this to a small value such as 100. Only applicable if \code{cv_method = 'LOO'}.} \item{K}{Number of folds in the k-fold cross validation. Default is 5 for genuine reference models and 10 for datafits (that is, for penalized maximum likelihood estimation).} \item{lambda_min_ratio}{Same as in \link[=varsel]{varsel}.} \item{nlambda}{Same as in \link[=varsel]{varsel}.} \item{thresh}{Same as in \link[=varsel]{varsel}.} \item{regul}{Amount of regularization in the projection. Usually there is no need for regularization, but sometimes for some models the projection can be ill-behaved and we need to add some regularization to avoid numerical problems.} \item{validate_search}{Whether to cross-validate also the selection process, that is, whether to perform selection separately for each fold. Default is TRUE and we strongly recommend not setting this to FALSE, because this is known to bias the accuracy estimates for the selected submodels. However, setting this to FALSE can sometimes be useful because comparing the results to the case where this parameter is TRUE gives idea how strongly the feature selection is (over)fitted to the data (the difference corresponds to the search degrees of freedom or the effective number of parameters introduced by the selectin process).} \item{seed}{Random seed used in the subsampling LOO. By default uses a fixed seed.} \item{...}{Additional arguments to be passed to the \code{get_refmodel}-function.} } \value{ An object of type \code{cvsel} that contains information about the feature selection. The fields are not meant to be accessed directly by the user but instead via the helper functions (see the vignettes or type ?projpred to see the main functions in the package.) } \description{ Perform cross-validation for the projective variable selection for a generalized linear model. } \examples{ \donttest{ ### Usage with stanreg objects fit <- stan_glm(y~x, binomial()) cvs <- cv_varsel(fit) varsel_plot(cvs) } } projpred/man/cv-indices.Rd0000644000176200001440000000330113531311224015155 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/methods.R \name{cv-indices} \alias{cv-indices} \alias{cvfolds} \alias{cvind} \title{Create cross-validation indices} \usage{ cvfolds(n, k, seed = NULL) cvind(n, k, out = c("foldwise", "indices"), seed = NULL) } \arguments{ \item{n}{Number of data points.} \item{k}{Number of folds. Must be at least 2 and not exceed \code{n}.} \item{seed}{Random seed so that the same division could be obtained again if needed.} \item{out}{Format of the output, either 'foldwise' (default) or 'indices'. See below for details.} } \value{ \code{cvfolds} returns a vector of length \code{n} such that each element is an integer between 1 and \code{k} denoting which fold the corresponding data point belongs to. The returned value of \code{cvind} depends on the \code{out}-argument. If \code{out}='foldwise', the returned value is a list with \code{k} elements, each having fields \code{tr} and \code{ts} which give the training and test indices, respectively, for the corresponding fold. If \code{out}='indices', the returned value is a list with fields \code{tr} and \code{ts} each of which is a list with \code{k} elements giving the training and test indices for each fold. } \description{ Divide indices from 1 to \code{n} into subsets for \code{k}-fold cross validation. These functions are potentially useful when creating the \code{cvfits} and \code{cvfun} arguments for \link[=init_refmodel]{init_refmodel}. The returned value is different for these two methods, see below for details. } \examples{ \donttest{ ### compute sample means within each fold n <- 100 y <- rnorm(n) cv <- cvind(n, k=5) cvmeans <- lapply(cv, function(fold) mean(y[fold$tr])) } } projpred/man/predict.refmodel.Rd0000644000176200001440000000272113611400232016361 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/refmodel.R \name{predict.refmodel} \alias{predict.refmodel} \title{Predict method for reference model objects} \usage{ \method{predict}{refmodel}( object, znew, ynew = NULL, offsetnew = NULL, weightsnew = NULL, type = c("response", "link"), ... ) } \arguments{ \item{object}{The object of class \code{refmodel}.} \item{znew}{Matrix of predictor values used in the prediction.} \item{ynew}{New (test) target variables. If given, then the log predictive density for the new observations is computed.} \item{offsetnew}{Offsets for the new observations. By default a vector of zeros.} \item{weightsnew}{Weights for the new observations. For binomial model, corresponds to the number trials per observation. Has effect only if \code{ynew} is specified. By default a vector of ones.} \item{type}{Scale on which the predictions are returned. Either 'link' (the latent function value, from -inf to inf) or 'response' (the scale on which the target \code{y} is measured, obtained by taking the inverse-link from the latent value).} \item{...}{Currently ignored.} } \value{ Returns either a vector of predictions, or vector of log predictive densities evaluated at \code{ynew} if \code{ynew} is not \code{NULL}. } \description{ Compute the predictions using the reference model, that is, compute the expected value for the next observation, or evaluate the log-predictive density at a given point. } projpred/man/proj-pred.Rd0000644000176200001440000000710213611400232015033 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/methods.R \name{proj-pred} \alias{proj-pred} \alias{proj_linpred} \alias{proj_predict} \title{Extract draws of the linear predictor and draw from the predictive distribution of the projected submodel} \usage{ proj_linpred( object, xnew, ynew = NULL, offsetnew = NULL, weightsnew = NULL, nv = NULL, transform = FALSE, integrated = FALSE, ... ) proj_predict( object, xnew, offsetnew = NULL, weightsnew = NULL, nv = NULL, draws = NULL, seed_samp = NULL, ... ) } \arguments{ \item{object}{Either an object returned by \link[=varsel]{varsel}, \link[=cv_varsel]{cv_varsel} or \link[=init_refmodel]{init_refmodel}, or alternatively any object that can be converted to a reference model.} \item{xnew}{The predictor values used in the prediction. If \code{vind} is specified, then \code{xnew} should either be a dataframe containing column names that correspond to \code{vind} or a matrix with the number and order of columns corresponding to \code{vind}. If \code{vind} is unspecified, then \code{xnew} must either be a dataframe containing all the column names as in the original data or a matrix with the same columns at the same positions as in the original data.} \item{ynew}{New (test) target variables. If given, then the log predictive density for the new observations is computed.} \item{offsetnew}{Offsets for the new observations. By default a vector of zeros.} \item{weightsnew}{Weights for the new observations. For binomial model, corresponds to the number trials per observation. For \code{proj_linpred}, this argument matters only if \code{ynew} is specified. By default a vector of ones.} \item{nv}{Number of variables in the submodel (the variable combination is taken from the variable selection information). If a vector with several values, then results for all specified model sizes are returned. Ignored if \code{vind} is specified. By default use the automatically suggested model size.} \item{transform}{Should the linear predictor be transformed using the inverse-link function? Default is \code{FALSE}. For \code{proj_linpred} only.} \item{integrated}{If \code{TRUE}, the output is averaged over the parameters. Default is \code{FALSE}. For \code{proj_linpred} only.} \item{...}{Additional argument passed to \link{project} if \code{object} is an object returned by \link{varsel} or \link{cv_varsel}.} \item{draws}{Number of draws to return from the predictive distribution of the projection. The default is 1000. For \code{proj_predict} only.} \item{seed_samp}{An optional seed to use for drawing from the projection. For \code{proj_predict} only.} } \value{ If the prediction is done for one submodel only (\code{nv} has length one or \code{vind} is specified) and ynew is unspecified, a matrix or vector of predictions (depending on the value of \code{integrated}). If \code{ynew} is specified, returns a list with elements pred (predictions) and lpd (log predictive densities). If the predictions are done for several submodel sizes, returns a list with one element for each submodel. } \description{ \code{proj_linpred} extracts draws of the linear predictor and \code{proj_predict} draws from the predictive distribution of the projected submodel or submodels. If the projection has not been performed, the functions also perform the projection. } \examples{ \donttest{ ### Usage with stanreg objects fit <- stan_glm(y~x, binomial()) vs <- varsel(fit) # compute predictions with 4 variables at the training points pred <- proj_linpred(vs, xnew=x, nv = 4) pred <- proj_predict(vs, xnew=x, nv = 4) } } projpred/man/varsel-statistics.Rd0000644000176200001440000000504513611400232016621 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/methods.R \name{varsel-statistics} \alias{varsel-statistics} \alias{varsel_plot} \alias{varsel_stats} \title{Plot or fetch summary statistics related to variable selection} \usage{ varsel_plot( object, nv_max = NULL, stats = "elpd", deltas = F, alpha = 0.32, baseline = NULL, ... ) varsel_stats( object, nv_max = NULL, stats = "elpd", type = c("mean", "se"), deltas = F, alpha = 0.32, baseline = NULL, ... ) } \arguments{ \item{object}{The object returned by \link[=varsel]{varsel} or \link[=cv_varsel]{cv_varsel}.} \item{nv_max}{Maximum submodel size for which the statistics are calculated. For \code{varsel_plot} it must be at least 1.} \item{stats}{One or several strings determining which statistics to calculate. Available statistics are: \itemize{ \item{elpd:} {(Expected) sum of log predictive densities} \item{mlpd:} {Mean log predictive density, that is, elpd divided by the number of datapoints.} \item{mse:} {Mean squared error (gaussian family only)} \item{rmse:} {Root mean squared error (gaussian family only)} \item{acc/pctcorr:} {Classification accuracy (binomial family only)} \item{auc:} {Area under the ROC curve (binomial family only)} } Default is elpd.} \item{deltas}{If \code{TRUE}, the submodel statistics are estimated relative to the baseline model (see argument \code{baseline}) instead of estimating the actual values of the statistics. Defaults to \code{FALSE}.} \item{alpha}{A number indicating the desired coverage of the credible intervals. For example \code{alpha=0.32} corresponds to 68\% probability mass within the intervals, that is, one standard error intervals.} \item{baseline}{Either 'ref' or 'best' indicating whether the baseline is the reference model or the best submodel found. Default is 'ref' when the reference model exists, and 'best' otherwise.} \item{...}{Currently ignored.} \item{type}{One or more items from 'mean', 'se', 'lower' and 'upper' indicating which of these to compute (mean, standard error, and lower and upper credible bounds). The credible bounds are determined so that \code{1-alpha} percent of the mass falls between them.} } \description{ \code{varsel_stats} can be used to obtain summary statistics related to variable selection. The same statistics can be plotted with \code{varsel_plot}. } \examples{ \donttest{ ### Usage with stanreg objects fit <- stan_glm(y~x, binomial()) vs <- cv_varsel(fit) varsel_plot(vs) # print out some stats varsel_stats(vs, stats=c('acc'), type = c('mean','se')) } } projpred/DESCRIPTION0000644000176200001440000000276213614525572013626 0ustar liggesusersPackage: projpred Title: Projection Predictive Feature Selection Version: 1.1.5 Authors@R: c(person("Juho", "Piironen", role = c("cre", "aut"), email = "juho.t.piironen@gmail.com"), person("Markus", "Paasiniemi", role = "aut"), person("Aki", "Vehtari", role = "aut"), person("Jonah", "Gabry", role = "ctb"), person("Paul-Christian", "Bürkner", role = "ctb"), person("Marco", "Colombo", role = "ctb")) Maintainer: Juho Piironen Description: Performs projection predictive feature selection for generalized linear models (see, Piironen, Paasiniemi and Vehtari, 2018, ). The package is compatible with the 'rstanarm' and 'brms' packages, but other reference models can also be used. See the package vignette for more information and examples. Depends: R (>= 3.5.0) Imports: loo (>= 2.0.0), ggplot2, Rcpp, utils LinkingTo: Rcpp, RcppArmadillo License: GPL-3 Encoding: UTF-8 LazyData: TRUE RoxygenNote: 7.0.2 Suggests: rstanarm, brms, testthat, knitr, rmarkdown, glmnet, bayesplot (>= 1.5.0) URL: https://mc-stan.org/projpred, https://discourse.mc-stan.org/ BugReports: https://github.com/stan-dev/projpred/issues NeedsCompilation: yes Packaged: 2020-01-29 17:35:31 UTC; juho Author: Juho Piironen [cre, aut], Markus Paasiniemi [aut], Aki Vehtari [aut], Jonah Gabry [ctb], Paul-Christian Bürkner [ctb], Marco Colombo [ctb] Repository: CRAN Date/Publication: 2020-01-30 10:10:02 UTC projpred/tests/0000755000176200001440000000000013613746255013255 5ustar liggesusersprojpred/tests/testthat/0000755000176200001440000000000013614525572015113 5ustar liggesusersprojpred/tests/testthat/test_glm_ridge.R0000644000176200001440000002050413361364246020225 0ustar liggesuserscontext("ridge") # tests for ridge regression, currently untested combinations # - gaussian with inverse-link # - binomial with log or cloglog-link # - poisson with sqrt or id-link # - Gamma with inverse or id-link # - everything except gaussian with id-link for ridge penalty set.seed(1235) n <- 40 nv <- 10 nv_fit <- nv - 5 x <- matrix(rnorm(n*nv, 0, 1), n, nv) b <- runif(nv)-0.5 dis <- runif(1, 1, 2) x_tr <- x[,1:nv_fit] weights <- sample(1:4, n, replace = T) offset <- rnorm(n, 0, 1) tol <- 1e-03 # some link-functions seem to need higher thresh-argument for glm_ridge # (gaussian-log, binomial-cauchit, Gamma-log) extra_thresh <- 1e-10 test_that("glmfun: gradients should give the same results as finite differences", { fdiffu <- function(f, x, h=1e-3, order=1) { # function for computing derivative of univariate function f at x using finite difference. if (order != 1 && order != 2) stop('Order must be either 1 or 2.') n <- length(x) df <- rep(0,n) for (i in 1:n) { if (order==1) df[i] <- (f(x[i] + h) - f(x[i])) / h else if (order==2) df[i] <- (f(x[i] + h) - 2*f(x[i]) + f(x[i] - h)) / h^2 } return(df) } fams <- list(kl_helpers(gaussian(link='identity')), kl_helpers(gaussian(link='log')), kl_helpers(binomial(link='logit')), kl_helpers(binomial(link='probit')), kl_helpers(binomial(link='cauchit')), kl_helpers(poisson(link='log')), kl_helpers(Student_t(nu=3, link='identity')), kl_helpers(Student_t(nu=4, link='log')), kl_helpers(Student_t(nu=7, link='inverse')) ) n <- 10 weights <- sample(1:4, n, replace = T) offset <- rnorm(n, 0, 1) for (i in seq_along(fams)) { fam <- fams[[i]] if (fam$family == 'gaussian' || fam$family == 'Student_t') y <- rnorm(n) else if (fam$family == 'binomial') y <- rbinom(n,1,0.6) else if (fam$family == 'poisson') y <- rpois(n, 1) devfun <- function(f) projpred:::pseudo_data(f,y,fam,weights=weights,offset=offset)$loss zfun <- function(f) projpred:::pseudo_data(f,y,fam,weights=weights,offset=offset)$z wfun <- function(f) projpred:::pseudo_data(f,y,fam,weights=weights,offset=offset)$w gradan <- function(f) sum(projpred:::pseudo_data(f,y,fam,weights=weights,offset=offset)$grad) # analytic gradfd <- function(f) fdiffu(devfun, f, h=1e-5) # finite difference # compare analytic and finite difference gradients fval <- seq(-5,5,len=100) gan <- sapply(fval, gradan) gfd <- sapply(fval, gradfd) expect_equal(gan, gfd, tol=1e-4*max(abs(gan),abs(gfd))) } }) test_that("glm_ridge: gaussian, id-link, intercept, lambda = 0", { fam <- kl_helpers(gaussian(link = 'identity')) y <- rnorm(n, x%*%b, dis) lambda <- 0 glmfit <- glm(y ~ x_tr, family = fam, weights = weights, offset = offset) ridgefit <- glm_ridge(x_tr, y, family = fam, lambda = lambda, weights = weights, offset = offset, intercept = TRUE) expect_equal(unname(coef(glmfit)), c(ridgefit$beta0, ridgefit$beta), tolerance = tol) }) test_that("glm_ridge: gaussian, id-link, no intercept, lambda = 0", { fam <- kl_helpers(gaussian(link = 'identity')) y <- rnorm(n, x%*%b, dis) lambda <- 0 glmfit <- glm(y ~ x_tr - 1, family = fam, weights = weights, offset = offset) ridgefit <- glm_ridge(x_tr, y, family = fam, lambda = lambda, weights = weights, offset = offset, intercept = FALSE) expect_equal(unname(coef(glmfit)), c(ridgefit$beta), tolerance = tol) }) test_that("glm_ridge: gaussian, id-link, intercept, lambda = 0.5", { fam <- kl_helpers(gaussian(link = 'identity')) y <- rnorm(n, x%*%b, dis) lambda <- 0.5 ridgefit <- glm_ridge(x_tr, y, family = fam, lambda = lambda, weights = weights, offset = offset, intercept = TRUE) # analytic solution, no penalty on the intercept term penalty <- 0.5*diag(c(0, rep(lambda, nv_fit))) exp_beta <- c(solve(crossprod(cbind(1, x_tr) * sqrt(weights)) + penalty, crossprod(cbind(1, x_tr) * weights, y - offset))) expect_equal(exp_beta, c(ridgefit$beta0, ridgefit$beta), tolerance = tol) }) test_that("glm_ridge: gaussian, log-link, intercept, lambda = 0", { fam <- kl_helpers(gaussian(link = 'log')) # intercept of 4 to ensure that y are positive y <- rnorm(n, fam$linkinv(x%*%b+4), dis) lambda <- 0 glmfit <- glm(y ~ x_tr, family = fam, weights = weights, offset = offset) ridgefit <- glm_ridge(x_tr, y, family = fam, lambda = lambda, weights = weights, offset = offset, intercept = TRUE, thresh = extra_thresh) expect_equal(unname(coef(glmfit)), c(ridgefit$beta0, ridgefit$beta), tolerance = tol) }) test_that("glm_ridge: binomial, logit-link, intercept, lambda = 0", { fam <- kl_helpers(binomial(link = 'logit')) y <- rbinom(n, weights, fam$linkinv(x%*%b)) lambda <- 0 glmfit <- glm(cbind(y, weights-y) ~ x_tr, family = fam, offset = offset) ridgefit <- glm_ridge(x_tr, y/weights, family = fam, lambda = lambda, weights = weights, offset = offset, intercept = TRUE) expect_equal(unname(coef(glmfit)), c(ridgefit$beta0, ridgefit$beta), tolerance = tol) }) test_that("glm_ridge: binomial, logit-link, no intercept, lambda = 0", { fam <- kl_helpers(binomial(link = 'logit')) y <- rbinom(n, weights, fam$linkinv(x%*%b)) lambda <- 0 glmfit <- glm(cbind(y, weights-y) ~ x_tr - 1, family = fam, offset = offset) ridgefit <- glm_ridge(x_tr, y/weights, family = fam, lambda = lambda, weights = weights, offset = offset, intercept = FALSE) expect_equal(unname(coef(glmfit)), c(ridgefit$beta), tolerance = tol) }) test_that("glm_ridge: binomial, probit-link, intercept, lambda = 0", { fam <- kl_helpers(binomial(link = 'probit')) y <- rbinom(n, weights, fam$linkinv(x%*%b)) lambda <- 0 glmfit <- glm(cbind(y, weights-y) ~ x_tr, family = fam, offset = offset) ridgefit <- glm_ridge(x_tr, y/weights, family = fam, lambda = lambda, weights = weights, offset = offset, intercept = TRUE, thresh = extra_thresh) expect_equal(unname(coef(glmfit)), c(ridgefit$beta0, ridgefit$beta), tolerance = tol) }) test_that("glm_ridge: binomial, cauchit-link, intercept, lambda = 0", { fam <- kl_helpers(binomial(link = 'cauchit')) y <- rbinom(n, weights, fam$linkinv(x%*%b)) lambda <- 0 glmfit <- glm(cbind(y, weights-y) ~ x_tr, family = fam, offset = offset) ridgefit <- glm_ridge(x_tr, y/weights, family = fam, lambda = lambda, weights = weights, offset = offset, intercept = TRUE, thresh = extra_thresh) expect_equal(unname(coef(glmfit)), c(ridgefit$beta0, ridgefit$beta), tolerance = tol) }) test_that("glm_ridge: poisson, log-link, intercept, lambda = 0", { fam <- kl_helpers(poisson(link = 'log')) y <- rpois(n, fam$linkinv(x%*%b)) lambda <- 0 glmfit <- glm(y ~ x_tr, family = fam, weights = weights, offset = offset) ridgefit <- glm_ridge(x_tr, y, family = fam, lambda = lambda, weights = weights, offset = offset, intercept = TRUE) expect_equal(unname(coef(glmfit)), c(ridgefit$beta0, ridgefit$beta), tolerance = tol) }) test_that("glm_ridge: poisson, log-link, no intercept, lambda = 0", { fam <- kl_helpers(poisson(link = 'log')) y <- rpois(n, fam$linkinv(x%*%b)) lambda <- 0 glmfit <- glm(y ~ x_tr - 1, family = fam, weights = weights, offset = offset) ridgefit <- glm_ridge(x_tr, y, family = fam, lambda = lambda, weights = weights, offset = offset, intercept = FALSE) expect_equal(unname(coef(glmfit)), c(ridgefit$beta), tolerance = tol) }) # test_that("glm_ridge: Gamma, log-link, intercept, lambda = 0", { # fam <- kl_helpers(Gamma(link = 'log')) # y <- rgamma(n, fam$linkinv(x%*%b + 1)) # lambda <- 0 # # glmfit <- glm(y ~ x_tr, family = fam, weights = weights, offset = offset) # ridgefit <- glm_ridge(x_tr, y, family = fam, lambda = lambda, # weights = weights, offset = offset, intercept = TRUE, # thresh = extra_thresh) # # expect_equal(unname(coef(glmfit)), c(ridgefit$beta0, ridgefit$beta), # tolerance = tol) # }) projpred/tests/testthat/test_cvindices.R0000644000176200001440000000263113531311224020227 0ustar liggesuserscontext('cv-indices') # tests for cvfolds and cvind test_that('k is valid', { cvfuns <- c(cvfolds, cvind) for (cvfun in cvfuns) { expect_error(cvfun(n = 10, k = 1000), 'cannot exceed n') expect_error(cvfun(n = 10, k = 1), 'must be at least 2') expect_error(cvfun(n = 10, k = c(4, 9)), 'a single integer value') expect_error(cvfun(n = 10, k = 'a'), 'a single integer value') } }) test_that('cvfolds produces sensible results', { out <- cvfolds(n = 10, k = 3) expect_equal(length(out), 10) expect_equal(min(out), 1) expect_equal(max(out), 3) }) test_that('cvind checks the \'out\' argument', { expect_error(cvind(n = 10, k = 3, out = 'zzz'), '\'arg\' should be one of') expect_error(cvind(n = 10, k = 3, out = c('yyy', 'zzz')), 'must be of length 1') expect_error(cvind(n = 10, k = 3, out = 12), 'must be NULL or a character vector') }) test_that('cvind produces sensible results with out=\'foldwise\'', { out <- cvind(n = 10, k = 3, out = 'foldwise') expect_equal(length(out), 3) expect_named(out[[1]], c("tr", "ts")) }) test_that('cvind produces sensible results with out=\'indices\'', { out <- cvind(n = 10, k = 3, out = 'indices') expect_equal(length(out), 2) expect_named(out, c("tr", "ts")) expect_equal(length(out$tr), 3) expect_equal(length(out$ts), 3) }) projpred/tests/testthat/helpers/0000755000176200001440000000000013345277366016563 5ustar liggesusersprojpred/tests/testthat/helpers/SW.R0000644000176200001440000000007413345277366017240 0ustar liggesusersSW <- function(expr) capture.output(suppressWarnings(expr)) projpred/tests/testthat/test_syntax.R0000644000176200001440000000173713612376163017631 0ustar liggesuserscontext('syntax') # test for simple fits but with varying syntax (e.g. varying formula etc.) if (require(rstanarm)) { # load the mesquite data data('mesquite', package = 'projpred') # fit the model with some transformations on the target variable and the original inputs SW( fit <- stan_glm(log(LeafWt) ~ log(Diam1) + log(Diam2) + log(CanHt) + log(TotHt) + log(Dens) + log(Diam1)*log(Diam2) + Group, data = mesquite, QR=TRUE, refresh=0, chain=2) ) # selection vs <- varsel(fit) SW(cvs <- cv_varsel(fit, verbose=F)) ssize <- suggest_size(cvs) # project onto some model size proj <- project(cvs, nv = 3) test_that('varsel/cv_varsel/project return objects with correct types', { expect_true('vsel' %in% class(vs)) expect_true('cvsel' %in% class(cvs)) expect_true('projection' %in% class(proj)) }) test_that('suggested model size is ok', { expect_true(!is.na(ssize)) }) } projpred/tests/testthat/test_proj_pred.R0000644000176200001440000003316313612376041020260 0ustar liggesuserscontext('proj_linpred') # tests for proj_linpred and proj_predict if (require(rstanarm)) { seed <- 1235 set.seed(seed) n <- 40 nv <- 5 x <- matrix(rnorm(n*nv, 0, 1), n, nv) b <- runif(nv)-0.5 dis <- runif(1, 1, 2) weights <- sample(1:4, n, replace = T) offset <- rnorm(n) chains <- 2 iter <- 500 source(file.path('helpers', 'SW.R')) f_gauss <- gaussian() df_gauss <- data.frame(y = rnorm(n, f_gauss$linkinv(x%*%b), dis), x = x) f_binom <- binomial() df_binom <- data.frame(y = rbinom(n, weights, f_binom$linkinv(x%*%b)), x = x) f_poiss <- poisson() df_poiss <- data.frame(y = rpois(n, f_poiss$linkinv(x%*%b)), x = x) ys <- list() ys[[1]] <- df_gauss$y ys[[2]] <- df_binom$y/weights ys[[3]] <- df_poiss$y SW({ fit_gauss <- stan_glm(y ~ x, family = f_gauss, data = df_gauss, QR = T, weights = weights, offset = offset, chains = chains, seed = seed, iter = iter) fit_binom <- stan_glm(cbind(y, weights-y) ~ x, family = f_binom, QR = T, data = df_binom, weights = weights, offset = offset, chains = chains, seed = seed, iter = iter) fit_poiss <- stan_glm(y ~ x, family = f_poiss, data = df_poiss, QR = T, weights = weights, offset = offset, chains = chains, seed = seed, iter = iter) }) fit_list <- list(gauss = fit_gauss, binom = fit_binom, poiss = fit_poiss) vs_list <- lapply(fit_list, varsel, nv_max = nv, verbose = FALSE) proj_vind_list <- lapply(vs_list, project, vind = c(2,3), seed = seed) proj_all_list <- lapply(vs_list, project, intercept = FALSE, seed = seed, nv=0:nv) test_that("proj_linpred: xnew is specified correctly", { expect_error(proj_linpred(proj_vind_list), 'argument "xnew" is missing, with no default') expect_error(proj_linpred(proj_vind_list, xnew = NULL), 'must be a data.frame or a matrix') expect_error(proj_linpred(proj_vind_list, xnew = x[, 1]), 'must be a data.frame or a matrix') expect_error(proj_linpred(proj_vind_list, xnew = x, vind = 1:1000), 'number of columns in xnew does not match') expect_error(proj_linpred(proj_vind_list, xnew = x[, 1:2]), 'xnew has 2 columns, but vind expects 3 columns') }) test_that("output of proj_linpred is sensible with fit-object as input", { for(i in 1:length(vs_list)) { i_inf <- names(vs_list)[i] pl <- proj_linpred(vs_list[[i]], xnew = x, nv = 0:nv) expect_length(pl, nv + 1) for(j in 1:length(pl)) expect_equal(ncol(pl[[j]]), n, info = i_inf) } }) test_that("output of proj_linpred is sensible with project-object as input", { for(i in 1:length(proj_vind_list)) { i_inf <- names(proj_vind_list)[i] pl <- proj_linpred(proj_vind_list[[i]], xnew = x) expect_equal(ncol(pl), n, info = i_inf) } for(i in 1:length(proj_all_list)) { i_inf <- names(proj_all_list)[i] pl <- proj_linpred(proj_all_list[[i]], xnew = x) expect_length(pl, nv + 1) for(j in 1:length(pl)) expect_equal(ncol(pl[[j]]), n, info = i_inf) } }) test_that("proj_linpred: error when varsel has not been performed on the object", { expect_error(proj_linpred(1, xnew = x), 'is not a variable selection object') expect_error(proj_linpred(fit_gauss, xnew = x), 'is not a variable selection object') expect_error(proj_linpred(c(proj_vind_list, list(x)), xnew = x), 'contains objects not created by varsel') }) test_that("proj_linpred: specifying ynew incorrectly produces an error", { expect_error(proj_linpred(vs_list[["gauss"]], xnew = x, ynew = x[, 1:3]), 'y cannot have more than two columns') expect_error(proj_linpred(vs_list[["gauss"]], xnew = x, ynew = factor(ys[[1]])), 'cannot be a factor') expect_error(proj_linpred(vs_list[["poiss"]], xnew = x, ynew = factor(ys[[3]])), 'cannot be a factor') expect_error(proj_linpred(vs_list[["binom"]], xnew = x, ynew = ys[[1]]), 'y values must be 0 <= y <= 1 for the binomial model') expect_error(proj_linpred(vs_list[["binom"]], xnew = x, ynew = factor(ys[[1]])), 'y cannot contain more than two classes') }) test_that("proj_linpred: specifying ynew has an expected effect", { for(i in 1:length(vs_list)) { i_inf <- names(vs_list)[i] pl <- proj_linpred(vs_list[[i]], xnew = x, ynew = ys[[i]], weightsnew=weights, nv = 0:nv) pl2 <- proj_linpred(vs_list[[i]], xnew = x, weightsnew=weights, nv = 0:nv) for(j in 1:length(pl)) { expect_named(pl[[j]], c('pred', 'lpd')) expect_equal(ncol(pl[[j]]$pred), n, info = i_inf) expect_equal(ncol(pl[[j]]$lpd), n, info = i_inf) expect_equal(ncol(pl2[[j]]), n, info = i_inf) } } }) test_that("proj_linpred: specifying ynew as a factor works in a binomial model", { yfactor <- factor(rbinom(n, 1, 0.5)) pl <- proj_linpred(vs_list[["binom"]], xnew = x, ynew = yfactor) expect_named(pl, c('pred', 'lpd')) expect_equal(ncol(pl$pred), n) expect_equal(ncol(pl$lpd), n) }) test_that("proj_linpred: specifying weights has an expected effect", { for(i in 1:length(proj_vind_list)) { # for binomial models weights have to be specified if (proj_vind_list[[i]]$family_kl$family != 'binomial') { i_inf <- names(proj_vind_list)[i] plw <- proj_linpred(proj_vind_list[[i]], xnew = x, ynew = ys[[i]], weightsnew = weights) pl <- proj_linpred(proj_vind_list[[i]], xnew = x, ynew = ys[[i]]) expect_named(plw, c('pred', 'lpd')) expect_equal(ncol(plw$pred), n, info = i_inf) expect_equal(ncol(plw$lpd), n, info = i_inf) expect_true(sum(plw$lpd != pl$lpd) > 0, info = i_inf) } } }) test_that("proj_linpred: specifying offset has an expected effect", { for(i in 1:length(proj_vind_list)) { i_inf <- names(proj_vind_list)[i] plo <- proj_linpred(proj_vind_list[[i]], xnew = x, ynew = ys[[i]], weightsnew=weights, offsetnew = offset) pl <- proj_linpred(proj_vind_list[[i]], xnew = x, ynew = ys[[i]], weightsnew=weights) expect_named(plo, c('pred', 'lpd')) expect_equal(ncol(plo$pred), n, info = i_inf) expect_equal(ncol(plo$lpd), n, info = i_inf) expect_true(sum(plo$lpd != pl$lpd) > 0, info = i_inf) } }) test_that("proj_linpred: specifying transform has an expected effect", { for(i in 1:length(proj_vind_list)) { i_inf <- names(proj_vind_list)[i] plt <- proj_linpred(proj_vind_list[[i]], xnew = x, transform = TRUE) plf <- proj_linpred(proj_vind_list[[i]], xnew = x, transform = FALSE) expect_equal(proj_vind_list[[i]]$family_kl$linkinv(plf), plt, info = i_inf) } }) test_that("proj_linpred: specifying integrated has an expected effect", { for(i in 1:length(proj_vind_list)) { i_inf <- names(proj_vind_list)[i] plt <- proj_linpred(proj_vind_list[[i]], xnew = x, integrated = TRUE) plf <- proj_linpred(proj_vind_list[[i]], xnew = x, integrated = FALSE) expect_equal(drop(proj_vind_list[[i]]$weights%*%plf), plt, info = i_inf) } }) test_that("proj_linpred: adding more regularization has an expected effect", { regul <- c(1e-6, 1e-1, 1e2) for(i in 1:length(vs_list)) { i_inf <- names(vs_list)[i] norms <- rep(0, length(regul)) for (j in 1:length(regul)) { pred <- proj_linpred(vs_list[[i]], xnew = x, nv = 2, transform = FALSE, integrated = TRUE, regul=regul[j]) norms[j] <- sum(pred^2) } for (j in 1:(length(regul)-1)) expect_true(all(norms[j] >= norms[j+1]), info = i_inf) } }) test_that("proj_linpred: arguments passed to project work accordingly", { for(i in 1:length(vs_list)) { i_inf <- names(vs_list)[i] pr <- project(vs_list[[i]], nv = c(2, 4), nc = 2, ns = 20, intercept = FALSE, regul = 1e-8, seed = 12) prl1 <- proj_linpred(pr, xnew = x) prl2 <- proj_linpred(vs_list[[i]], xnew = x, nv = c(2, 4), nc = 2, ns = 20, intercept = FALSE, regul = 1e-8, seed = 12) expect_equal(prl1, prl2, info = i_inf) } }) test_that("proj_linpred: providing xnew as a data frame works as expected", { for(i in 1:length(proj_vind_list)) { i_inf <- names(proj_vind_list)[i] pl <- proj_predict(proj_vind_list[[i]], xnew = setNames(data.frame(x), paste0('x',1:NCOL(x)))) expect_equal(ncol(pl), n, info = i_inf) } SW( fit_form <- stan_glm(mpg~(drat + wt)^2, data = mtcars, QR = T, chains = chains, seed = seed, iter = iter) ) vs_form <- varsel(fit_form) p1 <- proj_linpred(vs_form, xnew = mtcars, nv = 3, seed = 2) p2 <- proj_linpred(vs_form, xnew = get_x(fit_form)[,-1], nv = 3, seed = 2) expect_equal(p1, p2) }) # ------------------------------------------------------------- context('proj_predict') test_that("proj_predict: xnew is specified correctly", { expect_error(proj_predict(proj_vind_list), 'argument "xnew" is missing, with no default') expect_error(proj_predict(proj_vind_list, xnew = NULL), 'must be a data.frame or a matrix') expect_error(proj_predict(proj_vind_list, xnew = x[, 1]), 'must be a data.frame or a matrix') expect_error(proj_predict(proj_vind_list, xnew = x, vind = 1:1000), 'number of columns in xnew does not match') expect_error(proj_predict(proj_vind_list, xnew = x[, 1:2]), 'xnew has 2 columns, but vind expects 3 columns') }) test_that("output of proj_predict is sensible with fit-object as input", { for(i in 1:length(vs_list)) { i_inf <- names(vs_list)[i] pl <- proj_predict(vs_list[[i]], xnew = x, nv = 0:nv) expect_length(pl, nv + 1) for(j in 1:length(pl)) expect_equal(ncol(pl[[j]]), n, info = i_inf) } }) test_that("output of proj_predict is sensible with project-object as input", { for(i in 1:length(proj_vind_list)) { i_inf <- names(proj_vind_list)[i] pl <- proj_predict(proj_vind_list[[i]], xnew = x) expect_equal(ncol(pl), n, info = i_inf) } for(i in 1:length(proj_all_list)) { i_inf <- names(proj_all_list)[i] pl <- proj_predict(proj_all_list[[i]], xnew = x) expect_length(pl, nv + 1) for(j in 1:length(pl)) expect_equal(ncol(pl[[j]]), n, info = i_inf) } }) test_that("proj_predict: error when varsel has not been performed on the object", { expect_error(proj_predict(1, xnew = x), 'is not a variable selection object') expect_error(proj_predict(fit_gauss, xnew = x), 'is not a variable selection object') expect_error(proj_predict(c(proj_vind_list, list(x)), xnew = x), 'contains objects not created by varsel') }) test_that("proj_predict: specifying ynew has an expected effect", { for (i in 1:length(vs_list)) { pl <- proj_predict(vs_list[[i]], xnew = x, ynew = ys[[i]], nv = 0:3) pl2 <- proj_predict(vs_list[[i]], xnew = x, nv = 0:3) for (j in 1:length(pl)) { expect_equal(dim(pl[[j]]), dim(pl2[[j]])) } } }) test_that("proj_predict: specifying ynew as a factor works in a binomial model", { yfactor <- factor(rbinom(n, 1, 0.5)) pl <- proj_predict(vs_list[["binom"]], xnew = x, ynew = yfactor) expect_equal(ncol(pl), n) expect_true(all(pl %in% c(0, 1))) }) test_that("proj_predict: specifying weightsnew has an expected effect", { pl <- proj_predict(proj_vind_list[['binom']], xnew = x, seed = seed) plw <- proj_predict(proj_vind_list[['binom']], xnew = x, seed = seed, weightsnew = weights) expect_true(sum(pl != plw)>0) }) test_that("proj_predict: specifying offsetnew has an expected effect", { for(i in 1:length(proj_vind_list)) { i_inf <- names(proj_vind_list)[i] pl <- proj_predict(proj_vind_list[[i]], xnew = x, draws = iter, seed = seed) plo <- proj_predict(proj_vind_list[[i]], xnew = x, draws = iter, seed = seed, offsetnew = offset) expect_true(sum(pl != plo) > 0, info = i_inf) } }) test_that("proj_predict: specifying draws has an expected effect", { for(i in 1:length(proj_vind_list)) { i_inf <- names(proj_vind_list)[i] pl <- proj_predict(proj_vind_list[[i]], xnew = x, draws = iter) expect_equal(dim(pl), c(iter, n)) } }) test_that("proj_predict: specifying seed_sam has an expected effect", { for(i in 1:length(proj_vind_list)) { i_inf <- names(proj_vind_list)[i] pl1 <- proj_predict(proj_vind_list[[i]], xnew = x, seed_samp = seed) pl2 <- proj_predict(proj_vind_list[[i]], xnew = x, seed_samp = seed) expect_equal(pl1, pl2, info = i_inf) } }) test_that("proj_predict: arguments passed to project work accordingly", { for(i in 1:length(vs_list)) { i_inf <- names(vs_list)[i] pr1 <- project(vs_list[[i]], nv = c(2, 4), nc = 2, ns = 20, intercept = FALSE, regul = 1e-8, seed = 12) prp1 <- proj_predict(pr1, xnew = x, draws = 100, seed_samp = 11) prp2 <- proj_predict(vs_list[[i]], xnew = x, draws = 100, seed_samp = 11, nv = c(2, 4), nc = 2, ns = 20, intercept = FALSE, regul = 1e-8, seed = 12) expect_equal(prp1, prp2, info = i_inf) } }) }projpred/tests/testthat/test_refmodel.R0000644000176200001440000000540113612376105020064 0ustar liggesuserscontext('refmodel') # tests for generic reference model if (require(rstanarm)) { seed <- 1235 set.seed(seed) n <- 50 nv <- 5 x <- matrix(rnorm(n*nv, 0, 1), n, nv) b <- runif(nv)-0.5 dis <- runif(1, 1, 2) weights <- sample(1:4, n, replace = T) chains <- 2 iter <- 500 offset <- rnorm(n) source(file.path('helpers', 'SW.R')) f_gauss <- gaussian() df_gauss <- data.frame(y = rnorm(n, f_gauss$linkinv(x%*%b), dis), x = I(x)) f_binom <- binomial() df_binom <- data.frame(y = rbinom(n, weights, f_binom$linkinv(x%*%b)), x = I(x)) SW({ fit_gauss <- stan_glm(y ~ x, family = f_gauss, data = df_gauss, QR = T, weights = weights, offset = offset, chains = chains, seed = seed, iter = iter) fit_binom <- stan_glm(cbind(y, weights-y) ~ x, family = f_binom, QR = T, data = df_binom, weights = weights, offset = offset, chains = chains, seed = seed, iter = iter) ref_gauss <- get_refmodel(fit_gauss) ref_binom <- get_refmodel(fit_binom) }) test_that('get_refmodel produces sensible results', { expect_s3_class(ref_gauss, "refmodel") expect_s3_class(ref_binom, "refmodel") }) test_that('get_refmode checks for the absence of data', { SW({ fit_nodata <- stan_glm(df_gauss$y ~ x, family = f_gauss, QR = T, weights = weights, offset = offset, chains = chains, seed = seed, iter = iter) }) expect_error(get_refmodel(fit_nodata), 'Model was fitted without a \'data\' argument') }) test_that('predict checks the \'type\' argument', { expect_error(predict(ref_gauss, df_gauss, type = 'zzz'), '\'arg\' should be one of') }) test_that('predict produces sensible results for gaussian models', { out.resp <- predict(ref_gauss, df_gauss, type = 'response') expect_vector(out.resp) expect_length(out.resp, nrow(df_gauss)) out.link <- predict(ref_gauss, df_gauss, type = 'link') expect_equal(out.resp, out.link) }) test_that('predict produces sensible results for binomial models', { out.resp <- predict(ref_binom, df_binom, type = 'response') expect_vector(out.resp) expect_length(out.resp, nrow(df_binom)) expect_true(all(out.resp >= 0 & out.resp <= 1)) out.link <- predict(ref_binom, df_binom, type = 'link') expect_length(out.resp, nrow(df_binom)) }) test_that('predict produces sensible results when specifying ynew', { out <- predict(ref_gauss, df_gauss, ynew = df_gauss$y) expect_vector(out) expect_length(out, length(df_gauss$y)) expect_error(predict(ref_gauss, df_gauss, ynew = df_gauss), 'must be a numerical vector') }) }projpred/tests/testthat/test_glm_elnet.R0000644000176200001440000001552113361364246020245 0ustar liggesuserscontext("elnet") # tests for glm_elnet if (!requireNamespace('glmnet', quietly = TRUE)) { stop('glmnet needed for this function to work. Please install it.', call. = FALSE) } set.seed(1235) n <- 40 nv <- 10 nv_fit <- nv - 5 x <- matrix(rnorm(n*nv, 0, 1), n, nv) b <- c(seq(0,1,length.out = nv_fit), rep(0, nv-nv_fit)) #runif(nv)-0.5 dis <- runif(1, 0.3,0.5) # runif(1, 1, 2) x_tr <- x[,1:nv_fit] b_tr <- b[1:nv_fit] weights <- sample(1:4, n, replace = T) weights_norm <- weights / sum(weights) * n offset <- 0.1*rnorm(n) #rnorm(n) penalty <- runif(ncol(x_tr))+0.5 penalty <- penalty/sum(penalty)*ncol(x_tr) # must scale the penalties to be comparable to glmnet tol <- 1e-04 extra_thresh <- 1e-10 test_that("glm_elnet: various families and setups, glm_elnet and glmnet should give same result", { fams <- list(gaussian(), binomial(), poisson()) x_list <- lapply(fams, function(fam) x_tr) y_list <- lapply(fams, function(fam) { if (fam$family == 'gaussian') { y <- rnorm(n, x_tr%*%b_tr, 0.1) y_glmnet <- y } else if (fam$family == 'binomial') { y <- rbinom(n, weights, fam$linkinv(x_tr%*%b_tr)) y <- y/weights y_glmnet <- cbind(1-y,y) # different way of specifying binomial y for glmnet } else if (fam$family == 'poisson') { y <- rpois(n, fam$linkinv(x_tr%*%b_tr)) y_glmnet <- y } list(y=y, y_glmnet=y_glmnet) }) for (intercept in c(F,T)) { for (alpha in c(0,1)) { # cannot test 0 < alpha < 1 because it seems glmnet uses the 'unnaive' elastic net for (normalize in c(F)) { # it seems glmnet does the normalization differently so we can't test normalize=T.. for (use_offset in c(F,T)) { for (use_weights in c(F,T)) { for (i in seq_along(fams)) { x <- x_list[[i]] y <- y_list[[i]]$y y_glmnet <- y_list[[i]]$y_glmnet fam <- fams[[i]] nlam <- 500 lambda_min_ratio <- 1e-3 if (use_offset) os <- offset else os <- rep(0,n) if (use_weights) w <- weights else w <- rep(1,n) # compute the whole solution paths fit1 <- glm_elnet(x,y,fam, alpha=alpha, lambda_min_ratio = 0.1*lambda_min_ratio, nlambda=nlam, weights = w, offset = os, normalize = normalize, thresh = 1e-12, intercept = intercept) fit2 <- glmnet::glmnet(x,y_glmnet,family = fam$family, alpha=alpha, lambda.min.ratio = lambda_min_ratio, nlambda=nlam, weights = w, offset = os, standardize = normalize, thresh = 1e-12, intercept = intercept) # check that with a given L1-norm, the coefficient values are the same # (need to check it this way since the lambda values are not comparable between glm_elnet and glmnet) b1 <- rbind(fit1$beta0, fit1$beta) enorm1 <- colSums(abs(b1)) b2 <- as.matrix(rbind(fit2$a0, fit2$beta)) enorm2 <- colSums(abs(b2)) for (j in 1:nrow(b1)) { # loop through each coefficient (including intercept) infostr <- paste0('alpha = ', alpha, ', normalization = ',normalize, ', weights = ',use_weights, ', offset = ', use_offset, ', family = ', fam$family) b1j_interp <- approxfun(enorm1, b1[j,]) magn <- max(abs(b2[j,])+1e-9) max_rel_diff <- max(abs(b1j_interp(enorm2) - b2[j,]) / magn, na.rm=T) expect_true(max_rel_diff < 2*1e-2, info=infostr) } # # plot coefficient path of some variable (useful when debugging so we leave this here commented) # ds <- 0.3 # j <- 1 # ggplot() + # geom_point(aes(x=enorm1, y=b1[j,]), color='black', size=ds) + # geom_line(aes(x=enorm1, y=b1[j,]), color='black') + # geom_point(aes(x=enorm2, y=b2[j,]), color='red', size=ds) + # geom_line(aes(x=enorm2, y=b2[j,]), color='red') #+ xlim(0.5,1) + ylim(0,0.1) } } } } } } }) test_that("glm_elnet: poisson, log-link, normalization should not affect the maximum likelihood solution", { fam <- kl_helpers(poisson(link = 'log')) y <- rpois(n, fam$linkinv(x%*%b)) nlam <- 100 elnetfit1 <- glm_elnet(x_tr, y, family = fam, nlambda=nlam, lambda_min_ratio=1e-7, offset = offset, weights = weights_norm, intercept = TRUE, normalize = FALSE) elnetfit2 <- glm_elnet(x_tr, y, family = fam, nlambda=nlam, lambda_min_ratio=1e-7, offset = offset, weights = weights_norm, intercept = TRUE, normalize = TRUE) expect_equal(c(elnetfit1$beta0[nlam], elnetfit1$beta[,nlam]), c(elnetfit2$beta0[nlam], elnetfit2$beta[,nlam]), tolerance = tol) }) test_that("glm_elnet with alpha=0 and glm_ridge give the same result.", { for (famstr in c('gaussian', 'binomial', 'poisson')) { if (famstr == 'gaussian') { fam <- gaussian(link = 'identity') y <- rnorm(n, x_tr%*%b_tr, 0.5) } else if (famstr == 'binomial') { fam <- binomial(link = 'probit') y <- rbinom(n, weights, fam$linkinv(x_tr%*%b_tr)) / weights } else if (famstr == 'poisson') { fam <- poisson(link = 'log') y <- rpois(n, fam$linkinv(x_tr%*%b_tr)) } for (intercept in c(T,F)) { for (normalize in c(T,F)) { # compute the L2-path with glm_elnet elnetfit <- glm_elnet(x_tr, y, family = fam, nlambda=50, alpha = 0, offset = offset, weights = weights, penalty = penalty, intercept = intercept, normalize = normalize, thresh = 1e-15) b1 <- rbind(elnetfit$beta0, elnetfit$beta) # compute the solutions using glm_ridge in the same lambda grid b2 <- array(dim = dim(b1)) for (j in seq_along(elnetfit$lambda)) { lam <- elnetfit$lambda[j] ridgefit <- glm_ridge(x_tr, y, family = fam, lambda = lam, offset = offset, weights = weights, penalty = penalty, intercept = intercept, normalize = normalize, thresh = 1e-15) b2[1,j] <- ridgefit$beta0 b2[2:nrow(b2),j] <- ridgefit$beta } infostr <- paste0('intercept = ', intercept, ', normalize = ', normalize, ', family = ', fam$family) expect_true( max(abs(b1-b2)) < 1e-6, info = infostr) } } } }) projpred/tests/testthat/test_as_matrix.R0000644000176200001440000000510413612375606020263 0ustar liggesuserscontext('as.matrix.projection') # tests for as_matrix if (require(rstanarm)) { set.seed(1235) n <- 40 nv <- 5 x <- matrix(rnorm(n*nv, 0, 1), n, nv) b <- runif(nv)-0.5 dis <- runif(1, 1, 2) weights <- sample(1:4, n, replace = T) offset <- rnorm(n) chains <- 2 seed <- 1235 iter <- 500 source(file.path('helpers', 'SW.R')) f_gauss <- gaussian() df_gauss <- data.frame(y = rnorm(n, f_gauss$linkinv(x%*%b), dis), x = x) f_binom <- binomial() df_binom <- data.frame(y = rbinom(n, weights, f_binom$linkinv(x%*%b)), x = x) SW( fit_gauss <- stan_glm(y ~ x, family = f_gauss, data = df_gauss, QR = T, weights = weights, offset = offset, chains = chains, seed = seed, iter = iter) ) SW( fit_binom <- stan_glm(cbind(y, weights-y) ~ x, family = f_binom, QR = T, data = df_binom, weights = weights, offset = offset, chains = chains, seed = seed, iter = iter) ) vs_gauss <- varsel(fit_gauss) vs_binom <- varsel(fit_binom) vind <- c(2,3) ns <- 100 p_gauss <- project(vs_gauss, vind = vind, ns = ns) p_binom <- project(vs_binom, vind = vind, ns = ns) test_that("as.matrix.projection returns the relevant variables for gaussian", { m <- as.matrix(p_gauss) expect_equal(colnames(m), c(names(coef(fit_gauss))[c(1, vind + 1)], 'sigma')) expect_equal(dim(m), c(ns, length(vind) + 2)) }) test_that("as.matrix.projection returns the relevant variables for binomial", { m <- as.matrix(p_binom) expect_equal(colnames(m), names(coef(fit_binom))[c(1, vind + 1)]) expect_equal(dim(m), c(ns, length(vind) + 1)) }) test_that("as.matrix.projection works as expected without an intercept", { p_nointercept <- project(vs_gauss, vind = vind, ns = ns, intercept = FALSE) m <- as.matrix(p_nointercept) expect_equal(colnames(m), c(names(coef(fit_gauss))[vind + 1], 'sigma')) expect_equal(dim(m), c(ns, length(vind) + 1)) }) test_that("as.matrix.projection works as expected with zero variables", { p_novars <- project(vs_gauss, nv = 0, ns = ns, intercept = F) m <- as.matrix(p_novars) expect_equal(colnames(m), 'sigma') expect_equal(dim(m), c(ns, 1)) }) test_that("as.matrix.projection gives a warning but works with clustering", { nc <- 3 p_clust <- project(vs_gauss, vind = vind, nc = nc) expect_warning(m <- as.matrix(p_clust)) expect_equal(colnames(m), c(names(coef(fit_gauss))[c(1, vind + 1)], 'sigma')) expect_equal(dim(m), c(nc, length(vind) + 2)) }) }projpred/tests/testthat/test_project.R0000644000176200001440000002623013612375755017752 0ustar liggesuserscontext('project') # tests for project if (require(rstanarm)) { seed <- 1235 set.seed(seed) n <- 40 nv <- 5 x <- matrix(rnorm(n*nv, 0, 1), n, nv) b <- runif(nv)-0.5 dis <- runif(1, 1, 2) weights <- sample(1:4, n, replace = T) offset <- rnorm(n) chains <- 2 iter <- 500 source(file.path('helpers', 'SW.R')) f_gauss <- gaussian() df_gauss <- data.frame(y = rnorm(n, f_gauss$linkinv(x%*%b), dis), x = x) f_binom <- binomial() df_binom <- data.frame(y = rbinom(n, weights, f_binom$linkinv(x%*%b)), x = x) f_poiss <- poisson() df_poiss <- data.frame(y = rpois(n, f_poiss$linkinv(x%*%b)), x = x) SW({ fit_gauss <- stan_glm(y ~ x, family = f_gauss, data = df_gauss, QR = T, weights = weights, offset = offset, chains = chains, seed = seed, iter = iter) fit_binom <- stan_glm(cbind(y, weights-y) ~ x, family = f_binom, QR = T, data = df_binom, weights = weights, offset = offset, chains = chains, seed = seed, iter = iter) fit_poiss <- stan_glm(y ~ x, family = f_poiss, data = df_poiss, QR = T, weights = weights, offset = offset, chains = chains, seed = seed, iter = iter) }) fit_list <- list(fit_gauss, fit_binom, fit_poiss) vs_list <- lapply(fit_list, varsel, nv_max = nv, verbose = FALSE) test_that("project: relaxing has the expected effect", { vs_list <- lapply(fit_list, varsel, nv_max = nv, verbose = FALSE, method='l1') for (i in seq_along(vs_list)) { p0 <- project(vs_list[[i]], relax=F, nv=1:nv) p1 <- project(vs_list[[i]], relax=T, nv=1:nv, nc=1, regul=1e-9) for (j in seq_along(p1)) { # L1-penalised coefficients should have smaller L1-norm expect_true( sum(abs(p0[[j]]$beta)) < sum(abs(p1[[j]]$beta)) ) } } }) test_that("object returned by project contains the relevant fields", { for(i in 1:length(vs_list)) { i_inf <- names(vs_list)[i] p <- project(vs_list[[i]], nv=0:nv) expect_type(p, "list") expect_length(p, nv + 1) for(j in 1:length(p)) { expect_s3_class(p[[j]], "projection") expect_named(p[[j]], c('kl', 'weights', 'dis', 'alpha', 'beta', 'vind', 'p_type', 'intercept', 'family_kl'), ignore.order = T, info = i_inf) # number of draws should equal to the number of draw weights ns <- length(p[[j]]$weights) expect_length(p[[j]]$alpha, ns) expect_length(p[[j]]$dis, ns) expect_equal(ncol(p[[j]]$beta), ns, info = i_inf) # j:th element should have j-1 variables expect_equal(nrow(p[[j]]$beta), j - 1, info = i_inf) expect_length(p[[j]]$vind, j - 1) # family kl expect_equal(p[[j]]$family_kl, vs_list[[i]]$family_kl, info = i_inf) } # kl should be non-increasing on training data klseq <- sapply(p, function(x) x$kl) expect_equal(klseq, cummin(klseq), info = i_inf) # all submodels should use the same clustering expect_equal(p[[1]]$weights, p[[nv]]$weights, info = i_inf) } }) test_that("project: error when varsel has not been performed for the object", { expect_error(project(1, xnew = x), 'is not a variable selection object') expect_error(project(fit_gauss, xnew = x), 'is not a variable selection object') }) test_that("project: nv is checked", { expect_error(project(vs_list[[1]], nv = 1000), 'Cannot perform the projection with 1000 variables') expect_error(project(vs_list[[1]], nv = -1), 'must contain non-negative values') expect_error(project(vs_list[[1]], nv = 'a'), 'must contain non-negative values') expect_error(project(vs_list[[1]], nv = df_gauss), 'must contain non-negative values') }) test_that("project: setting nv = NULL has the expected effect", { for(i in 1:length(vs_list)) { i_inf <- names(vs_list)[i] p <- project(vs_list[[i]], nv = NULL) # if only one model size is projected, do not return a list of length one expect_true(length(p) >= 1, info = i_inf) # beta has the correct number of rows expect_equal(nrow(p$beta), vs_list[[i]]$ssize, info = i_inf) expect_length(p$vind, vs_list[[i]]$ssize) } }) test_that("project: setting nv = 0 has an expected effect", { for(i in 1:length(vs_list)) { i_inf <- names(vs_list)[i] nv <- 0 p <- project(vs_list[[i]], nv = nv) # if only one model size is projected, do not return a list of length one expect_true(length(p) >= 1, info = i_inf) # beta has the correct number of rows expect_equal(nrow(p$beta), nv, info = i_inf) expect_length(p$vind, nv) } }) test_that("project: setting nv = 3 has an expected effect", { for(i in 1:length(vs_list)) { i_inf <- names(vs_list)[i] nv <- 3 p <- project(vs_list[[i]], nv = nv) # if only one model is projected, do not return a list of length one expect_true(length(p) >= 1, info = i_inf) # beta has the correct number of rows expect_equal(nrow(p$beta), nv, info = i_inf) expect_length(p$vind, nv) } }) test_that("project: setting vind to 4 has an expected effect", { for(i in 1:length(vs_list)) { vind <- 4 p <- project(vs_list[[i]], vind = vind) expect_equivalent(p$vind, vind) expect_equal(nrow(p$beta), 1) exp_ind <- which(vs_list[[i]]$vind == vind) expect_named(p$vind, names(vs_list[[i]]$vind)[exp_ind]) } }) test_that("project: setting vind to 1:2 has an expected effect", { for(i in 1:length(vs_list)) { # i_inf <- names(vs_list)[i] vind <- 1:2 # names(vind) <- names(coef(vs_list[[i]]))[vind+1] p <- project(vs_list[[i]], vind = vind) expect_equivalent(p$vind, vind) expect_equal(nrow(p$beta), length(vind), info = i_inf) exp_ind <- sapply(vind, function(x) which(vs_list[[i]]$vind == x)) expect_named(p$vind, names(vs_list[[i]]$vind)[exp_ind]) } }) test_that("project: setting vind to something nonsensical returns an error", { # variable selection objects expect_error(project(vs_list[[1]], vind = 1:10), 'vind contains an index larger than') expect_error(project(vs_list[[1]], vind = 17), 'vind contains an index larger than') # fit objects expect_error(project(fit_list[[1]], vind = 1:10), 'vind contains an index larger than') expect_error(project(fit_list[[1]], vind = 17), 'vind contains an index larger than') }) test_that("project: setting ns to 1 has an expected effect", { for(i in 1:length(vs_list)) { i_inf <- names(vs_list)[i] ns <- 1 p <- project(vs_list[[i]], ns = ns, nv = nv) # expected number of draws expect_length(p$weights, ns) expect_length(p$alpha, ns) expect_length(p$dis, ns) expect_equal(ncol(p$beta), ns, info = i_inf) expect_equal(p$weights, 1, info = i_inf) } }) test_that("project: setting ns to 40 has an expected effect", { for(i in 1:length(vs_list)) { i_inf <- names(vs_list)[i] ns <- 40 p <- project(vs_list[[i]], ns = ns, nv = nv) # expected number of draws expect_length(p$weights, ns) expect_length(p$alpha, ns) expect_length(p$dis, ns) expect_equal(ncol(p$beta), ns, info = i_inf) # no clustering, so draw weights should be identical expect_true(do.call(all.equal, as.list(p$weights)), info = i_inf) } }) test_that("project: setting nc to 1 has an expected effect", { for(i in 1:length(vs_list)) { i_inf <- names(vs_list)[i] nc <- 1 p <- project(vs_list[[i]], nc = nc, nv = nv) # expected number of draws expect_length(p$weights, nc) expect_length(p$alpha, nc) expect_length(p$dis, nc) expect_equal(ncol(p$beta), nc, info = i_inf) } }) test_that("project: setting nc to 20 has an expected effect", { for(i in 1:length(vs_list)) { i_inf <- names(vs_list)[i] nc <- 20 p <- project(vs_list[[i]], nc = nc, nv = nv) # expected number of draws expect_length(p$weights, nc) expect_length(p$alpha, nc) expect_length(p$dis, nc) expect_equal(ncol(p$beta), nc, info = i_inf) } }) test_that("project: setting ns or nc to too big throws an error", { expect_error(project(vs_list[[1]], ns = 400000, nv = nv), 'exceed the number of columns') expect_error(project(vs_list[[1]], nc = 400000, nv = nv), 'exceed the number of columns') expect_error(project(fit_list[[1]], vind = 1:nv, ns = 400000), 'exceed the number of columns') expect_error(project(fit_list[[1]], vind = 1:nv, nc = 400000), 'exceed the number of columns') }) test_that("project: specifying intercept has an expected effect", { for(i in 1:length(vs_list)) { i_inf <- names(vs_list)[i] p <- project(vs_list[[i]], nv = nv, intercept = TRUE) expect_true(p$intercept, info = i_inf) p <- project(vs_list[[i]], nv = nv, intercept = FALSE) expect_true(!p$intercept, info = i_inf) expect_true(all(p$alpha==0), info = i_inf) } }) test_that("project: specifying the seed does not cause errors", { for(i in 1:length(vs_list)) { i_inf <- names(vs_list)[i] p <- project(vs_list[[i]], nv = nv, seed = seed) expect_named(p, c('kl', 'weights', 'dis', 'alpha', 'beta', 'vind', 'p_type', 'intercept', 'family_kl'), ignore.order = T, info = i_inf) } }) test_that("project: adding more regularization has an expected effect", { regul <- c(1e-6, 1e-3, 1e-1, 1e1, 1e4) for(i in 1:length(vs_list)) { #i_inf <- names(vs_list)[i] norms <- rep(0, length(regul)) for (j in 1:length(regul)) norms[j] <- sum(project(vs_list[[i]], nv = 3, seed = seed, nc=1, regul=regul[j])$beta^2) for (j in 1:(length(regul)-1)) expect_gt(norms[j],norms[j+1]) } }) test_that("project: projecting full model onto itself does not change results", { tol <- 1e-3 for (i in 1:length(fit_list)) { fit <- fit_list[[i]] draws <- as.data.frame(fit) alpha_ref <- draws$`(Intercept)` beta_ref <- draws[,1+(1:nv),drop=F] S <- nrow(draws) proj <- project(fit, vind = 1:nv, seed = seed, ns=S, regul=0) # test alpha and beta dalpha <- max(abs(proj$alpha - alpha_ref)) dbeta <- max(abs(proj$beta - t(beta_ref))) expect_lt(dalpha, tol) expect_lt(dbeta, tol) if (ncol(draws) > nv+1) { # test dispersion dis_ref <- draws[,ncol(draws)] ddis <- max(abs(proj$dis - draws$sigma)) expect_lt(ddis, tol) } } }) test_that("project: works as expected from a cvsel object", { SW({ cvs <- cv_varsel(fit_gauss, nv_max = 3, verbose = FALSE) p <- project(cvs, nv=3) }) expect_equal(nrow(p$beta), 3) expect_length(p$vind, 3) }) }projpred/tests/testthat/test_varsel.R0000644000176200001440000005704013612375533017575 0ustar liggesuserscontext('varsel') # tests for varsel and cv_varsel if (require(rstanarm)) { seed <- 1235 set.seed(seed) n <- 50 nv <- 5 x <- matrix(rnorm(n*nv, 0, 1), n, nv) b <- runif(nv)-0.5 dis <- runif(1, 1, 2) weights <- sample(1:4, n, replace = T) chains <- 2 iter <- 500 offset <- rnorm(n) source(file.path('helpers', 'SW.R')) f_gauss <- gaussian() df_gauss <- data.frame(y = rnorm(n, f_gauss$linkinv(x%*%b), dis), x = I(x)) f_binom <- binomial() df_binom <- data.frame(y = rbinom(n, weights, f_binom$linkinv(x%*%b)), x = I(x)) f_poiss <- poisson() df_poiss <- data.frame(y = rpois(n, f_poiss$linkinv(x%*%b)), x = I(x)) SW({ fit_gauss <- stan_glm(y ~ x, family = f_gauss, data = df_gauss, QR = T, weights = weights, offset = offset, chains = chains, seed = seed, iter = iter) fit_binom <- stan_glm(cbind(y, weights-y) ~ x, family = f_binom, QR = T, data = df_binom, weights = weights, offset = offset, chains = chains, seed = seed, iter = iter) fit_poiss <- stan_glm(y ~ x, family = f_poiss, data = df_poiss, QR = T, weights = weights, offset = offset, chains = chains, seed = seed, iter = iter) fit_lm <- stan_lm(y ~ x, data = df_gauss, weights = weights, offset = offset, prior = R2(0.3), chains = chains, seed = seed, iter = iter) fit_glmer <- stan_glmer(mpg ~ wt + (1|cyl), data = mtcars, chains = chains, seed = seed, iter = iter) }) fit_list <- list(gauss = fit_gauss, binom = fit_binom, poiss = fit_poiss, lm = fit_lm) vsf <- function(x, m) varsel(x, method = m, nv_max = nv, verbose = FALSE) vs_list <- list(l1 = lapply(fit_list, vsf, 'L1'), fs = lapply(fit_list, vsf, 'forward')) ref_gauss <- init_refmodel(x, df_gauss$y, family = f_gauss) ref_binom <- init_refmodel(x, rbinom(n, 1, f_binom$linkinv(x%*%b)), family = f_binom) ref_list <- list(ref_gauss = ref_gauss, ref_binom = ref_binom) vsref_list <- list(l1 = lapply(ref_list, vsf, 'L1'), fs = lapply(ref_list, vsf, 'forward')) test_that('varsel returns an object of type "vsel"', { for(i in 1:length(vs_list)) { for(j in 1:length(vs_list[[i]])) { expect_s3_class(vs_list[[i]][[j]], 'vsel') } } }) test_that('object returned by varsel contains the relevant fields', { for(i in 1:length(vs_list)) { i_inf <- names(vs_list)[i] for(j in 1:length(vs_list[[i]])) { j_inf <- names(vs_list[[i]])[j] # refmodel seems legit expect_s3_class(vs_list[[i]][[j]]$refmodel, 'refmodel') # vind seems legit expect_length(vs_list[[i]][[j]]$vind, nv) expect_equal(names(coef(fit_gauss)[-1])[vs_list[[i]][[j]]$vind], names(vs_list[[i]][[j]]$vind), info = paste(i_inf, j_inf)) # kl seems legit expect_length(vs_list[[i]][[j]]$kl, nv + 1) # decreasing expect_equal(vs_list[[i]][[j]]$kl, cummin(vs_list[[i]][[j]]$kl), info = paste(i_inf, j_inf)) # d_test seems legit expect_length(vs_list[[i]][[j]]$d_test$y, n) expect_length(vs_list[[i]][[j]]$d_test$weights, n) expect_type(vs_list[[i]][[j]]$d_test$type, 'character') expect_equal(vs_list[[i]][[j]]$d_test$type, 'train', info = paste(i_inf, j_inf)) # summaries seems legit expect_named(vs_list[[i]][[j]]$summaries, c('sub', 'ref'), info = paste(i_inf, j_inf)) expect_length(vs_list[[i]][[j]]$summaries$sub, nv + 1) expect_named(vs_list[[i]][[j]]$summaries$sub[[1]], c('mu', 'lppd'), info = paste(i_inf, j_inf)) expect_named(vs_list[[i]][[j]]$summaries$ref, c('mu', 'lppd'), info = paste(i_inf, j_inf)) # family_kl seems legit expect_equal(vs_list[[i]][[j]]$family$family, vs_list[[i]][[j]]$family_kl$family, info = paste(i_inf, j_inf)) expect_equal(vs_list[[i]][[j]]$family$link, vs_list[[i]][[j]]$family_kl$link, info = paste(i_inf, j_inf)) expect_true(length(vs_list[[i]][[j]]$family_kl) >= length(vs_list[[i]][[j]]$family$family), info = paste(i_inf, j_inf)) } } }) test_that('search method is valid', { expect_error(varsel(fit_gauss, method = 'k-fold'), 'Unknown search method') }) test_that('nv_max has an effect on varsel for gaussian models', { vs1 <- varsel(fit_gauss, method = 'forward', nv_max = 3, verbose = FALSE) expect_length(vs1$vind, 3) }) test_that('nv_max has an effect on varsel for non-gaussian models', { vs1 <- varsel(fit_binom, method = 'forward', nv_max = 3, verbose = FALSE) expect_length(vs1$vind, 3) }) test_that('specifying the number of clusters has an expected effect', { vs <- varsel(fit_binom, method = 'forward', nv_max = 3, nc = 10) expect_length(vs$vind, 3) }) test_that('specifying d_test has the expected effect', { vs <- varsel(fit_gauss, d_test = vs_list[[1]][[1]]$refmodel, nv_max = 3) expect_length(vs$vind, 3) }) test_that('Having something else than stan_glm as the fit throws an error', { expect_error(varsel(fit_glmer, verbose = FALSE), regexp = 'not yet supported') expect_error(varsel(rnorm(5), verbose = FALSE), regexp = 'no applicable method') }) test_that("varsel: adding more regularization has an expected effect", { regul <- c(1e-6, 1e-3, 1e-1, 1e1, 1e4) for(i in 1:length(fit_list)) { norms <- rep(0, length(regul)) msize <- 3 for (j in 1:length(regul)) { vsel <- varsel(fit_list[[i]], regul=regul[j]) norms[j] <- sum( fit_list[[i]]$family$linkfun(vsel$summaries$sub[[msize]]$mu)^2 ) } for (j in 1:(length(regul)-1)) expect_gt(norms[j],norms[j+1]) } }) test_that("varsel: length of the penalty vector is checked", { vsf <- function(obj, penalty) varsel(obj, method = 'L1', nv_max = nv, verbose = FALSE, penalty = penalty) expect_error(vsf(fit_list$gauss, rep(1, nv + 1))) expect_error(vsf(fit_list$gauss, 1)) }) test_that("varsel: specifying penalties for variables has an expected effect", { penalty <- rep(1,nv) ind_zeropen <- c(2,4) # a few variables without cost ind_infpen <- c(1) # one variable with infinite penalty, should be selected last penalty[ind_zeropen] <- 0 penalty[ind_infpen] <- Inf vsf <- function(obj) varsel(obj, method = 'L1', nv_max = nv, verbose = FALSE, penalty=penalty) vs_list_pen <- lapply(fit_list, vsf) for (i in seq_along(vs_list_pen)) { # check that the variables with no cost are selected first and the ones with # inf penalty last sdiff <- setdiff(head(vs_list_pen[[i]]$vind, length(ind_zeropen)), ind_zeropen) expect_length(sdiff, 0) sdiff <- setdiff(tail(vs_list_pen[[i]]$vind, length(ind_infpen)), ind_infpen) expect_length(sdiff, 0) } }) # ------------------------------------------------------------- context('cv_varsel') cvsf <- function(x, m, cvm, K = NULL) cv_varsel(x, method = m, cv_method = cvm, nv_max = nv, K = K) SW({ cvs_list <- list(l1 = lapply(fit_list, cvsf, 'L1', 'LOO'), fs = lapply(fit_list, cvsf, 'forward', 'LOO')) # without weights/offset because kfold does not support them currently # test only with one family to make the tests faster # the chains, seed and iter arguments to the rstanarm functions here must be # specified directly rather than through a variable (eg, seed = 1235 instead # of seed = seed), otherwise when the calls are evaluated in refmodel$cvfun() # they may not be found in the evaluation frame of the calling function, # causing the test to fail glm_simp <- stan_glm(y ~ x, family = poisson(), data = df_poiss, QR = T, chains = 2, seed = 1235, iter = 400) lm_simp <- stan_lm(y ~ x, data = df_gauss, prior = R2(0.6), chains = 2, seed = 1235, iter = 400) simp_list = list(glm = glm_simp, lm = lm_simp) cv_kf_list <- list(l1 = lapply(simp_list, cvsf, 'L1', 'kfold', K = 2), fs = lapply(simp_list, cvsf, 'forward', 'kfold', K = 2)) # LOO cannot be performed without a genuine probabilistic model cvsref_list <- list(l1 = lapply(ref_list, cvsf, 'L1', 'kfold'), fs = lapply(ref_list, cvsf, 'forward', 'kfold')) }) test_that('cv_varsel returns an object of type "cvsel"', { for(i in 1:length(cvs_list)){ for(j in 1:length(cvs_list[[i]])) { expect_s3_class(cvs_list[[i]][[j]], 'cvsel') } } }) test_that('object returned by cv_varsel contains the relevant fields', { for(i in 1:length(cvs_list)) { i_inf <- names(cvs_list)[i] for(j in 1:length(cvs_list[[i]])) { j_inf <- names(cvs_list[[i]])[j] # vind seems legit expect_length(cvs_list[[i]][[j]]$vind, nv) expect_equal(names(coef(fit_gauss)[-1])[cvs_list[[i]][[j]]$vind], names(cvs_list[[i]][[j]]$vind), info = paste(i_inf, j_inf)) # kl seems legit expect_length(cvs_list[[i]][[j]]$kl, nv + 1) # decreasing expect_equal(cvs_list[[i]][[j]]$kl, cummin(cvs_list[[i]][[j]]$kl), info = paste(i_inf, j_inf)) # d_test seems legit expect_length(cvs_list[[i]][[j]]$d_test$y, n) expect_length(cvs_list[[i]][[j]]$d_test$weights, n) expect_type(cvs_list[[i]][[j]]$d_test$type, 'character') expect_equal(cvs_list[[i]][[j]]$d_test$type, 'loo', info = paste(i_inf, j_inf)) # summaries seems legit expect_named(cvs_list[[i]][[j]]$summaries, c('sub', 'ref'), info = paste(i_inf, j_inf)) expect_length(cvs_list[[i]][[j]]$summaries$sub, nv + 1) expect_named(cvs_list[[i]][[j]]$summaries$sub[[1]], c('mu', 'lppd', 'w'), ignore.order = TRUE, info = paste(i_inf, j_inf)) expect_named(cvs_list[[i]][[j]]$summaries$ref, c('mu', 'lppd'), ignore.order = TRUE, info = paste(i_inf, j_inf)) # family_kl seems legit expect_equal(cvs_list[[i]][[j]]$family$family, cvs_list[[i]][[j]]$family_kl$family, info = paste(i_inf, j_inf)) expect_equal(cvs_list[[i]][[j]]$family$link, cvs_list[[i]][[j]]$family_kl$link, info = paste(i_inf, j_inf)) expect_true(length(cvs_list[[i]][[j]]$family_kl) >= length(cvs_list[[i]][[j]]$family$family), info = paste(i_inf, j_inf)) # pctch seems legit #expect_equal(dim(cvs_list[[i]][[j]]$pctch), c(nv, nv + 1), # info = paste(i_inf, j_inf)) #expect_true(all(cvs_list[[i]][[j]]$pctch[,-1] <= 1 & # cvs_list[[i]][[j]]$pctch[,-1] >= 0), # info = paste(i_inf, j_inf)) #expect_equal(cvs_list[[i]][[j]]$pctch[,1], 1:nv, # info = paste(i_inf, j_inf)) #expect_equal(colnames(cvs_list[[i]][[j]]$pctch), # c('size', names(cvs_list[[i]][[j]]$vind)), # info = paste(i_inf, j_inf)) # ssize seems legit expect_true(cvs_list[[i]][[j]]$ssize>=0 || is.na(cvs_list[[i]][[j]]$ssize), info = paste(i_inf, j_inf)) } } }) test_that('nv_max has an effect on cv_varsel for gaussian models', { suppressWarnings( vs1 <- cv_varsel(fit_gauss, method = 'forward', nv_max = 3, verbose = FALSE) ) expect_length(vs1$vind, 3) }) test_that('nv_max has an effect on cv_varsel for non-gaussian models', { suppressWarnings( vs1 <- cv_varsel(fit_binom, method = 'forward', nv_max = 3, verbose = FALSE) ) expect_length(vs1$vind, 3) }) test_that('nloo works as expected', { expect_error(cv_varsel(fit_gauss, cv_method = 'loo', nloo = -1), "must be at least 1") SW({ expect_equal(cv_varsel(fit_gauss, cv_method = 'loo', nv_max = nv, nloo = NULL), cv_varsel(fit_gauss, cv_method = 'loo', nv_max = nv, nloo = 1000)) # nloo less than number of observations out <- cv_varsel(fit_gauss, cv_method = 'loo', nloo = 20, verbose = FALSE) expect_equal(sum(!is.na(out$summaries$sub[[1]]$lppd)), 20) }) }) test_that('the validate_search option works as expected', { SW({ vs1 <- cv_varsel(fit_gauss, validate_search = FALSE) vs2 <- cv_varsel(fit_gauss, validate_search = TRUE) }) expect_true(all(varsel_stats(vs1)$elpd >= varsel_stats(vs2)$elpd)) }) test_that('Having something else than stan_glm as the fit throws an error', { expect_error(cv_varsel(fit_glmer, verbose = FALSE), regexp = 'not yet supported') expect_error(cv_varsel(rnorm(5), verbose = FALSE), regexp = 'no applicable method') }) test_that('object returned by cv_varsel, kfold contains the relevant fields', { for(i in 1:length(cv_kf_list)) { i_inf <- names(cv_kf_list)[i] for(j in 1:length(cv_kf_list[[i]])) { j_inf <- names(cv_kf_list[[i]])[j] # vind seems legit expect_length(cv_kf_list[[i]][[j]]$vind, nv) expect_equal(names(coef(fit_gauss)[-1])[cv_kf_list[[i]][[j]]$vind], names(cv_kf_list[[i]][[j]]$vind), info = paste(i_inf, j_inf)) # kl seems legit expect_length(cv_kf_list[[i]][[j]]$kl, nv + 1) # decreasing expect_equal(cv_kf_list[[i]][[j]]$kl, cummin(cv_kf_list[[i]][[j]]$kl), info = paste(i_inf, j_inf)) # d_test seems legit expect_length(cv_kf_list[[i]][[j]]$d_test$y, n) expect_length(cv_kf_list[[i]][[j]]$d_test$weights, n) expect_type(cv_kf_list[[i]][[j]]$d_test$type, 'character') expect_equal(cv_kf_list[[i]][[j]]$d_test$type, 'kfold', info = paste(i_inf, j_inf)) # summaries seems legit expect_named(cv_kf_list[[i]][[j]]$summaries, c('sub', 'ref'), info = paste(i_inf, j_inf)) expect_length(cv_kf_list[[i]][[j]]$summaries$sub, nv + 1) expect_named(cv_kf_list[[i]][[j]]$summaries$sub[[1]], c('mu', 'lppd'), ignore.order = TRUE, info = paste(i_inf, j_inf)) expect_named(cv_kf_list[[i]][[j]]$summaries$ref, c('mu', 'lppd'), ignore.order = TRUE, info = paste(i_inf, j_inf)) # family_kl seems legit expect_equal(cv_kf_list[[i]][[j]]$family$family, cv_kf_list[[i]][[j]]$family_kl$family, info = paste(i_inf, j_inf)) expect_equal(cv_kf_list[[i]][[j]]$family$link, cv_kf_list[[i]][[j]]$family_kl$link, info = paste(i_inf, j_inf)) expect_true(length(cv_kf_list[[i]][[j]]$family_kl) >= length(cv_kf_list[[i]][[j]]$family$family), info = paste(i_inf, j_inf)) # pctch seems legit expect_equal(dim(cv_kf_list[[i]][[j]]$pctch), c(nv, nv + 1), info = paste(i_inf, j_inf)) expect_true(all(cv_kf_list[[i]][[j]]$pctch[,-1] <= 1 & cv_kf_list[[i]][[j]]$pctch[,-1] >= 0), info = paste(i_inf, j_inf)) expect_equal(cv_kf_list[[i]][[j]]$pctch[,1], 1:nv, info = paste(i_inf, j_inf)) expect_equal(colnames(cv_kf_list[[i]][[j]]$pctch), c('size', names(cv_kf_list[[i]][[j]]$vind)), info = paste(i_inf, j_inf)) } } }) test_that('cross-validation method is valid', { expect_error(cv_varsel(fit_gauss, cv_method = 'k-fold'), 'Unknown cross-validation method') }) test_that('K is valid for cv_method=\'kfold\'', { expect_error(cv_varsel(glm_simp, cv_method = 'kfold', K = 1), 'must be at least 2') expect_error(cv_varsel(glm_simp, cv_method = 'kfold', K = 1000), 'cannot exceed n') expect_error(cv_varsel(glm_simp, cv_method = 'kfold', K = c(4, 9)), 'a single integer value') expect_error(cv_varsel(glm_simp, cv_method = 'kfold', K = 'a'), 'a single integer value') expect_error(cv_varsel(glm_simp, cv_method = 'kfold', K = df_poiss), 'a single integer value') }) test_that('omitting the \'data\' argument causes an error', { out <- SW(fit_nodata <- stan_glm(df_gauss$y~df_gauss$x, QR = T, chains = chains, seed = seed, iter = iter)) expect_error(cv_varsel(fit_nodata, cv_method = 'loo'), 'Model was fitted without a \'data\' argument') expect_error(cv_varsel(fit_nodata, cv_method = 'kfold'), 'Model was fitted without a \'data\' argument') }) test_that('providing k_fold works', { out <- SW({ k_fold <- kfold(glm_simp, K = 2, save_fits = TRUE) fit_cv <- cv_varsel(glm_simp, cv_method = 'kfold', k_fold = k_fold) }) expect_false(any(grepl('k_fold not provided', out))) expect_length(fit_cv$vind, nv) # kl seems legit expect_length(fit_cv$kl, nv + 1) # decreasing expect_equal(fit_cv$kl, cummin(fit_cv$kl)) # d_test seems legit expect_length(fit_cv$d_test$y, n) expect_length(fit_cv$d_test$weights, n) expect_type(fit_cv$d_test$type, 'character') expect_equal(fit_cv$d_test$type, 'kfold') # summaries seems legit expect_named(fit_cv$summaries, c('sub', 'ref')) expect_length(fit_cv$summaries$sub, nv + 1) expect_named(fit_cv$summaries$sub[[1]], c('mu', 'lppd'), ignore.order = TRUE) expect_named(fit_cv$summaries$ref, c('mu', 'lppd'), ignore.order = TRUE) # family_kl seems legit expect_equal(fit_cv$family$family, fit_cv$family_kl$family) expect_equal(fit_cv$family$link, fit_cv$family_kl$link) expect_true(length(fit_cv$family_kl) >= length(fit_cv$family$family)) # pctch seems legit expect_equal(dim(fit_cv$pctch), c(nv, nv + 1)) expect_true(all(fit_cv$pctch[,-1] <= 1 & fit_cv$pctch[,-1] >= 0)) expect_equal(fit_cv$pctch[,1], 1:nv) expect_equal(colnames(fit_cv$pctch), c('size', names(fit_cv$vind))) }) # ------------------------------------------------------------- context('varsel_stats') valid_stats_all <- c('elpd', 'mlpd') valid_stats_gauss_only <- c('mse', 'rmse') valid_stats_binom_only <- c('acc', 'auc') valid_stats_gauss <- c(valid_stats_all, valid_stats_gauss_only) valid_stats_binom <- c(valid_stats_all, valid_stats_binom_only) vs_funs <- c(varsel_stats, varsel_plot, suggest_size) test_that('invalid objects are rejected', { for (fun in vs_funs) { expect_error(fun(NULL), "is not a variable selection object") expect_error(fun(fit_gauss), "is not a variable selection object") } }) test_that('invalid stats are rejected', { for (fun in vs_funs) { expect_error(fun(vs_list[[1]][["gauss"]], stat = NULL), 'specified as NULL') expect_error(fun(vs_list[[1]][["gauss"]], stat = NA), 'not recognized') expect_error(fun(vs_list[[1]][["gauss"]], stat = 'zzz'), 'not recognized') expect_error(fun(vs_list[[1]][["gauss"]], stat = 'acc'), 'available only for the binomial family') expect_error(fun(vs_list[[1]][["gauss"]], stat = 'auc'), 'available only for the binomial family') } }) test_that('invalid \'baseline\' arguments are rejected', { expect_error(varsel_stats(vs_list[[1]][["gauss"]], baseline = 'zzz'), "Argument 'baseline' must be either 'ref' or 'best'") }) test_that('varsel_stats output seems legit', { for(i in seq_along(cvs_list)) { for(j in seq_along(cvs_list[[i]])) { cvs <- cvs_list[[i]][[j]] if (cvs$family_kl$family == 'gaussian') stats_str <- valid_stats_gauss else if (cvs$family_kl$family == 'binomial') stats_str <- valid_stats_binom else stats_str <- valid_stats_all stats <- varsel_stats(cvs, stats=stats_str, type=c('mean','lower','upper','se')) expect_true(nrow(stats) == nv+1) expect_true(all(c('size','vind', stats_str, paste0(stats_str,'.se'), paste0(stats_str,'.upper'), paste0(stats_str,'.lower')) %in% names(stats))) expect_true(all(stats$mlpd > stats$mlpd.lower)) expect_true(all(stats$mlpd < stats$mlpd.upper)) } } }) test_that('varsel_stats works with reference models', { for (i in seq_along(vsref_list)) { for (j in seq_along(vsref_list[[i]])) { vs <- vsref_list[[i]][[j]] if (vs$family_kl$family == 'gaussian') stats_str <- valid_stats_gauss else stats_str <- valid_stats_binom stats <- varsel_stats(vs, stats=stats_str) expect_true(is.data.frame(stats)) } } }) test_that('print works as expected', { # default rounding expect_output(out <- print(vs_list[[1]][[1]])) expect_equal(out$elpd, round(out$elpd, 2)) expect_output(out <- print(cvs_list[[1]][[1]])) expect_equal(out$elpd, round(out$elpd, 2)) # rounding to 4 decimal places expect_output(out <- print(vs_list[[1]][[1]], digits = 4)) expect_equal(out$elpd, round(out$elpd, 4)) expect_output(out <- print(cvs_list[[1]][[1]], digits = 4)) expect_equal(out$elpd, round(out$elpd, 4)) # options to varsel_stats expect_output(out <- print(vs_list[[1]][[1]], nv_max = 3, stats = 'mse')) expect_equal(nrow(out) - 1, 3) expect_named(out, c('size', 'vind', 'mse', 'mse.se')) expect_output(out <- print(cvs_list[[1]][[1]], nv_max = 3, stats = 'mse')) expect_equal(nrow(out) - 1, 3) expect_named(out, c('size', 'vind', 'mse', 'mse.se', 'pctch')) }) # ------------------------------------------------------------- context('varsel_plots') test_that('plotting works', { expect_s3_class(varsel_plot(vs_list[[1]][[1]]), 'ggplot') expect_visible(varsel_plot(vs_list[[1]][[1]], nv_max = 3)) }) test_that('invalid \'baseline\' arguments are rejected', { expect_error(varsel_plot(vs_list[[1]][[1]], baseline = 'zzz'), "Argument 'baseline' must be either 'ref' or 'best'") }) test_that('the value of nv_max is valid', { expect_error(varsel_plot(vs_list[[1]][[1]], nv_max = 0), 'nv_max must be at least 1') }) test_that('nv_max is capped to the largest model size', { expect_equal(varsel_plot(vs_list[[1]][[1]]), varsel_plot(vs_list[[1]][[1]], nv_max = 1000)) }) # ------------------------------------------------------------- context('suggest_size') test_that('suggest_size checks the length of stat', { expect_error(suggest_size(vs_list[[1]][["gauss"]], stat = valid_stats_all), 'Only one statistic') }) test_that('suggest_size works on all stats', { for (stat in valid_stats_gauss) { ssize <- suggest_size(vs_list[[1]][["gauss"]], stat = stat) expect_true(!is.na(ssize)) expect_true(ssize >= 0) } for (stat in valid_stats_binom) { ssize <- suggest_size(vs_list[[1]][["binom"]], stat = stat) expect_true(!is.na(ssize)) expect_true(ssize >= 0) } }) } projpred/tests/testthat/test_misc.R0000644000176200001440000000772313612375677017250 0ustar liggesuserscontext('miscellaneous') # miscellaneous tests if (require(rstanarm)) { set.seed(1235) n <- 40 nv <- 5 x <- matrix(rnorm(n*nv, 0, 1), n, nv) b <- runif(nv)-0.5 dis <- runif(1, 1, 2) weights <- sample(1:4, n, replace = T) offset <- rnorm(n) chains <- 2 seed <- 1235 iter <- 500 source(testthat::test_path('helpers', 'SW.R')) f_gauss <- gaussian() df_gauss <- data.frame(y = rnorm(n, f_gauss$linkinv(x%*%b), dis), x = x) f_binom <- binomial() df_binom <- data.frame(y = rbinom(n, weights, f_binom$linkinv(x%*%b)), x = x) f_poiss <- poisson() df_poiss <- data.frame(y = rpois(n, f_poiss$linkinv(x%*%b)), x = x) SW( fit_gauss <- stan_glm(y ~ x, family = f_gauss, data = df_gauss, QR = T, weights = weights, offset = offset, chains = chains, seed = seed, iter = iter, refresh=0) ) SW( fit_binom <- stan_glm(cbind(y, weights-y) ~ x, family = f_binom, QR = T, data = df_binom, weights = weights, offset = offset, chains = chains, seed = seed, iter = iter, refresh=0) ) SW( fit_poiss <- stan_glm(y ~ x, family = f_poiss, data = df_poiss, QR = T, weights = weights, offset = offset, chains = chains, seed = seed, iter = iter, refresh=0) ) fit_list <- list(gauss = fit_gauss, binom = fit_binom, poiss = fit_poiss) test_that("check that the main function calls do not return the same RNG state every time", { s <- 5 for (seed in c(130927, NULL)) { for (i in seq_along(fit_list)) { fit <- fit_list[[i]] # varsel foo <- varsel(fit, seed=seed) r1 <- rnorm(s) foo <- varsel(fit, seed=seed) r2 <- rnorm(s) expect_true(any(r1!=r2)) # cv_varsel SW(foo <- cv_varsel(fit, seed=seed)) r1 <- rnorm(s) SW(foo <- cv_varsel(fit, seed=seed)) r2 <- rnorm(s) expect_true(any(r1!=r2)) # project vind <- c(1,2) foo <- project(fit, vind=vind, ns = 100, seed=seed) r1 <- rnorm(s) foo <- project(fit, vind=vind, ns = 100, seed=seed) r2 <- rnorm(s) expect_true(any(r1!=r2)) # proj_linpred vind <- c(1,3) foo <- proj_linpred(fit, x[,vind], vind=vind, seed=seed) r1 <- rnorm(s) foo <- proj_linpred(fit, x[,vind], vind=vind, seed=seed) r2 <- rnorm(s) expect_true(any(r1!=r2)) # proj_predict vind <- c(1,3) foo <- proj_predict(fit, x[,vind], vind=vind, seed=seed) r1 <- rnorm(s) foo <- proj_predict(fit, x[,vind], vind=vind, seed=seed) r2 <- rnorm(s) expect_true(any(r1!=r2)) } } }) test_that("check that providing seed has the expected effect", { for (seed in c(130927, 1524542)) { for (i in seq_along(fit_list)) { fit <- fit_list[[i]] # varsel foo <- varsel(fit, seed=seed) bar <- varsel(fit, seed=seed) expect_equal(foo, bar) # cv_varsel SW(foo <- cv_varsel(fit, seed=seed)) SW(bar <- cv_varsel(fit, seed=seed)) expect_equal(foo, bar) # project vind <- c(1,2) foo <- project(fit, vind=vind, nc = 10, seed=seed) bar <- project(fit, vind=vind, nc = 10, seed=seed) expect_equal(foo, bar) # proj_linpred vind <- c(1,3) foo <- proj_linpred(fit, x[,vind], vind=vind, seed=seed) bar <- proj_linpred(fit, x[,vind], vind=vind, seed=seed) expect_equal(foo, bar) # proj_predict vind <- c(1,3) foo <- proj_predict(fit, x[,vind], vind=vind, seed=seed) bar <- proj_predict(fit, x[,vind], vind=vind, seed=seed) expect_equal(foo, bar) } } }) } projpred/tests/testthat/test_datafit.R0000644000176200001440000001700113531311224017671 0ustar liggesuserscontext('datafit') suppressWarnings(RNGversion("3.5.0")) # tests for data based estimates (no actual reference model) if (!requireNamespace('glmnet', quietly = TRUE)) { stop('glmnet needed for this function to work. Please install it.', call. = FALSE) } set.seed(1235) n <- 40 nv <- 5 x <- matrix(rnorm(n*nv, 0, 1), n, nv) b <- runif(nv)-0.5 dis <- runif(1, 1, 2) weights <- sample(1:4, n, replace = T) offset <- rnorm(n) chains <- 2 seed <- 1235 iter <- 500 source(file.path('helpers', 'SW.R')) f_gauss <- gaussian() df_gauss <- data.frame(y = rnorm(n, f_gauss$linkinv(x%*%b), dis), x = x) f_binom <- binomial() df_binom <- data.frame(y = rbinom(n, weights, f_binom$linkinv(x%*%b)), x = x) f_poiss <- poisson() df_poiss <- data.frame(y = rpois(n, f_poiss$linkinv(x%*%b)), x = x) dref_gauss <- init_refmodel(x,df_gauss$y,gaussian(),offset=offset,wobs=weights) dref_binom <- init_refmodel(x,df_binom$y/weights,binomial(),offset=offset,wobs=weights) dref_poiss <- init_refmodel(x,df_poiss$y,poisson(),offset=offset,wobs=weights) dref_list <- list(gauss = dref_gauss, binom = dref_binom, poiss = dref_poiss) # varsel vsd_list <- lapply(dref_list, varsel, nv_max = nv, verbose = FALSE) # cv_varsel cvvsd_list <- lapply(dref_list, cv_varsel, nv_max = nv, verbose = FALSE) # predd_list <- lapply(vsd_list, proj_linpred, xnew=x, seed = seed, offsetnew=offset, weightsnew=weights, nv=3) test_that('predict fails for \'datafit\' objects', { expect_error(predict(dref_gauss, df_gauss), 'Cannot make predictions with data reference only') }) test_that('output of varsel is sensible with only data provided as reference model', { for(i in seq_along(vsd_list)) { # vind seems legit expect_equal(length(vsd_list[[i]]$vind), nv) # kl seems legit expect_equal(length(vsd_list[[i]]$kl), nv + 1) # kl decreasing expect_equal(vsd_list[[i]]$kl, cummin(vsd_list[[i]]$kl)) # summaries seems legit expect_named(vsd_list[[i]]$summaries, c('sub', 'ref')) expect_equal(length(vsd_list[[i]]$summaries$sub), nv + 1) expect_named(vsd_list[[i]]$summaries$sub[[1]], c('mu', 'lppd')) expect_named(vsd_list[[i]]$summaries$ref, c('mu', 'lppd')) } }) test_that("output of cv_varsel is sensible with only data provided as reference model", { for(i in seq_along(cvvsd_list)) { # vind seems legit expect_equal(length(cvvsd_list[[i]]$vind), nv) # kl seems legit expect_equal(length(cvvsd_list[[i]]$kl), nv + 1) # kl decreasing expect_equal(cvvsd_list[[i]]$kl, cummin(cvvsd_list[[i]]$kl)) # summaries seems legit expect_named(cvvsd_list[[i]]$summaries, c('sub', 'ref')) expect_equal(length(cvvsd_list[[i]]$summaries$sub), nv + 1) expect_named(cvvsd_list[[i]]$summaries$sub[[1]], c('mu', 'lppd')) expect_named(cvvsd_list[[i]]$summaries$ref, c('mu', 'lppd')) } }) test_that('varsel_stats stops if baseline = \'ref\' and deltas = TRUE', { expect_error(varsel_stats(vsd_list[[1]], baseline = 'ref', deltas = TRUE), 'Cannot use deltas = TRUE and baseline = \'ref\' when there is no reference model') }) test_that("output of project is sensible with only data provided as reference model", { for(i in 1:length(vsd_list)) { # length of output of project is legit p <- project(vsd_list[[i]], nv=0:nv) expect_equal(length(p), nv + 1) for(j in 1:length(p)) { expect_named(p[[j]], c('kl', 'weights', 'dis', 'alpha', 'beta', 'vind', 'p_type', 'intercept', 'family_kl'), ignore.order = T) # number of draws should equal to the number of draw weights ns <- length(p[[j]]$weights) expect_equal(length(p[[j]]$alpha), ns) expect_equal(length(p[[j]]$dis), ns) expect_equal(ncol(p[[j]]$beta), ns) # j:th element should have j-1 variables expect_equal(nrow(p[[j]]$beta), j-1) expect_equal(length(p[[j]]$vind), j-1) # family kl expect_equal(p[[j]]$family_kl, vsd_list[[i]]$family_kl) } # kl should be non-increasing on training data klseq <- sapply(p, function(e) e$kl) expect_equal(klseq, cummin(klseq)) # all submodels should use the same clustering/subsampling expect_equal(p[[1]]$weights, p[[nv]]$weights) } }) test_that("output of proj_linpred is sensible with only data provided as reference model", { for(i in 1:length(vsd_list)) { # length of output of project is legit pred <- proj_linpred(vsd_list[[i]], xnew=x, seed = seed, offsetnew=offset, weightsnew=weights, nv=3) expect_equal(length(pred), nrow(x)) pred <- proj_linpred(vsd_list[[i]], xnew=x, ynew=dref_list[[i]]$y, seed = seed, offsetnew=offset, weightsnew=weights, nv=3) expect_equal(length(pred$pred), nrow(x)) expect_equal(length(pred$lpd), nrow(x)) } }) # below are some tests that check Lasso solution computed with varsel is the same # as that of glmnet. (notice that glm_ridge and glm_elnet are already tested separately, so # these would only check that the results do not change due to varsel/cv_varsel etc.) set.seed(1235) n <- 100 nv <- 10 x <- matrix(rnorm(n*nv, 0, 1), n, nv) b <- seq(0,1,length.out = nv) dis <- runif(1, 0.3,0.5) weights <- sample(1:4, n, replace = T)# offset <- 0.1*rnorm(n) seed <- 1235 source(file.path('helpers', 'SW.R')) fams <- list(gaussian(), binomial(), poisson()) x_list <- lapply(fams, function(fam) x) y_list <- lapply(fams, function(fam) { if (fam$family == 'gaussian') { y <- rnorm(n, x%*%b, 0.5) y_glmnet <- y } else if (fam$family == 'binomial') { y <- rbinom(n, weights, fam$linkinv(x%*%b)) y <- y/weights y_glmnet <- cbind(1-y,y) # different way of specifying binomial y for glmnet } else if (fam$family == 'poisson') { y <- rpois(n, fam$linkinv(x%*%b)) y_glmnet <- y } list(y=y, y_glmnet=y_glmnet) }) test_that("L1-projection with data reference gives the same results as Lasso from glmnet.", { for (i in seq_along(fams)) { x <- x_list[[i]] y <- y_list[[i]]$y y_glmnet <- y_list[[i]]$y_glmnet fam <- fams[[i]] lambda_min_ratio <- 1e-7 nlambda <- 1500 # Lasso solution with projpred ref <- init_refmodel(x,y,family = fam, wobs = weights, offset = offset) vs <- varsel(ref, method='l1', lambda_min_ratio = lambda_min_ratio, nlambda = nlambda, thresh = 1e-12) pred1 <- proj_linpred(vs, xnew = x, nv=0:nv, transform = F, offsetnew=offset) # compute the results for the Lasso lasso <- glmnet::glmnet(x,y_glmnet,family=fam$family, weights = weights, offset = offset, lambda.min.ratio = lambda_min_ratio, nlambda = nlambda, thresh = 1e-12) vind <- predict(lasso, type='nonzero', s=lasso$lambda) nselected <- sapply(vind, function(e) length(e)) lambdainds <- sapply(unique(nselected), function(nv) max(which(nselected==nv))) lambdaval <- lasso$lambda[lambdainds] pred2 <- predict(lasso, newx=x, type='link', s=lambdaval, newoffset=offset) # check that the predictions agree (up to nv-2 only, because glmnet terminates the coefficient # path computation too early for some reason...) for (j in 1:(nv-2)) { expect_true( max(abs(pred1[[j]]-pred2[,j])) < 3*1e-2 ) } # check that the coefficients are similar delta <- abs(vs$spath$beta - lasso$beta[vs$spath$vind,lambdainds])[,1:(nv-2)] expect_true( max(delta) < 1e-2 ) expect_true( abs(vs$spath$alpha[1] - lasso$a0[1]) < 1e-2) # graphical checks; useful for debugging this test # k <- 5 # qplot(pred1[[k]], pred2[,k]) + geom_abline(slope=1) # # plot(lasso) # b <- vs$spath$beta # l1norm <- colSums(abs(b)) # for (j in 1:nrow(b)) # lines(l1norm, b[j,], type = 'p') } }) projpred/tests/testthat.R0000644000176200001440000000007413613746245015240 0ustar liggesuserslibrary(testthat) library(projpred) test_check("projpred") projpred/src/0000755000176200001440000000000013614341143012666 5ustar liggesusersprojpred/src/glmfun.cpp0000644000176200001440000004630613361364246014703 0ustar liggesusers#include #include #include #include #include //[[Rcpp::depends(RcppArmadillo)]] using namespace Rcpp; using namespace arma; /** * Returns the value of the penalty term in the elastic net regularization. */ double elnet_penalty(vec beta, // coefficients double lambda, // regularization parameter double alpha, // elastic net mixing parameter vec penalty) // relative penalties for the variables { double value; uvec fin = find_finite(penalty); value = lambda*sum(penalty.elem(fin) % (0.5*(1-alpha)*square(beta.elem(fin)) + alpha*(abs(beta.elem(fin))) ) ); return(value); } /** Returns the value of the regularized quadratic approximation to the loss function that is to be minimized iteratively: L = 0.5*sum_i{ w_i*(z_i - f_i)^2 } + lambda*{ 0.5*(1-alpha)*||beta||_2^2 + alpha*||beta||_1 } */ double loss_approx(vec& beta, // coefficients vec& f, // latent values vec& z, // locations for pseudo obsevations vec& w, // weights of the pseudo observations (inverse-variances) double lambda, // regularization parameter double alpha, // elastic net mixing parameter vec& penalty) // relative penalties for the variables { double loss; uvec fin = find_finite(penalty); loss = 0.5*sum(w % square(z-f)) + elnet_penalty(beta,lambda,alpha,penalty); return loss; } /** Updates the regression coefficients and the intercept (unless excluded) based on the current quadratic approximation to the loss function. This is done via the 'soft-thresholding' as described by Friedman et. al (2009). Performs either one pass through the specified set of varibles or iterates until convergence. */ void coord_descent( vec& beta, // regression coefficients double& beta0, // intercept vec& f, // latent values mat& x, // input matrix vec& z, // locations for pseudo obsevations vec& w, // weights of the pseudo observations (inverse-variances) double& lambda, // regularization parameter double& alpha, // elastic net mixing parameter vec& penalty, // relative penalties for the variables bool intercept, // whether to use intercept std::set& varind, // which coefficients are updated std::set& active_set, // active set, may change if some variables enter or leave bool until_convergence, // true = until convergence, false = one pass through varind int& npasses, // counts total passes through the variables double tol, // stop when change in the loss is smaller than this int maxiter = 1000) // maximum number of iterations (passes) through varind { int iter = 0; double loss,loss_old; size_t j; double h; // initial loss loss_old = loss_approx(beta,f,z,w,lambda,alpha,penalty); // auxiliary that will be used later on // double lam_alpha = lambda*alpha; // double lam_oneminus_alpha = lambda*(1-alpha); while (iter < maxiter) { // update the intercept if (intercept) { f = f - beta0; beta0 = sum(w % (z - f)) / sum(w); f = f + beta0; } active_set.clear(); for (std::set::iterator it=varind.begin(); it!=varind.end(); ++it) { // update the regression coefficients via 'soft-thresholding' // varible index j = *it; f = f - beta(j)*x.col(j); h = sum( w % x.col(j) % (z - f) ); // auxiliary variable if (fabs(h) <= penalty(j)*lambda*alpha) { beta(j) = 0.0; } else if (h > 0) { beta(j) = (h - penalty(j)*lambda*alpha) / ( sum(w % square(x.col(j))) + penalty(j)*lambda*(1-alpha) ); active_set.insert(j); } else { beta(j) = (h + penalty(j)*lambda*alpha) / ( sum(w % square(x.col(j))) + penalty(j)*lambda*(1-alpha) ); active_set.insert(j); } f = f + beta(j)*x.col(j); } ++iter; ++npasses; loss = loss_approx(beta,f,z,w,lambda,alpha,penalty); if (until_convergence) { if (loss_old-loss < tol) { break; } else { // continue iterating loss_old = loss; } } else { break; } } if (iter == maxiter) Rcpp::Rcout << "Warning: maximum number of iterations reached in coordinate descent. Results can be inaccurate!\n"; } /** Computes the whole elastic-net regularization path given the grid of values to lambda. Assumes that the lambda grid is selected carefully and utilizes the function pseudo_obs that returns the pseudo-observations corresponding to the quadratic approximation to the loss function for a given vector of latent values (see elnetfun.R). */ // [[Rcpp::export]] List glm_elnet_c(arma::mat x, // input matrix Function pseudo_obs, // R-function returning the pseudo-data based on the quadratic approximation arma::vec lambda, // grid for the regularization parameter double alpha, // elastic net mixing parameter bool intercept, // whether to use intercept arma::vec penalty, // relative penalties for the variables double thresh, // threshold for determining the convergence int qa_updates_max, // maximum for the total number of quadratic approximation updates int pmax, // stop computation when the active set size is equal or greater than this bool pmax_strict, // if true, then the active set size of the last beta is always at most pmax arma::vec beta, // initial value for the regression coefficients double beta0, // initial value for the intercept arma::vec w0, // initial guess for the weights of the pseudo-gaussian observations (needed for Student-t model) int as_updates_max = 50) // maximum number of active set updates for one quadratic approximation { // for gaussian pseudo data List obs; vec z; // observations vec w; // weights (inverse variances) size_t D = x.n_cols; // number of inputs size_t pmaxu = (size_t) pmax; // converting pmax to unsigned int (avoids some compiler warnings) int nlam = lambda.size(); double lam; // temporary varible for fixed lambda int k; // lambda index int qau; // number of quadratic approximation updates int asu; // number of active set updates (for a given quadratic approximation) // for storing the whole solution path rowvec beta0_path(nlam); mat beta_path(D,nlam); mat w_path(x.n_rows,nlam); beta0_path.zeros(); beta_path.zeros(); int npasses = 0; // counts how many times the coefficient vector is looped through urowvec qa_updates(nlam); qa_updates.zeros(); urowvec as_updates(nlam); as_updates.zeros(); // initialization if (!intercept) beta0 = 0; // ensure intercept is zero when it is not used vec f = x*beta + beta0; std::set active_set; std::set active_set_old; std::set varind_all; // a constant set containing indices of all the variables for (size_t j=0; j(obs["z"]); w = as(obs["w"]); double loss_initial = loss_approx(beta, f, z, w, lambda(0), alpha, penalty); // initial loss double loss_old = loss_initial; // will be updated iteratively double loss; // will be updated iteratively double tol = thresh*fabs(loss_initial); // convergence criterion for coordinate descent // loop over lambda values for (k=0; k(obs["z"]); w = as(obs["w"]); ++qau; // current value of the (approximate) loss function loss_old = loss_approx(beta, f, z, w, lam, alpha, penalty); // loss_old = ((double) obs["loss"]) + elnet_penalty(beta, lam, alpha, penalty); // run the coordinate descent until convergence for the current // quadratic approximation asu = 0; while (asu < as_updates_max) { // iterate within the current active set until convergence (this might update // active_set_old, if some variable goes to zero) coord_descent(beta, beta0, f, x, z, w, lam, alpha, penalty, intercept, active_set, active_set_old, true, npasses, tol); // perfom one pass over all the variables and check if the active set changes // (this might update active_set) coord_descent(beta, beta0, f, x, z, w, lam, alpha, penalty, intercept, varind_all, active_set, false, npasses, tol); ++asu; if (active_set==active_set_old) { // active set did not change so convergence reached // (for the current quadratic approximation to the loss function) break; } } as_updates(k) = as_updates(k) + asu; // the loss after updating the coefficients loss = loss_approx(beta, f, z, w, lam, alpha, penalty); // obs = pseudo_obs(f,w); // loss = ((double) obs["loss"]) + elnet_penalty(beta, lam, alpha, penalty); // check if converged if (fabs(loss_old-loss) < tol) { // if (loss_old-loss < tol) { // convergence reached; proceed to the next lambda value break; } } // store the current solution beta0_path(k) = beta0; beta_path.col(k) = beta; w_path.col(k) = w; qa_updates(k) = qau; if (qau == qa_updates_max && qa_updates_max > 1) Rcpp::Rcout << "glm_elnet warning: maximum number of quadratic approximation updates reached. Results can be inaccurate.\n"; if ((alpha > 0.0) && (active_set.size() >= pmaxu)) { // obtained solution with at least pmax variables and penalty is not ridge, so terminate if (pmax_strict) { // return solutions only up to the previous lambda value beta0_path = beta0_path.head(k); beta_path = beta_path.head_cols(k); } else { // return solutions up to the current lambda value beta0_path = beta0_path.head(k+1); beta_path = beta_path.head_cols(k+1); } break; } } return List::create(beta_path, beta0_path, w_path, npasses, qa_updates, as_updates); } /** Internal function that gives the output in c++ way (writes into allocated memory). * See glm_ridge_c for a wrapper that is being called from R. */ void glm_ridge( vec& beta, // output: regression coefficients (contains intercept) double& loss, // output: value of the loss function vec& w, // output: weights of the pseudo-gaussian observations at the optimum (needed for Student-t model) int& qau, // output: number of quadratic approximation updates arma::mat x, Function pseudo_obs, double lambda, bool intercept, arma::vec penalty, // relative penalties for the variables double thresh, int qa_updates_max, int ls_iter_max=50, bool debug=false) { if (intercept) { // add a vector of ones to x and set zero penalty for the intercept x = join_horiz(ones(x.n_rows), x); penalty = join_vert(zeros(1), penalty); } int n = x.n_rows; int D = x.n_cols; int ls_iter; // counts linesearch iterations int j; double t; // step size in line search double a = 0.1; // backtracking line search parameters a and b (see Boyd and Vandenberghe, 2004) double b = 0.5; // initialization vec beta_new(D); beta_new.zeros(); vec dbeta(D); dbeta.zeros(); vec grad(D); grad.zeros(); // gradient of the negative log likelihood w.r.t. the regression coefficients vec grad_f(n); grad_f.zeros(); // pointwise gradient of the negative log likelihood w.r.t. the latent values f vec f = x*beta; mat xw(n,D); // this will be the weighted x mat regmat = lambda*diagmat(penalty);//eye(D,D); // regularization matrix // initial quadratic approximation List obs = pseudo_obs(f,w); vec z = as(obs["z"]); w = as(obs["w"]); grad_f = as(obs["grad"]); double loss_initial = ((double) obs["loss"]) + elnet_penalty(beta, lambda, 0, penalty); double loss_old = loss_initial; // will be updated iteratively loss = loss_initial; // will be updated iteratively double tol = thresh*fabs(loss_initial); // threshold for convergence double decrement = 0; // newton decrement, used to monitor convergence qau = 0; while (qau < qa_updates_max) { // weight the observations for (j=0; j 0) { if (loss < loss_old - a*t*decrement ) break; } else { Rcpp::Rcout << "The search direction is not a descent direction "; Rcpp::Rcout << "(newton decrement = " << decrement << ", should be positive), "; Rcpp::Rcout << ", this is likely a bug. Please report to the developers." << '\n'; } } if (ls_iter == ls_iter_max && ls_iter_max > 1) { // beta.print("beta = "); // dbeta.print("dbeta = "); // grad.print("grad = "); // Rcpp::Rcout << "loss = " << loss << "\n"; // Rcpp::Rcout << "loss_initial = " << loss_initial << "\n"; // Rcpp::Rcout << "tol = " << tol << "\n"; // Rcpp::Rcout << "decrement = " << decrement << "\n"; // Rcpp::Rcout << "\n\n"; Rcpp::Rcout << "glm_ridge warning: maximum number of line search iterations reached. The optimization can be ill-behaved.\n"; break; } // update the solution beta = beta + t*dbeta; z = as(obs["z"]); w = as(obs["w"]); grad_f = as(obs["grad"]); loss_old = loss; ++qau; } if (qau == qa_updates_max && qa_updates_max > 1) { if (decrement/fabs(loss_initial) > 100*tol) { // warn the user if the max number of iterations is reached and we are relatively far // (two orders of magnitude) from the given convergence threshold Rcpp::Rcout << "glm_ridge warning: maximum number of quadratic approximation updates reached, within "; Rcpp::Rcout << decrement << " from optimum (tolerance = " << thresh << ").\n"; } } } /** * Wrapper for glm_ridge that can be called from R. */ // [[Rcpp::export]] List glm_ridge_c( arma::mat x, Function pseudo_obs, double lambda, bool intercept, arma::vec penalty, // relative penalties for the variables arma::vec beta_init, // initial value for the coefficients (containing the intercept as the first element) arma::vec w_init, // initial guess for the weights of the pseudo-gaussian observations (needed for Student-t model) double thresh, int qa_updates_max, int ls_iter_max=100, bool debug=false) { int D = x.n_cols; if (intercept) D++; vec beta = beta_init; vec w = w_init; int qau; double loss; glm_ridge(beta, loss, w, qau, x, pseudo_obs, lambda, intercept, penalty, thresh, qa_updates_max, ls_iter_max, debug); if (intercept) return List::create(vec(beta.tail(D-1)), beta(0), w, loss, qau); // return List::create(vec(beta.tail(D-1)), beta(0), w, qau); else return List::create(beta, 0.0, w, loss, qau); // return List::create(beta, 0.0, w, qau); } /** Forward search for glm. */ // [[Rcpp::export]] List glm_forward_c( arma::mat x, // inputs (features) Function pseudo_obs, // R-function returning the pseudo-data based on the quadratic approximation double lambda, // regularization parameter (multiplier for L2-penalty) bool intercept, // whether to use intercept arma::vec penalty, // relative penalties for the variables double thresh, // threshold for stopping the iterative reweighted least squares int qa_updates_max, // max number or quadratic approximation updates int pmax, // maximum number of variables up to which the search is continued arma::vec w0, // initial guess for the weights of the pseudo-gaussian observations (needed for Student-t model) int ls_iter_max=50 ) // max number of line search iterations { mat xp; // x for the current active set mat xp_temp; // x for the current active set + the variable to be added size_t D = x.n_cols; // total number of inputs size_t pmaxu = (size_t) pmax; // converting pmax to unsigned int (avoids some compiler warnings) uvec chosen(D); chosen.zeros(); // keeps track of added variables uvec varorder(pmaxu); varorder.zeros(); // stores the order in which the variables are added to the model mat beta_all(D,pmaxu); beta_all.zeros(); // collects beta from all steps rowvec beta0_all(pmaxu); beta0_all.zeros(); // collects beta0 from all steps mat w_all(x.n_rows,pmaxu); // collects weights of the gaussian pseudo-observations from all steps // declare a few variables that are needed during the iteration vec w = w0; int qau; size_t j,k,jopt=0; uvec varind; uvec step(1); for (k=0; k::infinity(); double loss; // loop through all the candidate variables that could be added next for (j=0; j do not edit by hand // Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 #include #include using namespace Rcpp; // glm_elnet_c List glm_elnet_c(arma::mat x, Function pseudo_obs, arma::vec lambda, double alpha, bool intercept, arma::vec penalty, double thresh, int qa_updates_max, int pmax, bool pmax_strict, arma::vec beta, double beta0, arma::vec w0, int as_updates_max); RcppExport SEXP _projpred_glm_elnet_c(SEXP xSEXP, SEXP pseudo_obsSEXP, SEXP lambdaSEXP, SEXP alphaSEXP, SEXP interceptSEXP, SEXP penaltySEXP, SEXP threshSEXP, SEXP qa_updates_maxSEXP, SEXP pmaxSEXP, SEXP pmax_strictSEXP, SEXP betaSEXP, SEXP beta0SEXP, SEXP w0SEXP, SEXP as_updates_maxSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< arma::mat >::type x(xSEXP); Rcpp::traits::input_parameter< Function >::type pseudo_obs(pseudo_obsSEXP); Rcpp::traits::input_parameter< arma::vec >::type lambda(lambdaSEXP); Rcpp::traits::input_parameter< double >::type alpha(alphaSEXP); Rcpp::traits::input_parameter< bool >::type intercept(interceptSEXP); Rcpp::traits::input_parameter< arma::vec >::type penalty(penaltySEXP); Rcpp::traits::input_parameter< double >::type thresh(threshSEXP); Rcpp::traits::input_parameter< int >::type qa_updates_max(qa_updates_maxSEXP); Rcpp::traits::input_parameter< int >::type pmax(pmaxSEXP); Rcpp::traits::input_parameter< bool >::type pmax_strict(pmax_strictSEXP); Rcpp::traits::input_parameter< arma::vec >::type beta(betaSEXP); Rcpp::traits::input_parameter< double >::type beta0(beta0SEXP); Rcpp::traits::input_parameter< arma::vec >::type w0(w0SEXP); Rcpp::traits::input_parameter< int >::type as_updates_max(as_updates_maxSEXP); rcpp_result_gen = Rcpp::wrap(glm_elnet_c(x, pseudo_obs, lambda, alpha, intercept, penalty, thresh, qa_updates_max, pmax, pmax_strict, beta, beta0, w0, as_updates_max)); return rcpp_result_gen; END_RCPP } // glm_ridge_c List glm_ridge_c(arma::mat x, Function pseudo_obs, double lambda, bool intercept, arma::vec penalty, arma::vec beta_init, arma::vec w_init, double thresh, int qa_updates_max, int ls_iter_max, bool debug); RcppExport SEXP _projpred_glm_ridge_c(SEXP xSEXP, SEXP pseudo_obsSEXP, SEXP lambdaSEXP, SEXP interceptSEXP, SEXP penaltySEXP, SEXP beta_initSEXP, SEXP w_initSEXP, SEXP threshSEXP, SEXP qa_updates_maxSEXP, SEXP ls_iter_maxSEXP, SEXP debugSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< arma::mat >::type x(xSEXP); Rcpp::traits::input_parameter< Function >::type pseudo_obs(pseudo_obsSEXP); Rcpp::traits::input_parameter< double >::type lambda(lambdaSEXP); Rcpp::traits::input_parameter< bool >::type intercept(interceptSEXP); Rcpp::traits::input_parameter< arma::vec >::type penalty(penaltySEXP); Rcpp::traits::input_parameter< arma::vec >::type beta_init(beta_initSEXP); Rcpp::traits::input_parameter< arma::vec >::type w_init(w_initSEXP); Rcpp::traits::input_parameter< double >::type thresh(threshSEXP); Rcpp::traits::input_parameter< int >::type qa_updates_max(qa_updates_maxSEXP); Rcpp::traits::input_parameter< int >::type ls_iter_max(ls_iter_maxSEXP); Rcpp::traits::input_parameter< bool >::type debug(debugSEXP); rcpp_result_gen = Rcpp::wrap(glm_ridge_c(x, pseudo_obs, lambda, intercept, penalty, beta_init, w_init, thresh, qa_updates_max, ls_iter_max, debug)); return rcpp_result_gen; END_RCPP } // glm_forward_c List glm_forward_c(arma::mat x, Function pseudo_obs, double lambda, bool intercept, arma::vec penalty, double thresh, int qa_updates_max, int pmax, arma::vec w0, int ls_iter_max); RcppExport SEXP _projpred_glm_forward_c(SEXP xSEXP, SEXP pseudo_obsSEXP, SEXP lambdaSEXP, SEXP interceptSEXP, SEXP penaltySEXP, SEXP threshSEXP, SEXP qa_updates_maxSEXP, SEXP pmaxSEXP, SEXP w0SEXP, SEXP ls_iter_maxSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< arma::mat >::type x(xSEXP); Rcpp::traits::input_parameter< Function >::type pseudo_obs(pseudo_obsSEXP); Rcpp::traits::input_parameter< double >::type lambda(lambdaSEXP); Rcpp::traits::input_parameter< bool >::type intercept(interceptSEXP); Rcpp::traits::input_parameter< arma::vec >::type penalty(penaltySEXP); Rcpp::traits::input_parameter< double >::type thresh(threshSEXP); Rcpp::traits::input_parameter< int >::type qa_updates_max(qa_updates_maxSEXP); Rcpp::traits::input_parameter< int >::type pmax(pmaxSEXP); Rcpp::traits::input_parameter< arma::vec >::type w0(w0SEXP); Rcpp::traits::input_parameter< int >::type ls_iter_max(ls_iter_maxSEXP); rcpp_result_gen = Rcpp::wrap(glm_forward_c(x, pseudo_obs, lambda, intercept, penalty, thresh, qa_updates_max, pmax, w0, ls_iter_max)); return rcpp_result_gen; END_RCPP } static const R_CallMethodDef CallEntries[] = { {"_projpred_glm_elnet_c", (DL_FUNC) &_projpred_glm_elnet_c, 14}, {"_projpred_glm_ridge_c", (DL_FUNC) &_projpred_glm_ridge_c, 11}, {"_projpred_glm_forward_c", (DL_FUNC) &_projpred_glm_forward_c, 10}, {NULL, NULL, 0} }; RcppExport void R_init_projpred(DllInfo *dll) { R_registerRoutines(dll, NULL, CallEntries, NULL, NULL); R_useDynamicSymbols(dll, FALSE); } projpred/R/0000755000176200001440000000000013614341143012300 5ustar liggesusersprojpred/R/search.R0000644000176200001440000001067313361364246013707 0ustar liggesusers# Search heuristics # search_forward1 <- function(p_ref, d_train, family, intercept, nv_max, verbose, opt) { # predictive mean and variance of the reference model (with parameters integrated out) mu <- p_ref$mu v <- p_ref$var if (NCOL(mu) > 1 || NCOL(v) > 1) stop('Internal error: search_forward1 received multiple draws. Please report to the developers.') # forward search search <- glm_forward(d_train$x, mu, family, lambda=opt$regul, offset=d_train$offset, weights=d_train$weights, obsvar=v, intercept=intercept, pmax=nv_max) out <- list(alpha=search$beta0, beta=search$beta[search$varorder,], w=search$w, vind=search$varorder) return(out) } search_forward <- function(p_ref, d_train, family_kl, intercept, nv_max, verbose, opt) { # initialize the forward selection # proj performs the projection over samples projfun <- .get_proj_handle(family_kl, opt$regul) i <- 1 iq <- ceiling(quantile(1:nv_max, 1:10/10)) cols <- 1:ncol(d_train$x) chosen <- NULL # start adding variables one at a time while(i <= nv_max) { notchosen <- setdiff(cols, chosen) cands <- lapply(notchosen, function(x) c(chosen, x)) p_sub <- sapply(cands, projfun, p_ref, d_train, intercept) imin <- which.min(p_sub['kl',]) chosen <- c(chosen, notchosen[imin]) if(verbose && i %in% iq) print(paste0(names(iq)[max(which(i == iq))], " of variables selected.")) i <- i + 1 } chosen } search_L1 <- function(p_ref, d_train, family, intercept, nv_max, penalty, opt) { # predictive mean and variance of the reference model (with parameters integrated out) mu <- p_ref$mu v <- p_ref$var if (NCOL(mu) > 1 || NCOL(v) > 1) stop('Internal error: search_L1 received multiple draws. Please report to the developers.') # L1-penalized projection (projection path). # (Notice: here we use pmax = nv_max+1 so that the computation gets carried until all the way # down to the least regularization also for model size nv_max) search <- glm_elnet(d_train$x, mu, family, lambda_min_ratio=opt$lambda_min_ratio, nlambda=opt$nlambda, pmax=nv_max+1, pmax_strict=FALSE, offset=d_train$offset, weights=d_train$weights, intercept=intercept, obsvar=v, penalty=penalty, thresh=opt$thresh) # sort the variables according to the order in which they enter the model in the L1-path entering_indices <- apply(search$beta!=0, 1, function(num) which(num)[1]) # na for those that did not enter entered_variables <- c(1:NCOL(d_train$x))[!is.na(entering_indices)] # variables that entered at some point notentered_variables <- c(1:NCOL(d_train$x))[is.na(entering_indices)] # variables that did not enter at any point order_of_entered <- sort(entering_indices, index.return=TRUE)$ix order <- c(entered_variables[order_of_entered], notentered_variables) # fetch the coefficients corresponding to those points at the searchpath where new variable enters nvar <- length(order) n <- nrow(p_ref$mu) out <- list(alpha=rep(NA, nv_max+1), beta=matrix(0, nrow=nv_max, ncol=nv_max+1), lambda=rep(NA, nv_max+1), w=matrix(NA, nrow=n, ncol=nv_max+1)) for (k in 0:nv_max) { if (k == 0) { out$alpha[1] <- search$beta0[1] out$lambda[1] <- search$lambda[1] out$w[,1] <- search$w[,1] } else { # find those points in the L1-path where only the k most relevant features can have nonzero # coefficient, and then fetch their coefficients with least regularization ivar <- utils::tail(order, nvar-k) steps_k_var <- which(colSums(search$beta[ivar,,drop=F] != 0) == 0) if (length(steps_k_var) > 0) j <- utils::tail(steps_k_var, 1) else # no steps where all the variables in set ivar would have zero coefficient (could be due # to one or more of these variables having penalty = 0 so they are always in the model) # so set the coefficients to be equal to the starting value j <- 1 out$alpha[k+1] <- search$beta0[j] out$beta[1:k,k+1] <- search$beta[order[1:k],j] out$lambda[k+1] <- search$lambda[j] out$w[,k+1] <- search$w[,j] } } if (length(entered_variables) < nv_max) if (length(setdiff(notentered_variables, which(penalty == Inf))) > 0) warning('Less than nv_max variables entered L1-path. Try reducing lambda_min_ratio. ') out$vind <- order[1:nv_max] return(out) } projpred/R/methods.R0000644000176200001440000005706313614333254014105 0ustar liggesusers#' Extract draws of the linear predictor and draw from the predictive #' distribution of the projected submodel #' #' \code{proj_linpred} extracts draws of the linear predictor and #' \code{proj_predict} draws from the predictive distribution of the projected #' submodel or submodels. If the projection has not been performed, the #' functions also perform the projection. #' #' @name proj-pred #' #' @param object Either an object returned by \link[=varsel]{varsel}, \link[=cv_varsel]{cv_varsel} #' or \link[=init_refmodel]{init_refmodel}, or alternatively any object that can be converted to a reference model. #' @param xnew The predictor values used in the prediction. If \code{vind} is #' specified, then \code{xnew} should either be a dataframe containing column names #' that correspond to \code{vind} or a matrix with the number and order of columns #' corresponding to \code{vind}. If \code{vind} is unspecified, then \code{xnew} must #' either be a dataframe containing all the column names as in the original data or a matrix #' with the same columns at the same positions as in the original data. #' @param ynew New (test) target variables. If given, then the log predictive density #' for the new observations is computed. #' @param offsetnew Offsets for the new observations. By default a vector of #' zeros. #' @param weightsnew Weights for the new observations. For binomial model, #' corresponds to the number trials per observation. For \code{proj_linpred}, #' this argument matters only if \code{ynew} is specified. By default a vector #' of ones. #' @param transform Should the linear predictor be transformed using the #' inverse-link function? Default is \code{FALSE}. For \code{proj_linpred} only. #' @param integrated If \code{TRUE}, the output is averaged over the #' parameters. Default is \code{FALSE}. For \code{proj_linpred} only. #' @param nv Number of variables in the submodel (the variable combination is #' taken from the variable selection information). If a vector with several values, #' then results for all specified model sizes are returned. Ignored if \code{vind} is specified. #' By default use the automatically suggested model size. #' @param draws Number of draws to return from the predictive distribution of #' the projection. The default is 1000. #' For \code{proj_predict} only. #' @param seed_samp An optional seed to use for drawing from the projection. #' For \code{proj_predict} only. #' @param ... Additional argument passed to \link{project} if \code{object} #' is an object returned by \link{varsel} or \link{cv_varsel}. #' #' @return If the prediction is done for one submodel only (\code{nv} has length one #' or \code{vind} is specified) and ynew is unspecified, a matrix or vector of #' predictions (depending on the value of \code{integrated}). If \code{ynew} is specified, #' returns a list with elements pred (predictions) and lpd (log predictive densities). #' If the predictions are done for several submodel sizes, returns a list with one element #' for each submodel. #' #' @examples #' \donttest{ #' ### Usage with stanreg objects #' fit <- stan_glm(y~x, binomial()) #' vs <- varsel(fit) #' #' # compute predictions with 4 variables at the training points #' pred <- proj_linpred(vs, xnew=x, nv = 4) #' pred <- proj_predict(vs, xnew=x, nv = 4) #' #' } #' NULL # The 'helper' for proj_linpred and proj_predict, ie. does all the # functionality that is common to them. It essentially checks all the arguments # and sets them to their respective defaults and then loops over the # projections. For each projection, it evaluates the fun-function, which # calculates the linear predictor if called from proj_linpred and samples from # the predictive distribution if called from proj_predict. proj_helper <- function(object, xnew, offsetnew, weightsnew, nv, seed_samp, fun, ...) { if (!inherits(xnew, c('data.frame', 'matrix'))) stop('xnew must be a data.frame or a matrix. See ?proj-pred.') if (is.null(offsetnew)) offsetnew <- rep(0, nrow(xnew)) if (is.null(weightsnew)) weightsnew <- rep(1, nrow(xnew)) if ('projection' %in% class(object) || (length(object)>0 && 'projection' %in% class(object[[1]]))) { proj <- object } else { # reference model or varsel object obtained, so run the projection proj <- project(object = object, nv = nv, ...) } if (!.is_proj_list(proj)) { proj <- list(proj) } else { # proj is not a projection object if(any(sapply(proj, function(x) !('family_kl' %in% names(x))))) stop('list contains objects not created by varsel, cv_varsel or project') } projected_sizes <- sapply(proj, function(x) NROW(x$beta)) nv <- list(...)$nv %ORifNULL% projected_sizes if (!all(nv %in% projected_sizes)) stop(paste0('Linear prediction requested for nv = ', paste(nv, collapse = ', '), ', but projection performed only for nv = ', paste(projected_sizes, collapse = ', '), '.')) projs <- Filter(function(x) NROW(x$beta) %in% nv, proj) names(projs) <- nv xnew_df <- is.data.frame(xnew) if (xnew_df) { terms <- unique(unlist(lapply(projs, function(x) names(x$vind)))) xnew <- .df_to_model_mat(xnew, terms) } vind <- list(...)$vind if (!is.null(vind) && NCOL(xnew) != length(vind)) stop(paste('The number of columns in xnew does not match the', 'number of variable indices (vind).')) # set random seed but ensure the old RNG state is restored on exit if (exists('.Random.seed')) { rng_state_old <- .Random.seed on.exit(assign(".Random.seed", rng_state_old, envir = .GlobalEnv)) } set.seed(seed_samp) preds <- lapply(projs, function(proj) { if (xnew_df) { xtemp <- xnew[, min(1, length(proj$vind)):length(proj$vind), drop = F] } else if (!is.null(vind)) { # columns of xnew are assumed to match to the given variable indices xtemp <- xnew } else { # fetch the right columns from the feature matrix if (length(proj$vind) > 0 && max(proj$vind) > ncol(xnew)) stop(paste('xnew has', ncol(xnew), 'columns, but vind expects', max(proj$vind), 'columns.')) xtemp <- xnew[, proj$vind, drop = F] } mu <- proj$family_kl$mu_fun(xtemp, proj$alpha, proj$beta, offsetnew) fun(proj, mu, offsetnew, weightsnew) }) .unlist_proj(preds) } #' @rdname proj-pred #' @export proj_linpred <- function(object, xnew, ynew = NULL, offsetnew = NULL, weightsnew = NULL, nv = NULL, transform = FALSE, integrated = FALSE, ...) { # function to perform to each projected submodel fun <- function(proj, mu, offset, weights) { pred <- t(mu) if (!transform) pred <- proj$family_kl$linkfun(pred) if (integrated) { # average over the parameters pred <- as.vector( proj$weights %*% pred ) } else if (!is.null(dim(pred)) && dim(pred)[1]==1) { # return a vector if pred contains only one row pred <- as.vector(pred) } if (!is.null(ynew)) { # compute also the log-density target <- .get_standard_y(ynew, weights, proj$family_kl) ynew <- target$y weights <- target$weights lpd <- proj$family_kl$ll_fun(mu, proj$dis, ynew, weights) if (integrated && !is.null(dim(lpd))) { lpd <- as.vector(apply(lpd, 1, log_weighted_mean_exp, proj$weights)) } else if (!is.null(dim(lpd))) { lpd <- t(lpd) } list(pred = pred, lpd = lpd) } else { pred } } # proj_helper lapplies fun to each projection in object proj_helper(object = object, xnew = xnew, offsetnew = offsetnew, weightsnew = weightsnew, nv = nv, seed_samp = NULL, fun = fun, ...) } #' @rdname proj-pred #' @export proj_predict <- function(object, xnew, offsetnew = NULL, weightsnew = NULL, nv = NULL, draws = NULL, seed_samp = NULL, ...) { # function to perform to each projected submodel fun <- function(proj, mu, offset, weights) { if(is.null(draws)) draws <- 1000 draw_inds <- sample(x = seq_along(proj$weights), size = draws, replace = TRUE, prob = proj$weights) t(sapply(draw_inds, function(i) { proj$family_kl$ppd_fun(mu[,i], proj$dis[i], weights) })) } # proj_helper lapplies fun to each projection in object proj_helper(object = object, xnew = xnew, offsetnew = offsetnew, weightsnew = weightsnew, nv = nv, seed_samp = seed_samp, fun = fun, ...) } #' Plot or fetch summary statistics related to variable selection #' #' \code{varsel_stats} can be used to obtain summary statistics related to #' variable selection. The same statistics can be plotted with #' \code{varsel_plot}. #' #' @name varsel-statistics #' #' @param object The object returned by \link[=varsel]{varsel} or #' \link[=cv_varsel]{cv_varsel}. #' @param nv_max Maximum submodel size for which the statistics are calculated. #' For \code{varsel_plot} it must be at least 1. #' @param stats One or several strings determining which statistics to calculate. Available #' statistics are: #' \itemize{ #' \item{elpd:} {(Expected) sum of log predictive densities} #' \item{mlpd:} {Mean log predictive density, that is, elpd divided by the number of datapoints.} #' \item{mse:} {Mean squared error (gaussian family only)} #' \item{rmse:} {Root mean squared error (gaussian family only)} #' \item{acc/pctcorr:} {Classification accuracy (binomial family only)} #' \item{auc:} {Area under the ROC curve (binomial family only)} #' } #' Default is elpd. #' @param type One or more items from 'mean', 'se', 'lower' and 'upper' indicating which of these to #' compute (mean, standard error, and lower and upper credible bounds). The credible bounds are determined so #' that \code{1-alpha} percent of the mass falls between them. #' @param deltas If \code{TRUE}, the submodel statistics are estimated relative to the baseline model #' (see argument \code{baseline}) instead of estimating the actual values of the statistics. #' Defaults to \code{FALSE}. #' @param alpha A number indicating the desired coverage of the credible #' intervals. For example \code{alpha=0.32} corresponds to 68\% probability mass #' within the intervals, that is, one standard error intervals. #' @param baseline Either 'ref' or 'best' indicating whether the baseline is the reference model or #' the best submodel found. Default is 'ref' when the reference model exists, and 'best' otherwise. #' @param ... Currently ignored. #' #' #' @examples #' \donttest{ #' ### Usage with stanreg objects #' fit <- stan_glm(y~x, binomial()) #' vs <- cv_varsel(fit) #' varsel_plot(vs) #' #' # print out some stats #' varsel_stats(vs, stats=c('acc'), type = c('mean','se')) #' } #' NULL #' @rdname varsel-statistics #' @export varsel_plot <- function(object, nv_max = NULL, stats = 'elpd', deltas = F, alpha = 0.32, baseline=NULL, ...) { .validate_vsel_object_stats(object, stats) baseline <- .validate_baseline(object$refmode, baseline, deltas) # compute all the statistics and fetch only those that were asked nfeat_baseline <- .get_nfeat_baseline(object, baseline, stats[1]) tab <- rbind(.tabulate_stats(object, stats, alpha = alpha, nfeat_baseline=nfeat_baseline), .tabulate_stats(object, stats, alpha = alpha)) stats_table <- subset(tab, tab$delta==deltas) stats_ref <- subset(stats_table, stats_table$size==Inf) stats_sub <- subset(stats_table, stats_table$size!=Inf) stats_bs <- subset(stats_table, stats_table$size == nfeat_baseline) if(NROW(stats_sub) == 0) { stop(paste0(ifelse(length(stats)==1, 'Statistics ', 'Statistic '), paste0(unique(stats), collapse=', '), ' not available.')) } if(is.null(nv_max)) nv_max <- max(stats_sub$size) else { # don't exceed the maximum submodel size nv_max <- min(nv_max, max(stats_sub$size)) if (nv_max < 1) stop('nv_max must be at least 1') } ylab <- if(deltas) 'Difference to the baseline' else 'Value' # make sure that breaks on the x-axis are integers n_opts <- c(4,5,6) n_possible <- Filter(function(x) nv_max %% x == 0, n_opts) n_alt <- n_opts[which.min(n_opts - (nv_max %% n_opts))] nb <- ifelse(length(n_possible) > 0, min(n_possible), n_alt) by <- ceiling(nv_max/min(nv_max, nb)) breaks <- seq(0, by*min(nv_max, nb), by) minor_breaks <- if(by%%2 == 0) seq(by/2, by*min(nv_max, nb), by) else NULL # plot submodel results pp <- ggplot(data = subset(stats_sub, stats_sub$size <= nv_max), mapping = aes_string(x = 'size')) + geom_linerange(aes_string(ymin = 'lq', ymax = 'uq', alpha=0.1)) + geom_line(aes_string(y = 'value')) + geom_point(aes_string(y = 'value')) if (!all(is.na(stats_ref$se))) # add reference model results if they exist pp <- pp + geom_hline(aes_string(yintercept = 'value'), data = stats_ref, color = 'darkred', linetype=2) if (baseline != 'ref') # add the baseline result (if different from the reference model) pp <- pp + geom_hline(aes_string(yintercept = 'value'), data = stats_bs, color = 'black', linetype=3) pp <- pp + scale_x_continuous(breaks = breaks, minor_breaks = minor_breaks, limits = c(min(breaks), max(breaks))) + labs(x = 'Number of variables in the submodel', y = ylab) + theme(legend.position = 'none') + facet_grid(statistic ~ ., scales = 'free_y') pp } #' @rdname varsel-statistics #' @export varsel_stats <- function(object, nv_max = NULL, stats = 'elpd', type = c('mean','se'), deltas = F, alpha=0.32, baseline=NULL, ...) { .validate_vsel_object_stats(object, stats) baseline <- .validate_baseline(object$refmode, baseline, deltas) # fetch statistics if (deltas) { nfeat_baseline <- .get_nfeat_baseline(object, baseline, stats[1]) tab <- .tabulate_stats(object, stats, alpha=alpha, nfeat_baseline=nfeat_baseline) } else { tab <- .tabulate_stats(object, stats, alpha=alpha) } stats_table <- subset(tab, tab$size != Inf) # these are the corresponding names for mean, se, upper and lower in the stats_table, and their suffices # in the table to be returned qty <- unname(sapply(type, function(t) switch(t, mean='value', upper='uq', lower='lq', se='se'))) suffix <- unname(sapply(type, function(t) switch(t, mean='', upper='.upper', lower='.lower', se='.se'))) # loop through all the required statistics arr <- data.frame(size = unique(stats_table$size), vind = c(NA, object$vind)) for (i in seq_along(stats)) { temp <- subset(stats_table, stats_table$statistic == stats[i], qty) newnames <- sapply(suffix, function(s) paste0(stats[i],s)) colnames(temp) <- newnames arr <- cbind(arr, temp) } if(is.null(nv_max)) nv_max <- max(stats_table$size) if('pctch' %in% names(object)) arr$pctch <- c(NA, diag(object$pctch[,-1])) subset(arr, arr$size <= nv_max) } #' Print methods for vsel/cvsel objects #' #' The \code{print} methods for vsel/cvsel objects created by \code{\link{varsel}} #' or \code{\link{cv_varsel}}) rely on \code{\link{varsel_stats}} to display #' a summary of the results of the projection predictive variable selection. #' #' @name print-vsel #' #' @param x An object of class vsel/cvsel. #' @param digits Number of decimal places to be reported (2 by default). #' @param ... Further arguments passed to \code{\link{varsel_stats}}. #' #' @return Returns invisibly the data frame produced by \code{\link{varsel_stats}}. #' #' @export #' @method print vsel print.vsel <- function(x, digits=2, ...) { stats <- round(varsel_stats(x, ...), digits) print(stats[, -match("vind", colnames(stats))]) invisible(stats) } #' @rdname print-vsel #' @export #' @method print cvsel print.cvsel <- function(x, digits=2, ...) { stats <- round(varsel_stats(x, ...), digits) print(stats[, -match("vind", colnames(stats))]) invisible(stats) } #' Suggest model size #' #' This function can be used for suggesting an appropriate model size #' based on a certain default rule. Notice that the decision rules are heuristic #' and should be interpreted as guidelines. It is recommended that the user #' studies the results via \code{varsel_plot} and/or \code{varsel_stats} #' and makes the final decision based on what is most appropriate for the given #' problem. #' #' @param object The object returned by \link[=varsel]{varsel} or #' \link[=cv_varsel]{cv_varsel}. #' @param stat Statistic used for the decision. Default is 'elpd'. See \code{varsel_stats} for #' other possible choices. #' @param alpha A number indicating the desired coverage of the credible #' intervals based on which the decision is made. E.g. \code{alpha=0.32} corresponds to #' 68\% probability mass within the intervals (one standard error intervals). #' See details for more information. #' @param pct Number indicating the relative proportion between baseline model and null model #' utilities one is willing to sacrifice. See details for more information. #' @param type Either 'upper' (default) or 'lower' determining whether the decisions are #' based on the upper or lower credible bounds. See details for more information. #' @param baseline Either 'ref' or 'best' indicating whether the baseline is the reference model or #' the best submodel found. Default is 'ref' when the reference model exists, and 'best' otherwise. #' @param warnings Whether to give warnings if automatic suggestion fails, mainly for internal use. #' Default is TRUE, and usually no reason to set to FALSE. #' @param ... Currently ignored. #' #' @details The suggested model size is the smallest model for which #' either the lower or upper (depending on argument \code{type}) credible bound #' of the submodel utility \eqn{u_k} with significance level \code{alpha} falls above #' \deqn{u_base - pct*(u_base - u_0)} #' Here \eqn{u_base} denotes the utility for the baseline model and \eqn{u_0} the null model utility. #' The baseline is either the reference model or the best submodel found (see argument \code{baseline}). #' The lower and upper bounds are defined to contain the submodel utility with #' probability 1-alpha (each tail has mass alpha/2). #' #' By default \code{ratio=0}, \code{alpha=0.32} and \code{type='upper'} which means that we select the smallest #' model for which the upper tail exceeds the baseline model level, that is, which is better than the baseline #' model with probability 0.16 (and consequently, worse with probability 0.84). In other words, #' the estimated difference between the baseline model and submodel utilities is at most one standard error #' away from zero, so the two utilities are considered to be close. #' #' NOTE: Loss statistics like RMSE and MSE are converted to utilities by multiplying them by -1, so call #' such as \code{suggest_size(object, stat='rmse', type='upper')} should be interpreted as finding #' the smallest model whose upper credible bound of the \emph{negative} RMSE exceeds the cutoff level #' (or equivalently has the lower credible bound of RMSE below the cutoff level). This is done to make #' the interpretation of the argument \code{type} the same regardless of argument \code{stat}. #' #' @examples #' \donttest{ #' ### Usage with stanreg objects #' fit <- stan_glm(y~x, binomial()) #' vs <- cv_varsel(fit) #' suggest_size(vs) #' #' } #' #' @export suggest_size <- function(object, stat = 'elpd', alpha = 0.32, pct = 0.0, type='upper', baseline=NULL, warnings=TRUE, ...) { .validate_vsel_object_stats(object, stat) if (length(stat) > 1) stop('Only one statistic can be specified to suggest_size') if (.is_util(stat)) { sgn <- 1 } else { sgn <- -1 if (type == 'upper') type <- 'lower' else type <- 'upper' } bound <- paste0(stat,'.',type) stats <- varsel_stats(object, stats=stat, alpha=alpha, type=c('mean','upper','lower'), baseline=baseline, deltas = T) util_null <- sgn*unlist(unname(subset(stats, stats$size==0, stat))) util_cutoff <- pct*util_null res <- subset(stats, sgn*stats[,bound] >= util_cutoff, 'size') if(nrow(res) == 0) { # no submodel satisfying the criterion found if (object$nv_max == object$nv_all) ssize <- object$nv_max else { ssize <- NA if (warnings) warning(paste('Could not suggest model size. Investigate varsel_plot to identify', 'if the search was terminated too early. If this is the case,', 'run variable selection with larger value for nv_max.')) } } else { ssize <- min(res) } ssize } #' @method as.matrix projection #' @export as.matrix.projection <- function(x, ...) { if (x$p_type) { warning(paste0('Note, that projection was performed using', 'clustering and the clusters might have different weights.')) } res <- t(x$beta) if (ncol(res) > 0) colnames(res) <- names(x$vind) if (x$intercept) res <- cbind('(Intercept)' = x$alpha, res) if (x$family_kl$family == 'gaussian') res <- cbind(res, sigma = x$dis) res } #' Create cross-validation indices #' #' Divide indices from 1 to \code{n} into subsets for \code{k}-fold cross validation. #' These functions are potentially useful when creating the \code{cvfits} and \code{cvfun} #' arguments for \link[=init_refmodel]{init_refmodel}. The returned value is different for #' these two methods, see below for details. #' #' @name cv-indices #' #' @param n Number of data points. #' @param k Number of folds. Must be at least 2 and not exceed \code{n}. #' @param out Format of the output, either 'foldwise' (default) or 'indices'. See below for details. #' @param seed Random seed so that the same division could be obtained again if needed. #' #' @return \code{cvfolds} returns a vector of length \code{n} such that each element is an integer #' between 1 and \code{k} denoting which fold the corresponding data point belongs to. #' The returned value of \code{cvind} depends on the \code{out}-argument. If \code{out}='foldwise', #' the returned value is a list with \code{k} elements, #' each having fields \code{tr} and \code{ts} which give the training and test indices, respectively, #' for the corresponding fold. If \code{out}='indices', the returned value is a list with fields \code{tr} #' and \code{ts} #' each of which is a list with \code{k} elements giving the training and test indices for each fold. #' @examples #' \donttest{ #' ### compute sample means within each fold #' n <- 100 #' y <- rnorm(n) #' cv <- cvind(n, k=5) #' cvmeans <- lapply(cv, function(fold) mean(y[fold$tr])) #' } #' NULL #' @rdname cv-indices #' @export cvfolds <- function(n, k, seed=NULL) { .validate_num_folds(k, n) # set random seed but ensure the old RNG state is restored on exit if (exists('.Random.seed')) { rng_state_old <- .Random.seed on.exit(assign(".Random.seed", rng_state_old, envir = .GlobalEnv)) } set.seed(seed) # create and shuffle the indices folds <- rep_len(1:k, length.out = n) folds <- sample(folds, n, replace=FALSE) return(folds) } #' @rdname cv-indices #' @export cvind <- function(n, k, out=c('foldwise', 'indices'), seed=NULL) { .validate_num_folds(k, n) out <- match.arg(out) # set random seed but ensure the old RNG state is restored on exit if (exists('.Random.seed')) { rng_state_old <- .Random.seed on.exit(assign(".Random.seed", rng_state_old, envir = .GlobalEnv)) } set.seed(seed) # shuffle the indices ind <- sample(1:n, n, replace=FALSE) if (out == 'foldwise') { cv <- lapply(1:k, function(i) { ts <- sort(ind[seq(i,n,k)]) # test set tr <- setdiff(1:n, ts) # training set list(tr=tr,ts=ts) }) } else if (out == 'indices') { cv <- list() cv$tr <- list() cv$ts <- list() for (i in 1:k) { ts <- sort(ind[seq(i,n,k)]) # test set tr <- setdiff(1:n, ts) # training set cv$tr[[i]] <- tr cv$ts[[i]] <- ts } } return(cv) } projpred/R/cv_varsel.R0000644000176200001440000004520713614333254014423 0ustar liggesusers#' Cross-validate the variable selection (varsel) #' #' Perform cross-validation for the projective variable selection for a generalized #' linear model. #' @param fit Same as in \link[=varsel]{varsel}. #' @param method Same as in \link[=varsel]{varsel}. #' @param ns Number of samples used for selection. Ignored if nc is provided or if method='L1'. #' @param nc Number of clusters used for selection. Default is 1 and ignored if method='L1' #' (L1-search uses always one cluster). #' @param nspred Number of samples used for prediction (after selection). Ignored if ncpred is given. #' @param ncpred Number of clusters used for prediction (after selection). Default is 5. #' @param relax Same as in \link[=varsel]{varsel}. #' @param nv_max Same as in \link[=varsel]{varsel}. #' @param intercept Same as in \link[=varsel]{varsel}. #' @param penalty Same as in \link[=varsel]{varsel}. #' @param verbose Whether to print out some information during the validation, Default is TRUE. #' @param cv_method The cross-validation method, either 'LOO' or 'kfold'. Default is 'LOO'. #' @param nloo Number of observations used to compute the LOO validation (anything between 1 and the #' total number of observations). Smaller values lead to #' faster computation but higher uncertainty (larger errorbars) in the accuracy estimation. #' Default is to use all observations, but for faster experimentation, one can set this to a small value such as 100. #' Only applicable if \code{cv_method = 'LOO'}. #' @param K Number of folds in the k-fold cross validation. Default is 5 for genuine #' reference models and 10 for datafits (that is, for penalized maximum likelihood estimation). #' @param lambda_min_ratio Same as in \link[=varsel]{varsel}. #' @param nlambda Same as in \link[=varsel]{varsel}. #' @param thresh Same as in \link[=varsel]{varsel}. #' @param regul Amount of regularization in the projection. Usually there is no need for #' regularization, but sometimes for some models the projection can be ill-behaved and we #' need to add some regularization to avoid numerical problems. #' @param validate_search Whether to cross-validate also the selection process, that is, whether to perform #' selection separately for each fold. Default is TRUE and we strongly recommend not setting this #' to FALSE, because this is known to bias the accuracy estimates for the selected submodels. #' However, setting this to FALSE can sometimes be useful because comparing the results to the case #' where this parameter is TRUE gives idea how strongly the feature selection is (over)fitted to the #' data (the difference corresponds to the search degrees of freedom or the effective number #' of parameters introduced by the selectin process). #' @param seed Random seed used in the subsampling LOO. By default uses a fixed seed. #' @param ... Additional arguments to be passed to the \code{get_refmodel}-function. #' #' @return An object of type \code{cvsel} that contains information about the feature selection. The fields are not #' meant to be accessed directly by the user but instead via the helper functions (see the vignettes or type ?projpred #' to see the main functions in the package.) #' #' @examples #' \donttest{ #' ### Usage with stanreg objects #' fit <- stan_glm(y~x, binomial()) #' cvs <- cv_varsel(fit) #' varsel_plot(cvs) #' } #' #' @export cv_varsel <- function(fit, method = NULL, cv_method = NULL, ns = NULL, nc = NULL, nspred = NULL, ncpred = NULL, relax=NULL, nv_max = NULL, intercept = NULL, penalty = NULL, verbose = T, nloo=NULL, K = NULL, lambda_min_ratio=1e-5, nlambda=150, thresh=1e-6, regul=1e-4, validate_search=T, seed=NULL, ...) { refmodel <- get_refmodel(fit, ...) # resolve the arguments similar to varsel args <- parseargs_varsel(refmodel, method, relax, intercept, nv_max, nc, ns, ncpred, nspred) method <- args$method relax <- args$relax intercept <- args$intercept nv_max <- args$nv_max nc <- args$nc ns <- args$ns ncpred <- args$ncpred nspred <- args$nspred # arguments specific to this function args <- parseargs_cv_varsel(refmodel, cv_method, K) cv_method <- args$cv_method K <- args$K # search options opt <- list(lambda_min_ratio=lambda_min_ratio, nlambda=nlambda, thresh=thresh, regul=regul) if (tolower(cv_method) == 'kfold') { # TODO: should we save the cvfits object to the reference model so that it need not be computed again # if the user wants to compute the search again? sel_cv <- kfold_varsel(refmodel, method, nv_max, ns, nc, nspred, ncpred, relax, intercept, penalty, verbose, opt, K, seed=seed) } else if (tolower(cv_method) == 'loo') { if (!(is.null(K))) warning('K provided, but cv_method is LOO.') sel_cv <- loo_varsel(refmodel, method, nv_max, ns, nc, nspred, ncpred, relax, intercept, penalty, verbose, opt, nloo = nloo, validate_search = validate_search, seed = seed) } else { stop(sprintf('Unknown cross-validation method: %s.', cv_method)) } # run the selection using the full dataset if (verbose) cat('Performing variable selection using all data...\n') sel <- varsel(refmodel, method=method, ns=ns, nc=nc, nspred=nspred, ncpred=ncpred, relax=relax, nv_max=nv_max, intercept=intercept, penalty=penalty, verbose=verbose, lambda_min_ratio=lambda_min_ratio, nlambda=nlambda, regul=regul) # find out how many of cross-validated iterations select # the same variables as the selection with all the data. ch <- as.matrix(unname(as.data.frame(sel_cv$vind_cv))) w <- sel_cv$summaries$sub[[1]]$w # these weights might be non-constant in case of subsampling LOO if (is.null(w)) w <- rep(1/ncol(ch), ncol(ch)) # if weights are not set, then all validation folds have equal weight pctch <- t(sapply(seq_along(sel$vind), function(size) { c(size = size, sapply(sel$vind, function(var) { sum(t(ch[1:size, ] == var) * w, na.rm = T) })) })) colnames(pctch)[-1] <- names(sel$vind) # create the object to be returned vs <- list() vs$refmodel <- refmodel vs$spath <- sel$spath vs$method <- method vs$cv_method <- cv_method vs <- c(vs, c(sel_cv[c('d_test', 'summaries')], sel[c('family_kl', 'vind', 'kl')], list(pctch = pctch))) class(vs) <- 'cvsel' vs$nv_max <- nv_max vs$nv_all <- ncol(refmodel$x) vs$ssize <- suggest_size(vs, warnings = F) vs } parseargs_cv_varsel <- function(refmodel, cv_method, K) { # # Auxiliary function for parsing the input arguments for specific cv_varsel. # This is similar in spirit to parseargs_varsel, that is, to avoid the main function to become # too long and complicated to maintain. # if (is.null(cv_method)) { if ('datafit' %in% class(refmodel)) # only data given, no actual reference model cv_method <- 'kfold' else cv_method <- 'LOO' } if (cv_method == 'kfold') { if (is.null(K)) K <- if ('datafit' %in% class(refmodel)) 10 else 5 else .validate_num_folds(K, refmodel$nobs) } list(cv_method=cv_method, K=K) } kfold_varsel <- function(refmodel, method, nv_max, ns, nc, nspred, ncpred, relax, intercept, penalty, verbose, opt, K, seed=NULL) { # fetch the k_fold list (or compute it now if not already computed) k_fold <- .get_kfold(refmodel, K, verbose, seed) # check that k_fold has the correct form # .validate_kfold(refmodel, k_fold, refmodel$nobs) K <- length(k_fold) family_kl <- refmodel$fam # extract variables from each fit-object (samples, x, y, etc.) # to a list of size K refmodels_cv <- lapply(k_fold, function(fold) fold$refmodel) # List of size K with test data for each fold d_test_cv <- lapply(k_fold, function(fold) { list(z = refmodel$z[fold$omitted,,drop=F], x = refmodel$x[fold$omitted,,drop=F], y = refmodel$y[fold$omitted], weights = refmodel$wobs[fold$omitted], offset = refmodel$offset[fold$omitted]) }) # List of K elements, each containing d_train, p_pred, etc. corresponding # to each fold. msgs <- paste0(method, ' search for fold ', 1:K, '/', K, '.') list_cv <- mapply(function(refmod, d_test, msg) { d_train <- .get_traindata(refmod) p_sel <- .get_refdist(refmod, ns, nc) p_pred <- .get_refdist(refmod, nspred, ncpred) mu_test <- refmod$predfun(d_test$z, d_test$offset) list(d_train = d_train, d_test = d_test, p_sel = p_sel, p_pred = p_pred, mu_test = mu_test, dis = refmod$dis, w_test = refmod$wsample, msg = msg) }, refmodels_cv, d_test_cv, msgs, SIMPLIFY = F) # Perform the selection for each of the K folds if (verbose) { cat('Performing selection for each fold...\n') pb <- utils::txtProgressBar(min = 0, max = K, style = 3, initial=0) } spath_cv <- lapply(seq_along(list_cv), function(fold_index) { fold <- list_cv[[fold_index]] out <- select(method, fold$p_sel, fold$d_train, family_kl, intercept, nv_max, penalty, verbose, opt) if (verbose) utils::setTxtProgressBar(pb, fold_index) out }) vind_cv <- lapply(spath_cv, function(e) e$vind) if (verbose) close(pb) # Construct submodel projections for each fold as.search <- !relax && !is.null(spath_cv[[1]]$beta) && !is.null(spath_cv[[1]]$alpha) if (verbose && !as.search) { cat('Computing projections...\n') pb <- utils::txtProgressBar(min = 0, max = K, style = 3, initial=0) } p_sub_cv <- mapply(function(spath, fold_index) { fold <- list_cv[[fold_index]] vind <- spath$vind p_sub <- .get_submodels(spath, c(0, seq_along(vind)), family_kl, fold$p_pred, fold$d_train, intercept, opt$regul, as.search=as.search) if (verbose && !as.search) utils::setTxtProgressBar(pb, fold_index) return(p_sub) }, spath_cv, seq_along(list_cv), SIMPLIFY = F) if (verbose && !as.search) close(pb) # Helper function extract and combine mu and lppd from K lists with each # n/K of the elements to one list with n elements hf <- function(x) as.list(do.call(rbind, x)) # Apply some magic to manipulate the structure of the list so that instead of # list with K sub_summaries each containing n/K mu:s and lppd:s, we have only # one sub_summary-list that contains with all n mu:s and lppd:s. sub <- apply( mapply(function(p_sub, fold) { lapply(.get_sub_summaries(p_sub, fold$d_test, family_kl), data.frame) }, p_sub_cv, list_cv), 1, hf) ref <- hf(lapply(list_cv, function(fold) { data.frame(.weighted_summary_means(fold$d_test, family_kl, fold$w_test, fold$mu_test, fold$dis)) })) # Combine also the K separate test data sets into one list # with n y's and weights's. d_cv <- hf(lapply(d_test_cv, function(d) { data.frame(d[c('y', 'weights')]) })) list(vind_cv = vind_cv, summaries = list(sub = sub, ref = ref), d_test = c(d_cv, type = 'kfold')) } .get_kfold <- function(refmodel, K, verbose, seed) { # Fetch the k_fold list or compute it now if not already computed. This function will # return a list of length K, where each element is a list with fields 'refmodel' (object # of type refmodel computed by init_refmodel) and index list 'omitted' that denotes which # of the data points were left out for the corresponding fold. if (is.null(refmodel$cvfits)) { if (!is.null(refmodel$cvfun)) { # cv-function provided so perform the cross-validation now. In case refmodel # is datafit, cvfun will return an empty list and this will lead to normal cross-validation # for the submodels although we don't have an actual reference model if (verbose && !('datafit' %in% class(refmodel))) cat('Performing cross-validation for the reference model...\n') folds <- cvfolds(refmodel$nobs, k=K, seed=seed) cvfits <- refmodel$cvfun(folds) cvfits <- lapply(seq_along(cvfits), function(k) { # add the 'omitted' indices for the cvfits cvfit <- cvfits[[k]] cvfit$omitted <- which(folds==k) cvfit }) } else # genuine probabilistic model but no k-fold fits nor cvfun provided, so raise an error stop('For a generic reference model, you must provide either cvfits or cvfun for k-fold cross-validation. See function init_refmodel.') } else cvfits <- refmodel$cvfits # transform the cvfits-list to k_fold list, that is, initialize the reference models for each # fold given the prediction function and dispersion draws from the cvfits-list k_fold <- lapply(cvfits, function(cvfit) { ref <- init_refmodel(z=refmodel$z[-cvfit$omitted,,drop=F], y=refmodel$y[-cvfit$omitted], x=refmodel$x[-cvfit$omitted,,drop=F], family=refmodel$fam, predfun=cvfit$predfun, dis=cvfit$dis, offset=refmodel$offset[-cvfit$omitted], wobs=refmodel$wobs[-cvfit$omitted], intercept=refmodel$intercept) list(refmodel=ref, omitted=cvfit$omitted) }) return(k_fold) } loo_varsel <- function(refmodel, method, nv_max, ns, nc, nspred, ncpred, relax, intercept, penalty, verbose, opt, nloo = NULL, validate_search = T, seed = NULL) { # # Performs the validation of the searching process using LOO. # validate_search indicates whether the selection is performed separately for each # fold (for each data point) # fam <- refmodel$fam mu <- refmodel$mu dis <- refmodel$dis n <- nrow(mu) # by default use all observations nloo <- ifelse(is.null(nloo), n, min(nloo, n)) if (nloo < 1) stop('Value of \'nloo\' must be at least 1') # training data d_train <- .get_traindata(refmodel) # the clustering/subsampling used for selection p_sel <- .get_refdist(refmodel, ns=ns, nc=nc) cl_sel <- p_sel$cl # clustering information # the clustering/subsampling used for prediction p_pred <- .get_refdist(refmodel, ns=nspred, nc=ncpred) cl_pred <- p_pred$cl # fetch the log-likelihood for the reference model to obtain the LOO weights if (is.null(refmodel$loglik)) # case where log-likelihood not available, i.e., the reference model is not a genuine model # => cannot compute LOO stop('LOO can be performed only if the reference model is a genuine probabilistic model for which the log-likelihood can be evaluated.') else # log-likelihood available loglik <- refmodel$loglik psisloo <- loo::psis(-loglik, cores = 1, r_eff = rep(1,ncol(loglik))) # TODO: should take r_eff:s into account lw <- weights(psisloo) pareto_k <- loo::pareto_k_values(psisloo) # compute loo summaries for the reference model d_test <- d_train loo_ref <- apply(loglik+lw, 2, 'log_sum_exp') mu_ref <- rep(0,n) for (i in 1:n) mu_ref[i] <- mu[i,] %*% exp(lw[,i]) # decide which points form the validation set based on the k-values validset <- .loo_subsample(n, nloo, pareto_k, seed) inds <- validset$inds # initialize matrices where to store the results vind_mat <- matrix(nrow=n, ncol=nv_max) loo_sub <- matrix(nrow=n, ncol=nv_max+1) mu_sub <- matrix(nrow=n, ncol=nv_max+1) if (verbose) { cat('Computing LOOs...\n') pb <- utils::txtProgressBar(min = 0, max = nloo, style = 3, initial=0) } if (!validate_search) { # perform selection only once using all the data (not separately for each fold), # and perform the projection then for each submodel size # vind <- select(method, p_sel, d_train, fam, intercept, nv_max, penalty, verbose=F, opt) spath <- select(method, p_sel, d_train, fam, intercept, nv_max, penalty, verbose=F, opt) vind <- spath$vind } for (run_index in seq_along(inds)) { # observation index i <- inds[run_index] # reweight the clusters/samples according to the is-loo weights p_sel <- .get_p_clust(fam, mu, dis, wobs=refmodel$wobs, wsample=exp(lw[,i]), cl=cl_sel) p_pred <- .get_p_clust(fam, mu, dis, wobs=refmodel$wobs, wsample=exp(lw[,i]), cl=cl_pred) if (validate_search) { # perform selection with the reweighted clusters/samples # vind <- select(method, p_sel, d_train, fam, intercept, nv_max, penalty, verbose=F, opt) spath <- select(method, p_sel, d_train, fam, intercept, nv_max, penalty, verbose=F, opt) vind <- spath$vind } # project onto the selected models and compute the prediction accuracy for the left-out point as.search <- !relax && !is.null(spath$beta) && !is.null(spath$alpha) submodels <- .get_submodels(spath, 0:nv_max, fam, p_pred, d_train, intercept, opt$regul, as.search=as.search) d_test <- list(x=matrix(refmodel$x[i,],nrow=1), y=refmodel$y[i], offset=d_train$offset[i], weights=d_train$weights[i]) summaries_sub <- .get_sub_summaries(submodels, d_test, fam) for (k in seq_along(summaries_sub)) { loo_sub[i,k] <- summaries_sub[[k]]$lppd mu_sub[i,k] <- summaries_sub[[k]]$mu } vind_mat[i,] <- vind if (verbose) { utils::setTxtProgressBar(pb, run_index) } } if (verbose) # close the progress bar object close(pb) # put all the results together in the form required by cv_varsel summ_sub <- lapply(0:nv_max, function(k){ list(lppd=loo_sub[,k+1], mu=mu_sub[,k+1], w=validset$w) }) summ_ref <- list(lppd=loo_ref, mu=mu_ref) summaries <- list(sub=summ_sub, ref=summ_ref) vind_cv <- lapply(1:n, function(i){ vind_mat[i,] }) d_test <- list(y=d_train$y, weights=d_train$weights, type='loo') return(list(vind_cv=vind_cv, summaries=summaries, d_test=d_test)) } .loo_subsample <- function(n, nloo, pareto_k, seed) { # decide which points to go through in the validation (i.e., which points # belong to the semi random subsample of validation points) # set random seed but ensure the old RNG state is restored on exit if (exists('.Random.seed')) { rng_state_old <- .Random.seed on.exit(assign(".Random.seed", rng_state_old, envir = .GlobalEnv)) } set.seed(seed) resample <- function(x, ...) x[sample.int(length(x), ...)] if (nloo < n) { bad <- which(pareto_k > 0.7) ok <- which(pareto_k <= 0.7 & pareto_k > 0.5) good <- which(pareto_k <= 0.5) inds <- resample(bad, min(length(bad), floor(nloo/3)) ) inds <- c(inds, resample(ok, min(length(ok), floor(nloo/3)))) inds <- c(inds, resample(good, min(length(good), floor(nloo/3)))) if (length(inds) < nloo) { # not enough points selected, so choose randomly among the rest inds <- c(inds, resample(setdiff(1:n, inds), nloo-length(inds))) } # assign the weights corresponding to this stratification (for example, the # 'bad' values are likely to be overpresented in the sample) w <- rep(0,n) w[inds[inds %in% bad]] <- length(bad) / sum(inds %in% bad) w[inds[inds %in% ok]] <- length(ok) / sum(inds %in% ok) w[inds[inds %in% good]] <- length(good) / sum(inds %in% good) } else { # all points used inds <- c(1:n) w <- rep(1,n) } # ensure weights are normalized w <- w/sum(w) return(list(inds=inds, w=w)) } projpred/R/kl_helpers.R0000644000176200001440000001727313361364246014575 0ustar liggesusers# Model-specific helper functions. # # \code{kl_helpers(fam)} returns a family object augmented with auxiliary functions that # are needed for computing KL divergence, log predictive density, projecting dispersion etc. # # Missing: Quasi-families not implemented. If dis_gamma is the correct shape # parameter for projected Gamma regression, everything should be OK for gamma. kl_helpers <- function(fam) { # define the functions for all families but # return only the ones that are needed. if (.has.fam.extras(fam)) # if the object already was created using this function, then return return(fam) # kl-divergences # for binomial and poisson it is the mean of the dev.resids divided by 2 # NOTE: we should get rid off these, they are not much of a help.. kl_dev <- function(pref, data, psub) { if(NCOL(pref$mu)>1) { w <- rep(data$weights, NCOL(pref$mu)) colMeans(fam$dev.resids(pref$mu, psub$mu, w))/2 } else { mean(fam$dev.resids(pref$mu, psub$mu, data$weights))/2 } } kl_gauss <- function(pref, data, psub) colSums(data$weights*(psub$mu-pref$mu)^2) # not the actual kl but reasonable surrogate.. kl_student_t <- function(pref, data, psub) log(psub$dis) #- 0.5*log(pref$var) # FIX THIS, NOT CORRECT kl_gamma <- function(pref, data, psub) { stop('KL-divergence for gamma not implemented yet.') # mean(data$weights*( # p_sub$dis*(log(pref$dis)-log(p_sub$dis)+log(psub$mu)-log(pref$mu)) + # digamma(pref$dis)*(pref$dis - p_sub$dis) - lgamma(pref$dis) + # lgamma(p_sub$dis) + pref$mu*p_sub$dis/p_sub$mu - pref$dis)) } # dispersion parameters in draw-by-draw or clustered projection. # for gaussian and student-t dispersion is the noise scale, and for gamma it is the shape parameter. # in both cases pref is a list with field mu and var giving the mean and predictive # variance for each draw/cluster (columns) and each observation (rows). # psub is a list containing mu (analogous to pref$mu) and w, which give the weights # of thee pseudo-observations at optimal coefficients (needed for student-t projection). # wobs denote the observation weights. dis_na <- function(pref, psub, wobs) rep(0, ncol(pref$mu)) dis_gauss <- function(pref, psub, wobs) { sqrt(colSums(wobs/sum(wobs)*(pref$var + (pref$mu-psub$mu)^2))) } dis_student_t <- function(pref, psub, wobs) { s2 <- colSums( psub$w/sum(wobs)*(pref$var+(pref$mu-psub$mu)^2) ) # CHECK THIS sqrt(s2) # stop('Projection of dispersion not yet implemented for student-t') } dis_gamma <- function(pref, psub, wobs) { # TODO, IMPLEMENT THIS stop('Projection of dispersion parameter not yet implemented for family Gamma.') #mean(data$weights*((pref$mu - p_sub$mu)/ # fam$mu.eta(fam$linkfun(p_sub$mu))^2)) } # functions for computing the predictive variance (taking into account # the uncertainty in mu) predvar_na <- function(mu, dis, wsample=1) { 0 } predvar_gauss <- function(mu, dis, wsample=1) { wsample <- wsample/sum(wsample) mu_mean <- mu %*% wsample mu_var <- mu^2 %*% wsample - mu_mean^2 as.vector( sum(wsample*dis^2) + mu_var ) } predvar_student_t <- function(mu, dis, wsample=1) { wsample <- wsample/sum(wsample) mu_mean <- mu %*% wsample mu_var <- mu^2 %*% wsample - mu_mean^2 as.vector( fam$nu/(fam$nu-2)*sum(wsample*dis^2) + mu_var ) } predvar_gamma <- function(mu, dis, wsample) { stop('Family Gamma not implemented yet.')} # log likelihoods ll_binom <- function(mu, dis, y, weights=1) dbinom(y*weights, weights, mu, log=T) ll_poiss <- function(mu, dis, y, weights=1) weights*dpois(y, mu, log=T) ll_gauss <- function(mu, dis, y, weights=1) { dis <- matrix(rep(dis, each=length(y)), ncol=NCOL(mu)) weights*dnorm(y, mu, dis, log=T) } ll_student_t <- function(mu, dis, y, weights=1) { dis <- matrix(rep(dis, each=length(y)), ncol=NCOL(mu)) if (NCOL(y) < NCOL(mu)) y <- matrix(y, nrow=length(y), ncol=NCOL(mu)) weights*(dt((y-mu)/dis, fam$nu, log=T) - log(dis)) } ll_gamma <- function(mu, dis, y, weights=1) { dis <- matrix(rep(dis, each=length(y)), ncol=NCOL(mu)) weights*dgamma(y, dis, dis/matrix(mu), log=T) } # loss functions for projection. these are defined to be -2*log-likelihood, ignoring any additional constants. # we need to define these separately from the log-likelihoods because some of the log-likelihoods # or deviance functions do not work when given the fit of the reference model (float) in place of y (integer), # for instance binomial and poisson models. dev_binom <- function(mu, y, weights=1, dis=NULL) { if (NCOL(y) < NCOL(mu)) y <- matrix(y, nrow=length(y), ncol=NCOL(mu)) -2*weights*(y*log(mu) + (1-y)*log(1-mu)) } dev_poiss <- function(mu, y, weights=1, dis=NULL) { if (NCOL(y) < NCOL(mu)) y <- matrix(y, nrow=length(y), ncol=NCOL(mu)) -2*weights*(y*log(mu) - mu) } dev_gauss <- function(mu, y, weights=1, dis=NULL) { if (is.null(dis)) dis <- 1 else dis <- matrix(rep(dis, each=length(y)), ncol=NCOL(mu)) if (NCOL(y) < NCOL(mu)) y <- matrix(y, nrow=length(y), ncol=NCOL(mu)) -2*weights*(-0.5/dis*(y-mu)^2 - log(dis)) } dev_student_t <- function(mu, y, weights=1, dis=NULL) { if (is.null(dis)) dis <- 1 else dis <- matrix(rep(dis, each=length(y)), ncol=NCOL(mu)) if (NCOL(y) < NCOL(mu)) y <- matrix(y, nrow=length(y), ncol=NCOL(mu)) -2*weights*(-0.5*(fam$nu+1)*log(1 + 1/fam$nu*((y-mu)/dis)^2) - log(dis)) } dev_gamma <- function(mu, dis, y, weights=1) { # dis <- matrix(rep(dis, each=length(y)), ncol=NCOL(mu)) # weights*dgamma(y, dis, dis/matrix(mu), log=T) stop('Loss function not implemented for Gamma-family yet.') } # functions to sample from posterior predictive distribution ppd_gauss <- function(mu, dis, weights = 1) rnorm(length(mu), mu, dis) ppd_binom <- function(mu, dis, weights = 1) rbinom(length(mu), weights, mu) ppd_poiss <- function(mu, dis, weights = 1) rpois(length(mu), mu) ppd_student_t <- function(mu, dis, weights = 1) rt(length(mu), fam$nu)*dis + mu ppd_gamma <- function(mu, dis, weights = 1) rgamma(length(mu), dis, dis/mu) # function for computing mu = E(y) mu_fun <- function(x, alpha, beta, offset) { if (!is.matrix(x)) stop('x must be a matrix.') if (!is.matrix(beta)) stop('beta must be a matrix') fam$linkinv(cbind(1, x) %*% rbind(alpha, beta) + offset) } # return the family object with the correct function handles c(switch(fam$family, 'binomial' = list(kl = kl_dev, ll_fun = ll_binom, deviance = dev_binom, dis_fun = dis_na, predvar = predvar_na, ppd_fun = ppd_binom), 'poisson' = list(kl = kl_dev, ll_fun = ll_poiss, deviance = dev_poiss, dis_fun = dis_na, predvar = predvar_na, ppd_fun = ppd_poiss), 'gaussian' = list(kl = kl_gauss, ll_fun = ll_gauss, deviance = dev_gauss, dis_fun = dis_gauss, predvar = predvar_gauss, ppd_fun = ppd_gauss), 'Gamma' = list(kl = kl_gamma, ll_fun = ll_gamma, deviance = dev_gamma, dis_fun = dis_gamma, predvar_gamma, ppd_fun = ppd_gamma), 'Student_t' = list(kl = kl_student_t, ll_fun = ll_student_t, deviance = dev_student_t, dis_fun = dis_student_t, predvar = predvar_student_t, ppd_fun = ppd_student_t) ), list(mu_fun = mu_fun), fam) } .has.dispersion <- function(fam) { # a function for checking whether the family has a dispersion parameter fam$family %in% c('gaussian','Student_t','Gamma') } .has.fam.extras <- function(fam) { # check whether the family object has the extra functions, that is, whether it was # created by kl_helpers !is.null(fam$deviance) } projpred/R/varsel.R0000644000176200001440000002752413513014333013725 0ustar liggesusers#' Variable selection for generalized linear models #' #' Perform the projection predictive variable selection for generalized linear models using #' generic reference models. #' #' @param object Either a \code{refmodel}-type object created by \link[=get_refmodel]{get_refmodel} #' or \link[=init_refmodel]{init_refmodel}, or an object which can be converted to a reference model #' using \link[=get_refmodel]{get_refmodel}. #' @param d_test A test dataset, which is used to evaluate model performance. #' If not provided, training data is used. Currently this argument is for internal use only. #' @param method The method used in the variable selection. Possible options are #' \code{'L1'} for L1-search and \code{'forward'} for forward selection. #' Default is 'forward' if the number of variables in the full data is at most 20, and #' \code{'L1'} otherwise. #' @param relax If TRUE, then the projected coefficients after L1-selection are computed #' without any penalization (or using only the regularization determined by \code{regul}). If FALSE, then #' the coefficients are the solution from the L1-penalized projection. This option is relevant only #' if \code{method}='L1'. Default is TRUE for genuine reference models and FALSE if \code{object} is #' datafit (see \link[=init_refmodel]{init_refmodel}). #' @param ns Number of posterior draws used in the variable selection. #' Cannot be larger than the number of draws in the reference model. #' Ignored if nc is set. #' @param nc Number of clusters to use in the clustered projection. #' Overrides the \code{ns} argument. Defaults to 1. #' @param nspred Number of samples used for prediction (after selection). Ignored if ncpred is given. #' @param ncpred Number of clusters used for prediction (after selection). Default is 5. #' @param nv_max Maximum number of varibles until which the selection is continued. #' Defaults to min(20, D, floor(0.4*n)) where n is the number of observations and #' D the number of variables. #' @param intercept Whether to use intercept in the submodels. Defaults to TRUE. #' @param penalty Vector determining the relative penalties or costs for the variables. #' Zero means that those variables have no cost and will therefore be selected first, #' whereas Inf means that those variables will never be selected. Currently works only #' if method == 'L1'. By default 1 for each variable. #' @param verbose If TRUE, may print out some information during the selection. #' Defaults to FALSE. #' @param lambda_min_ratio Ratio between the smallest and largest lambda in the L1-penalized search. #' This parameter essentially determines how long the search is carried out, i.e., how large submodels #' are explored. No need to change the default value unless the program gives a warning about this. #' @param nlambda Number of values in the lambda grid for L1-penalized search. No need to change unless #' the program gives a warning about this. #' @param thresh Convergence threshold when computing L1-path. Usually no need to change this. #' @param regul Amount of regularization in the projection. Usually there is no need for #' regularization, but sometimes for some models the projection can be ill-behaved and we #' need to add some regularization to avoid numerical problems. #' @param ... Additional arguments to be passed to the \code{get_refmodel}-function. #' #' #' @return An object of type \code{vsel} that contains information about the feature selection. The fields are not #' meant to be accessed directly by the user but instead via the helper functions (see the vignettes or type ?projpred #' to see the main functions in the package.) #' #' @examples #' \donttest{ #' ### Usage with stanreg objects #' fit <- stan_glm(y~x, binomial()) #' vs <- varsel(fit) #' varsel_plot(vs) #' } #' #' @export varsel <- function(object, d_test = NULL, method = NULL, ns = NULL, nc = NULL, nspred = NULL, ncpred = NULL, relax=NULL, nv_max = NULL, intercept = NULL, penalty=NULL, verbose = F, lambda_min_ratio=1e-5, nlambda=150, thresh=1e-6, regul=1e-4, ...) { refmodel <- get_refmodel(object, ...) family_kl <- refmodel$fam # fetch the default arguments or replace them by the user defined values args <- parseargs_varsel(refmodel, method, relax, intercept, nv_max, nc, ns, ncpred, nspred) method <- args$method relax <- args$relax intercept <- args$intercept nv_max <- args$nv_max nc <- args$nc ns <- args$ns ncpred <- args$ncpred nspred <- args$nspred # training and test data d_train <- .get_traindata(refmodel) if (is.null(d_test)) { d_test <- d_train d_type <- 'train' } else { d_test <- .check_data(d_test) d_type <- 'test' } # reference distributions for selection and prediction after selection p_sel <- .get_refdist(refmodel, ns, nc) p_pred <- .get_refdist(refmodel, nspred, ncpred) # perform the selection opt <- list(lambda_min_ratio=lambda_min_ratio, nlambda=nlambda, thresh=thresh, regul=regul) searchpath <- select(method, p_sel, d_train, family_kl, intercept, nv_max, penalty, verbose, opt) vind <- searchpath$vind # statistics for the selected submodels as.search <- !relax && !is.null(searchpath$beta) && !is.null(searchpath$alpha) p_sub <- .get_submodels(searchpath, c(0, seq_along(vind)), family_kl, p_pred, d_train, intercept, regul, as.search=as.search) sub <- .get_sub_summaries(p_sub, d_test, family_kl) # predictive statistics of the reference model on test data. if no test data are provided, # simply fetch the statistics on the train data if ('datafit' %in% class(refmodel)) { # no actual reference model, so we don't know how to predict test observations ntest <- nrow(d_test$z) ref <- list(mu=rep(NA,ntest), lppd=rep(NA,ntest)) } else { if (d_type == 'train') { ref <- .weighted_summary_means(d_test, family_kl, refmodel$wsample, refmodel$mu, refmodel$dis) } else { mu_test <- refmodel$predfun(d_test$z, d_test$offset) ref <- .weighted_summary_means(d_test, family_kl, refmodel$wsample, mu_test, refmodel$dis) } } # store the relevant fields into the object to be returned vs <- list(refmodel=refmodel, spath=searchpath, d_test = c(d_test[c('y','weights')], type = d_type), summaries = list(sub = sub, ref = ref), family_kl = family_kl, method = method, vind = setNames(vind, refmodel$coefnames[vind]), kl = sapply(p_sub, function(x) x$kl) ) class(vs) <- 'vsel' # suggest model size vs$nv_max <- nv_max vs$nv_all <- ncol(refmodel$x) vs$ssize <- suggest_size(vs, warnings = F) vs } select <- function(method, p_sel, d_train, family_kl, intercept, nv_max, penalty, verbose, opt) { # # Auxiliary function, performs variable selection with the given method, # and returns the searchpath, i.e., a list with the followint entries (the last three # are returned only if one cluster projection is used for selection): # vind: the variable ordering # beta: coefficients along the search path # alpha: intercepts along the search path # p_sel: the reference distribution used in the selection (the input argument p_sel) # if (tolower(method) == 'l1') { searchpath <- search_L1(p_sel, d_train, family_kl, intercept, nv_max, penalty, opt) searchpath$p_sel <- p_sel return(searchpath) } else if (tolower(method) == 'forward') { if ( NCOL(p_sel$mu) == 1) { # only one mu column (one cluster or one sample), so use the optimized version of the forward search searchpath <- search_forward1(p_sel, d_train, family_kl, intercept, nv_max, verbose, opt) searchpath$p_sel <- p_sel return(searchpath) } else { # routine that can be used with several clusters tryCatch(vind <- search_forward(p_sel, d_train, family_kl, intercept, nv_max, verbose, opt), 'error' = .varsel_errors) searchpath <- list(vind=vind, p_sel=p_sel) return(searchpath) } } } parseargs_varsel <- function(refmodel, method, relax, intercept, nv_max, nc, ns, ncpred, nspred) { # # Auxiliary function for parsing the input arguments for varsel. The arguments # specified by the user (or the function calling this function) are treated as they are, but if # some are not given, then this function fills them in with the default values. The purpose of this # function is to avoid repeating the same code both in varsel and cv_varsel. # if (is.null(method)) { if (dim(refmodel$x)[2] <= 20) method <- 'forward' else method <- 'L1' } else if (!tolower(method) %in% c('forward', 'l1')) { stop(sprintf('Unknown search method: %s.', method)) } if (is.null(relax)) { if ('datafit' %in% class(refmodel)) relax <- F else relax <- T } if ((is.null(ns) && is.null(nc)) || tolower(method)=='l1') # use one cluster for selection by default, and always with L1-search nc <- 1 if (is.null(nspred) && is.null(ncpred)) # use 5 clusters for prediction by default ncpred <- min(ncol(refmodel$mu), 5) if(is.null(intercept)) intercept <- refmodel$intercept if(is.null(nv_max) || nv_max > NCOL(refmodel$x)) { nv_max_default <- floor(0.4*length(refmodel$y)) # a somewhat sensible default limit for nv_max nv_max <- min(NCOL(refmodel$x), nv_max_default, 20) } list(method=method, relax=relax, intercept=intercept, nv_max=nv_max, nc=nc, ns=ns, ncpred=ncpred, nspred=nspred) } # parse_varsel_args <- function(n, d, method = NULL, cv_method = NULL, # ns = NULL, nc = NULL, nspred = NULL, ncpred = NULL, relax = NULL, # nv_max = NULL, intercept = NULL, penalty = NULL, verbose = NULL, # nloo = NULL, K = NULL, k_fold = NULL, lambda_min_ratio = NULL, # nlambda = NULL, regul = NULL, validate_search = NULL, seed = NULL, ...) { # # Auxiliary function for figuring out the parameters for varsel and cv_varsel. The arguments # specified by the user (or the function calling this function) are treated as they are, but if # some are not given, then this function fills them in with the default values (by default, use # same values for both varsel and cv_varsel). The purpose of this function is to avoid repeating # the same (longish) code both in varsel and cv_varsel. # # if (is.null(seed)) # seed <- 134654 # # if (is.null(method)) { # if (dim(vars$x)[2] <= 20) # method <- 'forward' # else # method <- 'L1' # } # # if (is.null(relax)) { # if ('datafit' %in% class(refmodel)) # relax <- F # else # relax <- T # } # # if (is.null(cv_method)) { # if ('datafit' %in% class(refmodel)) # # only data given, no actual reference model # cv_method <- 'kfold' # else # cv_method <- 'LOO' # } # if (cv_method == 'kfold' && is.null(K)) { # if ('datafit' %in% class(refmodel)) # K <- 10 # else # K <- 5 # } # # if ((is.null(ns) && is.null(nc)) || tolower(method)=='l1') # # use one cluster for selection by default, and always with L1-search # nc <- 1 # if (is.null(nspred) && is.null(ncpred)) # # use 5 clusters for prediction by default # ncpred <- min(ncol(vars$mu), 5) # # if (is.null(intercept)) # intercept <- vars$intercept # if (is.null(nv_max) || nv_max > NCOL(vars$x)) { # nv_max_default <- floor(0.4*length(vars$y)) # a somewhat sensible default limit for nv_max # nv_max <- min(NCOL(vars$x), nv_max_default, 20) # } # # args <- list(method=method, cv_method=cv_method, ns=ns, nc=nc, nspred=nspred, ncpred=ncpred, # relax=relax, nv_max=nv_max, intercept=intercept, penalty=penalty, verbose=verbose, # nloo=nloo, K=K, k_fold=k_fold, lambda_min_ratio=lambda_min_ratio, nlambda=nlambda, # regul=regul, validate_search=validate_search, seed=seed) # # } projpred/R/families.R0000644000176200001440000000333613345277366014241 0ustar liggesusers# #' Extra family objects. # #' # #' Family objects not in the set of default \link[=family]{family}-objects. # #' # #' @name extra-families # #' # #' @param link Specification of the link function, as for the default \link[=family]{family}-objects. # #' @param nu Degrees of freedom for the Student-t distribution. # #' @param ... Further arguments, currently ignored. # #' # #' @return A family object analogous to those described in \link[=family]{family} # #' # NULL # TODO: uncomment all these documentation lines when Student-t projection ready. # Currently disabled because we do not want these to appear before this functionality is ready. # define a student-t family object. Dispersion is defined to be the scale parameter # of the distribution # #' @rdname extra-families # #' @export Student_t <- function(link='identity', nu=3) { if (!(link %in% c('identity','log','inverse'))) stop(paste0('Non-supported link: ', link)) if (!is.character(link)) stop('Link must be a string.') # fetch the link statistics stats <- make.link(link) # variance function varfun <- function(mu) { if (nu > 2) rep(nu/(nu-2), length(mu)) else rep(Inf, length(mu)) } # create the object and append the relevant fields fam <- list( family = 'Student_t', nu = nu, link = link, linkfun = stats$linkfun, linkinv = stats$linkinv, variance = varfun, dev.resids = function(y, mu, wt, dis=1) wt*(nu+1) * log(1 + 1/nu*((y-mu)/dis)^2), aic = function(y, n, mu, wt, dev) stop('aic not implemented for Student-t.'), mu.eta = stats$mu.eta, initialize = expression({ stop('initialization for Student-t not implemented.') }), validmu = function(mu) TRUE, valideta = stats$valideta ) structure(fam, class = 'family') }projpred/R/data.R0000644000176200001440000000271713363326134013347 0ustar liggesusers#' Binomial toy example. #' #' @format A simulated classification dataset containing 100 observations. #' \describe{ #' \item{y}{target, 0 or 1.} #' \item{x}{features, 30 in total.} #' } #' @source \url{http://web.stanford.edu/~hastie/glmnet/glmnetData/BNExample.RData} "df_binom" #' Gaussian toy example. #' #' @format A simulated regression dataset containing 100 observations. #' \describe{ #' \item{y}{target, real-valued.} #' \item{x}{features, 20 in total. Mean and sd approximately 0 and 1.} #' } #' @source \url{http://web.stanford.edu/~hastie/glmnet/glmnetData/QSExample.RData} "df_gaussian" #' Mesquite data set. #' #' The mesquite bushes yields data set from Gelman #' and Hill (2007) (\url{http://www.stat.columbia.edu/~gelman/arm/}). #' #' @format The outcome variable is the total weight (in grams) of photosynthetic #' material as derived from actual harvesting of the bush. The predictor #' variables are: #' \describe{ #' \item{diam1}{diameter of the canopy (the leafy area of the bush) #' in meters, measured along the longer axis of the bush.} #' \item{diam2}{canopy diameter measured along the shorter axis} #' \item{canopy height}{height of the canopy.} #' \item{total height}{total height of the bush.} #' \item{density}{plant unit density (# of primary stems per plant unit).} #' \item{group}{group of measurements (0 for the first group, 1 for the second group)} #' } #' #' @source \url{http://www.stat.columbia.edu/~gelman/arm/examples/} "mesquite" projpred/R/RcppExports.R0000644000176200001440000000204413611400313014704 0ustar liggesusers# Generated by using Rcpp::compileAttributes() -> do not edit by hand # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 glm_elnet_c <- function(x, pseudo_obs, lambda, alpha, intercept, penalty, thresh, qa_updates_max, pmax, pmax_strict, beta, beta0, w0, as_updates_max = 50L) { .Call('_projpred_glm_elnet_c', PACKAGE = 'projpred', x, pseudo_obs, lambda, alpha, intercept, penalty, thresh, qa_updates_max, pmax, pmax_strict, beta, beta0, w0, as_updates_max) } glm_ridge_c <- function(x, pseudo_obs, lambda, intercept, penalty, beta_init, w_init, thresh, qa_updates_max, ls_iter_max = 100L, debug = FALSE) { .Call('_projpred_glm_ridge_c', PACKAGE = 'projpred', x, pseudo_obs, lambda, intercept, penalty, beta_init, w_init, thresh, qa_updates_max, ls_iter_max, debug) } glm_forward_c <- function(x, pseudo_obs, lambda, intercept, penalty, thresh, qa_updates_max, pmax, w0, ls_iter_max = 50L) { .Call('_projpred_glm_forward_c', PACKAGE = 'projpred', x, pseudo_obs, lambda, intercept, penalty, thresh, qa_updates_max, pmax, w0, ls_iter_max) } projpred/R/projpred-package.R0000644000176200001440000000403713513014333015641 0ustar liggesusers#' Projection predictive feature selection #' #' @docType package #' @name projpred #' #' @useDynLib projpred #' @importFrom Rcpp sourceCpp #' #' @import stats #' @import ggplot2 #' @importFrom loo psis #' #' #' @description Description #' #' \pkg{projpred} is an R package to perform projection predictive variable selection for generalized #' linear models. The package is aimed to be compatible with \pkg{rstanarm} but also other #' reference models can be used (see function \code{\link{init_refmodel}}). #' #' Currently, the supported models (family objects in R) include Gaussian, #' Binomial and Poisson families, but more will be implemented later. See the #' \href{https://mc-stan.org/projpred/articles/quickstart.html}{quickstart-vignette} #' for examples. #' #' #' @section Functions: #' #' \describe{ #' \item{\link{varsel}, \link{cv_varsel}, \link{init_refmodel}, \link{suggest_size}}{ #' Perform and cross-validate the variable selection. \link{init_refmodel} can be used to initialize #' a reference model other than \pkg{rstanarm}-fit.} #' \item{\link{project}}{ #' Get the projected posteriors of the reduced models.} #' \item{\link{proj_predict}, \link{proj_linpred}}{ #' Make predictions with reduced number of features.} #' \item{\link{varsel_plot}, \link{varsel_stats}}{ #' Visualize and get some key statistics about the variable selection.} #' } #' #' #' #' #' @section References: #' #' Dupuis, J. A. and Robert, C. P. (2003). Variable selection in qualitative models via #' an entropic explanatory power. \emph{Journal of Statistical Planning and Inference}, 111(1-2):77–94. #' #' Goutis, C. and Robert, C. P. (1998). Model choice in generalised linear models: a Bayesian #' approach via Kullback–Leibler projections. \emph{Biometrika}, 85(1):29–37. #' #' Juho Piironen and Aki Vehtari (2017). Comparison of Bayesian predictive methods for model selection. #' \emph{Statistics and Computing}, 27(3):711-735. doi:10.1007/s11222-016-9649-y. (\href{https://link.springer.com/article/10.1007/s11222-016-9649-y}{Online}). NULL projpred/R/project.R0000644000176200001440000001321613531311224014067 0ustar liggesusers#' Projection to submodels #' #' Perform projection onto submodels of selected sizes or a specified feature #' combination. #' #' @param object Either a \code{refmodel}-type object created by \link[=get_refmodel]{get_refmodel} #' or \link[=init_refmodel]{init_refmodel}, or an object which can be converted to a reference model #' using \link[=get_refmodel]{get_refmodel}. #' @param nv Number of variables in the submodel (the variable combination is taken from the #' \code{varsel} information). If a list, then the projection is performed for each model size. #' Default is the model size suggested by the variable selection (see function \code{suggest_size}). #' Ignored if \code{vind} is specified. #' @param vind Variable indices onto which the projection is done. If specified, \code{nv} is ignored. #' @param relax If TRUE, then the projected coefficients after L1-selection are computed #' without any penalization (or using only the regularization determined by \code{regul}). If FALSE, then #' the coefficients are the solution from the L1-penalized projection. This option is relevant only #' if L1-search was used. Default is TRUE for genuine reference models and FALSE if \code{object} is #' datafit (see \link[=init_refmodel]{init_refmodel}). #' @param ns Number of samples to be projected. Ignored if \code{nc} is specified. Default is 400. #' @param nc Number of clusters in the clustered projection. #' @param intercept Whether to use intercept. Default is \code{TRUE}. #' @param seed A seed used in the clustering (if \code{nc!=ns}). Can be used #' to ensure same results every time. #' @param regul Amount of regularization in the projection. Usually there is no need for #' regularization, but sometimes for some models the projection can be ill-behaved and we #' need to add some regularization to avoid numerical problems. #' @param ... Currently ignored. #' #' @return A list of submodels (or a single submodel if projection was performed onto #' a single variable combination), each of which contains the following elements: #' \describe{ #' \item{\code{kl}}{The kl divergence from the reference model to the submodel.} #' \item{\code{weights}}{Weights for each draw of the projected model.} #' \item{\code{dis}}{Draws from the projected dispersion parameter.} #' \item{\code{alpha}}{Draws from the projected intercept.} #' \item{\code{beta}}{Draws from the projected weight vector.} #' \item{\code{vind}}{The order in which the variables were added to the submodel.} #' \item{\code{intercept}}{Whether or not the model contains an intercept.} #' \item{\code{family_kl}}{A modified \code{\link{family}}-object.} #' } #' #' #' @examples #' \donttest{ #' ### Usage with stanreg objects #' fit <- stan_glm(y~x, binomial()) #' vs <- varsel(fit) #' #' # project onto the best model with 4 variables #' proj4 <- project(vs, nv = 4) #' #' # project onto an arbitrary variable combination (variable indices 3,4 and 8) #' proj <- project(fit, vind=c(3,4,8)) #' } #' #' @export project <- function(object, nv = NULL, vind = NULL, relax = NULL, ns = NULL, nc = NULL, intercept = NULL, seed = NULL, regul=1e-4, ...) { if (!inherits(object, c('vsel', 'cvsel')) && is.null(vind)) stop(paste('The object is not a variable selection object.', 'Run variable selection first, or provide the variable indices (vind).')) refmodel <- get_refmodel(object) if (is.null(relax)) # use non-relaxed solution for datafits by default relax <- ifelse('datafit' %in% class(get_refmodel(object)), FALSE, TRUE) if (is.null(object$spath$beta) || (!is.null(vind) && any(object$spath$vind[1:length(vind)] != vind))) # search path not found, or the given variable combination not in the search path relax <- TRUE if (!is.null(vind)) { if (max(vind) > ncol(refmodel$x)) stop('vind contains an index larger than ', ncol(refmodel$x), '.') nv <- length(vind) # if vind is given, nv is ignored (project only onto the given submodel) } else { vind <- object$vind # by default take the variable ordering from the selection } if (is.null(ns) && is.null(nc)) ns <- min(400, NCOL(refmodel$mu)) # by default project at most 400 draws if (is.null(nv)) { if (!is.null(object$ssize) && !is.na(object$ssize)) nv <- object$ssize # by default, project onto the suggested model size else stop('No suggested model size found, please specify nv or vind') } else { if (!is.numeric(nv) || any(nv < 0)) stop('nv must contain non-negative values.') if (max(nv) > length(vind)) { stop(paste('Cannot perform the projection with', max(nv), 'variables,', 'because variable selection was run only up to', length(vind), 'variables.')) } } if (is.null(intercept)) intercept <- refmodel$intercept family_kl <- refmodel$fam # training data d_train <- .get_traindata(refmodel) # get the clustering or subsample p_ref <- .get_refdist(refmodel, ns = ns, nc = nc, seed = seed) # project onto the submodels if (relax) { subm <- .get_submodels(list(vind=vind), nv, family_kl, p_ref, d_train, intercept, regul, as.search=F) } else { subm <- .get_submodels(object$spath, nv, family_kl, p_ref, d_train, intercept, regul, as.search=T) } # add family_kl proj <- lapply(subm, function(model) { names(model$vind) <- sapply(model$vind, function(i, ch) names(ch)[which(ch == i)], object$vind) model <- c(model, list(family_kl = family_kl), list(p_type = is.null(ns))) class(model) <- 'projection' return(model) }) # If only one model size, just return the proj instead of a list of projs .unlist_proj(proj) } projpred/R/misc.R0000644000176200001440000003104013614333254013360 0ustar liggesusers.onAttach <- function(...) { ver <- utils::packageVersion("projpred") msg <- paste0('This is projpred version ', ver, '.') packageStartupMessage(msg) } weighted.sd <- function(x, w, na.rm=F) { if (na.rm) { ind <- !is.na(w) & !is.na(x) n <- sum(ind) } else { n <- length(x) ind <- rep(T,n) } w <- w/sum(w[ind]) m <- sum(x[ind]*w[ind]) sqrt(n/(n-1)*sum(w[ind]*(x[ind] - m)^2)) } weighted.cov <- function(x,y, w, na.rm=F) { if (na.rm) { ind <- !is.na(w) & !is.na(x) & !is.na(y) n <- sum(ind) } else { n <- length(x) ind <- rep(T,n) } w <- w/sum(w[ind]) mx <- sum(x[ind]*w[ind]) my <- sum(y[ind]*w[ind]) n/(n-1)*sum(w[ind]*(x[ind] - mx)*(x[ind] - my)) } log_weighted_mean_exp <- function(x, w) { x <- x + log(w) max_x <- max(x) max_x + log(sum(exp(x - max_x))) } log_sum_exp <- function(x) { max_x <- max(x) max_x + log(sum(exp(x - max_x))) } auc <- function(x) { resp <- x[, 1] pred <- x[, 2] weights <- x[, 3] n <- nrow(x) ord <- order(pred, decreasing=TRUE) resp <- resp[ord] pred <- pred[ord] weights <- weights[ord] w0 <- w1 <- weights w0[resp == 1] <- 0 # true negative weights w1[resp == 0] <- 0 # true positive weights cum_w0 <- cumsum(w0) cum_w1 <- cumsum(w1) ## ignore tied predicted probabilities, keeping only the rightmost one rightmost.prob <- c(diff(pred) != 0, TRUE) fpr <- c(0, cum_w0[rightmost.prob]) / cum_w0[n] tpr <- c(0, cum_w1[rightmost.prob]) / cum_w1[n] delta_fpr <- c(diff(fpr), 0) delta_tpr <- c(diff(tpr), 0) ## sum the area of the rectangles that fall completely below the ROC curve ## plus half the area of the rectangles that are cut in two by the curve return(sum(delta_fpr * tpr) + sum(delta_fpr * delta_tpr) / 2) } bootstrap <- function(x, fun=mean, b=1000, oobfun=NULL, seed=NULL, ...) { # # bootstrap an arbitrary quantity fun that takes the sample x # as the first input. other parameters to fun can be passed in as ... # example: boostrap(x,mean) # # set random seed but ensure the old RNG state is restored on exit if (exists('.Random.seed')) { rng_state_old <- .Random.seed on.exit(assign(".Random.seed", rng_state_old, envir = .GlobalEnv)) } set.seed(seed) seq_x <- seq.int(NROW(x)) is_vector <- NCOL(x) == 1 bsstat <- rep(NA, b) oobstat <- rep(NA, b) for (i in 1:b) { bsind <- sample(seq_x, replace=T) bsstat[i] <- fun(if (is_vector) x[bsind] else x[bsind, ], ...) if (!is.null(oobfun)) { oobind <- setdiff(seq_x, unique(bsind)) oobstat[i] <- oobfun(if (is_vector) x[oobind] else x[oobind, ], ...) } } if (!is.null(oobfun)) { return(list(bs=bsstat, oob=oobstat)) } else return(bsstat) } .bbweights <- function(N,B) { # generate Bayesian bootstrap weights, N = original sample size, # B = number of bootstrap samples bbw <- matrix(rgamma(N*B, 1), ncol = N) bbw <- bbw/rowSums(bbw) return(bbw) } # from rstanarm `%ORifNULL%` <- function(a, b) if (is.null(a)) b else a .is.wholenumber <- function(x) abs(x - round(x)) < .Machine$double.eps^0.5 .validate_num_folds <- function(k, n) { if (!is.numeric(k) || length(k) != 1 || !.is.wholenumber(k)) stop('Number of folds must be a single integer value.') if (k < 2) stop('Number of folds must be at least 2.') if (k > n) stop('Number of folds cannot exceed n.') } .validate_vsel_object_stats <- function(object, stats) { if (!inherits(object, c('vsel', 'cvsel'))) stop('The object is not a variable selection object. Run variable selection first') recognized_stats <- c('elpd', 'mlpd','mse', 'rmse', 'acc', 'pctcorr', 'auc') binomial_only_stats <- c('acc', 'pctcorr', 'auc') family <- object$family_kl$family if (is.null(stats)) stop('Statistic specified as NULL.') for (stat in stats) { if (!(stat %in% recognized_stats)) stop(sprintf('Statistic \'%s\' not recognized.', stat)) if (stat %in% binomial_only_stats && family != 'binomial') stop('Statistic \'', stat, '\' available only for the binomial family.') } } .validate_baseline <- function(refmodel, baseline, deltas) { if (is.null(baseline)) { if (inherits(refmodel, 'datafit')) baseline <- 'best' else baseline <- 'ref' } else { if (!(baseline %in% c('ref', 'best'))) stop('Argument \'baseline\' must be either \'ref\' or \'best\'.') if (baseline == 'ref' && deltas == TRUE && inherits(refmodel, 'datafit')) { # no reference model (or the results missing for some other reason), # so cannot compute differences between the reference model and submodels stop('Cannot use deltas = TRUE and baseline = \'ref\' when there is no reference model.') } } return(baseline) } .get_standard_y <- function(y, weights, fam) { # return y and the corresponding observation weights into the 'standard' form: # for binomial family, y is transformed into a vector with values between 0 and 1, # and weights give the number of observations at each x. # for all other families, y and weights are kept as they are (unless weights is # a vector with length zero in which case it is replaced by a vector of ones). if(NCOL(y) == 1) { # weights <- if(length(weights) > 0) unname(weights) else rep(1, length(y)) if(length(weights) > 0) weights <- unname(weights) else weights <- rep(1, length(y)) if (fam$family == 'binomial') { if (is.factor(y)) { if (nlevels(y) > 2) stop('y cannot contain more than two classes if specified as factor.') y <- as.vector(y, mode='integer') - 1 # zero-one vector } else { if (any(y < 0 | y > 1)) stop("y values must be 0 <= y <= 1 for the binomial model.") } } else { if (is.factor(y)) stop('y cannot be a factor for models other than the binomial model.') } } else if (NCOL(y) == 2) { weights <- rowSums(y) y <- y[, 1] / weights } else { stop('y cannot have more than two columns.') } return(list(y=y,weights=weights)) } .get_refdist <- function(refmodel, ns=NULL, nc=NULL, seed=NULL) { # # Creates the reference distribution based on the refmodel-object, and the # desired number of clusters (nc) or number of subsamples (ns). If nc is specified, # then clustering is used and ns is ignored. Returns a list with fields: # # mu: n-by-s matrix, vector of expected values for y for each draw/cluster. here s # means either the number of draws ns or clusters nc used, depending on which one is used. # var: n-by-s matrix, vector of predictive variances for y for each draw/cluster which # which are needed for projecting the dispersion parameter (note that this can be # unintuitively zero for those families that do not have dispersion) # weights: s-element vector of weights for the draws/clusters # cl: cluster assignment for each posterior draw, that is, a vector that has length equal to the # number of posterior draws and each value is an integer between 1 and s # if (is.null(seed)) seed <- 17249420 # set random seed but ensure the old RNG state is restored on exit if (exists('.Random.seed')) { rng_state_old <- .Random.seed on.exit(assign(".Random.seed", rng_state_old, envir = .GlobalEnv)) } set.seed(seed) fam <- refmodel$fam S <- NCOL(refmodel$mu) # number of draws in the reference model if (!is.null(nc)) { # use clustering (ignore ns argument) if (nc == 1) { # special case, only one cluster cl <- rep(1, S) p_ref <- .get_p_clust(fam, refmodel$mu, refmodel$dis, wobs=refmodel$wobs, cl=cl) } else if (nc == NCOL(refmodel$mu)) { # number of clusters equal to the number of samples, so return the samples return(.get_refdist(refmodel, ns=nc)) } else { # several clusters if (nc > NCOL(refmodel$mu)) stop('The number of clusters nc cannot exceed the number of columns in mu.') p_ref <- .get_p_clust(fam, refmodel$mu, refmodel$dis, wobs=refmodel$wobs, nc=nc) } } else if (!is.null(ns)) { # subsample from the reference model # would it be safer to actually randomly draw the subsample? if (ns > NCOL(refmodel$mu)) stop('The number of subsamples ns cannot exceed the number of columns in mu.') s_ind <- round(seq(1, S, length.out = ns)) cl <- rep(NA, S) cl[s_ind] <- c(1:ns) predvar <- sapply(s_ind, function(j) { fam$predvar(refmodel$mu[,j,drop=F], refmodel$dis[j]) }) p_ref <- list(mu = refmodel$mu[, s_ind, drop=F], var = predvar, dis = refmodel$dis[s_ind], weights = rep(1/ns, ns), cl=cl) } else { # use all the draws from the reference model predvar <- sapply(1:S, function(j) { fam$predvar(refmodel$mu[,j,drop=F], refmodel$dis[j]) }) p_ref <- list(mu = refmodel$mu, var = predvar, dis = refmodel$dis, weights = refmodel$wsample, cl=c(1:S)) } return(p_ref) } .get_p_clust <- function(family_kl, mu, dis, nc=10, wobs=rep(1,dim(mu)[1]), wsample=rep(1,dim(mu)[2]), cl = NULL) { # Function for perfoming the clustering over the samples. # # cluster the samples in the latent space if no clustering provided if (is.null(cl)) { f <- family_kl$linkfun(mu) out <- kmeans(t(f), nc, iter.max = 50) cl <- out$cluster # cluster indices for each sample } else if (typeof(cl)=='list') { # old clustering solution provided, so fetch the cluster indices if (is.null(cl$cluster)) stop('argument cl must be a vector of cluster indices or a clustering object returned by k-means.') cl <- cl$cluster } # (re)compute the cluster centers, because they may be different from the ones # returned by kmeans if the samples have differing weights nc <- max(cl, na.rm=T) # number of clusters (assumes labeling 1,...,nc) centers <- matrix(0, nrow=nc, ncol=dim(mu)[1]) wcluster <- rep(0,nc) # cluster weights eps <- 1e-10 for (j in 1:nc) { # compute normalized weights within the cluster, 1-eps is for numerical stability ind <- which(cl==j) ws <- wsample[ind]/sum(wsample[ind])*(1-eps) # cluster centers and their weights centers[j,] <- mu[,ind,drop=F] %*% ws wcluster[j] <- sum(wsample[ind]) # unnormalized weight for the jth cluster } wcluster <- wcluster/sum(wcluster) # predictive variances predvar <- sapply(1:nc, function(j) { # compute normalized weights within the cluster, 1-eps is for numerical stability ind <- which(cl == j) ws <- wsample[ind]/sum(wsample[ind])*(1-eps) family_kl$predvar( mu[,ind,drop=F], dis[ind], ws ) }) # combine the results p <- list(mu = unname(t(centers)), var = predvar, weights = wcluster, cl = cl) return(p) } .get_traindata <- function(refmodel) { # # Returns the training data fetched from the reference model object. return(list(z = refmodel$z, x = refmodel$x, y = refmodel$y, weights = refmodel$wobs, offset = refmodel$offset)) } .check_data <- function(data) { # # Check that data object has the correct form for internal use. The object must # be a list with with fields 'x', 'y', 'weights' and 'offset'. # Raises error if x or y is missing, but fills weights and offset with default # values if missing. # if (is.null(data$z)) stop('The data object must be a list with field z giving the reference model inputs.') if (is.null(data$x)) stop('The data object must be a list with field x giving the feature values.') if (is.null(data$y)) stop('The data object must be a list with field y giving the target values.') if (is.null(data$weights)) data$weights <- rep(1, nrow(data$x)) if (is.null(data$offset)) data$offset <- rep(0, nrow(data$x)) return(data) } .split_coef <- function(b, intercept) { if(intercept) { list(alpha = b[1, ], beta = b[-1, , drop = F]) } else { list(alpha = rep(0, NCOL(b)), beta = b) } } .augmented_x <- function(x, intercept) { if (intercept) return(cbind(1, x)) else return(x) } .nonaugmented_x <- function(x, intercept) { if (intercept) { if (ncol(x) == 1) # there is only the column of ones in x, so return empty matrix return(matrix(nrow=nrow(x), ncol=0)) else return(x[,2:ncol(x),drop=F]) } else return(x) } .varsel_errors <- function(e) { if(grepl('computationally singular', e$message)) { stop(paste( 'Numerical problems with inverting the covariance matrix. Possibly a', 'problem with the convergence of the stan model?, If not, consider', 'stopping the selection early by setting the variable nv_max accordingly.' )) } else { stop(e$message) } } .df_to_model_mat <- function(dfnew, var_names) { f <- formula(paste('~', paste(c('0', var_names), collapse = ' + '))) model.matrix(terms(f, keep.order = T), data = dfnew) } .is_proj_list <- function(proj) { !( 'family_kl' %in% names(proj) ) } .unlist_proj <- function(p) if(length(p) == 1) p[[1]] else p projpred/R/refmodel.R0000644000176200001440000004340113610140647014224 0ustar liggesusers#' Get reference model structure #' #' Generic function that can be used to create and fetch the reference model structure #' for all those objects that have this method. All these implementations are wrappers #' to the \code{\link{init_refmodel}}-function so the returned object has the same type. #' #' @name get-refmodel #' #' @param object Object based on which the reference model is created. See possible types below. #' @param ... Arguments passed to the methods. #' #' @return An object of type \code{refmodel} (the same type as returned by \link{init_refmodel}) #' that can be passed to all the functions that #' take the reference fit as the first argument, such as \link{varsel}, \link{cv_varsel}, \link{project}, #' \link[=proj-pred]{proj_predict} and \link[=proj-pred]{proj_linpred}. #' #' @examples #' \donttest{ #' ### Usage with stanreg objects #' dat <- data.frame(y = rnorm(100), x = rnorm(100)) #' fit <- stan_glm(y ~ x, family = gaussian(), data = dat) #' ref <- get_refmodel(fit) #' print(class(ref)) #' #' # variable selection, use the already constructed reference model #' vs <- varsel(ref) #' # this will first construct the reference model and then execute #' # exactly the same way as the previous command (the result is identical) #' vs <- varsel(fit) #' } #' NULL #' @rdname get-refmodel #' @export get_refmodel <- function (object, ...) { UseMethod("get_refmodel", object) } #' @rdname get-refmodel #' @export get_refmodel.refmodel <- function(object, ...) { # if the object is reference model already, then simply return it as is object } #' @rdname get-refmodel #' @export get_refmodel.vsel <- function(object, ...) { # the reference model is stored in vsel-object object$refmodel } #' @rdname get-refmodel #' @export get_refmodel.cvsel <- function(object, ...) { # the reference model is stored in cvsel object object$refmodel } #' @rdname get-refmodel #' @export get_refmodel.stanreg <- function(object, ...) { # the fit is an rstanarm-object if (!requireNamespace("rstanarm", quietly = TRUE)) { stop("You need package \"rstanarm\". Please install it.", call. = FALSE) } if ('lmerMod' %in% class(object)) stop('stan_lmer and stan_glmer are not yet supported.') families <- c('gaussian','binomial','poisson') if (!(family(object)$family %in% families)) stop(paste0('Only the following families are currently supported:\n', paste(families, collapse = ', '), '.')) # fetch the draws samp <- as.data.frame(object) ndraws <- nrow(samp) # data, family and the predictor matrix x z <- object$data # inputs of the reference model (this contains also the target or a transformation of it, but that shouldn't hurt) if (is.null(dim(z))) stop('Model was fitted without a \'data\' argument') fam <- kl_helpers(family(object)) x <- rstanarm::get_x(object) rownames(x) <- NULL # ignore the rownames x <- x[, as.logical(attr(x, 'assign')), drop=F] # drop the column of ones attr(x, 'assign') <- NULL y <- unname(rstanarm::get_y(object)) dis <- samp$sigma %ORifNULL% rep(0, ndraws) # TODO: handle other than gaussian likelihoods.. offset <- object$offset %ORifNULL% rep(0, nobs(object)) intercept <- as.logical(attr(object$terms,'intercept') %ORifNULL% 0) predfun <- function(zt) t(rstanarm::posterior_linpred(object, newdata=data.frame(zt), transform=T, offset=rep(0,nrow(zt)))) wsample <- rep(1/ndraws, ndraws) # equal sample weights by default wobs <- unname(weights(object)) # observation weights if (length(wobs)==0) wobs <- rep(1,nrow(z)) # cvfun for k-fold cross-validation cvfun <- function(folds) { cvres <- rstanarm::kfold(object, K = max(folds), save_fits = T, folds = folds) fits <- cvres$fits[,'fit'] lapply(fits, function (fit) { dis <- as.data.frame(fit)$sigma # NOTE: this works only for Gaussian family predfun <- function(zt) t(rstanarm::posterior_linpred(fit, newdata=data.frame(zt), transform=T, offset=rep(0,nrow(zt)))) list(predfun=predfun, dis=dis) }) } init_refmodel(z=z, y=y, family=fam, x=x, predfun=predfun, dis=dis, offset=offset, wobs=wobs, wsample=wsample, intercept=intercept, cvfits=NULL, cvfun=cvfun) } #' @rdname get-refmodel #' @export get_refmodel.brmsfit <- function(object, ...) { # the fit is a brms-object if (!requireNamespace("brms", quietly = TRUE)) { stop("You need package 'brms'. Please install it.") } if (brms::is.mvbrmsformula(formula(object))) { stop("Multivariate models are not yet supported.") } bterms <- brms::parse_bf(formula(object)) if (any(!names(bterms$dpars) %in% "mu")) { stop("Distributional regression models are not yet supported.") } mu_btl <- bterms$dpars$mu if (!inherits(mu_btl, "btl")) { stop("Non-linear models are not yet supported.") } specials <- sapply(mu_btl[c("re", "sp", "sm", "gp")], NROW) if (any(as.logical(specials))) { stop("Multilevel or other special terms are not yet supported.") } family <- family(object)$family families <- c("gaussian", "binomial", "bernoulli", "poisson") if (!(family %in% families)) { stop('Only the following families are currently supported:\n', paste0(families, collapse = ", ")) } # fetch the draws samp <- as.data.frame(object) ndraws <- nrow(samp) # data, family and the predictor matrix x z <- model.frame(object) attributes(z)[c("terms", "brmsframe")] <- NULL # bernoulli is just a special case of binomial fam <- ifelse(family == "bernoulli", "binomial", family) fam <- get(fam, mode = "function")() fam <- kl_helpers(fam) x <- model.matrix(mu_btl$fe, data = z) rownames(x) <- NULL # drop the intercept column int_cols <- apply(x, 2, function(v) all(v == 1)) x <- x[, !int_cols, drop = FALSE] # extract response values resp <- brms::data_response(bterms, data = z) y <- resp$Y if (family == "binomial") { if (utils::packageVersion("brms") < "2.5.3") { stop("Binomial models require brms 2.5.3 or higher.") } trials <- resp$trials y <- y / trials # ensure probabilities will be predicted by 'predfun' object$formula$formula <- brms::update_adterms( object$formula$formula, ~ trials(1) ) } # does the model have an intercept? intercept <- any(int_cols) # extract draws of an overdispersion parameter # this may be changed in the future for non-gaussian models dis <- samp[["sigma"]] %ORifNULL% rep(0, ndraws) # extract offsets if (!is.null(mu_btl$offset)) { if (utils::packageVersion("brms") < "2.5.3") { stop("Models with offsets require brms 2.5.3 or higher.") } offset <- model.frame(mu_btl$offset, data = z) offset <- unname(model.offset(offset)) } else { offset <- rep(0, nrow(z)) } # equal sample weights by default wsample <- rep(1 / ndraws, ndraws) # observation weights if (family == "binomial") { # trials are used as weights in binomial models wobs <- trials if (!is.null(resp$weights)) { stop("Can't handle additional weights in binomial models.") } } else if (!is.null(resp$weights)) { wobs <- resp$weights } else { wobs <- rep(1, nrow(z)) } # prediction function predfun <- function(zt) { t(brms::posterior_linpred( object, newdata = data.frame(zt), transform = TRUE, offset = FALSE )) } # cvfun for k-fold cross-validation cvfun <- function(folds) { cvres <- brms::kfold( object, K = max(folds), save_fits = TRUE, folds = folds ) fits <- cvres$fits[, 'fit'] lapply(fits, function(fit) { dis <- as.data.frame(fit, pars = "^sigma$")$sigma predfun <- function(zt) { t(brms::posterior_linpred( fit, newdata = data.frame(zt), transform = TRUE, offset = FALSE )) } return(list(predfun = predfun, dis = dis)) }) } init_refmodel( z = z, y = y, family = fam, x = x, predfun = predfun, dis = dis, offset = offset, wobs = wobs, wsample = wsample, intercept = intercept, cvfits = NULL, cvfun = cvfun ) } #' Custom reference model initialization #' #' Initializes a structure that can be used as a reference fit for the #' projective variable selection. This function is provided to allow construction #' of the reference fit from arbitrary fitted models, because only limited #' information is needed for the actual projection and variable selection. #' #' @param z Predictor matrix of dimension \code{n}-by-\code{dz} containing the training #' features for the reference model. Rows denote the observations and columns the different features. #' @param y Vector of length \code{n} giving the target variable values. #' @param family \link{family} object giving the model family #' @param x Predictor matrix of dimension \code{n}-by-\code{dx} containing the candidate #' features for selection (i.e. variables from which to select the submodel). Rows denote #' the observations and columns the different features. Notice that this can #' different from \code{z}. If missing, same as \code{z} by default. #' @param predfun Function that takes a \code{nt}-by-\code{dz} test predictor matrix \code{zt} as an input #' (\code{nt} = # test points, \code{dz} = number of features in the reference model) and outputs #' a \code{nt}-by-\code{S} matrix of expected values for the target variable \code{y}, #' each column corresponding to one posterior draw for the parameters in the reference model #' (the number of draws \code{S} can also be 1). Notice that the output should be computed without #' any offsets, these are automatically taken into account internally, e.g. in cross-validation. #' If omitted, then the returned object will be 'data reference', that is, it can be used to compute #' penalized maximum likelihood solutions such as Lasso (see examples below and in the quickstart vignette.) #' @param dis Vector of length \code{S} giving the posterior draws for the dispersion parameter #' in the reference model if there is such a parameter in the model family. For Gaussian #' observation model this is the noise std \code{sigma}. #' @param offset Offset to be added to the linear predictor in the projection. (Same as in #' function \code{glm}.) #' @param wobs Observation weights. If omitted, equal weights are assumed. #' @param wsample vector of length \code{S} giving the weights for the posterior draws. #' If omitted, equal weights are assumed. #' @param intercept Whether to use intercept. Default is \code{TRUE}. #' @param cvfun Function for performing K-fold cross-validation. The input is an \code{n}-element #' vector where each value is an integer between 1 and K denoting the fold for each observation. #' Should return a list with K elements, each of which is a list with fields \code{predfun} and #' \code{dis} (if the model has a dispersion parameter) which are defined the same way as the arguments #' \code{predfun} and \code{dis} above but are computed using only the corresponding subset of the data. #' More precisely, if \code{cvres} denotes #' the list returned by \code{cvfun}, then \code{cvres[[k]]$predfun} and \code{cvres[[k]]$dis} must be computed #' using only data from indices \code{folds != k}, where \code{folds} is the \code{n}-element input for #' \code{cvfun}. Can be omitted but either \code{cvfun} or \code{cvfits} is needed for K-fold cross-validation #' for genuine reference models. See example below. #' @param cvfits A list with K elements, that has the same format as the value returned by \code{cvind} but #' each element of \code{cvfits} must also contain a field \code{omitted} which indicates the indices that #' were left out for the corresponding fold. Usually it is easier to specify \code{cvfun} but this can be useful #' if you have already computed the cross-validation for the reference model and would like to avoid #' recomputing it. Can be omitted but either \code{cvfun} or \code{cvfits} is needed for K-fold cross-validation #' for genuine reference models. #' @param ... Currently ignored. #' #' @return An object that can be passed to all the functions that #' take the reference fit as the first argument, such as \link{varsel}, \link{cv_varsel}, #' \link[=proj-pred]{proj_predict} and \link[=proj-pred]{proj_linpred}. #' #' @examples #' \donttest{ #' #' # generate some toy data #' set.seed(1) #' n <- 100 #' d <- 10 #' x <- matrix(rnorm(n*d), nrow=n, ncol=d) #' b <- c(c(1,1),rep(0,d-2)) # first two variables are relevant #' y <- x %*% b + rnorm(n) #' #' # fit the model (this uses rstanarm for posterior inference, #' # but any other tool could also be used) #' fit <- stan_glm(y~x, family=gaussian(), data=data.frame(x=I(x),y=y)) #' draws <- as.matrix(fit) #' a <- draws[,1] # intercept #' b <- draws[,2:(ncol(draws)-1)] # regression coefficients #' sigma <- draws[,ncol(draws)] # noise std #' #' # initialize the reference model structure #' predfun <- function(xt) t( b %*% t(xt) + a ) #' ref <- init_refmodel(x,y, gaussian(), predfun=predfun, dis=sigma) #' #' # variable selection based on the reference model #' vs <- cv_varsel(ref) #' varsel_plot(vs) #' #' #' # pass in the original data as 'reference'; this allows us to compute #' # traditional estimates like Lasso #' dref <- init_refmodel(x,y,gaussian()) #' lasso <- cv_varsel(dref, method='l1') # lasso #' varsel_plot(lasso, stat='rmse') #' #' } #' #' @export init_refmodel <- function(z, y, family, x=NULL, predfun=NULL, dis=NULL, offset=NULL, wobs=NULL, wsample=NULL, intercept=TRUE, cvfun=NULL, cvfits=NULL, ...) { n <- NROW(z) family <- kl_helpers(family) if (is.null(x)) x <- z if (is.null(offset)) offset <- rep(0, n) # y and the observation weights in a standard form target <- .get_standard_y(y, wobs, family) y <- target$y wobs <- target$weights if (is.null(predfun)) { # no prediction function given, so the 'reference model' will simply contain the # observed data as the fitted values predmu <- function(z,offset=0) matrix(rep(NA, NROW(z))) mu <- y proper_model <- FALSE } else { # genuine reference model. add impact of offset to the prediction function predmu <- function(z,offset=0) family$linkinv( family$linkfun(predfun(z)) + offset ) mu <- predmu(z,offset) if (NROW(y)!=NROW(mu)) stop(paste0('The number of rows in the output of predfun(z) does not match with the given y;', 'predfun seems to be misspecified.')) proper_model <- TRUE } if (proper_model) if (.has.dispersion(family) && is.null(dis)) stop(sprintf('Family %s needs a dispersion parameter so you must specify input argument \'dis\'.', family$family)) mu <- unname(as.matrix(mu)) S <- NCOL(mu) # number of samples in the reference model if (is.null(dis)) dis <- rep(0, S) if (is.null(wobs)) wobs <- rep(1, n) if (is.null(wsample)) wsample <- rep(1, S) if (is.null(intercept)) intercept <- TRUE wsample <- wsample/sum(wsample) # compute log-likelihood if (proper_model) loglik <- t(family$ll_fun(mu,dis,y,wobs)) else loglik <- NULL # figure out column names for the variables if (!is.null(colnames(x))) coefnames <- colnames(x) else coefnames <- paste0('x',1:ncol(x)) if (!proper_model) { # this is a dummy definition for cvfun, but it will lead to standard cross-validation # for datafit reference; see cv_varsel and get_kfold cvfun <- function(folds) lapply(1:max(folds), function(k) list()) } refmodel <- list(z=z, x=x, y=y, fam=family, mu=mu, dis=dis, nobs=n, coefnames=coefnames, offset=offset, wobs=wobs, wsample=wsample, intercept=intercept, predfun=predmu, loglik=loglik, cvfits=cvfits, cvfun=cvfun) # define the class of the retuned object to be 'refmodel' and additionally 'datafit' # if only the observed data was provided and no actual function for predicting test data class(refmodel) <- 'refmodel' if (!proper_model) class(refmodel) <- c(class(refmodel),'datafit') return(refmodel) } #' Predict method for reference model objects #' #' Compute the predictions using the reference model, that is, compute the #' expected value for the next observation, or evaluate the log-predictive #' density at a given point. #' #' @param object The object of class \code{refmodel}. #' @param znew Matrix of predictor values used in the prediction. #' @param ynew New (test) target variables. If given, then the log predictive density #' for the new observations is computed. #' @param offsetnew Offsets for the new observations. By default a vector of #' zeros. #' @param weightsnew Weights for the new observations. For binomial model, #' corresponds to the number trials per observation. Has effect only if \code{ynew} is specified. #' By default a vector of ones. #' @param type Scale on which the predictions are returned. Either 'link' (the latent function #' value, from -inf to inf) or 'response' (the scale on which the target \code{y} is measured, #' obtained by taking the inverse-link from the latent value). #' @param ... Currently ignored. #' #' @return Returns either a vector of predictions, or vector of log predictive densities evaluated #' at \code{ynew} if \code{ynew} is not \code{NULL}. #' @export predict.refmodel <- function(object, znew, ynew = NULL, offsetnew = NULL, weightsnew = NULL, type = c('response', 'link'), ...) { if ('datafit' %in% class(object)) stop('Cannot make predictions with data reference only.') type <- match.arg(type) if (is.null(offsetnew)) offsetnew <- rep(0, nrow(znew)) mu <- object$predfun(znew, offsetnew) if (is.null(ynew)) { if (type == 'link') pred <- object$fam$linkfun(mu) else pred <- mu # integrate over the samples if (NCOL(pred) > 1) pred <- rowMeans(pred) return(pred) } else { if (!is.numeric(ynew)) stop('ynew must be a numerical vector') # evaluate the log predictive density at the given ynew values if (is.null(weightsnew)) weightsnew <- rep(1, nrow(znew)) loglik <- object$fam$ll_fun(mu, object$dis, ynew, weightsnew) S <- ncol(loglik) lpd <- apply(loglik, 1, log_sum_exp) - log(S) return(lpd) } } projpred/R/glmfun.R0000644000176200001440000003061713513014333013716 0ustar liggesusers# # The functions in this file are used to compute the elastic net coefficient paths # for a GLM. The main function is glm_elnet, other functions are auxiliaries. # The L1-regularized projection path is computed by replacing the actual data y # by the fit of the reference model when calling glm_elnet. Uses functions in glmfun.cpp. # standardization <- function(x, center=T, scale=T, weights=NULL) { # # return the shift and scaling for each variable based on data matrix x. # w <- weights/sum(weights) if (center) mx <- colSums(x*w) else mx <- rep(0,ncol(x)) if (scale) sx <- apply(x,2,weighted.sd,w) else sx <- rep(1,ncol(x)) return(list(shift=mx, scale=sx)) } pseudo_data <- function(f, y, family, offset=rep(0,length(f)), weights=rep(1.0,length(f)), obsvar=0, wprev=NULL) { # # Returns locations z and weights w (inverse-variances) of the Gaussian pseudo-observations # based on the linear approximation to the link function at f = eta = x*beta + beta0, # as explained in McGullagh and Nelder (1989). Returns also the loss (= negative log likelihood) # and its pointwise derivative w.r.t f at the current f. # mu <- family$linkinv(f+offset) dmu_df <- family$mu.eta(f+offset) z <- f + (y - mu)/dmu_df if (family$family == 'Student_t') { # Student-t does not belong to the exponential family and thus it has its own # way of computing the observation weights if (is.null(wprev)) { # initialization of the em-iteration; loop recursively until stable initial weights are found wprev <- weights while(T) { wtemp <- pseudo_data(f,y,family, offset=offset, weights=weights, wprev=wprev, obsvar=obsvar)$w if (max(abs(wtemp-wprev)) < 1e-6) break wprev <- wtemp } } # given the weights from the previous em-iteration, update s2 based on the previous weights and mu, # and then compute new weights w nu <- family$nu s2 <- sum(wprev*(obsvar+(y-mu)^2)) / sum(weights) w <- weights*(nu+1)/(nu + 1/s2*(obsvar+(y-mu)^2)) loss <- 0.5*sum(family$deviance(mu, y, weights, sqrt(s2))) # ADD 0.5* HERE!!! grad <- weights*(mu-y)/(nu*s2) * (nu+1)/(1+(y-mu)^2/(nu*s2)) * dmu_df } else if (family$family %in% c('gaussian','poisson','binomial')) { # exponential family distributions w <- (weights * dmu_df^2)/family$variance(mu) # 2* because of deviance loss <- 0.5*sum(family$deviance(mu, y, weights)) grad <- -w*(z-f) } else { stop(sprintf('Don\'t know how to compute quadratic approximation and gradients for family \'%s\'.', family$family)) } return(list(z=z, w=w, loss=loss, grad=grad)) } lambda_grid <- function(x, y, family, offset, weights, intercept, penalty, obsvar=0, alpha=1.0, lambda_min_ratio=1e-2, nlam=100) { # # Standard lambda sequence as described in Friedman et al. (2009), section 2.5. # The grid will have nlam values, evenly spaced in the log-space between lambda_max # and lambda_min. lambda_max is the smallest value for which all the regression # coefficients will be zero (assuming alpha > 0, alpha = 0 will be initialized # as if alpha = 0.01). Returns also the initial solution corresponding to the largest # lambda (intercept and the unpenalized variables will be nonzero). # n <- dim(x)[1] if (alpha == 0) # initialize ridge as if alpha = 0.01 alpha <- 0.01 # find the initial solution, that is, values for the intercept (if included) # and those covariates that have penalty=0 (those which are always included, if such exist) init <- glm_ridge(x[,penalty==0,drop=F],y, family=family, lambda=0, weights=weights, offset=offset, obsvar=obsvar, intercept=intercept) f0 <- init$beta0*rep(1,n) if (length(init$beta) > 0) f0 <- f0 + as.vector( x[,penalty==0,drop=F] %*% init$beta ) obs <- pseudo_data(f0, y, family, offset, weights, obsvar=obsvar) resid <- obs$z - f0 # residual from the initial solution lambda_max_cand <- abs( t(x) %*% (resid*obs$w) ) / (penalty*alpha) lambda_max <- max(lambda_max_cand[is.finite(lambda_max_cand)]) lambda_max <- 1.001*lambda_max # to prevent some variable from entering at the first step due to numerical inaccuracy lambda_min <- lambda_min_ratio*lambda_max loglambda <- seq(log(lambda_min), log(lambda_max), len=nlam) beta <- rep(0, ncol(x)) beta[penalty == 0] <- init$beta return( list(lambda = rev(exp(loglambda)), beta=beta, beta0=init$beta0, w0=obs$w) ) } glm_elnet <- function(x, y, family=gaussian(), nlambda=100, lambda_min_ratio=1e-3, lambda=NULL, alpha=1.0, thresh=1e-6, qa_updates_max=ifelse(family$family=='gaussian' && family$link=='identity', 1, 100), pmax=dim(as.matrix(x))[2]+1, pmax_strict=FALSE, weights=NULL, offset=NULL, obsvar=0, intercept=TRUE, normalize=TRUE, penalty=NULL) { # # Fits GLM with elastic net penalty on the regression coefficients. # Computes the whole regularization path. # Does not handle any dispersion parameters. # if (!.has.fam.extras(family)) family <- kl_helpers(family) # ensure x is in matrix form and fill in missing weights and offsets x <- as.matrix(x) if (is.null(weights)) weights <- rep(1.0, nrow(x)) if (is.null(offset)) offset <- rep(0.0, nrow(x)) if (is.null(penalty)) penalty <- rep(1.0, ncol(x)) else if (length(penalty) != ncol(x)) stop(paste0("Incorrect length of penalty vector (should be ", ncol(x), ").")) # standardize the features (notice that the variables are centered only if intercept is used # because otherwise the intercept would become nonzero unintentionally) transf <- standardization(x, center=intercept, scale=normalize, weights=weights) penalty[transf$scale==0] <- Inf # ignore variables with zero variance transf$scale[transf$scale==0] <- 1 x <- t((t(x)-transf$shift)/transf$scale) # default lambda-sequence, including optimal start point if (is.null(lambda)) { temp <- lambda_grid(x, y, family, offset, weights, intercept, penalty, alpha=alpha, obsvar=obsvar, nlam=nlambda, lambda_min_ratio=lambda_min_ratio) lambda <- temp$lambda w0 <- temp$w0 beta <- temp$beta beta0 <- temp$beta0 } else { beta <- rep(0,ncol(x)) beta0 <- 0 w0 <- weights } # call the c++-function that serves as the workhorse pseudo_obs <- function(f,wprev) pseudo_data(f,y,family,offset=offset,weights=weights,obsvar=obsvar,wprev=wprev) out <- glm_elnet_c(x,pseudo_obs,lambda,alpha,intercept,penalty, thresh,qa_updates_max,pmax,pmax_strict,beta,beta0,w0) beta <- out[[1]] beta0 <- as.vector(out[[2]]) # # return the intecept and the coefficients on the original scale beta <- beta/transf$scale beta0 <- beta0 - colSums(transf$shift*beta) return(list( beta=beta, beta0=beta0, w=out[[3]], lambda=lambda[1:ncol(beta)], npasses=out[[4]], updates_qa=as.vector(out[[5]]), updates_as=as.vector(out[[6]]) )) } glm_ridge <- function(x, y, family=gaussian(), lambda=0, thresh=1e-7, qa_updates_max=NULL, weights=NULL, offset=NULL, obsvar=0, intercept=TRUE, penalty=NULL, normalize=TRUE, la_approx=FALSE, beta_init=NULL, beta0_init=NULL, ls_iter_max=30) { # # Fits GLM with ridge penalty on the regression coefficients. # Does not handle any dispersion parameters. # if (is.null(x)) x <- matrix(ncol=0, nrow=length(y)) if (!.has.fam.extras(family)) family <- kl_helpers(family) if (family$family == 'gaussian' && family$link == 'identity') { qa_updates_max <- 1 ls_iter_max <- 1 } else if (is.null(qa_updates_max)) qa_updates_max <- 100 if (is.null(weights)) weights <- rep(1.0, length(y)) if (is.null(offset)) offset <- rep(0.0, length(y)) if (is.null(beta0_init)) beta0_init <- 0 if (is.null(beta_init)) beta_init <- rep(0, NCOL(x)) if (intercept) beta_start <- c(beta0_init, beta_init) else beta_start <- beta_init if (is.null(penalty)) penalty <- rep(1.0, NCOL(x)) if (length(x) == 0) { if (intercept) { # model with intercept only (fit like model with no intercept but with one constant predictor) x <- matrix(rep(1,length(y)), ncol=1) w0 <- weights pseudo_obs <- function(f,wprev) pseudo_data(f,y,family,offset=offset,weights=weights,obsvar=obsvar,wprev=wprev) out <- glm_ridge_c(x, pseudo_obs, lambda, FALSE, 1, beta_start, w0, thresh, qa_updates_max, ls_iter_max) return( list(beta=matrix(integer(length=0)), beta0=as.vector(out[[1]]), w=out[[3]], loss=out[[4]], qa_updates=out[[5]]) ) } else { # null model with no predictors and no intercept pseudo_obs <- function(f,wprev) pseudo_data(f,y,family,offset=offset,weights=weights,obsvar=obsvar,wprev=wprev) pobs <- pseudo_obs(rep(0,length(y)), weights) return( list( beta=matrix(integer(length=0)), beta0=0, w=pobs$w, qa_updates=0 ) ) } } # normal case, at least one predictor x <- as.matrix(x) # ensure x is a matrix # standardize the features (notice that the variables are centered only if intercept is used # because otherwise the intercept would become nonzero unintentionally) transf <- standardization(x, center=intercept, scale=normalize, weights=weights) penalty[transf$scale==0] <- Inf # ignore variables with zero variance transf$scale[transf$scale==0] <- 1 x <- t((t(x)-transf$shift)/transf$scale) # compute the solution w0 <- weights pseudo_obs <- function(f,wprev) pseudo_data(f,y,family,offset=offset,weights=weights,obsvar=obsvar,wprev=wprev) out <- glm_ridge_c(x, pseudo_obs, lambda, intercept, penalty, beta_start, w0, thresh, qa_updates_max, ls_iter_max) beta <- out[[1]] beta0 <- as.vector(out[[2]]) w <- out[[3]] loss <- out[[4]] # return the intecept and the coefficients on the original scale beta_orig <- beta/transf$scale beta0_orig <- beta0 - sum(transf$shift*beta_orig) out <- list( beta=beta_orig, beta0=beta0_orig, w=w, qa_updates=out[[5]] ) return(out) } glm_forward <- function(x, y, family=gaussian(), lambda=0, thresh=1e-7, qa_updates_max=NULL, weights=NULL, offset=NULL, obsvar=0, intercept=TRUE, penalty=NULL, normalize=TRUE, pmax=dim(as.matrix(x))[2]) { # # Runs forward stepwise regression. Does not handle any dispersion parameters. # if (is.null(x)) x <- matrix(ncol=0, nrow=length(y)) if (!.has.fam.extras(family)) family <- kl_helpers(family) if (family$family == 'gaussian' && family$link == 'identity') qa_updates_max <- 1 else if (is.null(qa_updates_max)) qa_updates_max <- 100 if (is.null(penalty)) penalty <- rep(1.0, ncol(x)) # compute the null model out <- glm_ridge(NULL, y, family=family, lambda=lambda, thresh=thresh, qa_updates_max=qa_updates_max, weights=weights, offset=offset, obsvar=obsvar, intercept=intercept, penalty=penalty) nullmodel <- list(beta=out$beta, beta0=out$beta0, varorder=integer(length=0), w=out$w) if (length(x) == 0) { # return only the null model nullmodel$varorder <- integer(length=0) return(nullmodel) } # normal case, at least one predictor x <- as.matrix(x) if (is.null(weights)) weights <- rep(1.0, nrow(x)) if (is.null(offset)) offset <- rep(0.0, nrow(x)) # standardize the features (notice that the variables are centered only if intercept is used # because otherwise the intercept would become nonzero unintentionally) transf <- standardization(x, center=intercept, scale=normalize, weights=weights) penalty[transf$scale==0] <- Inf # ignore variables with zero variance transf$scale[transf$scale==0] <- 1 x <- t((t(x)-transf$shift)/transf$scale) # forward search (use the c++ function) w0 <- weights pseudo_obs <- function(f,wprev) pseudo_data(f,y,family,offset=offset,weights=weights,obsvar=obsvar,wprev=wprev) path <- glm_forward_c(x, pseudo_obs, lambda, intercept, penalty, thresh, qa_updates_max, pmax, w0) beta <- cbind(rep(0,ncol(x)), path[[1]]) beta0 <- c(nullmodel$beta0, as.vector(path[[2]])) # return the intecept and the coefficients on the original scale beta <- beta/transf$scale beta0 <- beta0 - colSums(transf$shift*beta) return(list( beta=beta, beta0=beta0, varorder=as.vector(path[[3]])+1, w=cbind(nullmodel$w, path[[4]]) )) } projpred/R/projfun.R0000644000176200001440000001406213361364246014121 0ustar liggesusers# Function handles for the projection # project_gaussian <- function(vind, p_ref, d_train, family_kl, intercept, regul = 1e-12) { x <- d_train$x mu <- p_ref$mu dis <- p_ref$dis if ("weights" %in% names(d_train)) wobs <- d_train$weights else wobs <- rep(1.0, NROW(mu)) if ("weights" %in% names(p_ref)) wsample <- p_ref$weights else wsample <- rep(1.0, NCOL(mu)) # ensure the weights are normalized wobs <- wobs/sum(wobs) wsample <- wsample/sum(wsample) if (intercept) { # add vector of ones to x and transform the variable indices x <- cbind(1, x) vind <- c(1, vind + 1) } else if (length(vind) == 0) { # no intercept used and vind is empty, so projecting to the completely # null model with eta=0 always pobs <- pseudo_data(0, mu, family_kl, offset=d_train$offset, weights=wobs) beta_sub <- matrix(integer(length=0), ncol=NCOL(mu)) dis_sub <- family_kl$dis_fun(list(mu=pobs$z, var=p_ref$var), list(mu=0), pobs$w) kl <- weighted.mean(colSums(wobs*pobs$z^2), wsample) submodel <- list(kl = kl, weights = wsample, dis = dis_sub, vind = vind, intercept = intercept) return(c(submodel, .split_coef(beta_sub, intercept))) } xp <- x[, vind, drop = F] Dp <- dim(xp)[2] regulmat <- diag(regul*rep(1.0, Dp), Dp, Dp) # Solve the projection equations (with l2-regularization) pobs <- pseudo_data(0, mu, family_kl, offset=d_train$offset, weights=wobs) # this will remove the offset wsqrt <- sqrt(pobs$w) beta_sub <- solve( crossprod(wsqrt*xp)+regulmat, crossprod(wsqrt*xp, wsqrt*pobs$z) ) musub <- xp%*%beta_sub dis_sub <- family_kl$dis_fun(list(mu=pobs$z, var=p_ref$var), list(mu=musub), pobs$w) kl <- weighted.mean(colSums(wobs*((pobs$z-musub)^2)), wsample) # not the actual kl-divergence, but a reasonable surrogate.. submodel <- list(kl = kl, weights = wsample, dis = dis_sub) # split b to alpha and beta, add it to submodel and return the result submodel <- c(submodel, .split_coef(beta_sub, intercept)) if(length(vind) == 1 && intercept) { submodel$vind <- integer(length=0) } else { submodel$vind <- vind[(1+intercept*1):length(vind)] - intercept*1 } submodel$intercept <- intercept return(submodel) } project_nongaussian <- function(vind, p_ref, d_train, family_kl, intercept, regul=1e-9, coef_init=NULL) { # find the projected regression coefficients for each sample xsub <- d_train$x[, vind, drop = F] d <- NCOL(xsub) n <- NROW(p_ref$mu) S <- NCOL(p_ref$mu) # loop through each draw and compute the projection for it individually beta <- matrix(0, nrow=d, ncol=S) alpha <- rep(0, S) w <- matrix(nrow=n, ncol=S) for (s in 1:S) { out <- glm_ridge(x = xsub, y = p_ref$mu[, s, drop = F], family=family_kl, lambda=regul, weights=d_train$weights, offset=d_train$offset, obsvar=p_ref$var[,s], intercept=intercept) beta[,s] <- out$beta alpha[s] <- out$beta0 w[,s] <- out$w } # compute the dispersion parameters and kl-divergences, and combine the results submodel <- list() mu <- family_kl$mu_fun(xsub, alpha, beta, d_train$offset) submodel$dis <- family_kl$dis_fun(p_ref, list(mu=mu,w=w), d_train$weights) submodel$kl <- weighted.mean(family_kl$kl(p_ref, d_train, list(mu=mu,dis=submodel$dis)), p_ref$weights) submodel$weights <- p_ref$weights submodel$alpha <- alpha submodel$beta <- beta submodel$vind <- vind submodel$intercept <- intercept return(submodel) } # function handle for the projection over samples. Gaussian case # uses analytical solution to do the projection over samples. .get_proj_handle <- function(family_kl, regul=1e-9) { # Use analytical solution for gaussian because it is faster if(family_kl$family == 'gaussian' && family_kl$link == 'identity') { return( function(vind, p_ref, d_train, intercept) { project_gaussian(vind, p_ref, d_train, family_kl, intercept, regul=regul) }) } else { # return handle to project_nongaussian with family_kl set accordingly return( function(vind, p_ref, d_train, intercept) { project_nongaussian(vind, p_ref, d_train, family_kl, intercept, regul=regul) }) } } .get_submodels <- function(searchpath, nv, family_kl, p_ref, d_train, intercept, regul, as.search=F) { # # # Project onto given model sizes nv. Returns a list of submodels. If as.search=TRUE, # submodels parameters will be as they were computed during the search, so there is # no need to project anything anymore, and this function simply fetches the information # from the searchpath list, which contains the parameter values. # varorder <- searchpath$vind p_sel <- searchpath$p_sel if (as.search) { # simply fetch the already computed quantities for each submodel size fetch_submodel <- function(nv) { submodel <- list() vind <- utils::head(varorder, nv) w <- searchpath$w[,nv+1,drop=F] alpha <- searchpath$alpha[nv+1] if (nv==0) beta <- matrix(0,nrow=0, ncol=1) else beta <- searchpath$beta[1:nv,nv+1,drop=F] xsub <- d_train$x[, vind, drop = F] mu <- family_kl$mu_fun(xsub, alpha, beta, d_train$offset) submodel$dis <- family_kl$dis_fun(p_sel, list(mu=mu,w=w), d_train$weights) submodel$kl <- weighted.mean(family_kl$kl(p_sel, d_train, list(mu=mu,dis=submodel$dis)), p_sel$weights) submodel$weights <- p_sel$weights submodel$alpha <- alpha submodel$beta <- beta submodel$vind <- vind submodel$intercept <- intercept return(submodel) } } else { # need to project again for each submodel size projfun <- .get_proj_handle(family_kl, regul) fetch_submodel <- function(nv) { if (nv == 0) vind <- integer(length=0) # empty else vind <- varorder[1:nv] return(projfun(vind, p_ref, d_train, intercept)) } } submodels <- lapply(nv, fetch_submodel) return(submodels) } projpred/R/summary_funs.R0000644000176200001440000002240513531311224015151 0ustar liggesusers .get_sub_summaries <- function(submodels, d_test, family_kl) { res <- lapply(submodels, function(model) { vind <- model$vind if(NROW(model$beta) == 0) { xt <- matrix(0, nrow = length(d_test$weights), ncol = 0) } else if(!is.matrix(d_test$x)) { xt <- matrix(d_test$x[vind], nrow = 1) } else { xt <- d_test$x[, vind, drop = F] } mu <- family_kl$mu_fun(xt, model$alpha, model$beta, d_test$offset) .weighted_summary_means(d_test, family_kl, model$weights, mu, model$dis) }) } # Calculates weighted means of mu and lppd given samples of # mu and dis, and the test data. .weighted_summary_means <- function(d_test, family_kl, wsample, mu, dis) { loglik <- family_kl$ll_fun(mu, dis, matrix(d_test$y,nrow=NROW(mu)), d_test$weights) if (length(loglik) == 1) { # one observation, one sample list(mu = mu, lppd = loglik) } else if (is.null(dim(loglik))){ # loglik is a vector, but not sure if it means one observation with many samples, or vice versa? stop('Internal error encountered: loglik is a vector, but should be a scalar or matrix') } else { # mu is a matrix, so apply weighted sum over the samples list(mu = c(mu %*% wsample), lppd = apply(loglik, 1, log_weighted_mean_exp, wsample)) } } .tabulate_stats <- function(varsel, stats, alpha = 0.05, nfeat_baseline=NULL) { # # Calculates the desired statistics, their standard errors and credible bounds with given # credible level alpha based on the variable selection information. If nfeat_baseline # is given, then compute the statistics relative to the baseline model with that size # (nfeat_baseline = Inf means reference model). stat_tab <- data.frame() summ_ref <- varsel$summaries$ref summ_sub <- varsel$summaries$sub # fetch the mu and lppd for the baseline model if (is.null(nfeat_baseline)) { # no baseline model, i.e, compute the statistics on the actual (non-relative) scale mu.bs <- NULL lppd.bs <- NULL delta <- FALSE } else { if (nfeat_baseline == Inf) summ.bs <- summ_ref else summ.bs <- summ_sub[[nfeat_baseline+1]] mu.bs <- summ.bs$mu lppd.bs <- summ.bs$lppd delta <- TRUE } for (s in seq_along(stats)) { stat <- stats[s] # reference model statistics summ <- summ_ref res <- get_stat(summ$mu, summ$lppd, varsel$d_test, varsel$family_kl, stat, mu.bs=mu.bs, lppd.bs=lppd.bs, weights=summ$w, alpha=alpha) row <- data.frame(data = varsel$d_test$type, size=Inf, delta=delta, statistic=stat, value=res$value, lq=res$lq, uq=res$uq, se=res$se) stat_tab <- rbind(stat_tab, row) # submodel statistics for (k in seq_along(varsel$summaries$sub)) { summ <- summ_sub[[k]] if (delta == F && sum(!is.na(summ_ref$mu)) > sum(!is.na(summ$mu))) { # special case (subsampling loo): reference model summaries computed for more points # than for the submodel, so utilize the reference model results to get more accurate # statistic fot the submodel on the actual scale res_ref <- get_stat(summ_ref$mu, summ_ref$lppd, varsel$d_test, varsel$family_kl, stat, mu.bs=NULL, lppd.bs=NULL, weights=summ_ref$w, alpha=alpha) res_diff <- get_stat(summ$mu, summ$lppd, varsel$d_test, varsel$family_kl, stat, mu.bs=summ_ref$mu, lppd.bs=summ_ref$lppd, weights=summ$w, alpha=alpha) val <- res_ref$value+res_diff$value val.se <- sqrt(res_ref$se^2+res_diff$se^2) lq <- qnorm(alpha/2, mean=val, sd=val.se) uq <- qnorm(1-alpha/2, mean=val, sd=val.se) row <- data.frame(data = varsel$d_test$type, size=k-1, delta=delta, statistic=stat, value=val, lq=lq, uq=uq, se=val.se) } else { # normal case res <- get_stat(summ$mu, summ$lppd, varsel$d_test, varsel$family_kl, stat, mu.bs=mu.bs, lppd.bs=lppd.bs, weights=summ$w, alpha=alpha) row <- data.frame(data = varsel$d_test$type, size=k-1, delta=delta, statistic=stat, value=res$value, lq=res$lq, uq=res$uq, se=res$se) } stat_tab <- rbind(stat_tab, row) } } stat_tab } get_stat <- function(mu, lppd, d_test, family, stat, mu.bs=NULL, lppd.bs=NULL, weights=NULL, alpha=0.1, seed=1208499, B=2000) { # # Calculates given statistic stat with standard error and confidence bounds. # mu.bs and lppd.bs are the pointwise mu and lppd for another model that is # used as a baseline for computing the difference in the given statistic, # for example the relative elpd. If these arguments are not given (NULL) then # the actual (non-relative) value is computed. n <- length(mu) if (stat %in% c('mlpd','elpd')) n_notna <- sum(!is.na(lppd)) else n_notna <- sum(!is.na(mu)) if (is.null(weights)) # set default weights if not given weights <- rep(1/n_notna, n) # ensure the weights sum to n_notna weights <- n_notna*weights/sum(weights) if (stat == 'mlpd') { if (!is.null(lppd.bs)) { value <- mean((lppd-lppd.bs)*weights, na.rm=T) value.se <- weighted.sd(lppd-lppd.bs, weights, na.rm=T) / sqrt(n_notna) } else { value <- mean(lppd*weights, na.rm=T) value.se <- weighted.sd(lppd, weights, na.rm=T) / sqrt(n_notna) } } else if (stat == 'elpd') { if (!is.null(lppd.bs)) { value <- sum((lppd-lppd.bs)*weights, na.rm=T) value.se <- weighted.sd(lppd-lppd.bs, weights, na.rm=T) / sqrt(n_notna) * n_notna } else { value <- sum(lppd*weights, na.rm=T) value.se <- weighted.sd(lppd, weights, na.rm=T) / sqrt(n_notna) * n_notna } } else if (stat == 'mse') { y <- d_test$y if (!is.null(mu.bs)) { value <- mean(weights*((mu-y)^2 - (mu.bs-y)^2), na.rm=T) value.se <- weighted.sd((mu-y)^2 - (mu.bs-y)^2, weights, na.rm=T) / sqrt(n_notna) } else { value <- mean(weights*(mu-y)^2, na.rm=T) value.se <- weighted.sd((mu-y)^2, weights, na.rm=T) / sqrt(n_notna) } } else if (stat == 'rmse') { y <- d_test$y if (!is.null(mu.bs)) { mu.bs[is.na(mu)] <- NA # make sure the relative rmse is computed using only those points for which mu[is.na(mu.bs)] <- NA # both mu and mu.bs are non-NA value <- sqrt(mean(weights*(mu-y)^2, na.rm=T)) - sqrt(mean(weights*(mu.bs-y)^2, na.rm=T)) value.bootstrap1 <- bootstrap((mu-y)^2, function(resid2) sqrt(mean(weights*resid2, na.rm=T)), b=B, seed=seed) value.bootstrap2 <- bootstrap((mu.bs-y)^2, function(resid2) sqrt(mean(weights*resid2, na.rm=T)), b=B, seed=seed) value.se <- sd(value.bootstrap1-value.bootstrap2) } else { value <- sqrt(mean(weights*(mu-y)^2, na.rm=T)) value.bootstrap <- bootstrap((mu-y)^2, function(resid2) sqrt(mean(weights*resid2, na.rm=T)), b=B, seed=seed) value.se <- sd(value.bootstrap) } } else if (stat == 'acc' || stat == 'pctcorr') { y <- d_test$y if (!is.null(mu.bs)) { value <- mean( weights*((round(mu)==y) - (round(mu.bs)==y)), na.rm=T ) value.se <- weighted.sd((round(mu)==y) - (round(mu.bs)==y), weights, na.rm=T) / sqrt(n_notna) } else { value <- mean(weights*(round(mu) == y), na.rm=T) value.se <- weighted.sd(round(mu) == y, weights, na.rm=T) / sqrt(n_notna) } } else if (stat == 'auc') { y <- d_test$y auc.data <- cbind(y, mu, weights) if (!is.null(mu.bs)) { mu.bs[is.na(mu)] <- NA # compute the relative auc using only those points mu[is.na(mu.bs)] <- NA # for which both mu and mu.bs are non-NA auc.data.bs <- cbind(y, mu.bs, weights) value <- auc(auc.data) - auc(auc.data.bs) value.bootstrap1 <- bootstrap(auc.data, auc, b=B, seed=seed) value.bootstrap2 <- bootstrap(auc.data.bs, auc, b=B, seed=seed) value.se <- sd(value.bootstrap1 - value.bootstrap2, na.rm=TRUE) } else { value <- auc(auc.data) value.bootstrap <- bootstrap(auc.data, auc, b=B, seed=seed) value.se <- sd(value.bootstrap, na.rm=TRUE) } } lq <- qnorm(alpha/2, mean=value, sd=value.se) uq <- qnorm(1-alpha/2, mean=value, sd=value.se) return(list(value=value, se=value.se, lq=lq, uq=uq)) } .is_util <- function(stat) { # a simple function to determine whether a given statistic (string) is # a utility (we want to maximize) or loss (we want to minimize) # by the time we get here, stat should have already been validated if (stat %in% c('rmse','mse')) return(F) else return(T) } .get_nfeat_baseline <- function(object, baseline, stat) { # get model size that is used as a baseline in comparisons. # baseline is one of 'best' or 'ref', stat is the statistic according to which # the selection is done if (baseline == 'best') { # find number of features that maximizes the utility (or minimizes the loss) tab <- .tabulate_stats(object, stat) stats_table <- subset(tab, tab$size != Inf) # tab <- .tabulate_stats(object) # stats_table <- subset(tab, tab$delta == F & tab$statistic == stat & tab$size != Inf) optfun <- ifelse(.is_util(stat), which.max, which.min) nfeat_baseline <- stats_table$size[optfun(stats_table$value)] } else { # use reference model nfeat_baseline <- Inf } return(nfeat_baseline) } projpred/NEWS.md0000644000176200001440000000467313531312077013211 0ustar liggesusers # News ## projpred 1.1.4 Better validation of function arguments. ## projpred 1.1.3 Added print methods for vsel and cvsel objects. Added AUC statistics for binomial family. A few additional minor patches. ## projpred 1.1.2 Removed the dependency on the ```rngtools``` package. ## projpred 1.1.1 This version contains only a few patches, no new features to the user. ## projpred 1.1.0 ### New features * Added support for ```brms``` models. ### Bug fixes * The program crashed with ```rstanarm``` models fitted with syntax like ```stan_glm(log(y) ~ log(x), ...)```, that is, it did not allow transformation for ```y```. ## projpred 1.0.0 ### New features and improvements ### * Changed the internals so that now all fit objects (such as rstanarm fits) are converted to ```refmodel```-objects using the generict ```get_refmodel```-function, and all the functions use only this object. This makes it much easier to use projpred with other reference models by writing them a new ```get_refmodel```-function. The syntax is now changed so that ```varsel``` and ```cv_varsel``` both return an object that has similar structure always, and the reference model is stored into this object. * Added more examples to the vignette. * Added possibility to change the baseline in ```varsel_plot/varsel_stats```. Now it is possible to compare also to the best submodel found, not only to the reference model. * Bug fix: RMSE was previously computed wrong, this is now fixed. * Small changes: ```nloo = n``` by default in ```cv_varsel```. ```regul=1e-4``` now by default in all functions. ## projpred 0.9.0 ### New features and improvements * Added the ```relax``` argument for the main functions (```varsel```,```cv_varsel```,```project``` and the prediction functions). Now it is possible to make predictions also with those parameter estimates that were computed during the L1-penalized search. This change also allows the user to compute the Lasso-solution by providing the observed data as the 'reference fit' for init_refmodel. An example will be added to the vignette. ### Bug fixes * The projection with a nonzero regularization parameter value did not produce exactly correct result, although the difference to the correct result was often so small that user would not see the difference. Fixed this. ## projpred 0.8.0 and earlier Until this version, we did not keep record of the changes between different versions. Started to do this from version 0.9.0 onwards. projpred/MD50000644000176200001440000000536713614525572012434 0ustar liggesusers33b8b130147c52434afa53a7fc2a6ff7 *DESCRIPTION 0cd06af4f060beb975d30e955842c1d0 *NAMESPACE 045a9efa3538820316e53e97aa98ac9a *NEWS.md 4c27668f69d985b1d632647921616786 *R/RcppExports.R 5cfe7efe1b88ad3c8e424b5b15ee95b8 *R/cv_varsel.R 5b26ac302967a6ec9096c2018f46c868 *R/data.R be4e252d89e7220ef86d42650b2d937d *R/families.R fb16e74e8b597dc11c43142d13a252b8 *R/glmfun.R 1fc27afef4d25f9b19fcfa7cfcc32124 *R/kl_helpers.R 307e03e6b4e70372c2dc14a0548cd1d2 *R/methods.R c33968bdcd67da802e7f9a5e1263d771 *R/misc.R 0f589f7dfefb0dce44c95569e2d29246 *R/project.R b7f55dbfc6aede931abd7ba0dd530149 *R/projfun.R 78796abe25c8551e4103aa82da89118f *R/projpred-package.R 134f6e377e6d233e54182dd84c26e330 *R/refmodel.R 83bb2f1587c8a86c2ae55b6cd87e988d *R/search.R 198391b1da05dbe56947a9e8a35f4001 *R/summary_funs.R ca29c9ad8f5c1d9aa77b27025718e7b5 *R/varsel.R 45d5b4d5e3937d3c419ca1adeb372ebf *data/df_binom.rda 8674bc6a5094f87786bcc0ea263cdb27 *data/df_gaussian.rda aebedcc5fe4fc707275c40e3513a04a5 *data/mesquite.rda bf1b2e4ebd759c9bd1aa95740412367f *man/cv-indices.Rd 23d684195f789b5123e49dee8d61a96a *man/cv_varsel.Rd 9cec48782781b1299ba39e31bcd17e59 *man/df_binom.Rd 4258af1e967d6b85de608de48cc15fd5 *man/df_gaussian.Rd 5c12986d8dbd4e143459c83e7587c987 *man/get-refmodel.Rd 97ca767b05d42255a9ce47e39dde3eb1 *man/init_refmodel.Rd 13bafdd470c193ff6d32a0f93e4d904e *man/mesquite.Rd 1797df0e51928fe4e320baa538cdfb07 *man/predict.refmodel.Rd cbc5112b9adb4ee94edd5633da4ac57f *man/print-vsel.Rd b8efbdb3036900a23d4f3430d7e10881 *man/proj-pred.Rd 16d5161f340c64bc397a9e85509ad86e *man/project.Rd 3534282c20b273a7f6ee2c58f1cc1b11 *man/projpred.Rd 8a3a426d37359437379f7c9d161cfdb9 *man/suggest_size.Rd e57ba4f0e04b3bb2ab6c082f08763814 *man/varsel-statistics.Rd ce22f955ec2bbf64042386019cb86b88 *man/varsel.Rd 2a6f9e9e044a78154d3cfda5936d6f48 *src/Makevars 8d46d69896a1f1d31ed76035a0e49d67 *src/Makevars.win c3fa06b84b1b3db1cf697b9b6feeb72a *src/RcppExports.cpp b1f8ce46d56e5d5156d04258e264116e *src/glmfun.cpp 6b94d036531d78970a0719a4a139fdcc *tests/testthat.R a93ca41c3a701a2051edc8d178721fd2 *tests/testthat/helpers/SW.R 6842a4cffdd2109064c769d920a13090 *tests/testthat/test_as_matrix.R a511f7779d17cc0b8e774f9cb6fd3952 *tests/testthat/test_cvindices.R d2d11b8c34eabed82153f52d371759ae *tests/testthat/test_datafit.R 2cc303a2175951f88a278218e23a18ab *tests/testthat/test_glm_elnet.R a41ad00ab6af57809df462c69cfb0161 *tests/testthat/test_glm_ridge.R 916e4f13c8c8dc119b15ed5e91c13a20 *tests/testthat/test_misc.R 57c76dc59c67065f6f15caa904555f0d *tests/testthat/test_proj_pred.R 821d5d3283d687b88dea5600ea19f3b3 *tests/testthat/test_project.R 0d44519e3b1bb048d362fb96b2ac3556 *tests/testthat/test_refmodel.R ae28e462cdc0e7ea5c4adca3c6bc6c1e *tests/testthat/test_syntax.R dbc7818cd44344df985d30390cd701f3 *tests/testthat/test_varsel.R