cba/0000755000175100001440000000000014344060573011023 5ustar hornikuserscba/NAMESPACE0000644000175100001440000000217113037706735012250 0ustar hornikusersimport("grid") import("proxy") export(proximus, rockCluster, rockLink, ccfkms, sdists,sdists.trace,sdists.center,sdists.center.align, cluster.dist, gknn, as.dummy, lmplot,clmplot, rlbmat, order.length,order.optimal,order.greedy, order.dist,order.matrix,order.data.frame, stress,stress.dist, implot, circleplot.dist) S3method("as.dummy",logical) S3method("as.dummy",integer) S3method("as.dummy",factor) S3method("as.dummy",matrix) S3method("as.dummy",list) S3method("as.dummy","data.frame") S3method(predict,"rock") S3method(fitted,"rock") S3method(print,rock) S3method(fitted,proximus) S3method(print,proximus) S3method(summary,proximus) S3method(print,"summary.proximus") S3method(predict,ccfkms) S3method(cut,ordered) S3method(plot,"sdists.graph") useDynLib("cba", .registration = TRUE) ## 2015/7 importFrom("methods", "as") importFrom("stats", "hclust", "runif") importFrom("graphics", "plot", "lines", "text", "segments", "axis", "image.default") importFrom("grDevices", "heat.colors", "gray.colors") cba/data/0000755000175100001440000000000012724613221011726 5ustar hornikuserscba/data/Votes.rda0000644000175100001440000000464514344040324013525 0ustar hornikusersBZh91AY&SYWQ;cQ9HTM-g$d<`f :}UT̼Mb}sd"yqz #)r{\IVZboU ̯'E{W0kKC Nb&ҟթbٗTNJ6Q]w2͠Ce EGӴAE{̤c ]-Og[M,tB@QN~Pu)݇t$n8o!UrEO'0ykåe PMA!:QKr0X#HK83Ձ`Mjm|=~?\n;OZ7\3qp=@y'#D7b$$mj .vs@z>'I(NV;|PfOJ<1(*2N-WpcKޢᏗA貝W,Ua,`+&vMϦVuW[()qX#X_{FIJŵW`Z_HQ ck{7mP!w^L0֖IMgD,Gy(Lcf6rVuCcӫLX(kdoXw.W5m^f߲uDh;vRs ?ln|yGՀxhi6'lyv2PK.|WLJ1< DhS+.`rZY 3f &h`kq\D9!I}t15J;R DT+xϱ0=,ޮA|OF찆yƣtfkH҉sF %XfTzx:dyUW[o2|Q`v+>Ck*mDxf-GNm6px)DI#fC9ddň.|p&P0bU$Dxr~հ)9lyJ=Mf`goNedGY}@THKnjSvV󌷗4R?-zcX}P2f02` ]lq K]-vB2taUjv:3wA[}1LrR"8+So5ֲxu5L~qBףQb=ep\ N¤Zbra~f mdKuW{*S35%~Q@`Û i"OxM>+ n[(g+3|yZъMUΌ1(1iS[NoTyQ8$m9x)깔u73[X"2s4GLlibm{F}*R! 7jg?7,eZB!G#M1Mq#U;ZbJ:Ig*@\%l\Y͞vG!pNT7V F]2X#_ `}ش4ŻzO9fY0VZ۶uq'RO~* 6=bY],[k]q!PkO˝@ +:|QL?莙`:Nd~6쀵JwM !._%:\%WTH灻'1%ø_ >WSVӺdxV/Q!oq{:rUYqw6sNvX#ڙ/XIc}]Wor߅7P~BqO'&< ,30QpIx23:moU0y<jSM $zEȖy6ڴ="ΧL6f˒cċp]~_cU,@y HM0P(-ʽcJ4OGܜ()a A.PrT&G+觗-lfzgx29/Vxĝ3,BbqW{öhRAܸa1 ,袃ʜ)upipFlj=&6MWXN^aR̂^t9"JnX*Le28bxN3戅9Rx>deW}9X3g &á o[{D[XrDkw?zI7*ƌLK%zBVخ%J{SL$>3ς,sVέHLYiM!߳kK=>ت*:O 5).*04QC<BmVX`]{ 0vPw. Yzse`Lnqg,-W^lԺ[tL1j v */6?Z!%V@,'=9WWj0uI\{di>L8B|Nڢ~Qe,LӠ$\ڕS}^L85"TV gnfj0Ƚ^2IJUPZEu%~[;*x}I\6 ;B9wyD_6$q(iQ_c1lǺe)8NhÁ1D|\IS7u=\ּ͔xy2XT>0`{lf.$B6[{8io%Cͨ8AY65Ђ=VswkX\H3f'-i4L w.A_Ζ`8:|MaՖtW' FBqMEj  YW*GpQ/Н=Q;JŠֆG@!%wt,ϘZN# Xd2氟i2pbcJw޴6 r2x.0Tw Snx!-6Z"ʋ1P}EOr&r(}`f XxuۥWv;c9>v?lx,5Q7w0lYA0."O#| TH+a3uTRBcD)JJ6IiLJLG;T8ʵ :j/ fx(f\ުG]1\' XdbGJ̈bCGPqfXNE(00p=jN]G=~WnqURcC}xE~D衊͐B[g "e]f鎌 㜲j#:v|]1CUBwH0E{VDBL>CVu;B*0%uXH e;荌J|I/G#Ձ͆RF4X`7^Wo'NT(cqZ߿!J޶i{3vWX2a..O+NRHxxc$ lJqݣ7Nr:</4;GL*u_k:hs"Y㰢 [>!|sbi|2%y W$"_Pca3PM1sNwk5 k<rkC\~rYo`:F^ocTF<X*;{#?+f鰒SRBϥ,j.jJk.@ʊCқLfF. *v}UՁsgRE4&zR|&]@E~1K|R?B8eRgdqgRd:9^W%hn Q($On~QH׬2bÂȞ71³wphS7!%CdBJ?^>_U.zU{ҬR[A VBhI}\jR2\LbPIޯg<`B:Og޼hfaB;Fƾ'!J#_W` dL0C]VKVbw} _g/m(=C_CqQh>,s]#S&]+iEwHǒֺ{&#;$-bRqeJ&(bc*GoeAEj\*Ndyqmu#a=T*ԙW=I凤P/)NJh`JNC5@,y|E@rv2Z%W\dZUcჯUxS3,^|c ,-mtX`>›sH%TmվفFu@7 5LsMQL;@wX c7nK` iYAJɽm&B4][ f@t&+صm+tdDa窱a{D"W_A">9~o'q!6s͊bh鮁X׌x;SK`9[q;ExY7%ܔM`EWv]ZL؊C ͤLIZejpÖm?6:]#n)ͻ9!v%1~09>>x$l6ޟNG}֋%cR(Uӿ`q z9pOk_ wUxHyp-s]b;O)/RTByp7ޞpBZjp;ai0*A8ddR܁34~ \3S=/m .dhвo'hތ?S~b6/<:)7leVAE+&@Ǯ4\ j$8U)q\U,1#6 îhAF67SA7}/d-S9їg1KD hgF4'I6$6i1%14,]0RC=ВJbGXVYfooGfҁt5S8Ll~+޷%۶(K,> 0Rޤ1_E?p;s%Pz5蠿x;_A.=5/:X\'&ߋF0f $ȱy S9U%IQmLyGʺP2s%z㜼)"?JٵoMf쪃yuU6! {YG5VG'B}ZTШ#$in@b>k-`U{Ys9,g2,RURfyc!u&Hf4x&x(хwc9gN$"Vg~3^(8\d.V(y_3$5]d 1UMdlO7shƋg-z#zĆ:fګ%riU `dzX ӇBp%tm8-[4ڡ%#:ӎCt8b^"@nԂhJYOV񔁃7ERTwOC ¯ߛ^O' 'y+mYL 'VQDA-G$ zzmIzTŌSENET1~uL!=CBpBgEXM2i]9Nh UsԌ,Z,} X'Ѯ^5=%,X XI~7O`*GtגJZCk<ᅣtY gOiX64D m'u'W9q;|3#]D?ݰbZu+<&zlmL=#GG6&< /%݈I&[v >?  #+5__f&0{C`CYQ#B8̺~K ,HMfp~Dbj;h@17~;{XDsf6MO%E~(W˪a{N^+nT-Vs$ U p? ϭn`$xٖ+e޺M[?V x)QvTJDiH$2]A l\_=rLXp]^o 1*K WkêO#dǝHJ FZFHkźhFBJ,f]/p3/HA i NW#u%| 0M*b) *p1Ww גʾpJTIeU,#7ϽWpls:'\"&Wn|LpVnEH8=>wlF[ Azr |VnG.3߳LWa8xFAO'i=[z_ 5<XA-j.("݌dDXVTwͿQm '*㢔ryE\M2;[lx5/4?s_[WK`(i$HL2yBFI^!;[! K.eg;m<±*{1mxNqv%x'~>&ݸڐwuvϋ|j&A懑`=IRGT?Qd_Ȼ&#kFRSU3ShԮ N!Z '] nTQ 3I6(SSQErf2b'mA[Ep| F7} K5Z/^GK13:g|1J !3 ~0: Zjr)Ё]ٴ tzo2ynvFԤ>oq-eA@ЈDDQޝ Κ|shn60ҌW(G'ԁ2Խ]9N{4V֯=^硤ݘl5>gNvk'=u7@:s4L2J6.B$ GJ s`eoEkE Z6 ю ?[Y_r-(:hݠ(,*TpT=HE xzOGboBDg6Ma-ʍ ^0tZic7úUchz. ìL@sĨCϋm䝯aN̈́Eܢ6J-h(gߥHkդ*R=&߄{/>zqs1F(.r=nʼn㉙~m/*8 .e:&LcN{ѐl.^ǃZ19.ezdCy Q"&lN }K쉬0tDdYD|ҴֺT TG3pT'z5dLD 9DGZh'$&\ 'lS!WN&+WQ.LvD$&dqzP;_Aq~byrhr'b5 QFR&**! 5rofz"m}>:NVAĜLo~4ͽJ a+O:Y_`}6u ""ٜ9Z}ItNY*{>Mp7$"@dr &>9Xs2! Hyb^5^c4d Ԟ:{|xAe?K_Ԏn Yo}bM3r{|{iNs0㩌CRҳ|65PHq%Fy%wÿul`"=68Fz/Xq^<= `3mexfȓWRy~>( ba{.$I7ZgPeЏ 0V}RP>KΥs(Z$ofaRӮLY6|@ WՌ+ču#&` rՌs?^oz\%G}NUڂ=s9HeThymn4}j7tPCډi Phyɾ`LptFIH6` iG qvπ.P=aM*:l{^JOgHs$zeTܲRuRAP:'G ]1ć-DCg$ B|퐯k)#yuF?Y=+!Jl$eXcX@)~jڧXoxC(1.IKɉzC l:}Gm8qD~)%_3wv]GE[.'}n~z* /ޢ?y>D0!W-CG 7A^^i$e2j])9ŖzW,`7 9?)@6ig+%+` RBGNK ytfO1|iN9P|eA,mo3wǾ|Y;SWSs|Lk/i?v"AMEI  lZmFv9mO:4`h&˵e$+0y2-dj#zs%jbswgoH8@4mhO ߁p8̻k5^#Bxg8TkwjS+||K“vdA))fV^o.h.mmSҫSrw o$F > ?Y4U|>>"W!fqRƼ0P=KҍD21"}I- YVB6>mZ 2DD#ve,c陼z׮m4ek85^ik_O^ӜݭR*1u ? tAN3 t{:h!6){ʭ eU8gǭ6*`|cED m66I_4}(F@ Op05 : ,b|(W:J06#tD݈U\IƓ.5]lT~T-#|ǰŜt6Px;VԽs i yDߞɟ2[P{ tWXLBqᾒ"TmIe?o0c <>鶋j~r$ K0C SwRлiL2Tg3gӲʌjloaBvWeL}'*x铿]3YyH< 0Vt´4>AQ@N(4F0O(#bf["{@|pR< ZP(v[Z;l $۸fF!2.6 mߊQ#يT~[YclG"BxWQ&WIŦ>akEco$g'!e_R$6J"c OCw@BZZ,Wlc 4/Ø(DDa"rRODBe𶠞NL oLeGwL$|inY.M6#Bt #Kk,8#W }/ pYά5y!HE$ u5ٺOGT괽 _SQTF\])=pH }~i Dݪ fX宩m'yfvȺ%Ѳk%a!LLF[yz>H)N#hTi*.T6sFt`t$0:T#I  vwHa&D&YW-l ڛ\wgnԕ+@E XGr88 4c', >MH[M'cpW?Quiޭ>u`NqN&V족K\(1P, J+X DaD{#39yMKY):=_IX޽ Ǖ sT9K͘,wi28k%"c$٠F۴0GEp)UØz "Z557P.+ATbZ8֓i.w[cxeſ$mH? ĺ>~q25sIjku *>r'hER˿]!ȏE N 7hN#ʼn *Q*YQg.뀍Ae:UTJV`PCg(c #<v (<\ѹfX)1]/{a>szWLH'\E]٠1BZHaoC[3=p;NO@>l`Zqfg5.EqRC=l ֕ dD%hT/(SV>2(A(3"3̓#Y tV;I:#Njj)~i=1!.y=0:EaId>^FpFR^lcn9n\]v;?%˳ $qne u>Uc[cX@zEJm*mR@R77(NT(biYZ8{7{SRg^ *4xe"Zჾ <dO;h t UJg{F˞JcEZҫ+/ N&6[sgp-`mmhxD(Pb=)/VB|rDQF.7'kYR5ZZ8in޿]SgY푏ȇ\pMs|{w}lKIz$lae,+Cy-$/,/ÑG&NN(#%ͩ0+y+~LWl re$ֹccVLvl5̻*y?ΥWxĭ~fh8ζ{@IKVkA!4& zፊ%¶&/YkȨnxf4$ƚb\N'_4^Uګ1 H|Ǹ`qNEW? ԟ.!- ĉ\aFaż4hF)g*>Hkbg0d\ijd"Ѝ&U33AQI}ڍ߅;ƯN8 M"rp!O -UuwPG>"#2ڨ0('(&SxDjϿ~WG2d8j4km'څ1i+Imئ&u͏S/dLj2b[ G/t 4'Gj*8qR)ԏd;Z^x taVb4bN 4|+:ҀמWg.ȹoiRn(zY;8q+f >DY ('rGށ;Y< T*>s67yغd./b\"*e=t/LpAQl+8&~Vj s1uɒ@#lm =gWϊKB;o^MT|>V7ŋ5Z1{Dƅfäh]Zw rm(jgr76?mSɳ{ pd>I0<ud8REdCsb ՘< [y6bWuy⍇$%l4>⮃VFfX:7٦~ܧ{jGDÓCk(!q]犤_Mf]ʑnqvGJ񬧖`Ա:>>$iCiӮ0Ħx#Xu23(j`CV[從mBz=1t&%)^G;]нR2"K 2I pR k# *6bOЯev_$5]jB^$e}!LĬgF< Wx> IDeH߸̷B|قla[C@>pGNadCݕ)P,QLX`jR&NDj~O.w揓Մm $~2I< bW[(io-ii46?w~CK cܹ +YT͌Knr VM;ܟ'Bo#:qkNmi9D2Z Ε36l]FV6@ҘgP 9 vƞ0T؅Sy^S`]՟'(bke C!+#5\vQ/%J\RPUpG>׎vW؉’3f0R%<ޮۆ⽏Bx0K~K{R_}p:YrZ\p_lbX싿u[b_Brt˼Rz\w&~YMh|_-G|u=/ vמ ڼd?Xa4qcFx~+%Ԃ*[Q$VWsnH#,*Fi?s%sҽ,בur,~9{N@0+Ķ2lnQw.^P~S;vQ2boG-,K7 ce'қ 1]%q)\9|uutMevm{Jۅ Dޙ f M=`7H:Lkmؙ9Ws^~w(ܪ"~jBk09ϔς P:AF]Hp8׍*}Mב7\տ. Ԓʧ&- HAw5s5; gaiM^SwqT:E+oelP!~Vm7 99}\ͪyB򛽣H48]s\jxʽU7[vRxiZ%8*$)]ȾD>zB`d+hLVJ79VPߤ0%g|ɑZFOWDB[Yz@"lj75&8]MS' . Frほ{6\Hja{0YA/.9=X~c|#7:C U\9tg_0  P˜ R 1];8}c5]l:%|1ۛ2yIR]0xTޕ@(|-~2gNIp <R+j]뙸2w -Y9I'͌z̚O,Z(e\o¡<p.QrBVLE%u!wJu`KHU2 ~S.# 9*F͟/\^8 A>{=&ПEBmP 2Qf/ 4g,e㶖+G!R_z]30$n9=S>j'ϑyMSR<'J@/'Dǂ g(٬^-˧yNeSAZЀ`$?wIDR! .Om]aNW/1鈙6u{!3o:@s-&/?d?R+)#l3r+enTNXHtk B X8Td]Zg  Ǟ" o͚ȓ Q!H&iD6{@s"v_ b(}w,%4csnh}3kaCͽyТHJJol(-#ǀ~ 4o8t0[{ooQ:я 1,OȍaK&ìKPIc9ܡwZ 98*zAVQ]5Ckܠ6Lh,|?{VU9.UH_ޟZq0ks? }Ok[a;/ϒQR`7[F%bcL6)4uΖ(0PCmυx3zk,ΰ"so<,įG=-X鹼Qk"GU׳z;I) AⶎT|=:?Ա昛DMAr[Ե#-L: ĝM+R,3\|t t*hj4No.o,{)7stXĉȂ*gZ}Շ=L ap*@+U[X~* b^ ;`jVP M6:I1w@trbcn&dnǬֿ\C%A=v@Fr&&D`B"!4[bXr ? F"ٖ*F,2+k;t饵lsuԃ,mnӴ%+ DIgc޶ϭ~V\fv`EzC0>4\Gyt1eIz~{^ebpӈYopҖf&>#ldd@0e0A$%bB0뢐6]@!)TG{0T|awƿ^s)$=uqx,sfֽqc*xhv@neBYʩ2\`TC>~v2*˔NIn{:c~y)L%x:N0XL;7.Z0,V#=c&SЭƑ]o",B'%W9۷:6[D:9>֒-uD/g 6 v0m3ܑ&/B*! ǚ>4Ct`ZI/mmy[% ŻP/f=T%Hz1Ili6C~5#xbG6ϯ>n =8o!jMW(~U6*؄j3Dl,N.j;",{Q ĪBd~wv"VK,c d6 am-e=ɕ6Ny^4_p^Lp&gQ~L>>bv`k \X*@? LOJ$ Ncd)+)y H~lsAIid!/%X-VfROuq69ܪtG=m$c ,,j1n_vfI;REuh: <5^C9MWaB2Ao#2UriGvQY7tlkS9o|Z^9_dS R4<V=d3D:"YUZYSB@͠<< m\Y=їlG˕T0}ڞ?ˎrw2,C.-NTg@zt1*c|*|L~=\$Xwl({p!Rl*nLQ7 }{o'8}^cm+T+!z'Jj|ZT u=X.;5گUf׀:sUf~"u(fN(V+ea 2ߍ<&#hz:+#멜8m'~25!¹\*? ų2えnȰdbMn\ef kM3t">*ٸC:+ 4ekp}?y)R|_pyIz& NQWF⭔M?&96YkOvweׯj%]3'thj Dze#*tB|$Pʃ6xIԶD==c:U&,5 v!فe?QpC {(7Zw#O# $vb/st  JܬuLjHX >Ve i(񊪻')珵$ymlHݠ1l^x F 4|eC"/p|J?v/\< >eepO(W3c=W^r.Rj <#UI\\-z}(A 1mW52Bh,g>=^{+n55ʮTU>Ss yW`%B)KTtQUUҶS:c5:+o\+"6'#+ CO1֌WRSFЉx^n iE0:7l>s5x|6XVʟGRkÍʕvly ~n?0v_^06@UY3(xS*3wC9l4i?K,G{avhr|0yDG ̜Fѭ b$Րk>tMWշOe#K4 Q0Sf!Ņ6$c\SE`4Ǟ~@$#IϦp5WPt ;7[+vdI){;N&#+ ӆ( U\S0(ijᔞ"J)rA+Z!.N3.cWUr @eyZ?%lܹӗVfyYtň֮`3L?62y}!C|$ԫFP]Y)6YFGԜt-MB1YMC!6[1 q2|ʩAie%ry<7F&%3t)o9v&WXO5x]Cu=uaQ/ N[0uѻ% Ros'$ π`a0Wu^ZS0v"mBYIo,so|;`]2. 픯٭liSIa Y _N@07Hq5,-))X2-f9/Z:$Ʋnm䉿M334`,F}|_kڣV—F1EC` lens}#l F&(;wcO2mA&?YO;6Lz[5$>mB!DlX;}Va;!G}.kڞA{S2~e2؀b#N7\u3-5Cq#[xॳ}tì([Q=O0pZLNd~Ը;':U}xLs "ߋh3-'ƴn2EC J> #`kr(q'noW0w5=,qYzMt¶3`dPIVp(8 4)2xv[CŻznt%޴9cj5IKor6:9{)ۙ7NE&soGREڭ1{#nQ=kȮʈQfrnP}g XUkc!^q|U@ȰlI@CbBΜJ:%e!8?9fHej eirFuǟ'Eʝ"n(C#!SĴ UCo ȧ_ Սq^MCD6˂?B&.4*hxbOM<*%>Jy@pBwwz QJǰ৯}l5cO(_q%[cm b5|jyT~hO$ڝU8"!ǭtz@l cK٧'6n}M0ș%V 7aI>Ïv2 ӓ|Z6akPj0}r~D 㹶2A@5qs[kts{,<ȯ)ͬdb4$ԵlyCLU֜~/)0t -+/uG:~| [>w)%@f~f >>/+__- V(vGkaKM͔txfɻzb x,栈gZIFxғ03eg-ڼvwئp1cS`rI9lnq͢X a)_YX-l馛h 0|ZuTem\SLjrNlξv\{ߟ|-q2WvTtFr%GgLjB`دG(N0ewH.֌ h{0Ky..Җ1jSn`I)Dmu>͊ %y> r=Xtt"17b/7vG&onRowpbξ,T\Ўr$,h1hIޙ<K4k \ԄˬC/N`a~OtJƂ]H$yv?@]WP]݁۲m0O.wIPw9-t*1JƓT>TS7.-rMvʊtR@j r;&(m "/@>`T\[hnBiwL2Xc+o=fT6ä~> o"h[ g.A+YOAw*x3`}UW]WnCA/P?~ש|%Xq10|}{,7B&.b&q;䵲6Y8@n3cnB? ȁE!s\tot&# VyGK Zݙ/= lF8&M^xXRoLfZQkC,RނP42hSjp^hSKY]HM.8ܐ?v:W HU)d%]NIS0efh=PxIRPki_T[|a7eQ?IJKKCB ,b"KH7Wu(<1bq&B^:v`k 3F0IP7 H]} SK7rA7^*j ,*]L@ΨŸ':ZFOWfN}CxVIzs_iD:]s-괣q儇O34c*uk"XPX qt@n-ԃ/9>Dh+!u?pS'ATD$.V+ⳖcHI=Ź, tc'@V_`opF9L|< `~jLoXpNOV&„Fr4AҢQM nE5K0֫ԞGryIo g>$~+LnNf !b V.Rh`AՅ-bP7J[×C~.Z 1((kG9,p7ĥ&j[Ț}`گpxЬn35gWA'#p4ѕ/lf2JVXTk]wJqXn?ri%ZT`/ӣa&뱞e_+өގB89/[J{H5"l`ƙ!x`XX@"1Nɾ'me^LEȧeRr1YlK[q\GѶBR\lNE =鸪D籟|ʰZKp`Izi(]&;iMYӳ Ύ_hZhkoIPlV)\\ri7Z-m7Kb(jO5:#BzWηvU{]9D(:5(9.ܘ  ;ƿa^#!ZsceĄO242 \AI!I h FwBS[pԐi [0{wڱ-D;!)lyk{ӂ+a:1c**1ۏo<8zE}>%x D.x ՟V֭%(O\` HΌC`"U50%IQDN[Txx s^9+ pi]JѰ2 w/%s,%!Pn|X^y^}̎#crZ^ K;?eH,N`[ 2O_Qeǽ\ϫPΚ=:nq]+5F@\}Kzz I9a:)l;! n'%H[X췐jEX?_^۾368\!Ԁ-%r41$AS]%D>?K%\vfYa`&zRαQ(t<5~j] DE"E1vz 1Y cB15=[`6>j,-o V/" F2C3jDj!d"r6|f{I-R۷:}0[+œ$j#g[Š|ޞm|< 9s +ڜ٤\D]- ݏͩ1&dֱɲ&hyR]i2"ܠfxUsn:Q=k\`nQgnq,m~'wW_p,#ƣCs)^j45 +IH9m9e=5Ӌ8~m.~z9Sv~"[D*:( 筠V_U6kX cd0zxVpU2L;#TT"\HqJfiq^`=K7E -::y!09PTo=8@Ѫ)Nt=PW+Zx>\5 'VOk A-H>R[.U$tƮ ' ֚?H KGť`H)R^_so>fYV@5noԯO g#t)}-g#hqԐ3݀TUVT==k8p%T^4s=oiIշ% L:|*nCYO7!RR F^*yF֊SO4^]{CCC,ވ?,Z͈`~Iן #WdlO ! 4nFCԇoefFt܍Z`;Ll:aA3(:儿P#b E7LJo%K)G m=ע6բX"ư*ܲ%CG"r`X3P#EVoy,EpQ k?J$M* BdJi5.QsF^Y]sֹ*ۢiF|aEUAYD,{ ܭ~bIe3@&gSH 5ɛ"d gKVHߖ2VS앻\$0ߚ]5>نMe+[~>&摱Z|`VՓEP?5XRp&g"OA(bB ;zdT;h?)T3e%^6KɼTMJ&oKhdW͞6eN޶YN,_A1bA(V[vJs[5b;O׫XŲUCF+7C0h,OwvU^G>7 T=Xѷ2K_rJ]ceeY:@?,[?:P-ڗBbF2`6P!wN*Ƃ>x+/Z6" y"zӾ2BAL~KmDm8gRc߆G*wC ~zԶ?ZJwޫapt6Ey~zpj}ٹrf{Ps0[[kᠭ0=dSH5˿ قH~mQ`L")9}*@Dc."H'tR?"38պtW~K bcfp֍zb\N ~ʦ U+0s-jR ՞ vnK;f(ʒ҂Xs Qݤ}5뿋8rJSqq(-A>VPEE{ahp"_,BQ<&,L;o"کRQH9F{h蓤OnzgD#B妬$9 >Fr~p98TbLU(-]p* 2yQmq˪jfuWa<>ptWf"ēlm-|"N.8]Ih?>4-}FEHY \Cz-IFYq5Hf b"C`7.wOEӞe/_J0`Mf9h\%U=H^ ^ ir춠{RZ XQc{w%{-b룜;$XL̚[Q[6m! =[ol؉rVrkA% |Gr۳:[L>.P2ͳm#V @fPKCd@l/e#ZWO"`g?RU!e&OTc \7H7Dd G ppF۴w8bip˹C6GuXbDd}\ ZH2!V4CI LiVLCn7B@rtdC HY4b^$[U`"7Xn=,P yv^`JF{ %)Zr*7Ph3\ v*9vĪfLoo'= 9tXNl+ bV(ш2ؿa(銒gPX%&,PFAo[3»g>/hC,b#z,qdՅgX N>O !'nSa~6#l(nhhUj4]ΒѴtns zUxzJ=%ntlfWMJ-Lݻ/[7^j'>AjR`zN.*Į0k2GDX4O1__/M^:kqк9m*)\Y *y?J:cA4 6OlTx֧Z4ב,e`ﺬ'ɨ|ռQ1S.F&F4\D.G!!tN[+KʢٲQ<13Kk)k$ 2ЁCk$hOWҠ8Z`P#^Tc@ ycsE8o4 1k^o_z}pj-ƦwpMz3Y/LIQ <1bPdzގ;8=;V;:(S0j361QAtNv*ޣhҔ=~T IE8,E>m'}@E2?ٽD@P2S/#({FD5_kk4K 1A0,XZ cfdEZ?Nd#:ԓ@Sː)`>re2)E.kdB~-Yo/XlAn5X ȥ|j?H}Jre{l貙Iѱbv-+UvB#pŧ+s ;%H_gEMHM67&}0W4V*ơ[ x96^]nswT|B&s ZxT oOjPbգt h(Ë4Yd~D{ב F%{{UaV(@Q3qf*" XmZXs=b0yReJD[<u^ܤv&\ dX.q(BSpb7oit89{Z8_Hߐ6oU&Y ^*eCɉ*@,;uS%O`h_ކ&;-ϦעNFZ`sў_= ܶK4-Vψ0B[i&pm:\a챡O tO6ySWgrvOήT FV[3s/~ %G[\R`O7E<5,LAB%Pv7~+W(NHOS؃9=i*ELTV,2ox|_d /}DZu%>$4uCIi_~5 XZ]k7y9+${Yx#" &U")]5 MUӑC5 /(r8-AG&7ݸ r37:pIe( VrQ=f'bN͙+lQ \JrD .@U0U>D8o_M  a'u~GYatKT^dX9e*r{ v'L碭Bާgka =*_&Hzp  w( Gv_@k^bM2$e?HĎG\lTwFLtx"2*ۀ gZtW)64N{cSI^砦]+Q0:-2L}bB7+ Kh[va_P୿D㼖WC'lQm"}>Z탆ձDi@pыlTjj %vK-K/">0 YZcba/data/townships.rda0000644000175100001440000000063514344040324014456 0ustar hornikusers͔O0 ȧ"0k3 ,)22 6ha$t`BF$sƗKg>9!5(ٸq AD$AR g 8'Ys .J \\PZ@h"<"=}. Note that distances of value \code{NA} and \code{NaN} are ignored. This is not strictly correct but avoids computing \eqn{2^k} possible solutions if there are k \code{NA} values. The time complexity is \eqn{O(n^2)} with n the number of rows/columns. } \value{ A factor of cluster labels (indexed 1,2,\dots,k). } %\references{ % fixme %} \author{Christian Buchta} \note{ Fixme: can the time complexity be improved? } \seealso{ \code{\link{dist}} and \code{\link{sdists}} for distance computation.} \examples{ ## 3 clusters (1 = connected) x <- matrix(c(1,1,0,0,0,0, 1,1,0,0,0,0, 0,0,1,1,0,0, 0,0,1,1,0,0, 0,0,0,0,1,1, 0,0,0,0,1,1), ncol=6) c <- cluster.dist(as.dist(!x), beta = 0) # invert and note that 0 >= 0 c } \keyword{cluster} cba/man/clmplot.Rd0000644000175100001440000000534011633160276013540 0ustar hornikusers\name{clmplot} \alias{clmplot} \title{Plotting Logical Matrices} \description{ A wrapper function to \code{image} that produces a level plot with the option to color the rows (or columns) based on a clustering and/or classification of the data, and the option to reorder the rows and columns for better presentation. } \usage{ clmplot(x, col, col.bycol = FALSE, order=FALSE, dist.method = "binary", hclust.method = "average", axes = FALSE, xlab = "", ylab = "", ...) } \arguments{ \item{x}{an logical matrix.} \item{col}{an optional vector defining a coloring.} \item{col.bycol}{option to color by columns.} \item{order}{option to (pre)order the rows and columns.} \item{dist.method}{method to be used by \code{dist}.} \item{hclust.method}{method to be used by \code{hclust}.} \item{axes}{option to plot axes.} \item{xlab, ylab}{labels for the x and y axis.} \item{\dots}{further arguments to \code{image}.} } \details{ For dummy coded data the level \code{FALSE} is assumed to carry no information and is colored \code{white}. Thus, the level \code{TRUE} can be colored according to some classification of the rows (or columns) of the data matrix. If no color specification is supplied the default color \code{black} is used. If \code{col} is of type character it is assumed to contain color codes. Otherwise, it must be a factor and the levels are assigned colors from \code{\link{heat.colors}}. If \code{order} is \code{TRUE} the rows and columns are ordered by \code{hclust} where the distances are computed by \code{dist}. Note that an axis is only plotted if the number of elements (rows or columns) is less than 100. } \value{ A list with the following components: \item{rows}{the row order.} \item{cols}{the column order.} } \author{Christian Buchta} \seealso{ \code{\link{lmplot}} for plotting of logical data at reduced resolutions, \code{\link{heatmap}} for ordered plotting of real-valued data, and package \pkg{gclus} for ordering functions. } \examples{ x <- matrix(sample(c(FALSE,TRUE),100,rep=TRUE),ncol=10) clmplot(x, order=TRUE, axes=TRUE) clmplot(x, col=rep(c(1,2),each=5)) clmplot(x, col=rep(c("red","blue"),each=5)) clmplot(x, col=rep(c("red","blue"),each=5), col.bycol=TRUE) \dontrun{ ### continue example (see rockCluster) col <- Votes$Class # color by party levels(col) <- c("red","blue") op <- par(mfrow=c(1,2), pty="s") clmplot(x, order=TRUE, col=as.character(col), main="Parties") col <- rf$cl # color by cluster levels(col) <- c("blue","red","green", "black") # map NA to black clmplot(x, order=TRUE, col=as.character(col), main="Clusters") par(op) } } \keyword{cluster} \keyword{hplot} cba/man/image.Rd0000644000175100001440000000244711633160276013155 0ustar hornikusers\name{image} \alias{implot} \title{Matrix Image Plots} \description{ Implements a wrapper function to \code{image} for proper plotting of objects of class \code{matrix} and \code{dist}. } \usage{ implot(x, xlab = "", ylab = "", axes = FALSE, ticks = 10, las = 2, ...) } \arguments{ \item{x}{an object of class \code{matrix} or \code{dist}.} \item{xlab, ylab}{labels for the x and y axis.} \item{axes}{logical, indicating whether \code{dimnames(x)} should be drawn on the plot.} \item{ticks}{the number of tick-marks to use.} \item{las}{the axis style to use (see \code{par}).} \item{\dots}{further arguments to \code{image}.} } \details{ Plots an object of class \code{matrix} in its original row and column orientation. This means, in a plot the columns become the x-coordinates and the reversed rows the y-coordinates. If \code{x} is of class \code{dist} it is coerced to full-storage \code{matrix} representation. } \value{ Returns the transformed \code{x} \emph{invisibly}. } \author{Christian Buchta} \seealso{ \code{\link{image}} and \code{\link{par}} for details.} \examples{ x <- matrix(sample(c(FALSE, TRUE),100,rep=TRUE),ncol=10, dimnames=list(1:10,LETTERS[1:10])) implot(x) implot(x, col=c("white","black"), axes = TRUE) } \keyword{cluster} \keyword{hplot} cba/man/lmplot.Rd0000644000175100001440000000271311633160276013376 0ustar hornikusers\name{lmplot} \alias{lmplot} \title{Plotting Logical Matrices} \description{ Implements a wrapper function to \code{image} that produces a black and white or gray-scale plot of a logical matrix. } \usage{ lmplot(x, block.size = 1, gray = FALSE, xlab = "", ylab = "", axes = FALSE, ...) } \arguments{ \item{x}{a logical matrix.} \item{block.size}{the interpolation block size.} \item{gray}{optionally use a gray scale.} \item{xlab}{title for the x axis.} \item{ylab}{title for the y axis.} \item{axes}{option to plot axes.} \item{\dots}{further arguments to \code{image}.} } \details{ \code{TRUE} is represented by the color white and \code{FALSE} by the color black. A lower resolution can be obtained by specifying an (interpolation) block size greater than one. Block densities can then be visualized by using the gray scale option. The number of levels of the palette corresponds to the block size but is capped to 8 levels (excluding white). Note that the opacity (blackness) corresponds with density (as on photographic film). } \author{Christian Buchta} %\section{Warning}{} \seealso{ \code{\link{lminter}} for interpolating logical matrices and \code{\link{image}} for further plotting options} \examples{ ### x <- matrix(sample(c(FALSE, TRUE), 64, rep=TRUE), ncol=8) lmplot(x) ### use lower resolution lmplot(x, block.size=2) ### use gray scale lmplot(x, block.size=2, gray=TRUE) } \keyword{cluster} \keyword{hplot} cba/man/proximus.Rd0000644000175100001440000001040014332123330013732 0ustar hornikusers\encoding{utf-8} \name{proximus} \alias{proximus} \title{Proximus} \description{ Cluster the rows of a logical matrix using the Proximus algorithm. The compression rate of the algorithm can be influenced by the choice of the maximum cluster radius and the minimum cluster size. } \usage{ proximus(x, max.radius = 2, min.size = 1, min.retry = 10, max.iter = 16, debug = FALSE) } \arguments{ \item{x}{a logical matrix.} \item{max.radius}{the maximum number of bits a member in a row set may deviate from its dominant pattern.} \item{min.size}{the minimum split size of a row set.} \item{min.retry}{number of retries to split a pure rank-one approximation (translates into a resampling rate).} \item{max.iter}{the maximum number of iterations for finding a local rank-one approximation.} \item{debug}{optional debugging output.} } \details{ The intended area of application is the compression of high-dimensional binary data into representative patterns. For instance, purchase incidence (market basket data) or term-document matrices may be preprocessed by Proximus for later association rule mining. The algorithm is of a recursive partitioning type. Specifically, at each step a binary split is attempted using a local rank-one approximation of the current submatrix (row set). That is a specialization of principal components to binary data which represents a matrix as the outer product of two binary vectors. The node expansion stops if a submatrix is pure, i.e., the column (presence set) vector indicates all the rows and the Hamming distances from the row (dominant attribute set) pattern vector, or the size of the row set, are less than or equal the specified threshold. In the case the rank-one approximation does not result in a split but the radius constraint is violated, the matrix is split using a random row and the radius constraint. The debug option can be used to gain some insight into how the algorithm proceeds: a right angle bracket indicates a split and the return to a recursion level is indicated by a left one. Leafs in the recursion tree are indicated by an asterisk and retries by a plus sign. The number of retries is bounded by the size of the current set divided by \code{min.retry}. Double angle brackets indicate a random split (see above). The numbers between square brackets indicate the current set size, the size of the presence (sub)set, and its radius. The adjoining numbers indicate the depth of the recursion and the count of retries. Finally, a count of the leaf nodes found so far is shown to the right of an asterisk. } \value{ An object of class \code{proximus} with the following components: \item{nr}{the number of rows of the data matrix.} \item{nc}{the number of columns of the data matrix.} \item{a}{a list containing the approximations (patterns).} \item{a$x}{a vector of row (presence set) indexes.} \item{a$y}{a vector of column (dominant attribute set) indexes.} \item{a$n}{the number of ones in the approximated submatrix.} \item{a$c}{the absolute error reduction by the approximation.} \item{max.radius}{see arguments.} \item{min.size}{see arguments.} \item{rownames}{rownames of the data matrix.} \item{colnames}{colnames of the data matrix.} } \references{ M. Koyutürk, A. Graham, and N. Ramakrishnan. Compression, Clustering, and Pattern Discovery in Very High-Dimensional Discrete-Attribute Data Sets. \emph{IEEE Transactions On Knowledge and Data Engineering}, Vol. 17, No. 4, (April) 2005. } \author{Christian Buchta} \note{ The size of a set need not be equal or greater than the user defined threshold. } \section{Warning}{Deep recursions may exhaust your computer.} \seealso{ \code{\link{summary.proximus}} for summaries, \code{\link{fitted}} for obtaining the approximated matrix and the pattern labels of the samples, and \code{\link{lmplot}} for plotting logical matrices. } \examples{ x <- matrix(sample(c(FALSE, TRUE), 200, rep=TRUE), ncol=10) pr <- proximus(x, max.radius=8) summary(pr) ### example from paper x <- rlbmat() pr <- proximus(x, max.radius=8, debug=TRUE) op <- par(mfrow=c(1,2), pty="s") lmplot(x, main="Data") box() lmplot(fitted(pr)$x, main="Approximation") box() par(op) } \keyword{cluster} cba/man/lminter.Rd0000644000175100001440000000237511304023136013531 0ustar hornikusers\name{lminter} \alias{lminter} \title{Interpolating Logical Matrices} \description{ Interpolate a logical matrix into a lower-resolution representation. } \usage{ lminter(x, block.size = 1, nbin = 0) } \arguments{ \item{x}{a logical matrix.} \item{block.size}{the interpolation block size.} \item{nbin}{the number of density bins.} } \details{ Partitions a binary matrix into square blocks of specified size (length) and interpolates the number of \code{TRUE} values per block using the specified number of bins. Note that the effective number of bins is one greater than the specified number because the zero bin is always included. Excess rows and columns at the lower or right margins of the matrix are ignored. If the number of bins is null counts are mapped to zero and one thresholding at half of the number of distinct count values including zero. Thus, for even numbered block sizes there is a bias towards zero. } \value{ An integer matrix of bin numbers. } \author{Christian Buchta} \note{Package internal function.} \seealso{\code{\link{lmplot}} for plotting logical matrices.} \examples{ \dontrun{ x <- matrix(sample(c(FALSE, TRUE), 4 ,rep=TRUE), ncol=2) lminter(x, block.size=2, nbin=2) }} \keyword{cluster} \keyword{hplot} cba/man/order.length.Rd0000644000175100001440000000233511304023136014446 0ustar hornikusers\name{order.length} \alias{order.length} \title{Conciseness of Presentation Measures} \description{ Compute the length of a Hamilton path through a distance matrix. } \usage{ order.length(dist, order) } \arguments{ \item{dist}{an object of class \code{dist}.} \item{order}{an optional permutation of the row (column) indexes.} } \details{ Ordering a distance matrix such that low distance values are placed close to the diagonal may improve its presentation. The length of an order is the corresponding objective measure. The order corresponds to a path through a graph where each node is visited only once, i.e. a Hamilton path. The length of a path is defined as the sum of the edge weights, i.e. distances. If \code{order} is missing the identity order is used. If \code{order} is not unique \code{NA} is returned. If there are non-finite distance values \code{NA} is returned. } \value{ A scalar real value. } \references{ R. Sedgewick. (2002). \emph{Algorithms in C. Part 5. Graph Algorithms}. 3rd Edition, Addison-Wesley. } \author{Christian Buchta} \examples{ d <- dist(matrix(runif(10),ncol=2)) order.length(d) o <- sample(5,5) # random order order.length(d, o) } \keyword{hplot} \keyword{cluster} cba/man/Votes.Rd0000644000175100001440000000552511726357705013203 0ustar hornikusers\name{Votes} \alias{Votes} \docType{data} \title{Congressional Votes 1984 Data Set} \description{ This data set includes votes for each of the U.S. House of Representatives Congressmen on the 16 key votes identified by the CQA. The CQA lists nine different types of votes: voted for, paired for, and announced for (these three simplified to yea), voted against, paired against, and announced against (these three simplified to nay), voted present, voted present to avoid conflict of interest, and did not vote or otherwise make a position known (these three simplified to an unknown disposition). } \usage{data(Votes)} \format{ A data frame with 435 observations on the following 17 variables. \describe{ \item{\code{handicapped-infants}}{a factor with levels \code{n} and \code{y}} \item{\code{water-project-cost-sharing}}{a factor with levels \code{n} and \code{y}} \item{\code{adoption-of-the-budget-resolution}}{a factor with levels \code{n} and \code{y}} \item{\code{physician-fee-freeze}}{a factor with levels \code{n} and \code{y}} \item{\code{el-salvador-aid}}{a factor with levels \code{n} and \code{y}} \item{\code{religious-groups-in-schools}}{a factor with levels \code{n} and \code{y}} \item{\code{anti-satellite-test-ban}}{a factor with levels \code{n} and \code{y}} \item{\code{aid-to-nicaraguan-contras}}{a factor with levels \code{n} and \code{y}} \item{\code{mx-missile}}{a factor with levels \code{n} and \code{y}} \item{\code{immigration}}{a factor with levels \code{n} and \code{y}} \item{\code{synfuels-corporation-cutback}}{a factor with levels \code{n} and \code{y}} \item{\code{education-spending}}{a factor with levels \code{n} and \code{y}} \item{\code{superfund-right-to-sue}}{a factor with levels \code{n} and \code{y}} \item{\code{crime}}{a factor with levels \code{n} and \code{y}} \item{\code{duty-free-exports}}{a factor with levels \code{n} and \code{y}} \item{\code{export-administration-act-south-africa}}{a factor with levels \code{n} and \code{y}} \item{\code{Class}}{a factor with levels \code{democrat} and \code{republican}} } } \details{ The records are drawn from: \emph{Congressional Quarterly Almanac}, 98th Congress, 2nd session 1984, Volume XL: Congressional Quarterly Inc. Washington, D.C., 1985. It is important to recognize that \code{NA} in this database does not mean that the value of the attribute is unknown. It means simply, that the value is not "yea" or "nay" (see above). } \source{ \url{http://www.ics.uci.edu/~mlearn/MLRepository.html} } \references{ Blake, C.L. & Merz, C.J. (1998). UCI Repository of Machine Learning Databases. Irvine, CA: University of California, Department of Information and Computer Science. } \examples{ data(Votes) summary(Votes) ## maybe str(Votes) ; plot(Votes) ... } \keyword{datasets} cba/TODO0000644000175100001440000000301011304023136011470 0ustar hornikusers * check which functions are now in seriation and remove such from cba. * move sdists and friends to a seperate package. ceeboo 2009 * complete this list with known issues. * improve the documentation. * consider an option for handling of ties in clusterers (which support it). * standard methods for ccfkms are missing * unit tests to detect bugs caused by changes in the R source code. at least my trust is gone with the copy on write issue. * reconsider overloading of image.matrix * replace Calloc with calloc, etc. and use own checks and cleanups. * replace initializations of R vector memory that use a for loop with, e.g. memset(REAL(x), sizeof(double) * n) * dist and sdist could issue a warning if NAs occur in the result. * check again the duplicate or not issue :-( [current experience indicates it should work] * recheck the handling of NA and NaN in dists and elsewhere. * check if the R interface to ccfkms checks for unique initial solutions. reconsider how these solutions are generated. consider providing a default interface that searches for a "proper" number of clusters. ceeboo 2006 + rethink the current approach to dists in favor of the dist C level interface. this might be more developper friendly in the long run. + we cannot optionally include package Matrix. + the fixation on double for dists computations may be too restrictive. at least we need to shift more checking to the C level functions due to the migration to package proxy. ceeboo 2007 cba/DESCRIPTION0000644000175100001440000000113714344060573012533 0ustar hornikusersPackage: cba Type: Package Title: Clustering for Business Analytics Version: 0.2-23 Author: Christian Buchta and Michael Hahsler Maintainer: Christian Buchta Description: Implements clustering techniques such as Proximus and Rock, utility functions for efficient computation of cross distances and data manipulation. Depends: R (>= 2.10), grid, proxy Imports: stats, graphics, grDevices, methods Suggests: gclus, colorspace Encoding: UTF-8 License: GPL-2 NeedsCompilation: yes Packaged: 2022-12-07 07:29:24 UTC; hornik Repository: CRAN Date/Publication: 2022-12-07 09:48:43 UTC cba/src/0000755000175100001440000000000014344034315011605 5ustar hornikuserscba/src/rock.c0000644000175100001440000003331214344034315012711 0ustar hornikusers #include #include /* rock.c * * implements the paper: * * S. Guha, R. Rastogi, and K. Shim. Rock: A Robust Clustering Algorithm * for Categorical Attributes. Information Systems, Vol. 25, No. 5, 2000. * * The implementation uses a lower triangular matrix representation and * comes in three parts: a function that computes link counts from * distances, another that constructs a cluster solution by merging, and * a function that classifies samples. Implementation of the clustering * problem by separate functions is slightly inefficient but allows for * reuse and further experimentation. * * The auxiliary functions for computation of "binary" distances are * considerably faster than R dist, and the second usage even does not * seem to be available in R (moved to dists.c). * * Release: 0.1-1 * Release date: 2005-06-30 * * (C) ceeboo 2005 */ /* * Compute Rock link counts (of the number of common neighbors of * any two data points). * * As input we exepct a lower triangular matrix organized by columns * but with the diagonal omitted which contains the distance values * for any pair of data points. Fixme: currently, the vector has no * attribute indicating its type. * * The same data structure is used as the return value. * * we now test for NA and NaN (at the cost of runtime) as we do not * want to rely on the implementation detail that NA is a large number. * as befor we settle for ignoring such values. although this is not * correct we avoid computing 2^k results (see also cluster.dist) * * ceeboo 2005, 2006 */ SEXP rockLink(SEXP R_x, SEXP R_beta) { if (TYPEOF(R_x) != REALSXP) error("rockLink: 'x' invalid storage type"); if (TYPEOF(R_beta) != REALSXP) error("rockLink: 'beta' invalid storage type"); int m, n; int i, j, k, kk, l; int *v, *p; double z, beta; double *x; SEXP R_obj; m = LENGTH(R_x); n = 1 + (int) sqrt(2*m); if (m < 3 || m != n*(n-1)/2) /* logical constraint */ error("rockLink: 'x' invalid length"); x = REAL(R_x); beta = REAL(R_beta)[0]; if (ISNAN(beta)) error("rockLink: 'beta' NA or NaN"); PROTECT(R_obj = NEW_INTEGER(m)); for (l = 0; l < m; l++) INTEGER(R_obj)[l] = 0; /* this sucks! */ v = Calloc(n, int); p = Calloc(n, int); /* column offset */ for (k = 0; k < n; k++) p[k] = k*(n-1)-k*(k+1)/2-1; for (i = 0; i < n; i++) { l = 0; for (k = 0; k < i; k++) { z = x[i+p[k]]; if (ISNAN(z)) continue; if (beta >= z) v[l++] = k; } kk = p[i]; for (k = i+1; k < n; k++) { z = x[k+kk]; if (ISNAN(z)) continue; if (beta >= z) v[l++] = k; } for (j = 1; j < l; j++) for (k = 0; k < j; k++) { kk = p[v[k]]; INTEGER(R_obj)[v[j]+kk]++; } } Free(p); Free(v); UNPROTECT(1); return R_obj; } /* * Successively merge two clusters, either unitl the desired * number of clusters is reached, or stop if there are all but * zero link counts left. * * returns a list containing a factor with levels labled * contiguously and starting with "1", and a table of cluster * sizes. * * This code is optimized for low memory footprint and computation * time. However, currently it does not use sparse representations. * * The search for the maximum merger among n candiates has time * complexity O(n*(n-1)/2) in the worst case and O(n-1) in the * best case. * * Note that we do not check the link count matrix for NAs or NaNs * because 1) the above function should not return these values, * and 2) the present function is conceptually internal. * * The neighborhood paramtere is constrained to the interval [0,1) * because inclusion of one results in divison by zero in all * calculations of the merging criterion. For zero link counts * this would result in NaNs and undesirable additional checks. * * Fixme: tie breaking! */ SEXP rockMerge(SEXP R_x, SEXP R_n, SEXP R_theta, SEXP R_debug) { int debug, m, n, nn; int i, j, k, l, ii, jj, kk, ll, iii, jjj, kkk; int *x, *o, *c, *f, *p, *w; double y, z; double *t, *v; char *s; SEXP R_obj, R_tmp, R_str, R_dim; debug = INTEGER(R_debug)[0]; m = LENGTH(R_x); n = 1 + (int) sqrt(2*m); /* number of samples */ if (m < 3 || m != n*(n-1)/2) error("rockMerge: invalid vector length"); nn = INTEGER(R_n)[0]; /* number of clusters */ if (nn < 1) error("rockMerge: invalid number of clusters"); z = REAL(R_theta)[0]; /* neigborhood parameter */ if (z < 0 || z >= 1) error("rockMerge: invalid neigborhood parameter"); z = 1 + 2 * (1-z) / (1+z); x = Calloc(m, int); /* link counts */ Memcpy(x, INTEGER(R_x), m); o = Calloc(n, int); /* sample index */ c = Calloc(n, int); /* cluster index */ f = Calloc(n, int); /* cluster size */ p = Calloc(n, int); /* column offset in dist */ t = Calloc(n+1, double); /* table of powers */ v = Calloc(n-1, double); /* column maximum */ w = Calloc(n-1, int); /* row index */ for (k = 0; k < n; k++) { o[k] = k; c[k] = -1; f[k] = 1; p[k] = k*(n-1)-k*(k+1)/2-1; t[k+1] = pow(k+1, z); } /* find the maximum of a column (in the lower * triangular part of it) and the corresponding * row index. */ y = t[2]-2*t[1]; /* initially constant */ k = 0; for (i = 0; i < n-1; i++) { v[i] = -1; for (j = i+1; j < n; j++) { z = x[k++] / y; if (z > v[i]) { v[i] = z; w[i] = j; } } } if (debug) Rprintf(" #cls clids sizes goodness\n"); m = n; while (m > nn) { z = -1; /* find the maximum */ for (ii = 0; ii < m-1; ii++) if (v[ii] > z) { z = v[ii]; i = ii; } if (z == 0) break; ii = o[i]; j = w[i]; jj = o[j]; if (debug) { Rprintf(" %4i %4i %4i [%4i,%4i] %12.6f", m, ii, jj, f[ii], f[jj], z); if (f[ii] > 1 && f[jj] > 1) Rprintf("+\n"); else Rprintf("\n"); } /* merge the frequencies and link counts; check * if the new cluster provides a new column maximum; * this is slightly inefficient in the worst case. */ f[ii] += f[jj]; for (k = 0; k < i; k++) { kk = o[k]; kkk = p[kk]; x[ii+kkk] += x[jj+kkk]; z = x[ii+kkk] / (t[f[ii]+f[kk]] - t[f[ii]] - t[f[kk]]); if (z > v[k]) { v[k] = z; w[k] = i; } else if (w[k] == i || w[k] == j) { v[k] = -1; /* be deterministic */ for (l = k+1; l < m; l++) { if (l == j) continue; ll = o[l]; z = x[ll+kkk] / (t[f[kk]+f[ll]] - t[f[kk]] - t[f[ll]]); if (z > v[k]) { v[k] = z; w[k] = l; } } } } v[i] = -1; /* column changed */ iii = p[ii]; for (k = i+1; k < j; k++) { kk = o[k]; kkk = p[kk]; x[kk+iii] += x[jj+kkk]; z = x[kk+iii] / (t[f[ii]+f[kk]] - t[f[ii]] - t[f[kk]]); if (z > v[i]) { v[i] = z; w[i] = k; } if (w[k] == j) { v[k] = -1; for (l = k+1; l < m; l++) { if (l == j) continue; ll = o[l]; z = x[ll+kkk] / (t[f[kk]+f[ll]] - t[f[kk]] - t[f[ll]]); if (z > v[k]) { v[k] = z; w[k] = l; } } } } jjj = p[jj]; for (k = j+1; k < m; k++) { kk = o[k]; x[kk+iii] += x[kk+jjj]; z = x[kk+iii] / (t[f[ii]+f[kk]] - t[f[ii]] - t[f[kk]]); if (z > v[i]) { v[i] = z; w[i] = k; } } /* reorganize the indexes of the clusters, * of the rows corresponding to the maxima, * and shrink the vectors. */ if (c[ii] == -1) c[ii] = ii; if (c[jj] == -1) c[jj] = c[ii]; else { iii = c[ii]; jjj = c[jj]; for (k = 0; k < n; k++) if (c[k] == jjj) c[k] = iii; } for (k = 0; k < j; k++) /* for clarity here */ if (w[k] > j) w[k]--; for (k = j+1; k < m-1; k++) { o[k-1] = o[k]; v[k-1] = v[k]; w[k-1] = w[k]-1; } if (k < m) o[k-1] = o[k]; m--; } Free(x); Free(p); Free(t); Free(v); Free(w); if (m > nn) Rprintf("rockMerge: terminated with %i clusters\n", m); PROTECT(R_obj = NEW_LIST(2)); PROTECT(R_tmp = NEW_INTEGER(n)); /* reorganize the indexes of the * clusters to be contiguous and * to start with one. */ for (k = 0; k < n; k++) o[k] = -1; m = 0; for (k = 0; k < n; k++) { if (c[k] == -1) c[k] = k; kk = c[k]; if (o[kk] == -1) o[kk] = ++m; INTEGER(R_tmp)[k] = o[kk]; } int sn = (int) log10(m) + 2; s = Calloc(sn, char); /* stringified integers */ PROTECT(R_str = NEW_STRING(m)); for (j = 0; j < m; j++) { snprintf(s,sn,"%i",j+1); SET_STRING_ELT(R_str, j, mkChar(s)); } Free(s); SET_LEVELS(R_tmp, R_str); UNPROTECT(1); PROTECT(R_str = NEW_STRING(1)); SET_STRING_ELT(R_str, 0, mkChar("factor")); SET_CLASS(R_tmp, R_str); UNPROTECT(1); SET_ELEMENT(R_obj, 0, R_tmp); UNPROTECT(1); PROTECT(R_tmp = NEW_INTEGER(m)); for (k = 0; k < n; k++) { kk = c[k]; if (o[kk] != -1) { INTEGER(R_tmp)[o[kk]-1] = f[kk]; o[kk] = -1; } } Free(o); Free(c); Free(f); PROTECT(R_dim = NEW_INTEGER(1)); INTEGER(R_dim)[0] = m; SET_DIM(R_tmp, R_dim); UNPROTECT(1); PROTECT(R_dim = NEW_LIST(1)); SET_ELEMENT(R_dim, 0, GET_LEVELS(VECTOR_ELT(R_obj, 0))); SET_DIMNAMES(R_tmp, R_dim); UNPROTECT(1); PROTECT(R_str = NEW_STRING(1)); SET_STRING_ELT(R_str, 0, mkChar("table")); SET_CLASS(R_tmp, R_str); UNPROTECT(1); SET_ELEMENT(R_obj, 1, R_tmp); UNPROTECT(1); UNPROTECT(1); return R_obj; } /* * compute a classification based on a Rock clustering. since we * use a threshold on distances a data point may be assigned to * more than one cluster, or even none. * * we expect the cluster indexes to be a factor, i.e. to be contiguous * and to start with one. the supplied distances have to be equal or * greater than zero. NAs and NaNs are allowed (see the explanation * above). * * note: * * 1) ties are broken at random (this may obfuscate that the data * actually has no structure). * 2) points that are not in any neighborhood are assigned the class * value NA. * * */ SEXP rockClass(SEXP R_x, SEXP R_l, SEXP R_beta, SEXP R_theta) { int nr, nc, nl, na; int i, j, h, k; int *l, *c, *cf; double beta; double t, z, y; double *n, *x; SEXP R_lev, R_obj, R_tmp, R_str, R_dim; nr = INTEGER(GET_DIM(R_x))[0]; nc = INTEGER(GET_DIM(R_x))[1]; if (LENGTH(R_l) != nc) error("rockClass: invalid vector length or number of columns"); R_lev = GET_LEVELS(R_l); nl = LENGTH(R_lev); t = REAL(R_theta)[0]; if (t < 0 || t > 1) error("rockMerge: invalid neigborhood parameter"); t = 1 + 2 * (1-t) / (1+t); l = INTEGER(R_l); /* number of levels */ n = Calloc(nc, double); /* expected neighbors */ /* check the validity of the indexes and * compute the expected number of neighbors */ for (j = 0; j < nc; j++) { i = l[j]; if (i == NA_INTEGER || i < 1 || i > nl) { Free(n); error("rockClass: invalid cluster index(es)"); } n[i-1]++; } for (j = 0; j < nl; j++) { z = n[j]; if (z == 0) { /* not contiguous */ Free(n); error("rockClass: invalid cluster index(es)"); } n[j] = pow(1+z, t); } x = REAL(R_x); /* distances */ beta = REAL(R_beta)[0]; /* threshold */ c = Calloc(nl, int); PROTECT(R_obj = NEW_LIST(2)); PROTECT(R_tmp = NEW_INTEGER(nr)); /* class indexes */ cf = Calloc(nl+1, int); /* class frequencies */ GetRNGstate(); for (j = 0; j < nl; j++) cf[j] = 0; for (i = 0; i < nr; i++) { for (j = 0; j < nl; j++) /* initialize */ c[j] = 0; for (j = 0; j < nc; j++) /* count neighbors */ if (beta >= x[i+j*nr]) c[l[j]-1]++; k = nl; /* include NAs */ h = 0; /* compiler hack */ z = 0; for (j = 0; j < nl; j++) { /* determine maximum */ y = c[j] / n[j]; if (y > z) { z = y; k = j; h = 1; } else if (h > 0 && y == z) { /* break ties */ if (unif_rand() > (double) h/(h+1)) k = j; h++; } } cf[k]++; INTEGER(R_tmp)[i] = k+1; } PutRNGstate(); Free(n); Free(c); na = nl+(cf[nl]>0); PROTECT(R_str = NEW_STRING(na)); for (j = 0; j < nl; j++) SET_STRING_ELT(R_str, j, STRING_ELT(R_lev, j)); if (na>nl) SET_STRING_ELT(R_str, j, NA_STRING); SET_LEVELS(R_tmp, R_str); UNPROTECT(1); PROTECT(R_str = NEW_STRING(1)); SET_STRING_ELT(R_str, 0, mkChar("factor")); SET_CLASS(R_tmp, R_str); UNPROTECT(1); SET_ELEMENT(R_obj, 0, R_tmp); UNPROTECT(1); PROTECT(R_tmp = NEW_INTEGER(na)); Memcpy(INTEGER(R_tmp), cf, na); Free(cf); PROTECT(R_dim = NEW_INTEGER(1)); INTEGER(R_dim)[0] = na; SET_DIM(R_tmp, R_dim); UNPROTECT(1); PROTECT(R_dim = NEW_LIST(1)); SET_ELEMENT(R_dim, 0, GET_LEVELS(VECTOR_ELT(R_obj, 0))); SET_DIMNAMES(R_tmp, R_dim); UNPROTECT(1); PROTECT(R_str = NEW_STRING(1)); SET_STRING_ELT(R_str , 0, mkChar("table")); SET_CLASS(R_tmp, R_str); UNPROTECT(1); SET_ELEMENT(R_obj, 1, R_tmp); UNPROTECT(1); UNPROTECT(1); return R_obj; } /**/ cba/src/ccfkms.c0000644000175100001440000002710414344034315013223 0ustar hornikusers/* * ccfkms.c * * parameter and logistic k-means based on conjugate covex functions * using sparse data structures and centering (or optionally * standardizing) of the data. * * For details see: * * Helmut Strasser and Klaus Poetzelberger. Data Compression by * Unsupervised Classification. SFB Report Series, No. 10, 1997. * * convex function: f(x) = |x|^q/q * * kohonen k-means: q = 1 * ordinary k-means: q = 2 * * convex function: f(x) = 2*ln(cosh(|x|))/2 * * logistic k-means: q = 0 * * Sparse data structure means that zero data values in the data are not * stored because they can be ignored in all vector operations, so that * computations get a considerable boost if the data are highly sparse, * i.e. the ratio of the number of non-zero data values to the number of * all data values is low. * * Centering means that the mean of the data is subtracted from the * samples, and standardizing that we further devide by the standard * deviation. Note that only for Euclidian distances the solution does * not depend on centering. * * In the case of degenerate and non-convergent solutions the program * gives a warning message. * * fixme: 1) second, ... winner not implemented. * 2) maybe return a flag indicating convergence issues? * * Note that the code is prepared for direct intefacing with sparse * data structures, such as dgRMatrix from the R package Matrix. * * (C) ceeboo 2003, 2004, 2005, 2007 */ #include #include static int debug = FALSE; /* be silent */ /* matrix structure for data in sparse row format */ typedef struct { int *ri; /* pointer to array of column start indexes */ int *ci; /* pointer to array of column indexes */ double *cv; /* pointer to array of column values */ int nr; /* number of rows */ int nc; /* number of columns */ int s; // non-sparse input data = 0 } SMAT; static void FreeSMat(SMAT *m) { if (m->s != 0) return; Free(m->ri); Free(m->ci); Free(m->cv); Free(m); } /* copy R matrix in full-storage representation to sparse * representation. treat as read only (!) */ static SMAT *matrix2smat(SEXP R_mat) { extern int debug; int nr, nc, n; int i, j, k; int *ri, *ci; double z; double *x, *cv; SMAT *m; nr = INTEGER(GET_DIM(R_mat))[0]; nc = INTEGER(GET_DIM(R_mat))[1]; x = REAL(R_mat); ri = Calloc(nr+1, int); /* row start indexes */ n = 1024; /* initial memory */ ci = Calloc(n, int); /* column indexes */ cv = Calloc(n, double); /* column values */ k = 0; for (i = 0; i < nr; i++) { /* rows */ ri[i] = k; for (j = 0; j < nc; j++) { /* columns */ z = x[i+j*nr]; if (R_FINITE(z) && z != 0.0) { if (k == n) { n *= 2; /* double memory */ ci = Realloc(ci, n, int); cv = Realloc(cv, n, double); } ci[k] = j; cv[k++] = z; } } } ri[i] = k; if (n > k) { ci = Realloc(ci, k, int); cv = Realloc(cv, k, double); } if (debug) { Rprintf("Non-Zero: %i\n", k); Rprintf("Sparsity: %4.2f\n",k / (double) (nr * nc)); } m = Calloc(1, SMAT); m->ri = ri; m->ci = ci; m->cv = cv; m->nr = nr; m->nc = nc; return m; } static SMAT *dgRMatrix2smat(SEXP R_x) { SMAT *m; SEXP x = getAttrib(R_x, install("x")); if (TYPEOF(x) != REALSXP) error("ccfkms: slot 'x' of dgRMatrix not of storage type real"); m = (SMAT *) malloc(sizeof(SMAT)); m->ri = INTEGER(getAttrib(R_x, install("p"))); m->ci = INTEGER(getAttrib(R_x, install("j"))); m->cv = REAL(x); m->nr = INTEGER(getAttrib(R_x, install("Dim")))[0]; m->nc = INTEGER(getAttrib(R_x, install("Dim")))[1]; m->s = 1; return m; } static SMAT *dgCMatrix2smat(SEXP R_x) { SMAT *m; SEXP x = getAttrib(R_x, install("x")); if (TYPEOF(x) != REALSXP) error("ccfkms: slot 'x' of dgCMatrix not of storage type real"); m = (SMAT *) malloc(sizeof(SMAT)); m->ri = INTEGER(getAttrib(R_x, install("p"))); m->ci = INTEGER(getAttrib(R_x, install("i"))); m->cv = REAL(x); m->nr = INTEGER(getAttrib(R_x, install("Dim")))[1]; m->nc = INTEGER(getAttrib(R_x, install("Dim")))[0]; m->s = 1; return m; } SEXP ccfkms(SEXP R_x, SEXP R_p, SEXP R_par, SEXP R_max_iter, SEXP R_opt_std, SEXP R_debug) { extern int debug; int opt_std, np, max_iter; int i, j, k, l, iter, ap; int *pf, *pm; double par; double x = 0, y, z, max_var, max_inf, old_inf, inf, var; double *am, *as, *p, *pt, *cc, *ct; char *s; SMAT *m = NULL; SEXP R_obj, R_tmp; debug = INTEGER(R_debug)[0]; if (inherits(R_x, "dgRMatrix")) m = dgRMatrix2smat(R_x); else if (inherits(R_x, "dgCMatrix")) m = dgCMatrix2smat(R_x); else m = matrix2smat(R_x); /* data matrix */ /* compute attribute means. standardization * is optional. if used we transform so that * we do not need to revert to a full-storage * representation. */ opt_std = INTEGER(R_opt_std)[0]; /* standardization option */ am = Calloc(m->nc, double); /* attribute means */ as = NULL; /* attribute standard deviations */ if (opt_std) as = Calloc(m->nc, double); for (i = 0; i < m->nr; i++) for (j = m->ri[i]; j < m->ri[i + 1]; j++) { am[m->ci[j]] += m->cv[j]; if (opt_std) as[m->ci[j]] += pow(m->cv[j], 2); } for (i = 0; i < m->nc; i++) { am[i] /= m->nr; if (opt_std) { as[i] = sqrt(as[i] / m->nr - pow(am[i], 2)); if (as[i] == 0) { Free(am); if (opt_std) Free(as); FreeSMat(m); error("ccfkms: zero standard deviation"); } am[i] /= as[i]; } } if (opt_std) /* prepere data */ for (i = 0; i < m->nr; i++) for (j = m->ri[i]; j < m->ri[i + 1]; j++) m->cv[j] /= as[m->ci[j]]; /* get initial protoypes and allocate * R result object. */ np = INTEGER(GET_DIM(R_p))[0]; /* number of prototypes */ if (INTEGER(GET_DIM(R_p))[1] != m->nc) { /* check */ Free(am); if (opt_std) Free(as); FreeSMat(m); error("ccfkms: \"x\" and \"p\" do not conform"); } PROTECT(R_obj = NEW_LIST(4)); /* result object */ PROTECT(R_tmp = allocMatrix(REALSXP, np, m->nc)); /* prototypes */ Memcpy(REAL(R_tmp), REAL(R_p), np * m->nc); /* copy prototypes */ p = REAL(R_tmp); SET_VECTOR_ELT(R_obj, 0, R_tmp); UNPROTECT(1); /* center (standardize) initial prototypes */ for (i = 0; i < np; i++) for (j = 0; j < m->nc; j++) { if (opt_std) p[i + j * np] /= as[j]; p[i + j * np] -= am[j]; } /* get parameter */ par = REAL(R_par)[0]; /* get maximum number of iterations */ max_iter = INTEGER(R_max_iter)[0]; /* compute the maximum information and variance, * i.e., each point is a prototype */ z = 0; for (i = 0; i < m->nc; i++) z += pow(am[i], 2); max_var = 0; max_inf = 0; for (i = 0; i < m->nr; i++) { y = z; for (j = m->ri[i]; j < m->ri[i + 1]; j++) y += m->cv[j] * (m->cv[j] - 2 * am[m->ci[j]]); max_var += y; max_inf += pow(sqrt(y), par) / par; } max_var /= m->nr; max_inf /= m->nr; /* allocate remaining result objects * and iterate to a fixpoint solution */ PROTECT(R_tmp = NEW_INTEGER(np)); /* prototype frequencies */ pf = INTEGER(R_tmp); SET_VECTOR_ELT(R_obj, 1, R_tmp); UNPROTECT(1); PROTECT(R_tmp = NEW_INTEGER(m->nr)); /* prototype memberships */ pm = INTEGER(R_tmp); SET_VECTOR_ELT(R_obj, 2, R_tmp); UNPROTECT(1); GetRNGstate(); pt = Calloc(np * m->nc, double); /* prototype temporary */ cc = Calloc(np, double); /* conjugate convex */ ct = Calloc(np, double); /* conjugate convex temporary */ if (debug) Rprintf("\n %3s %5s %5s %3s\n","#","inf","var","nap"); old_inf = -1; inf = 0; iter = 0; while(inf > old_inf && iter < max_iter) { /* map prototype means into domain of dual problem * and compute conjugate convex function */ for (i = 0; i < np; i++) { y = 0; for (j = 0; j < m->nc; j++) y += pow(p[i + j * np], 2); y = sqrt(y); if (par) z = pow(y, par-2); else { /* logistic */ x = (exp(y) - 1) / (exp(y) + 1); z = x / y; } for (j = 0; j < m->nc; j++) p[i + j * np] *= z; z = 0; for (j = 0; j < m->nc; j++) z += p[i + j * np] * am[j]; if (par) z += pow(y, par) * (par-1) / par; else z += (1 + x) * log(1 + x) + (1 - x) * log(1 - x); cc[i] = z; pf[i] = 0; /* initialize */ for (j=0; j < m->nc; j++) pt[i + j * np] = 0; } /* determine partition and * calculate prototype means */ for (i = 0; i < m->nr; i++) { for (k = 0; k < np; k++) { ct[k] = -cc[k]; for (j = m->ri[i]; j < m->ri[i + 1]; j++) ct[k] += m->cv[j] * p[k + m->ci[j] * np]; } /* find the closest prototype. * note that tie breaking is used. */ l = 1; k = 0; z = ct[0]; for (j = 1; j < np; j++) if (z < ct[j]) { k = j; z = ct[j]; } else if (z == ct[j]) { if (unif_rand() > l/(l+1)) k = j; l++; } pm[i] = k; /* update prototype frequency and means */ pf[k]++; for (j = m->ri[i]; j < m->ri[i + 1]; j++) pt[k + m->ci[j] * np] += m->cv[j]; } /* update the stopping criterion. compute the means * and the information and variance of the partition */ old_inf = inf; ap = 0; inf = 0; var = 0; for (i = 0; i < np; i++) { if (pf[i] != 0) { ap++; for (j = 0; j < m->nc; j++) p[i + j * np] = pt[i + j * np] / pf[i] - am[j]; z =0; for (j = 0; j < m->nc; j++) z += pow(p[i + j * np], 2); var += z * pf[i] / m->nr; z = sqrt(z); if (par) z = pow(z, par) / par; else { z = 2 * log((exp(z / 2) + exp(-z / 2)) / 2); } inf += z * pf[i] / m->nr; } } iter++; if (debug) Rprintf(" %3i %5.3f %5.3f %3i\n", iter, inf / max_inf, var / max_var, ap); /* degenrate solution */ if (old_inf > inf) warning("ccfkms: decrease in information"); } Free(pt); Free(cc); Free(ct); PutRNGstate(); if (max_iter > 1 && old_inf != inf) warning("ccfkms: no convergence"); /* invert the information */ inf = max_inf - inf; PROTECT(R_tmp = NEW_NUMERIC(1)); REAL(R_tmp)[0] = inf; SET_VECTOR_ELT(R_obj, 3, R_tmp); UNPROTECT(1); /* decenter (destandardize) the prototype means */ for (i = 0; i < np; i++) { for (j = 0; j < m->nc; j++) { p[i + j * np] += am[j]; if (opt_std) p[i + j * np] *= as[i]; } } Free(am); if (opt_std) Free(as); /* offset memberships to R indexing. */ for (i = 0; i < m->nr; i++) pm[i]++; FreeSMat(m); /* levels attribute */ int sn = np/10+2; s = Calloc(sn, char); /* stringified integers */ PROTECT(R_tmp = NEW_STRING(np)); for (j = 0; j < np; j++) { snprintf(s,sn,"%i",j+1); SET_STRING_ELT(R_tmp, j, mkChar(s)); } Free(s); SET_LEVELS(VECTOR_ELT(R_obj, 2), R_tmp); UNPROTECT(1); UNPROTECT(1); return R_obj; } /**/ cba/src/optimal.c0000644000175100001440000003406711304023136013420 0ustar hornikusers #include #include /* compute the lenght of an order, i.e. the sum of * the edge weights along the path defined by the * order. * * note that the order is a tour with the leg between * the first and the last city omitted. * * ceeboo 2005 */ static double orderLength(double *x, int *o, int n) { double v, z; int i, j, k; z = 0; /* path length */ i = o[0]; for (k = 0; k < n-1; k++) { j = o[k+1]; if (i > j) v = x[i+j*(n-1)-j*(j+1)/2-1]; else if (i == j) return NA_REAL; else v = x[j+i*(n-1)-i*(i+1)/2-1]; if (!R_FINITE(v)) return NA_REAL; z += v; i = j; } return z; } /* R wrapper */ SEXP order_length(SEXP R_dist, SEXP R_order) { int n, k; int *o; SEXP R_obj; n = 1 + (int) sqrt(2 * LENGTH(R_dist)); if (LENGTH(R_dist) < 1 || LENGTH(R_dist) != n*(n-1)/2) error("order_cost: invalid length"); if (LENGTH(R_order) != n) error("order_length: \"dist\" and \"order\" do not match"); o = Calloc(n, int); for (k = 0; k < n; k++) /* offset to C indexing */ o[k] = INTEGER(R_order)[k]-1; PROTECT(R_obj = NEW_NUMERIC(1)); REAL(R_obj)[0] = orderLength(REAL(R_dist), o, n); Free(o); UNPROTECT(1); return R_obj; } /* check validity of a merge tree representation */ int checkRmerge(int *x, int n) { int k, v; if (x[0] > 0 || x[n-1] > 0) /* initial merge */ return 0; for (k = 0; k < 2*(n-1); k++) { v = x[k]; if (v < -n || v > n-1) return 0; if (v > 0 && v > k+1) return 0; } return 1; } /* Z. Bar-Joseph, E. D. Demaine, D. K. Gifford, and T. Jaakkola. * (2001) Fast Optimal Leaf Ordering for Hierarchical Clustering. * Bioinformatics, Vol. 17 Suppl. 1, pp. 22-29. * * this implementation builds on the improvements of a more recent paper * available at the website of Bar-Joseph! * * as input we exepct a matrix with the distances in the lower triangle, * a merge tree, i.e. two arrays holding n-1 indexes of the left and right * subtrees (or leaves) merged at the kth step (for details see dist and * hclust). * * returns a list with a matrix (merge) and two vectors (order and length). * * The algorithm has the following stages: * * 1) find a leaf ordering consistent with the supplied merge tree. * the order of the leaves of a tree consists of the order of the * leaves in the left subtree followed by the order of the leaves * in the right subtree. * * note that the tree (leaf) indexes must have an offset of one because * the leaves are coded as negative numbers. subtrees are referenced by * their position in the merge sequence (see hclust). this sucks! * * we compute for each left and right subtree the offset of the leftmost * leaf in the total order of leaves, and the number of leaves in both * trees, i.e. in the parent tree. * * 2) recursively compute for each pair of outer endpoints, i.e. a left * endpoint from the left subtree and a right endpoint from the right * subtree the length of the optimal ordering of the leaves. * * the temporary tables are stored in the lower triangle as well as the * similarities. the lengths of the best linear orderings are stored in * the upper triangle. * * for the improved computations at the root the diagonal is used as * storage for temporary results. * * the time complexity of finding all the partial optimal leaf orderings * is O(n^3). * * the suggested improvement based on early termination of the search is * currently not implemented. however, ties are broken randomly. * * 3) recursively find the total optimal leaf ordering. * * 4) find the merge tree corresponding to the optimal ordering. * * fixme: using similarities would allow a remapping of non-finite * values to zero and thus sanitizing of overflows. also for * missing values this would be a more user friendly approach. * * (C) ceeboo 2005 */ static int calcAllOrder(double *x, int *e, int *oi, int *ok, int *oj, int ci, int ck, int cj, int n) { int i, ii, j, jj, k, kk, h = 0, l; double s, z; for (i = 0; i < ci; i++) { ii = oi[i]; for (j = 0; j < cj; j++) { jj = oj[j]; l = 0; z = R_PosInf; for (k = 0; k < ck; k++) { kk = ok[k]; if (ii > kk) s = x[kk+ii*n]; else s = x[ii+kk*n]; if (kk > jj) s += x[kk+jj*n]; else s += x[jj+kk*n]; if (s < z) { z = s; h = kk; l = 1; } else if (s == z) { if (unif_rand() > (double) l/(l+1)) h = kk; l++; } } if (!R_FINITE(z)) return 0; /* error */ if (ii > jj) x[jj+ii*n] = z; else x[ii+jj*n] = z; e[ii+jj*n] = h; } } return 1; } static int calcEndOrder(double *x, int *e, int *oi, int *ok, int ci, int ck, int n) { int i, ii, k, kk, h = 0, l; double s, z; for (i = 0; i < ci; i++) { ii = oi[i]; l = 0; z = R_PosInf; for (k = 0; k < ck; k++) { kk = ok[k]; if (ii > kk) s = x[kk+ii*n]; else s = x[ii+kk*n]; if (s < z) { z = s; h = kk; l = 1; } else if (s == z) { if (unif_rand() > (double) l/(l+1)) h = kk; l++; } } if (!R_FINITE(z)) return 0; x[ii+ii*n] = z; e[ii+ii*n] = h; } return 1; } static int debug = FALSE; SEXP order_optimal(SEXP R_dist, SEXP R_merge) { int n, i, ii, j, jj, k, kk, h, a = 0, b = 0; int cl = 0, cll = 0, clr = 0, cr = 0, crl = 0, crr = 0; int *l, *r, *c, *e; int *left, *right, *o, *ol = 0, *oll = 0, *olr = 0, *or = 0, *orl = 0, *orr = 0; double s, z, zz; double *x; SEXP R_obj; n = 1 + (int) sqrt(2 * LENGTH(R_dist)); if (LENGTH(R_dist) < 3 || LENGTH(R_dist) != n*(n-1)/2) error("order_optimal: invalid length"); if (LENGTH(GET_DIM(R_merge)) != 2) error("order_optimal: \"merge\" invalid"); if (INTEGER(GET_DIM(R_merge))[0] != n-1) error("order_optimal: \"dist\" and \"merge\" do not conform"); if (!checkRmerge(INTEGER(R_merge), n)) error("order_optimal: \"merge\" invalid"); /* copy similarities into lower triangle */ x = Calloc(n*n, double); /* data + part order lengths + temporary */ k = 0; for (i = 0; i < n-1; i++) for (j = i+1; j < n; j++) { z = REAL(R_dist)[k++]; if (!R_FINITE(z)) { Free(x); error("order_optimal: \"dist\" invalid"); } else x[j+i*n] = z; } PROTECT(R_obj = NEW_LIST(3)); /* result list */ SET_ELEMENT(R_obj, 0, duplicate(R_merge)); /* merge */ SET_ELEMENT(R_obj, 1, NEW_INTEGER(n)); /* order */ SET_ELEMENT(R_obj, 2, NEW_NUMERIC(1)); /* length */ left = INTEGER(VECTOR_ELT(R_obj, 0)); right = INTEGER(VECTOR_ELT(R_obj, 0))+n-1; o = INTEGER(VECTOR_ELT(R_obj, 1)); GetRNGstate(); l = Calloc(n, int); /* offset of leftmost leaf of left tree */ r = Calloc(n, int); /* offset of leftmost leaf of right tree; * reverse mapping of order */ c = Calloc(n-1, int); /* number of leaves in a tree */ e = Calloc(n*n, int); /* inner endpoints */ /* for each tree count the number of leaves. */ for (k = 0; k < n-1; k++) { if (left[k] > 0) c[k] += c[left[k]-1]; else c[k] = 1; if (right[k] > 0) c[k] += c[right[k]-1]; else c[k] += 1; } /* backpropagate the counts to obtain the current * leaf order and the offset of the leftmost leaf * of the left and right subtree. */ for (k = n-2; k >= 0; k--) { if (left[k] > 0) { h = l[k] + c[left[k]-1]; if (right[k] > 0) l[right[k]-1] = h; else o[h] = -right[k]-1; l[left[k]-1] = l[k]; } else { h = l[k] + 1; if (right[k] > 0) l[right[k]-1] = h; else o[h] = -right[k]-1; o[l[k]] = -left[k]-1; } r[k] = h; } /* determine for each subtree the optimal order * for each pair of left and right endpoints * (leaves). this is done in the order provided * by the merge tree. */ for (k = 0; k < n-1; k++) { ol = o + l[k]; /* order of left subtree */ or = o + r[k]; /* order of right subtree */ cl = r[k] - l[k]; /* number of leaves in left subtree */ cr = c[k] - cl; /* number of leaves in right subtree */ if (cl > 1) { /* a left tree */ h = left[k]-1; oll = o + l[h]; olr = o + r[h]; cll = r[h] - l[h]; clr = c[h] - cll; } else { /* a left leaf */ oll = olr = ol; cll = clr = cl; } if (cr > 1) { /* a right tree */ h = right[k]-1; orl = o + l[h]; orr = o + r[h]; crl = r[h] - l[h]; crr = c[h] - crl; } else { /* a right leaf */ orl = orr = or; crl = crr = cr; } if (k == n-2) /* optimized search at the root */ break; /* compute temporary sums for all endpoints */ if (!calcAllOrder(x, e, oll, olr, or, cll, clr, cr, n)) { Free(x); Free(r); Free(l); Free(c); Free(e); error("order_optimal: non-finite values"); } if (olr != oll) if (!calcAllOrder(x, e, olr, oll, or, clr, cll, cr, n)) { Free(x); Free(r); Free(l); Free(c); Free(e); error("order_optimal: non-finite values"); } /* copy temporary sums to lower triangle */ for (i = 0; i < cl; i++) { ii = ol[i]; for (j = 0; j < cr; j++) { jj = or[j]; if (ii > jj) x[ii+jj*n] = x[jj+ii*n]; else x[jj+ii*n] = x[ii+jj*n]; } } /* compute best orders for all endpoints */ if (!calcAllOrder(x, e, orl, orr, ol, crl, crr, cl, n)) { Free(x); Free(r); Free(l); Free(c); Free(e); error("order_optimal: non-finite values"); } if (orr != orl) if (!calcAllOrder(x, e, orr, orl, ol, crr, crl, cl, n)) { Free(x); Free(r); Free(l); Free(c); Free(e); error("order_optimal: non-finite values"); } /* now that we know both endpoints we can store * the inner endpoint from the left tree at the * correct addresse. */ for (i = 0; i < cr; i++) { ii = or[i]; for (j = 0; j < cl; j++) { jj = ol[j]; kk = e[ii+jj*n]; if (ii > jj) x[ii+jj*n] = (double) e[jj+kk*n]; else x[jj+ii*n] = (double) e[jj+kk*n]; } } /* copy back */ for (i = 0; i < cl; i++) { ii = ol[i]; for (j = 0; j < cr; j++) { jj = or[j]; if (ii > jj) e[ii+jj*n] = (int) x[ii+jj*n]; else e[ii+jj*n] = (int) x[jj+ii*n]; } } } /* find the best linear order for each endpoint * of the left and right subtree of the root */ if (!calcEndOrder(x, e, oll, olr, cll, clr, n)) { Free(x); Free(r); Free(l); Free(c); Free(e); error("order_optimal: non-finite values"); } if (olr != oll) if (!calcEndOrder(x, e, olr, oll, clr, cll, n)) { Free(x); Free(r); Free(l); Free(c); Free(e); error("order_optimal: non-finite values"); } if (!calcEndOrder(x, e, orl, orr, crl, crr, n)) { Free(x); Free(r); Free(l); Free(c); Free(e); error("order_optimal: non-finite values"); } if (orr != orl) if (!calcEndOrder(x, e, orr, orl, crr, crl, n)) { Free(x); Free(r); Free(l); Free(c); Free(e); error("order_optimal: non-finite values"); } /* find the best linear order at the root */ k = 0; z = R_PosInf; for (i = 0; i < cl; i++) { ii = ol[i]; zz = x[ii+ii*n]; for (j = 0; j < cr; j++) { jj = or[j]; s = zz + x[jj+jj*n]; if (ii > jj) s += x[ii+jj*n]; else s += x[jj+ii*n]; if (s < z) { z = s; a = ii; b = jj; k = 1; } else if (s == z) { if (unif_rand() > (double) k/(k+1)) { a = ii; b = jj; } k++; } } if (!R_FINITE(z)) { Free(x); Free(r); Free(l); Free(c); Free(e); error("order_optimal: non-finite values"); } } REAL(VECTOR_ELT(R_obj, 2))[0] = z; /* set length */ /* the order can be found by double recursion. * for this we use a stack, one for the left * and one for the right endpoints. */ l[0] = b; /* push endpoints of right tree on the stack*/ r[0] = e[b+b*n]; i = e[a+a*n]; /* start with endpoints of left tree */ j = a; h = 0; k = 1; while (h < n) { if (i == j) { /* backtrack */ o[h++] = i; k--; if (k < 0) break; i = l[k]; /* pop endpoints */ j = r[k]; } else { l[k] = e[j+i*n]; /* push endpoints of right tree on the stack */ r[k] = j; k++; j = e[i+j*n]; /* recurse left tree */ } } /* adjust the merge tree to the optimal order * * 1) for each pair of leaves from a left and right * subtree the order relation is the same. thus, * use the leftmost leaves as representatives. * * 2) if the order is reversed we must swap the * subtrees at the parent. */ for (k = 0; k < n; k++) /* reverse mapping of optimal order */ r[o[k]] = k; for (k = 0; k < n-1; k++) { if (left[k] > 0) /* left leaf in left subtree */ i = l[left[k]-1]; else i = -left[k]-1; if (right[k] > 0) /* left leaf in right subtree */ j = l[right[k]-1]; else j = -right[k]-1; if (r[i] > r[j]) { /* swap the subtrees */ h = right[k]; right[k] = left[k]; left[k] = h; } l[k] = i; /* left leaf in parent tree */ } for (k = 0; k < n; k++) /* offset to R indexing */ o[k]++; if (debug) { i = e[a+a*n]; j = e[b+b*n]; if (i > j) x[j+i*n] = z; else x[i+j*n] = z; for (k = 0; k < n-1; k++) { if (left[k] > 0) l[k] = l[left[k]-1]; else l[k] = -left[k]-1; if (right[k] > 0) r[k] = r[right[k]-1]; else r[k] = -right[k]-1; i = l[k]; j = r[k]; if (i > j) z = x[j+i*n]; else z = x[i+j*n]; Rprintf(" %3i | %4i %4i | %3i %3i | %f\n", k+1, left[k], right[k], i+1, j+1, z); } } Free(x); Free(l); Free(r); Free(c); Free(e); PutRNGstate(); UNPROTECT(1); return R_obj; } /**/ cba/src/gknn.c0000644000175100001440000001106611304023136012702 0ustar hornikusers #include #include /* gknn.c * * generic k-nearest neighbor algorithm which operates on a * user-supplied distance matrix. * * implements: * * 1) inclusion of tied (equi-distant) kth neighbors (see option * use_all) * 2) random selection of tied (equi-distant) kth neighbors (see * option use_all) * 3) minimum vote test (otherwise NA is returned for 'doubt') * 3) breaking of tied votes * * expects as input a cross-distance matrix, a factor of class values, * the number of neighbors to use, the minimum number of votes * (for a definite decision), options for for handling ties in votes * and/or distances (see above), and an option for inclusion of the * proportions of winning votes. note that classe values may be NA * but missing distance values are ignored. * * returns a factor of class predictions and, optionally, a vector * of proportions of winning votes as attribute "prob". * * ceeboo (2005) */ SEXP gknn(SEXP R_x, SEXP R_y, SEXP R_k, SEXP R_l, SEXP R_break_ties, SEXP R_use_all, SEXP R_prob) { int nr, n, nc, nn, nv; int break_ties, use_all; int *y, *o, *c; int i, j, k, l, m, v; double *x; SEXP R_obj, R_pro, R_str; nr = INTEGER(GET_DIM(R_x))[0]; /* number of test instances */ n = INTEGER(GET_DIM(R_x))[1]; /* number of training instances */ if (LENGTH(R_y) != n) error("gknn: \"x\" and \"y\" do not conform"); nc = LENGTH(GET_LEVELS(R_y)); /* number of classes */ if (nc < 1) error("gknn: \"y\" invalid number of levels"); if (STRING_ELT(GET_LEVELS(R_y), nc-1) == NA_STRING) error("gknn: \"y\" invalid level"); y = INTEGER(R_y); /* class indexes (R shifted) */ for (i = 0; i < n; i++) if (y[i] == NA_INTEGER || y[i] < 1 || y[i] > nc) error("gknn: \"y\" invalid value"); nn = INTEGER(R_k)[0]; /* number of neighbors */ if (nn < 1 || nn > n) error("gknn: invalid number of neighbors"); nv = INTEGER(R_l)[0]; /* number of minimum votes */ if (nv < 0 || nv > nn) error("gknn: invalid minimum number of votes"); break_ties = LOGICAL(R_break_ties)[0]; /* tie breaking */ use_all = LOGICAL(R_use_all)[0]; /* use all neighbors */ o = Calloc(n, int); /* order */ c = Calloc(nc+1, int); /* class counts */ x = Calloc(n, double); /* distances */ PROTECT(R_obj = NEW_INTEGER(nr)); if (LOGICAL(R_prob)[0]) { /* return proportions */ PROTECT(R_pro = NEW_NUMERIC(nr)); setAttrib(R_obj, install("prob"), R_pro); UNPROTECT(1); } else R_pro = R_NilValue; GetRNGstate(); for (i = 0; i < nr; i++) { for (j = 0; j < n; j++) { o[j] = j; x[j] = REAL(R_x)[i+j*nr]; /* copy distances */ } rsort_with_index(x, o, n); for (j = 1; j < nc+1; j++) /* R shifted */ c[j] = 0; k = 0; /* invalid class */ for (j = 0; j < nn; j++) { /* count classes */ if (ISNAN(x[j])) break; k = y[o[j]]; c[k]++; } if (use_all) { /* use tied */ while (j < n && x[j] == x[j-1]) { k = y[o[j++]]; c[k]++; } } else { /* draw from tied */ while (j < n && x[j] == x[j-1]) j++; if (j > nn) { l = nn - 1 + (int) (unif_rand() * (j-nn+1)); l = y[o[l]]; if (l != k) { c[k]--; k = l; c[k]++; } } } l = 0; /* number of ties */ v = 0; /* number of votes */ m = 0; /* max votes */ for (j = 1; j < nc+1; j++) { v += c[j]; if (c[j] > m) { m = c[j]; k = j; l = 1; } else if (l > 0 && c[j] == m) { if (unif_rand() > (double) l/(l+1)) k = j; l++; } } if (R_pro != R_NilValue) { if (v > 0) REAL(R_pro)[i] = (double) m/v; else REAL(R_pro)[i] = NA_REAL; } if (nv > m) { /* below minimum vote */ INTEGER(R_obj)[i] = NA_INTEGER; } else { if (l > 0) { if (break_ties) INTEGER(R_obj)[i] = k; else { if (l > 1) INTEGER(R_obj)[i] = NA_INTEGER; else INTEGER(R_obj)[i] = k; } } else INTEGER(R_obj)[i] = NA_INTEGER; } } Free(o); Free(c); Free(x); PutRNGstate(); SET_LEVELS(R_obj, duplicate(GET_LEVELS(R_y))); PROTECT(R_str = NEW_STRING(1)); SET_STRING_ELT(R_str , 0, mkChar("factor")); SET_CLASS(R_obj, R_str); UNPROTECT(1); UNPROTECT(1); return R_obj; } /**/ cba/src/proximus.c0000644000175100001440000003270214332123330013634 0ustar hornikusers/* * proximus.c * * implements the paper: * * M. Koyutürk, A. Graham, and N. Ramakrishnan. Compression, Clustering, * and Pattern Discovery in Very High-Dimensional Descrete-Attribute Data * Sets. IEEE Transactions On Knowledge and Data Engineering, Vol. 17, * No. 4, (April) 2005 * * As data mining algorithms are supposed to deal with large amounts of * data this code was optimized for low memory footprint and low execution * times. Fasten your seat belts ;-) * * * (C) ceeboo 2005 */ #include #include static int debug = FALSE; /* user defined */ /* copy a variable to R */ static SEXP var2R(int v) { SEXP R_obj; R_obj = NEW_INTEGER(1); INTEGER(R_obj)[0] = v; return R_obj; } /* vector for counting and indexing */ typedef struct { int *v; /* pointer to array of values */ int n; /* number elements */ } VEC; static VEC *newVec(int n) { int *v = Calloc(n, int); VEC *p = Calloc(1, VEC); p->v = v; p->n = n; return p; } static VEC *copyVec(VEC *v) { int i; VEC *r = newVec(v->n); for (i = 0; i < v->n; i++) r->v[i] = v->v[i]; return r; } static void freeVec(VEC *v) { if (v->v != NULL) Free(v->v); Free(v); } /* copy a vector to R where an offset is added * to each element */ static SEXP vec2R(VEC *v, int o) { int j; SEXP R_obj; R_obj = NEW_INTEGER(v->n); for (j = 0; j < v->n; j++) INTEGER(R_obj)[j] = v->v[j] + o; return R_obj; } /* for debugging static void VecPrintf(VEC *v, char *s) { int j; Rprintf("%s", s); for (j = 0; j < v->n; j++) Rprintf(" (%i,%i)", j, v->v[j]); Rprintf("\n"); } */ /* matrix for binary data in sparse column format */ typedef struct { int *ri; /* pointer to array of row indexes */ int *ci; /* pointer to array of column start indexes */ int nr; /* number of rows */ int nc; /* number of columns */ } MAT; static void freeMat(MAT *m) { Free(m->ri); Free(m->ci); Free(m); } /* copy and transpose R matrix to sparse matrix */ static MAT *R_mat2mat(SEXP R_mat) { extern int debug; int nr, nc; int i, j, k, n; int *x, *ci, *ri; MAT *m; x = INTEGER(R_mat); nr = INTEGER(GET_DIM(R_mat))[0]; /* number of rows */ nc = INTEGER(GET_DIM(R_mat))[1]; /* number of columns */ ci = Calloc(nr+1, int); /* column start */ n = 1024; /* initial size */ ri = Calloc(n, int); /* row indexes */ k = 0; for (j = 0; j < nr; j++) { ci[j] = k; for (i = 0; i < nc; i++) if (x[i * nr + j] == 1) { if (k == n) { /* used up */ n *= 2; /* double size */ ri = Realloc(ri, n, int); } ri[k++] = i; } } ci[j] = k; /* length of ri */ if (n > k) /* free unused */ ri = Realloc(ri, k, int); if (debug) { Rprintf("Non-Zero: %i\n", k); Rprintf("Sparsity: %4.2f\n",k / (double) (nr * nc)); } m = Calloc(1, MAT); m->ri = ri; m->ci = ci; m->nr = nc; m->nc = nr; return m; } /* multiply a matrix in sparse column format (m) with a sparse * vector (v) from the left using a subset of the columns (s). the * caller is reponsible for providing a proper results vector (r). */ static void matLeft(VEC *r, VEC *v, VEC *s, MAT *m) { int i, j, k, z; for (i = 0; i < s->n; i++) { /* columns */ z = 0; k = 0; j = m->ci[s->v[i]]; do { /* rows */ if (m->ri[j] == v->v[k]) { z++; j++; k++; } else if (m->ri[j] < v->v[k]) j++; else k++; } while (j < m->ci[s->v[i] + 1] && k < v->n); r->v[i] = z; } r->n = s->n; } /* as above but multiply from the right */ static void matRight(VEC *r, VEC *v, MAT *m) { int i, j; for (i = 0; i < m->nr; i++) r->v[i] = 0; r->n = m->nr; for (i = 0; i < v->n; i++) for (j = m->ci[v->v[i]]; j < m->ci[v->v[i] + 1]; j++) r->v[m->ri[j]]++; } /* linked list of approximation results */ typedef struct resNode { VEC *x; /* presence vector (column indexes) */ VEC *y; /* dominant pattern vector (row indexes) */ int n; /* number of ones ... */ int c; /* approximation criterion */ int r; /* hamming radius */ struct resNode *next; /* pointer to result element */ } RES; static int res_cnt; /* number of result elements */ static RES *res_last; /* last element of result list */ static int freeRes(RES *r) { int i; RES *p, *q; i = 0; for (p = r; p != NULL; p = q) { q = p->next; freeVec(p->x); freeVec(p->y); Free(p); i++; } return i; } /* copy result list to R and clean up * * fixme: pointer protection should be * on the level of the caller */ static SEXP res2R(RES *r, MAT *m) { int i, nr, nc; RES *p, *q; SEXP R_ret, R_obj, R_lst, R_res; nc = m->nr; /* transpose */ nr = m->nc; PROTECT(R_ret = NEW_LIST(3)); /* results header */ SET_ELEMENT(R_ret, 0, PROTECT(var2R(nr))); SET_ELEMENT(R_ret, 1, PROTECT(var2R(nc))); UNPROTECT(2); PROTECT(R_obj = NEW_STRING(3)); SET_STRING_ELT(R_obj, 0, mkChar("nr")); SET_STRING_ELT(R_obj, 1, mkChar("nc")); SET_STRING_ELT(R_obj, 2, mkChar("a")); SET_NAMES(R_ret, R_obj); UNPROTECT(1); PROTECT(R_lst = NEW_LIST(res_cnt)); /* results list */ i = 0; for (p = r; p != NULL; p = q) { q = p->next; PROTECT(R_res = NEW_LIST(5)); SET_ELEMENT(R_res, 0, PROTECT(vec2R(p->x,1))); SET_ELEMENT(R_res, 1, PROTECT(vec2R(p->y,1))); UNPROTECT(2); SET_ELEMENT(R_res, 2, PROTECT(var2R(p->n))); SET_ELEMENT(R_res, 3, PROTECT(var2R(p->c))); SET_ELEMENT(R_res, 4, PROTECT(var2R(p->r))); UNPROTECT(3); freeVec(p->x); freeVec(p->y); Free(p); PROTECT(R_obj = NEW_STRING(5)); SET_STRING_ELT(R_obj, 0, mkChar("x")); SET_STRING_ELT(R_obj, 1, mkChar("y")); SET_STRING_ELT(R_obj, 2, mkChar("n")); SET_STRING_ELT(R_obj, 3, mkChar("c")); SET_STRING_ELT(R_obj, 4, mkChar("r")); SET_NAMES(R_res, R_obj); UNPROTECT(1); if (i == res_cnt) { i += freeRes(q); freeMat(m); error("res2R result count error [%i:%i]", i, res_cnt); } SET_ELEMENT(R_lst, i++, R_res); UNPROTECT(1); } if (i != res_cnt) error("res2R result count error [%i:%i]", i, res_cnt); SET_ELEMENT(R_ret, 2, R_lst); UNPROTECT(2); return R_ret; } /* compute the rank-one approximation of a column subset of a * matrix. the code is optimized for minimal memory usage */ static int min_size = 1; /* user defined */ static int max_iter = 16; /* user defined */ static RES *approximate(VEC *s, MAT *m) { extern int min_size; /* minimum set size */ extern int max_iter; /* maximum iterations */ extern int debug; int i, j, l, c, z; VEC *x, *y, *v; RES *p; x = newVec(s->n); /* presence set (column indexes) */ y = newVec(m->nr); /* dominant pattern (row indexes) */ v = newVec((s->n > m->nr) ? s->n : m->nr); /* result vector (counts) */ if (s->n > min_size) { i = (int) (unif_rand() * s->n); /* sample a column */ y->n = 0; for (j = m->ci[s->v[i]]; j < m->ci[s->v[i] + 1]; j++) y->v[y->n++] = m->ri[j]; } else { for (j = 0; j < s->n; j++) x->v[j] = s->v[j]; } z = 0; /* number of ones in pattern */ c = -1; /* stopping criterion */ i = 0; while (i < max_iter) { if (s->n > min_size) { matLeft(v, y, s, m); x->n = 0; for (j = 0; j < v->n; j++) if (2 * v->v[j] >= y->n) /* holds for at least one */ x->v[x->n++] = s->v[j]; } matRight(v, x, m); z = 0; y->n = 0; for (j = 0; j < v->n; j++) if (2 * v->v[j] >= x->n) { /* may not hold for any */ z += v->v[j]; y->v[y->n++] = j; } l = c; c = 2 * z - x->n * y->n; if (c == l) /* convergence */ break; i++; if (debug > 1) Rprintf("%2i %6i %i\n", i, x->n, c); } if (i == max_iter) /* no convergence */ warning("approximation: no convergence"); /* compute the Hamming radius of the presence set */ matLeft(v, y, x, m); l = 0; for (i = 0; i < x->n; i++) { j = m->ci[x->v[i] + 1] - m->ci[x->v[i]]; j += y->n - 2 * v->v[i]; if (j > l) l = j; } freeVec(v); x->v = Realloc(x->v, x->n, int); /* see above */ if (y->n) /* see above */ y->v = Realloc(y->v, y->n, int); else { Free(y->v); y->v = NULL; } /* package result */ p = Calloc(1, RES); p->x = x; p->y = y; p->n = z; p->c = c; p->r = l; p->next = NULL; return p; } /* produce a presence set. for now, draw a pattern * and select additional patterns that are within the user * defined radius. this may result in a singular set and has * nothing todo with the approximation idea! this is more * like vodoo. */ static int max_radius = 1; /* user defined */ static VEC *presenceSet(VEC *s, MAT *m) { extern int debug; extern int max_radius; int i, j, z; VEC *y, *x; y = newVec(m->nr); /* pattern vector */ x = newVec(s->n); /* presenece vector */ i = (int) (unif_rand() * s->n); /* sample a column */ y->n = 0; for (j = m->ci[s->v[i]]; j < m->ci[s->v[i] + 1]; j++) y->v[y->n++] = m->ri[j]; /* select all rows that are within the * radius of the selected pattern */ matLeft(x, y, s, m); x->n = 0; for (i = 0; i < s->n; i++) { z = m->ci[s->v[i] + 1] - m->ci[s->v[i]]; z += y->n - 2 * x->v[i]; if (z <= max_radius) x->v[x->n++] = s->v[i]; } if (debug > 1) Rprintf(" %i %i\n", s->n, x->n); freeVec(y); x->v = Realloc(x->v, x->n, int); return x; } /* remove the set x from set s (column indexes). note: this is * not a general implementation of the setminus operation. */ static void remSet(VEC *x, VEC *s) { int i, j, k; j = 0; k = 0; for (i = 0; i < s->n; i++) if (j < x->n && x->v[j] == s->v[i]) j++; else s->v[k++] = s->v[i]; s->n = k; } /* partition a binary matrix over the columns. the code is optimized * for minimal memory usage. for the sake of algorithmic clarity * shortcuts with respect to terminal nodes are not implemented. */ static int min_retry = 10; /* user defined */ static RES *partition(VEC *s, MAT *m, int d, int i) { extern int max_radius; extern int min_retry; extern int min_size; /* see approximation */ extern int debug; extern int res_cnt; extern RES *res_last; VEC *xx, *ss; RES *z, *zz; z = approximate(s, m); if (debug) Rprintf("%3i [%i,%i,%i] %i", d, s->n, z->x->n, z->r, i); if (z->x->n == s->n) { /* pure */ if (z->r <= max_radius || /* homogenous */ z->x->n <= min_size) { /* min size */ res_cnt++; if (debug) Rprintf(" * %i\n", res_cnt); return res_last = z; } else if (min_retry && s->n >= min_retry * i) { /* retry */ if (debug) Rprintf(" +\n"); freeRes(z); return partition(s, m, d, i+1); } else { /* vodoo !!! */ if (debug) Rprintf(" >>\n"); freeRes(z); /* compare below */ xx = presenceSet(s, m); ss = copyVec(xx); zz = partition(ss, m, d+1, i); freeVec(ss); remSet(xx, s); freeVec(xx); if (s->n) { z = res_last; z->next = partition(s, m, d+1, i); } if (debug) Rprintf("%3i <<\n", d); return zz; } } if (debug) Rprintf(" >\n"); /* in order to prevent excessive memory consumption we reuse the * subset vector for the next zero set. as its contents may get * changed in the recursion, the next one set must be a copy of * the current one set. */ ss = copyVec(z->x); zz = partition(ss, m, d+1, i); freeVec(ss); remSet(z->x, s); freeRes(z); z = res_last; z->next = partition(s, m, d+1, i); if (debug) Rprintf("%3i <\n", d); return zz; } /* R interface */ SEXP proximus(SEXP R_mat, SEXP R_max_radius, SEXP R_min_size, SEXP R_min_retry, SEXP R_max_iter, SEXP R_debug) { extern int max_radius; /* see partition */ extern int min_size; extern int min_retry; extern int max_iter; /* see approximation */ extern int debug; extern int res_cnt; int j; VEC *s; MAT *m; RES *r; SEXP R_res; if (!LENGTH(R_max_radius) || !LENGTH(R_min_size ) || !LENGTH(R_min_retry ) || !LENGTH(R_max_iter ) || !LENGTH(R_debug )) error("proximus: missing parameter"); max_radius = INTEGER(R_max_radius)[0]; min_size = INTEGER(R_min_size )[0]; min_retry = INTEGER(R_min_retry )[0]; max_iter = INTEGER(R_max_iter )[0]; debug = LOGICAL(R_debug )[0]; if (!IS_LOGICAL(R_mat)) error("proximus: matrix not logical"); m = R_mat2mat(R_mat); s = newVec(m->nc); /* column subset vector */ for (j = 0; j < s->n; j++) s->v[j] = j; GetRNGstate(); res_cnt = 0; /* reset results counter */ r = partition(s, m, 0, 1); /* recursion */ PutRNGstate(); freeVec(s); R_res = res2R(r, m); freeMat(m); return R_res; } /***/ cba/src/stress.c0000644000175100001440000003017612265434154013310 0ustar hornikusers #include #include // arrayIndex.c extern SEXP _int_array_subscript(int, SEXP, const char *, const char *, SEXP, Rboolean, SEXP); /* compute the stress measure based on Moor Neighborhoods, i.e. the * sums of the squared distances of a point to its eight (five at the * margins and three at the corners) adjacent neighbors as defined by * the row and column indexes (or subsets of it). * * this function counts each edge distance only once! so, if you * prefer the measure from the paper you have to take twice the * value. * * note that NAs are omitted. however, the function does not return * NA if there was no legal edge at all. */ double stressMoore(double *x, int *r, int *c, int nr, int nc, int nrx) { double d, v, z; int i, j, l, ll, k, kk; z = 0; l = r[0]; for (i = 0; i < nr-1; i++) { ll = r[i+1]; k = c[0] * nrx; for (j = 0; j < nc-1; j++) { kk = c[j+1] * nrx; v = x[l+k]; if (!ISNAN(v)) { d = v - x[ll+k]; if (!ISNAN(d)) z += d * d; d = v - x[ll+kk]; if (!ISNAN(d)) z += d * d; d = v - x[l+kk]; if (!ISNAN(d)) z += d * d; } d = x[ll+k] - x[l+kk]; k = kk; if (!ISNAN(d)) z += d * d; } d = x[l+k] - x[ll+k]; l = ll; if (!ISNAN(d)) z += d * d; R_CheckUserInterrupt(); } k = c[0] * nrx; for (j = 0; j < nc-1; j++) { kk = c[j+1] * nrx; d = x[l+k] - x[l+kk]; k = kk; if (!ISNAN(d)) z += d * d; } return z; } /* same as above but use a von Neumann neighborhood, i.e. the * neighboring points on the diagonals are excluded. */ double stressNeumann(double *x, int *r, int *c, int nr, int nc, int nrx) { double d, v, z; int i, j, l, ll, k, kk; z = 0; l = r[0]; for (i = 0; i < nr-1; i++) { ll = r[i+1]; k = c[0] * nrx; for (j = 0; j < nc-1; j++) { kk = c[j+1] * nrx; v = x[l+k]; if (!ISNAN(v)) { d = v - x[ll+k]; if (!ISNAN(d)) z += d * d; d = v - x[l+kk]; if (!ISNAN(d)) z += d * d; } k = kk; } d = x[l+k] - x[ll+k]; l = ll; if (!ISNAN(d)) z += d * d; R_CheckUserInterrupt(); } k = c[0] * nrx; for (j = 0; j < nc-1; j++) { kk = c[j+1] * nrx; d = x[l+k] - x[l+kk]; k = kk; if (!ISNAN(d)) z += d * d; } return z; } /* R wrapper to the stress functions */ SEXP stress(SEXP R_x, SEXP R_r, SEXP R_c, SEXP R_type) { int nrx, nr, nc; int k; int *r, *c; SEXP R_obj; #ifdef _COMPAT_ PROTECT(R_r = arraySubscript(0, R_r, GET_DIM(R_x), getAttrib, (STRING_ELT), R_x)); PROTECT(R_c = arraySubscript(1, R_c, GET_DIM(R_x), getAttrib, (STRING_ELT), R_x)); #else PROTECT(R_r = _int_array_subscript(0, R_r, "dim", "dimnames", R_x, TRUE, R_NilValue)); PROTECT(R_c = _int_array_subscript(1, R_c, "dim", "dimnames", R_x, TRUE, R_NilValue)); #endif nrx = INTEGER(GET_DIM(R_x))[0]; /* number of rows */ nr = LENGTH(R_r); nc = LENGTH(R_c); /* remap R indexes to C indexes * this sucks! */ r = Calloc(nr, int); c = Calloc(nc, int); /* copy and shift indexes */ for (k = 0; k < nr; k++) r[k] = INTEGER(R_r)[k]-1; for (k = 0; k < nc; k++) c[k] = INTEGER(R_c)[k]-1; PROTECT(R_obj = NEW_NUMERIC(1)); switch (INTEGER(R_type)[0]) { case 1: REAL(R_obj)[0] = stressMoore(REAL(R_x), r, c, nr, nc, nrx); break; case 2: REAL(R_obj)[0] = stressNeumann(REAL(R_x), r, c, nr, nc, nrx); break; default: Free(r); Free(c); error("stress: type not implemented"); } Free(r); Free(c); UNPROTECT(3); return R_obj; } /* calculate the Moore distances between all pairs of rows or columns. * of a matrix. for a given (fixed) row or column ordering the distances * could be used to search for an optimal column or row ordering using * an alternating scheme. * * if the calculation are over the rows ncx = 1, otherwise the roles * of rows and columns are swapped and nrx = 1. * * the caller must provide the result array d and the temporary array t. * * the distances are arranged in lower triangular column format (compare * the R function dist). * * note that the edge distances are computed only once! * * (C) ceeboo 2005, 2006 */ void distMoore(double *x, int *r, int *c, int nr, int nc, int nrx, int ncx, double *d, double *t) { double v, w, z; int i, ii, j, jj, k, kk, kkk, l; for (k = 0; k < nr*(nr-1)/2; k++) /* initialize distances */ d[k] = 0; for (i = 0; i < nr; i++) { z = 0; ii = r[i] * ncx; kk = c[0] * nrx; for (k = 0; k < nc-1; k++) { kkk = c[k+1] * nrx; w = x[ii+kk] - x[ii+kkk]; if (!ISNAN(w)) z += w * w; kk = kkk; } t[i] = z; R_CheckUserInterrupt(); } l = 0; for (i = 0; i < nr-1; i++) { ii = r[i] * ncx; for (j = i+1; j < nr; j++) { z = t[i] + t[j]; jj = r[j] * ncx; kk = c[0] * nrx; for (k = 0; k < nc-1; k++) { kkk = c[k+1] * nrx; v = x[ii+kk]; if (!ISNAN(v)) { w = v - x[jj+kk]; if (!ISNAN(w)) z += w * w; w = v - x[jj+kkk]; if (!ISNAN(w)) z += w * w; } w = x[jj+kk] - x[ii+kkk]; if (!ISNAN(w)) z += w * w; kk = kkk; } w = x[ii+kk] - x[jj+kk]; if (!ISNAN(w)) z += w * w; d[l++] = z; R_CheckUserInterrupt(); } } } /* calculate the von Neumann distances over the rows or columns of a * matrix. * * compare above. */ void distNeumann(double *x, int *r, int *c, int nr, int nc, int nrx, int ncx, double *d, double *t) { double w, z; int i, ii, j, jj, k, kk, kkk, l; for (k = 0; k < nr*(nr-1)/2; k++) /* initialize distances */ d[k] = 0; for (i = 0; i < nr; i++) { z = 0; ii = r[i] * ncx; kk = c[0] * nrx; for (k = 0; k < nc-1; k++) { kkk = c[k+1] * nrx; w = x[ii+kk] - x[ii+kkk]; if (!ISNAN(w)) z += w * w; kk = kkk; } t[i] = z; R_CheckUserInterrupt(); } l = 0; for (i = 0; i < nr-1; i++) { ii = r[i] * ncx; for (j = i+1; j < nr; j++) { z = t[i] + t[j]; jj = r[j] * ncx; for (k = 0; k < nc-1; k++) { kk = c[k] * nrx; w = x[ii+kk]- x[jj+kk]; if (!ISNAN(w)) z += w * w; } kk = c[k] * nrx; w = x[ii+kk] - x[jj+kk]; if (!ISNAN(w)) z += w * w; d[l++] = z; R_CheckUserInterrupt(); } } } /* R wrapper */ SEXP stress_dist(SEXP R_x, SEXP R_r, SEXP R_c, SEXP R_bycol, SEXP R_type) { int nrx, nr, nc; int k; int *r, *c; double *d, *t; SEXP R_obj = R_NilValue; /* compiler hack */ #ifdef _COMPAT_ PROTECT(R_r = arraySubscript(0, R_r, GET_DIM(R_x), getAttrib, (STRING_ELT), R_x)); PROTECT(R_c = arraySubscript(1, R_c, GET_DIM(R_x), getAttrib, (STRING_ELT), R_x)); #else PROTECT(R_r = _int_array_subscript(0, R_r, "dim", "dimnames", R_x, TRUE, R_NilValue)); PROTECT(R_c = _int_array_subscript(1, R_c, "dim", "dimnames", R_x, TRUE, R_NilValue)); #endif nrx = INTEGER(GET_DIM(R_x))[0]; /* number of rows */ nr = LENGTH(R_r); nc = LENGTH(R_c); /* remap R indexes to C indexes * this sucks! */ r = Calloc(nr, int); c = Calloc(nc, int); /* copy and shift indexes */ for (k = 0; k < nr; k++) r[k] = INTEGER(R_r)[k]-1; for (k = 0; k < nc; k++) c[k] = INTEGER(R_c)[k]-1; switch(LOGICAL(R_bycol)[0]) { case 0: PROTECT(R_obj = NEW_NUMERIC(nr*(nr-1)/2)); d = REAL(R_obj); t = Calloc(nr, double); switch(INTEGER(R_type)[0]) { case 1: distMoore(REAL(R_x), r, c, nr, nc, nrx, 1, d, t); break; case 2: distNeumann(REAL(R_x), r, c, nr, nc, nrx, 1, d, t); break; default: Free(r); Free(c); Free(t); error("stress_dist: \"type\" not implemented"); } Free(t); break; case 1: PROTECT(R_obj = NEW_NUMERIC(nc*(nc-1)/2)); d = REAL(R_obj); t = Calloc(nc, double); switch(INTEGER(R_type)[0]) { case 1: distMoore(REAL(R_x), c, r, nc, nr, 1, nrx, d, t); break; case 2: distNeumann(REAL(R_x), c, r, nc, nr, 1, nrx, d, t); break; default: Free(r); Free(c); Free(t); error("stress_dist: type not implemented"); } Free(t); break; default: Free(r); Free(c); error("stress_dist: \"bycol\" invalid"); } Free(r); Free(c); UNPROTECT(3); return R_obj; } /* in ordering problems we want find a path of minimum length through * a distance graph. this is a TSP problem with a dummy city that is * equally distant from all other cities, i.e. the length of the path * i -> 0 -> j, closing the tour is irrelevant. however, the length of * the leg i -> j is greater or equal than any remaining leg k -> l on * the optimum tour. * * see: Climer, S. and Zhang, W. (2006) Rearrangement Clustering: * Pitfalls, Remedies, and Applications. Journal of Machine * Learning Research 7, pp. 919-943. * * orderTSP implements a greedy heuristic that exchanges two edges * immediately if this improves the tour length and stops if no further * improvement (over all combinations of edges) is possible. exchanging * edges amounts to reversing subpaths. * * the time complexity is O(n^2) with n the number of cities. * * note: the algorithm could easily be extended to a simulated * annealing algorithm. the code is slightly optimized. */ SEXP orderTSP(SEXP x, SEXP t) { if (TYPEOF(x) != REALSXP) error("'x' invalid storage type"); if (TYPEOF(t) != INTSXP) error("'t' invalid storage type"); int i, n, f = 0; n = 1 + (int) sqrt(2*LENGTH(x)); if (LENGTH(x) != n*(n-1)/2) error("'x' invalid length"); if (LENGTH(t) != n) error("'t' invalid length"); for (i = 0; i < n; i++) if (INTEGER(t)[i] < 1 || INTEGER(t)[i] > n) error("'t' invalid"); PROTECT(t = duplicate(t)); do { int i, j, k = 0, l = 0, c1, c2, c3, c4 = n-1; double e23, e13, e12, e34, e24, e31, e41; f = 0; c1 = INTEGER(t)[0]-1; for (i = 1; i < n-1; i++) { c2 = INTEGER(t)[i]-1; c3 = INTEGER(t)[i+1]-1; if (c2 > c3) e23 = REAL(x)[c2+c3*(n-1)-c3*(c3+1)/2-1]; else e23 = REAL(x)[c3+c2*(n-1)-c2*(c2+1)/2-1]; if (c1 > c3) e13 = REAL(x)[c1+c3*(n-1)-c3*(c3+1)/2-1]; else e13 = REAL(x)[c3+c1*(n-1)-c1*(c1+1)/2-1]; if (e23 > e13) { f++; for (k = 0; k < (i+1)/2; k++) { l = INTEGER(t)[i-k]; INTEGER(t)[i-k] = INTEGER(t)[k]; INTEGER(t)[k] = l; } c1 = INTEGER(t)[0]-1; } } for (i = 0; i < n-3; i++) { c1 = INTEGER(t)[i]-1; c2 = INTEGER(t)[i+1]-1; if (c1 > c2) e12 = REAL(x)[c1+c2*(n-1)-c2*(c2+1)/2-1]; else e12 = REAL(x)[c2+c1*(n-1)-c1*(c1+1)/2-1]; for (j = i+2; j < n-1; j++) { c3 = INTEGER(t)[j]-1; c4 = INTEGER(t)[j+1]-1; if (c3 > c4) e34 = REAL(x)[c3+c4*(n-1)-c4*(c4+1)/2-1]; else e34 = REAL(x)[c4+c3*(n-1)-c3*(c3+1)/2-1]; if (c2 > c4) e24 = REAL(x)[c2+c4*(n-1)-c4*(c4+1)/2-1]; else e24 = REAL(x)[c4+c2*(n-1)-c2*(c2+1)/2-1]; if (c3 > c1) e31 = REAL(x)[c3+c1*(n-1)-c1*(c1+1)/2-1]; else e31 = REAL(x)[c1+c3*(n-1)-c3*(c3+1)/2-1]; if (e12+e34 > e24+e31) { f++; for (k = 0; k < (j-i)/2; k++) { l = INTEGER(t)[j-k]; INTEGER(t)[j-k] = INTEGER(t)[i+1+k]; INTEGER(t)[i+1+k] = l; } c2 = INTEGER(t)[i+1]-1; if (c1 > c2) e12 = REAL(x)[c1+c2*(n-1)-c2*(c2+1)/2-1]; else e12 = REAL(x)[c2+c1*(n-1)-c1*(c1+1)/2-1]; } } if (c4 > c1) e41 = REAL(x)[c4+c1*(n-1)-c1*(c1+1)/2-1]; else e41 = REAL(x)[c1+c4*(n-1)-c4*(c4+1)/2-1]; if (e12 > e41) { f++; for (k = 0; k < (j-i)/2; k++) { l = INTEGER(t)[j-k]; INTEGER(t)[j-k] = INTEGER(t)[i+1+k]; INTEGER(t)[i+1+k] = l; } } R_CheckUserInterrupt(); } } while (f); UNPROTECT(1); return t; } /**/ cba/src/arrayIndex.c0000644000175100001440000001662112265434154014072 0ustar hornikusers #include #include // workaround i18n #define _(x) (x) // copied from 2.14-2 src/main/subscript.c // // ceeboo 2011/11 2014/1 // #define ECALL(call, yy) if(call == R_NilValue) error(yy); else errorcall(call, yy); static SEXP nullSubscript(int n) { int i; SEXP indx; indx = allocVector(INTSXP, n); for (i = 0; i < n; i++) INTEGER(indx)[i] = i + 1; return indx; } static SEXP logicalSubscript(SEXP s, int ns, int nx, int *stretch, SEXP call) { int canstretch, count, i, nmax; SEXP indx; canstretch = *stretch; if (!canstretch && ns > nx) { ECALL(call, _("(subscript) logical subscript too long")); } nmax = (ns > nx) ? ns : nx; *stretch = (ns > nx) ? ns : 0; if (ns == 0) return(allocVector(INTSXP, 0)); count = 0; for (i = 0; i < nmax; i++) if (LOGICAL(s)[i%ns]) count++; indx = allocVector(INTSXP, count); count = 0; for (i = 0; i < nmax; i++) if (LOGICAL(s)[i%ns]) { if (LOGICAL(s)[i%ns] == NA_LOGICAL) INTEGER(indx)[count++] = NA_INTEGER; else INTEGER(indx)[count++] = i + 1; } return indx; } static SEXP negativeSubscript(SEXP s, int ns, int nx, SEXP call) { SEXP indx; int stretch = 0; int i, ix; PROTECT(indx = allocVector(LGLSXP, nx)); for (i = 0; i < nx; i++) LOGICAL(indx)[i] = 1; for (i = 0; i < ns; i++) { ix = INTEGER(s)[i]; if (ix != 0 && ix != NA_INTEGER && -ix <= nx) LOGICAL(indx)[-ix - 1] = 0; } s = logicalSubscript(indx, nx, nx, &stretch, call); UNPROTECT(1); return s; } static SEXP positiveSubscript(SEXP s, int ns, int nx) { SEXP indx; int i, zct = 0; for (i = 0; i < ns; i++) { if (INTEGER(s)[i] == 0) zct++; } if (zct) { indx = allocVector(INTSXP, (ns - zct)); for (i = 0, zct = 0; i < ns; i++) if (INTEGER(s)[i] != 0) INTEGER(indx)[zct++] = INTEGER(s)[i]; return indx; } else return s; } static SEXP integerSubscript(SEXP s, int ns, int nx, int *stretch, SEXP call) { int i, ii, min, max, canstretch; Rboolean isna = FALSE; canstretch = *stretch; *stretch = 0; min = 0; max = 0; for (i = 0; i < ns; i++) { ii = INTEGER(s)[i]; if (ii != NA_INTEGER) { if (ii < min) min = ii; if (ii > max) max = ii; } else isna = TRUE; } if (max > nx) { if(canstretch) *stretch = max; else { ECALL(call, _("subscript out of bounds")); } } if (min < 0) { if (max == 0 && !isna) return negativeSubscript(s, ns, nx, call); else { ECALL(call, _("only 0's may be mixed with negative subscripts")); } } else return positiveSubscript(s, ns, nx); return R_NilValue; } /* This uses a couple of horrible hacks in conjunction with * VectorAssign (in subassign.c). If subscripting is used for * assignment, it is possible to extend a vector by supplying new * names, and we want to give the extended vector those names, so they * are returned as the use.names attribute. Also, unset elements of the vector * of new names (places where a match was found) are indicated by * setting the element of the newnames vector to NULL. */ /* The original code (pre 2.0.0) used a ns x nx loop that was too * slow. So now we hash. Hashing is expensive on memory (up to 32nx * bytes) so it is only worth doing if ns * nx is large. If nx is * large, then it will be too slow unless ns is very small. */ static SEXP stringSubscript(SEXP s, int ns, int nx, SEXP names, int *stretch, Rboolean in, SEXP call) { SEXP indx, indexnames; int i, j, nnames, sub, extra; int canstretch = *stretch; /* product may overflow, so check factors as well. */ Rboolean usehashing = in && ( ((ns > 1000 && nx) || (nx > 1000 && ns)) || (ns * nx > 15*nx + ns) ); PROTECT(s); PROTECT(names); PROTECT(indexnames = allocVector(VECSXP, ns)); nnames = nx; extra = nnames; /* Process each of the subscripts. First we compare with the names * on the vector and then (if there is no match) with each of the * previous subscripts, since (if assigning) we may have already * added an element of that name. (If we are not assigning, any * nonmatch will have given an error.) */ if(usehashing) { /* must be internal, so names contains a character vector */ /* NB: this does not behave in the same way with respect to "" and NA names: they will match */ PROTECT(indx = match(names, s, 0)); /* second pass to correct this */ for (i = 0; i < ns; i++) if(STRING_ELT(s, i) == NA_STRING || !CHAR(STRING_ELT(s, i))[0]) INTEGER(indx)[i] = 0; for (i = 0; i < ns; i++) SET_VECTOR_ELT(indexnames, i, R_NilValue); } else { PROTECT(indx = allocVector(INTSXP, ns)); for (i = 0; i < ns; i++) { sub = 0; if (names != R_NilValue) { for (j = 0; j < nnames; j++) { SEXP names_j = STRING_ELT(names, j); if (!in && TYPEOF(names_j) != CHARSXP) { ECALL(call, _("character vector element does not have type CHARSXP")); } if (NonNullStringMatch(STRING_ELT(s, i), names_j)) { sub = j + 1; SET_VECTOR_ELT(indexnames, i, R_NilValue); break; } } } INTEGER(indx)[i] = sub; } } for (i = 0; i < ns; i++) { sub = INTEGER(indx)[i]; if (sub == 0) { for (j = 0 ; j < i ; j++) if (NonNullStringMatch(STRING_ELT(s, i), STRING_ELT(s, j))) { sub = INTEGER(indx)[j]; SET_VECTOR_ELT(indexnames, i, STRING_ELT(s, j)); break; } } if (sub == 0) { if (!canstretch) { ECALL(call, _("subscript out of bounds")); } extra += 1; sub = extra; SET_VECTOR_ELT(indexnames, i, STRING_ELT(s, i)); } INTEGER(indx)[i] = sub; } /* We return the new names as the names attribute of the returned subscript vector. */ if (extra != nnames) setAttrib(indx, install("use.names"), indexnames); if (canstretch) *stretch = extra; UNPROTECT(4); return indx; } /* Array Subscripts. dim is the dimension (0 to k-1) s is the subscript list, dn is the attribute name of dim dnn is the attribute name of dimnames x is the array to be subscripted. */ SEXP _int_array_subscript(int dim, SEXP s, const char *dn, const char *dnn, SEXP x, Rboolean in, SEXP call) { int nd, ns, stretch = 0; SEXP dnames, tmp; ns = LENGTH(s); nd = INTEGER(getAttrib(x, install(dn)))[dim]; switch (TYPEOF(s)) { case NILSXP: return allocVector(INTSXP, 0); case LGLSXP: return logicalSubscript(s, ns, nd, &stretch, call); case INTSXP: return integerSubscript(s, ns, nd, &stretch, call); case REALSXP: PROTECT(tmp = coerceVector(s, INTSXP)); tmp = integerSubscript(tmp, ns, nd, &stretch, call); UNPROTECT(1); return tmp; case STRSXP: dnames = getAttrib(x, install(dnn)); if (dnames == R_NilValue) { ECALL(call, _("no 'dimnames' attribute for array")); } dnames = VECTOR_ELT(dnames, dim); return stringSubscript(s, ns, nd, dnames, &stretch, in, call); case SYMSXP: if (s == R_MissingArg) return nullSubscript(nd); default: if (call == R_NilValue) error(_("invalid subscript type '%s'"), type2char(TYPEOF(s))); else errorcall(call, _("invalid subscript type '%s'"), type2char(TYPEOF(s))); } return R_NilValue; } // R interface SEXP R_arraySubscript(SEXP x, SEXP dim, SEXP s, SEXP dn, SEXP dnn) { // FIXME return _int_array_subscript(INTEGER(dim)[0], s, (const char *) CHAR(STRING_ELT(dn, 0)), (const char *) CHAR(STRING_ELT(dnn, 0)), x, TRUE, R_NilValue); } // cba/src/cluster.c0000644000175100001440000000564414344034315013443 0ustar hornikusers #include #include /* cluster_dist * * cluster an undirected graph as representable by an R dist object, * i.e. find all the disconnected subgraphs of graph. * * as input we expect R_x the vector storage representation of the * upper/lower triangle of a distance matrix (see dist) and R_beta * the distance threshold. * * returns a factor of cluster labels (integers 1,2, ..., k, with * k the number of clusters). * * NA or NaN distance values are interpreted as no link! this is a * simplification as we do not want to check for the 2^k possible * clusterings given each indeterminate link is actually either above * or below the threshold. * * NA or NaN threshold values result in an error as we do not want * to to check for all the possible clusterings given the threshold * assumes a value in the range of the distances. * * fixme: 1) can we do this in less than O(n^2) time? * 2) can we use a strict threshold? * * ceeboo 2006 */ SEXP cluster_dist(SEXP R_x, SEXP R_beta) { if (TYPEOF(R_x) != REALSXP) error("cluster_dist: 'x' invalid storage type"); if (TYPEOF(R_beta) != REALSXP) error("cluster_dist: 'beta' invalid storage type"); int i, j, k, l, n, o, na, *c, *b; char *s; double beta, *x; SEXP R_str, R_obj; n = (int) sqrt(2 * length(R_x)) + 1; if (n < 3 || n * (n - 1) / 2 != length(R_x)) error("cluster_dist: 'x' invalid length"); beta = REAL(R_beta)[0]; /* distance threshold */ if (ISNAN(beta)) error("cluster_dist: 'beta' NA or NaN"); PROTECT(R_obj = NEW_INTEGER(n)); c = INTEGER(R_obj); for (i = 0; i < n; i++) c[i] = i; x = REAL(R_x); k = na = 0; for (i = 0; i < n - 1; i++) for (j = i + 1; j < n; j++) { if (ISNAN(x[k])) { na++; continue; } if (beta >= x[k++]) { if (c[j] == c[i]) continue; if (c[j] == j) c[j] = c[i]; else { o = c[j]; for (l = 0; l < n; l++) if (c[l] == o) c[l] = c[i]; } } } if (na) warning("cluster_dist: found NA (NaN) distance values, different solutions may be possible."); /* make indexes contiguous */ b = Calloc(n, int); k = 0; for (i = 0; i < n; i++) { j = c[i]; if (b[j] == 0) b[j] = ++k; c[i] = b[j]; } Free(b); /* make return value a factor */ int sn = k/10+2; s = Calloc(sn, char); /* stringified integers */ PROTECT(R_str = NEW_STRING(k)); for (j = 0; j < k; j++) { snprintf(s,sn,"%i",j+1); SET_STRING_ELT(R_str, j, mkChar(s)); } Free(s); SET_LEVELS(R_obj, R_str); UNPROTECT(1); PROTECT(R_str = NEW_STRING(1)); SET_STRING_ELT(R_str, 0, mkChar("factor")); SET_CLASS(R_obj, R_str); UNPROTECT(1); /* we are done */ UNPROTECT(1); return R_obj; } cba/src/coding.c0000644000175100001440000000142111304023136013202 0ustar hornikusers #include #include /* dummy code a factor where NAs are ignored, * i.e. all indicators are FALSE */ SEXP as_dummy(SEXP R_x) { int n, l, i, j; SEXP R_obj, R_tmp; n = LENGTH(R_x); l = LENGTH(GET_LEVELS(R_x)); if (l == 0) return R_NilValue; PROTECT(R_obj = NEW_LOGICAL(n*l)); for (i = 0; i < n*l; i++) /* this sucks! */ LOGICAL(R_obj)[i] = FALSE; for (i = 0; i < n; i++) { j = INTEGER(R_x)[i]; if (j == NA_INTEGER) continue; LOGICAL(R_obj)[i+(j-1)*n] = TRUE; } PROTECT(R_tmp = NEW_INTEGER(2)); INTEGER(R_tmp)[0] = n; INTEGER(R_tmp)[1] = l; SET_DIM(R_obj, R_tmp); UNPROTECT(1); SET_LEVELS(R_obj, duplicate(GET_LEVELS(R_x))); UNPROTECT(1); return R_obj; } /**/ cba/src/interpolate.c0000644000175100001440000000311011304023136014262 0ustar hornikusers #include #include /* interpolate a logical matrix to a lower resolution. * * note 1) that we currently use the full storage representation * of a binary matrix and 2) that some rows and/or columns at the * lower and left margins may get cut off * * (C) ceeboo 2005 */ SEXP lminter(SEXP R_x, SEXP R_block_size, SEXP R_nbin) { int nr, nc, np, zr, zc; int i, j; int *x, *z; SEXP R_obj, R_dim; nr = INTEGER(GET_DIM(R_x))[0]; /* number of rows */ nc = INTEGER(GET_DIM(R_x))[1]; /* number of columns */ x = LOGICAL(R_x); np = INTEGER(R_block_size)[0]; /* number of pixels */ zr = nr / np; /* reduced number of rows */ zc = nc / np; /* reduced number of columns */ PROTECT(R_obj = NEW_INTEGER(zr * zc)); z = INTEGER(R_obj); for (j = 0; j < zr * zc; j++) /* this sucks! */ z[j] = 0; for (j = 0; j < zc * np; j++) for (i = 0; i < zr * np; i++) z[i / np + (j / np) * zr] += x[i + j * nr]; i = INTEGER(R_nbin)[0]; /* number of bins */ if (i < 0 || i > np) error("lminter: invalid number of bins"); if (i == 0) { /* majority */ i = np * np / 2 + 1; for (j = 0; j < zr * zc; j++) z[j] /= i; } else { /* bins */ i = np * np / i; for (j = 0; j < zr * zc; j++) z[j] = ceil((double) z[j] / i); } PROTECT(R_dim = NEW_INTEGER(2)); INTEGER(R_dim)[0] = zr; INTEGER(R_dim)[1] = zc; SET_DIM(R_obj, R_dim); UNPROTECT(2); return R_obj; } /**/ cba/src/greedy.c0000644000175100001440000000604311304023136013223 0ustar hornikusers #include #include /* greedy endpoint ordering based on arbitrary similarities. * this is trivial. * * input is a lower triangular distance matrix. returns the * merge tree), the corresponding order, and the height (see * hclust). * * note that the height need not be monotonically increasing! * * (C) ceeboo 2005 */ typedef struct { double v; int i; } MDS; static MDS minDist(double *x, int j, int *c, int *p, int n) { int i, k, l; double v; MDS m = {R_PosInf, 0}; l = 0; for (k = 0; k < n; k++) { i = c[k]; if (i > j) v = x[i+p[j]]; else v = x[j+p[i]]; if (v < m.v) { m.v = v; m.i = i; l = 1; } else if (v == m.v) { if (unif_rand() > (double) l/(l+1)) m.i = i; l++; } } return m; } /* swap */ static void swap(int *x1, int *x2) { int x = *x1; *x1 = *x2; *x2 = x; } SEXP order_greedy(SEXP R_dist) { int n, i, j, h, k; int *left, *right, *order, *c, *p; double *x, *height; MDS l, ll = {R_NaN, 0}, r, rr = {R_NaN, 0}; SEXP R_obj; n = 1 + (int) sqrt(2 * LENGTH(R_dist)); if (LENGTH(R_dist) != n*(n-1)/2) error("order_greedy: \"dist\" invalid length"); PROTECT(R_obj = NEW_LIST(3)); SET_ELEMENT(R_obj, 0, allocMatrix(INTSXP, n-1, 2)); /* merge */ SET_ELEMENT(R_obj, 1, NEW_INTEGER(n)); /* order */ SET_ELEMENT(R_obj, 2, NEW_NUMERIC(n-1)); /* height */ left = INTEGER(VECTOR_ELT(R_obj, 0)); right = INTEGER(VECTOR_ELT(R_obj, 0))+n-1; order = INTEGER(VECTOR_ELT(R_obj, 1)); height = REAL(VECTOR_ELT(R_obj, 2)); x = REAL(R_dist); /* distance matrix */ GetRNGstate(); p = Calloc(n-1, int); /* column pointers */ c = Calloc(n, int); for (k = 0; k < n-1; k++) { c[k] = k; /* candidate leaves */ p[k] = k*(n-1)-k*(k+1)/2-1; order[k] = k; /* here backreference */ } c[k] = k; order[k] = k; i = (int) (unif_rand() * n); /* initial leaf */ h = l.i = ll.i = r.i = rr.i = i; for (k = 0; k < n-1; k++) { swap(c+order[h], c+n-k-1); swap(order+h, order+c[order[h]]); if (ll.i == h) ll = minDist(x, l.i, c, p, n-k-1); if (k == 0) rr = ll; else if (rr.i == h) rr = minDist(x, r.i, c, p, n-k-1); if (!R_FINITE(ll.v) || !R_FINITE(rr.v)) { Free(c); Free(p); error("order_greedy: non-finite values"); } if (ll.v < rr.v) { l = ll; h = l.i; left[k] = -h-1; right[k] = k; height[k] = l.v; } else { r = rr; h = r.i; left[k] = k; right[k] = -h-1; height[k] = r.v; } } left[0] = -i-1; /* in each step a leaf was merged. so, we can simply * descend the tree and place it on the next left * or right position. */ i = 0; j = n-1; for (k = n-2; k >= 0; k--) if (left[k] > 0) order[j--] = -right[k]; else order[i++] = -left[k]; order[j] = -right[0]; Free(c); Free(p); PutRNGstate(); UNPROTECT(1); return R_obj; } /**/ cba/src/dll.c0000644000175100001440000000474513041567251012541 0ustar hornikusers #include #include #include extern SEXP ccfkms(SEXP R_x, SEXP R_p, SEXP R_par, SEXP R_max_iter, SEXP R_opt_std, SEXP R_debug); extern SEXP cluster_dist(SEXP R_x, SEXP R_beta); extern SEXP as_dummy(SEXP R_x); extern SEXP gknn(SEXP R_x, SEXP R_y, SEXP R_k, SEXP R_l, SEXP R_break_ties, SEXP R_use_all, SEXP R_prob); extern SEXP order_optimal(SEXP R_dist, SEXP R_merge); extern SEXP order_length(SEXP R_dist, SEXP R_order); extern SEXP order_greedy(SEXP R_dist); extern SEXP lminter(SEXP R_x, SEXP R_block_size, SEXP R_nbin); extern SEXP proximus(SEXP R_mat, SEXP R_max_radius, SEXP R_min_size, SEXP R_min_retry, SEXP R_max_iter, SEXP R_debug); extern SEXP rockLink(SEXP R_x, SEXP R_beta); extern SEXP rockMerge(SEXP R_x, SEXP R_n, SEXP R_theta, SEXP R_debug); extern SEXP rockClass(SEXP R_x, SEXP R_l, SEXP R_beta, SEXP R_theta); extern SEXP sdists(SEXP R_x, SEXP R_y, SEXP R_method, SEXP R_weight, SEXP R_pairwise); extern SEXP sdists_transcript(SEXP R_x, SEXP R_y, SEXP R_method, SEXP R_weight, SEXP R_table); extern SEXP sdists_graph(SEXP x); extern SEXP sdists_align(SEXP R_x, SEXP R_y, SEXP t); extern SEXP stress(SEXP R_x, SEXP R_r, SEXP R_c, SEXP R_type); extern SEXP stress_dist(SEXP R_x, SEXP R_r, SEXP R_c, SEXP R_bycol, SEXP R_type); extern SEXP orderTSP(SEXP x, SEXP t); static const R_CallMethodDef CallEntries[] = { {"R_ccfkms", (DL_FUNC) ccfkms, 6}, {"R_cluster_dist", (DL_FUNC) cluster_dist, 2}, {"R_as_dummy", (DL_FUNC) as_dummy, 1}, {"R_gknn", (DL_FUNC) gknn, 7}, {"R_order_optimal", (DL_FUNC) order_optimal, 2}, {"R_order_length", (DL_FUNC) order_length, 2}, {"R_order_greedy", (DL_FUNC) order_greedy, 1}, {"R_lminter", (DL_FUNC) lminter, 3}, {"R_proximus", (DL_FUNC) proximus, 6}, {"R_rockLink", (DL_FUNC) rockLink, 2}, {"R_rockMerge", (DL_FUNC) rockMerge, 4}, {"R_rockClass", (DL_FUNC) rockClass, 4}, {"R_sdists", (DL_FUNC) sdists, 5}, {"R_sdists_transcript", (DL_FUNC) sdists_transcript, 5}, {"R_sdists_graph", (DL_FUNC) sdists_graph, 1}, {"R_sdists_align", (DL_FUNC) sdists_align, 3}, {"R_stress", (DL_FUNC) stress, 4}, {"R_stress_dist", (DL_FUNC) stress_dist, 5}, {"R_orderTSP", (DL_FUNC) orderTSP, 2}, {NULL, NULL, 0} }; void R_init_cba(DllInfo *dll) { R_registerRoutines(dll, NULL, CallEntries, NULL, NULL); R_useDynamicSymbols(dll, FALSE); } cba/src/sdists.c0000644000175100001440000005300513455363167013301 0ustar hornikusers #include #include #define min(A,B) ((A)>(B) ? (B) : (A)) #define max(A,B) ((A)>(B) ? (A) : (B)) /* compute auto-distances, or cross-distances on lists of * sequences, i.e. vectors of integers representing the * alphabet used. * * D. Gusfield (1997) Algorithms on Strings, Trees, and * Sequences. Cambridge University Press. * * for an interface for two (atomic) sequences which returns * the distance (similarity) as well as the complete edit * (alignment) traceback see below. * * fixme: 1) distance between two empty sequences. * 2) FASTA does not seem to be GPL but we may ask * if we could use parts of it in R. * * note that we do not implement the most efficient algorithmic * concepts known in the field. * * (C) ceeboo 2005, 2006, 2011 */ /* compute the operation weighted edit distance of two * sequences, i.e. insertion, deletion and substitution * may have different costs. * * note that the occurence of missing values results * in NA as a match or mismatch cannot be determined. */ double edist_ow(int *x, int *y, double *w, int nx, int ny, int nw, double *z0, char *b, double *v) { int i, j, x0 = 0, y0 = 0; double z1 = 0, z2 = 0, s0 = 0, s1 = 0, s2 = 0; for (i = 0; i <= nx; i++) { for (j = 0; j <= ny; j++) if (i == 0) { if (j == 0) { z0[j] = z2 = 0; if (b) b[0] = 0; if (v) v[0] = 0; } else { if (y[j-1] == NA_INTEGER) return NA_REAL; z2 = z0[j] = j * (nw > 5 ? w[5] : w[1]); if (b) b[j*(nx+1)] = 2; if (v) v[j*(nx+1)] = z2; } } else if (j == 0) { x0 = x[i-1]; if (x0 == NA_INTEGER) return NA_REAL; z1 = z2 = i * (nw > 4 ? w[4] : w[0]); if (b) b[i] = 1; if (v) v[i] = z1; } else { y0 = y[j-1]; s0 = z0[j] + w[0]; s1 = z1 + w[1]; s2 = z0[j-1] + ((x0 == y0) ? w[2] : w[3]); z2 = min(s0, s1); z2 = min(z2, s2); if (b) b[i+j*(nx+1)] = (s0 == z2) + (s1 == z2) * 2 + (s2 == z2) * ((x0 != y0) ? 4 : 8); if (v) v[i+j*(nx+1)] = z2; z0[j-1] = z1; if (j == ny) z0[j] = z2; else z1 = z2; } } return z2; } /* compute the alphabet-weighted distance. actually, we compute * the global sequential alignment with maximum similarity (see * Gusfield pp. 225) and return it as a negative number. */ double edist_aw(int *x, int *y, double *w, int nx, int ny, int nw, double *z0, char *b, double *v) { int i, j, x0 = 0, y0 = 0; double z1 = 0, z2 = 0, z3 = 0, s0 = 0, s1 = 0, s2 = 0; for (i = 0; i <= nx; i++) { for (j = 0; j <= ny; j++) if (i == 0) { if (j == 0) { z0[j] = z2 = z3 = w[0]; if (b) b[0] = 0; if (v) v[0] = z3; } else { y0 = y[j-1]; if (y0 == NA_INTEGER) return NA_REAL; z2 = z0[j] = z0[j-1] + w[(y0-1)*nw]; if (b) b[j*(nx+1)] = 2; if (v) v[j*(nx+1)] = z2; } } else if (j == 0) { x0 = x[i-1]; if (x0 == NA_INTEGER) return NA_REAL; z3 += w[(x0-1)]; z1 = z2 = z3; if (b) b[i] = 1; if (v) v[i] = z1; } else { y0 = y[j-1]; s0 = z0[j] + w[(x0-1)]; s1 = z1 + w[(y0-1)*nw]; s2 = z0[j-1] + w[(x0-1)+(y0-1)*nw]; z2 = max(s0, s1); z2 = max(z2, s2); if (b) b[i+j*(nx+1)] = (s0 == z2) + (s1 == z2) * 2 + (s2 == z2) * ((x0 != y0) ? 4 : 8); if (v) v[i+j*(nx+1)] = z2; z0[j-1] = z1; if (j == ny) z0[j] = z2; else z1 = z2; } } return -z2; } /* as above but align locally instead of globally. * * notes: 1) a value of zero indicates the empty sequence. * 2) an optimal non-empty solution is indicated by * the fifth bit in the traceback table */ double edist_awl(int *x, int *y, double *w, int nx, int ny, int nw, double *z0, char *b, double *v) { int i, j, x0 = 0, y0 = 0, k = 0, l = 0, *p = 0; double z1 = 0, z2 = 0, z = 0, s0 = 0, s1 = 0, s2 = 0; if (b) p = Calloc(nx*ny, int); for (i = 0; i <= nx; i++) { for (j = 0; j <= ny; j++) if (i == 0) { if (j == 0) { z0[j] = z = 0; if (b) b[0] = 0; if (v) v[0] = 0; } else { if (y[j-1] == NA_INTEGER) return NA_REAL; z0[j] = 0; if (b) b[j*(nx+1)] = 2; if (v) v[j*(nx+1)] = 0; } } else if (j == 0) { x0 = x[i-1]; if (x0 == NA_INTEGER) return NA_REAL; z1 = 0; if (b) b[i] = 1; if (v) v[i] = 0; } else { y0 = y[j-1]; s0 = z0[j] + w[(x0-1)]; s1 = z1 + w[(y0-1)*nw]; s2 = z0[j-1] + w[(x0-1)+(y0-1)*nw]; z2 = max(0, s0); z2 = max(z2, s1), z2 = max(z2, s2); if (b) { k = i+j*(nx+1); b[k] = (z2 > 0 && s0 == z2) + (z2 > 0 && s1 == z2) * 2 + (z2 > 0 && s2 == z2) * ((x0 != y0) ? 4 : 8); if (z2 > z) { l = 0; p[l++] = k; } else if (z2 > 0 && z2 == z) p[l++] = k; } if (v) v[i+j*(nx+1)] = z2; if (z2 > z) z = z2; z0[j-1] = z1; if (j == ny) z0[j] = z2; else z1 = z2; } } if (b) { while (l-- > 0) b[p[l]] = b[p[l]] + 16; Free(p); } return -z; } /* provide a common interface to all internal functions that * compute distances (similarities) of sequences. * * we expect two lists of integer vectors, an integer code for * the internal function, and a double vector (matrix) for the * weights to use. * * internal functions for distance computation have six + three * arguments: the first two are pointers to arrays of integer (the * sequences). the third is a pointer to an array of double (the * weights). the seventh is a pointer to an array of double long * enough to hold temporary results. the eighth is either null or * a pointer to a temporary array of character used in the computation * of the edit transcripts or alignments (see below). the ninth is * either null or a pointer to an array of double large enough to * hold the values of the dynamic programming table. * * returns either a vector in lower triangular format (see dist) * or a matrix of distances. * * in the case of auto-distances we check for asymmetric weights * as these may result in asymmetric distances. * * todo: warning if NA results are encountered (?) */ // test for exact symmetry int is_symmetric(double *x, int n) { int i, j, r = 1; // true for (i = 0; i < n-1; i++) for (j = i+1; j < n; j++) if (x[i+j*n] != x[j+i*n]) { r = 0; break; } return r; } SEXP sdists(SEXP R_x, SEXP R_y, SEXP R_method, SEXP R_weight, SEXP R_pairwise) { if (TYPEOF(R_x) != VECSXP || (!isNull(R_y) && TYPEOF(R_y) != VECSXP)) error("invalid sequence parameters"); if (TYPEOF(R_method) != INTSXP) error("invalid method parameter"); if (TYPEOF(R_weight) != REALSXP) error("invalid weight parameter"); if (TYPEOF(R_pairwise) != LGLSXP) error("invalid pairwise parameter"); double (*sdfun)(int *, int *, double *, int, int, int, double *, char *, double *v) = NULL; int nx, ny, nw; int i, j, k, n, m = 0; /* default symmetric */ SEXP x, y, t, r; /* return value */ nw = LENGTH(R_weight); switch (INTEGER(R_method)[0]) { case 1: sdfun = edist_ow; break; case 2: if (!isMatrix(R_weight)) error("invalid weight parameter"); sdfun = edist_aw; nw = INTEGER(GET_DIM(R_weight))[0]; break; case 3: if (!isMatrix(R_weight)) error("invalid weight parameter"); sdfun = edist_awl; nw = INTEGER(GET_DIM(R_weight))[0]; break; default: error("method not implemented"); } if (isNull(R_y)) { if ((isMatrix(R_weight) && !is_symmetric(REAL(R_weight), nw)) || (!isMatrix(R_weight) && REAL(R_weight)[0] != REAL(R_weight)[1])) error("auto-similarities for asymmetric weights not implemented"); R_y = R_x; } else if (LOGICAL(R_pairwise)[0] == TRUE) m = 2; else m = 1; nx = LENGTH(R_x); ny = LENGTH(R_y); if (m == 2 && nx != ny) error("invalid number of rows for pairwise mode"); if (m == 0) PROTECT(r = allocVector(REALSXP, nx*(nx-1)/2)); else if (m == 1) PROTECT(r = allocMatrix(REALSXP, nx, ny)); else PROTECT(r = allocVector(REALSXP, nx)); PROTECT(t = allocVector(REALSXP, 256)); /* temporary storage */ k = 0; n = nx; for (j = 0; j < ny; j++) { if (m == 0) i = j + 1; else if (m == 1) i = 0; else { i = j; n = j + 1; } y = VECTOR_ELT(R_y, j); if (LENGTH(y) >= LENGTH(t)) { /* more storage */ UNPROTECT(1); PROTECT(t = allocVector(REALSXP, LENGTH(y) * 2)); } for (; i < n; i++) { x = VECTOR_ELT(R_x, i); REAL(r)[k++] = (*sdfun)(INTEGER(x), INTEGER(y), REAL(R_weight), LENGTH(x), LENGTH(y), nw, REAL(t), 0, 0); R_CheckUserInterrupt(); } } UNPROTECT(2); return r; } /* get the next edit transcript. the input arguments are a pointer * to an array of traceback codes, the lenghts of the sequences * compared, and pointers to the transcript and its length. the * possible edit oparations are indicated by four bits. the lowest * bit is decoded and if more than one bit is set it is set to zero. * * returns -1 on error, 0 if no more transcripts are left, and * otherwise the backtrack position in the code array. */ static int next_transcript(char *b, int i, int j, char *s, int *l) { int b0 = 0, k0 = 0, k1 = 0, k = 0, n = i+1; while (i > 0 || j > 0) { if (i < 0 || j < 0) { REprintf("next_transcript: coding error\n"); return -1; } k0 = i+j*n; b0 = b[k0]; if (b0 & 1) { s[k++] = 'D'; if (b0 & 2 || b0 & 4 || b0 & 8) k1 = k0; i--; } else if (b0 & 2) { s[k++] = 'I'; if (b0 & 4 || b0 & 8) k1 = k0; j--; } else { if (b0 == 4) s[k++] = 'R'; else if (b0 == 8) s[k++] = 'M'; else { REprintf("edit_transcript: coding error\n"); return -1; } i--; j--; } } *l = k; s[k] = (char)0; if (k1) { b0 = b[k1]; if (b0 & 1) b[k1] = b0 ^ 1; else if (b0 & 2) b[k1] = b0 ^ 2; } return k1; } /* get the next transcript for a local alignment. first we have to find * the endpoint of a local alignment (if any). then we proceed as above * until we hit zero or either of the two sequences is exhausted. * remaining prefixes or suffixes are aligned by padding with wildcards * where insertions or deletions at the ends are used to account for * differences in lenghths (shifting the shorter prefix or suffix in the * direction of the local alignment). we use the codes {'i', 'd', '?'} * in order to distinguish these edit operations from those necessary * to obtain the local alignment. * * endpoints within a local alignment are ignored as we seek only * alignments of maximum length. bits 6 and 7 are used as temporary * storage for bits 1 and 2 which we restore after all solutions for * one endpoint have been generated. */ static int next_local_transcript(char *b, int i, int j, char *s, int *l) { int b0 = 0, k0 = 0, k1 = 0, k2 = 0, k = 0, n = i+1, m = j+1; for (; i > 0; i--) for (j = m-1; j > 0; j--) { k0 = i+j*n; if (b[k0] & 16) { k2 = k0; goto next; } } return 0; next: while (k < n-i-m+j) s[k++] = 'd'; while (k < m-j-n+i) s[k++] = 'i'; while (k < n-i-1 || k < m-j-1) s[k++] = '?'; while (i > 0 && j > 0) { k0 = i+j*n; b0 = b[k0]; if (b0 == 0) break; else if (b0 & 16) b[k0] = b0 = b0 ^ 16; if (b0 & 1) { s[k++] = 'D'; if (b0 & 2 || b0 & 4 || b0 & 8) k1 = k0; i--; } else if (b0 & 2) { s[k++] = 'I'; if (b0 & 4 || b0 & 8) k1 = k0; j--; } else { if (b0 & 4) s[k++] = 'R'; else if (b0 & 8) s[k++] = 'M'; else { REprintf("edit_transcript: coding error\n"); return -1; } i--; j--; } } for (; i > 0 && j > 0; i--, j--) s[k++] = '?'; for (; i > 0; i--) s[k++] = 'd'; for (; j > 0; j--) s[k++] = 'i'; *l = k; s[k] = (char)0; if (k1) { b0 = b[k1]; if (b0 & 1) b[k1] = (b0 ^ 1) | 32; else if (b0 & 2) b[k1] = (b0 ^ 2) | 64; b[k2] |= 16; } else for (k = 1; k < k2; k++) { b0 = b[k]; if (b0 & 16) k1 = k; if (b0 & 32) b0 = (b0 ^ 32) | 1; if (b0 & 64) b0 = (b0 ^ 64) | 2; b[k] = b0; } return k1; } /* compute the distance for two sequences and the corresponding set * of equivalent edit transcripts. the input arguments are the same * as above with the exception that the first two are integer vectors * instead of lists. * * the transcripts are coded as strings over the alphabet {'I', 'D', * 'R', 'M'} indicating an insert, delete, replace, or match operation * to be applied to the first (second) sequence (supplied). for the * extended symbol set for local alignments see above. * * the distance is returned as attribute 'value'. the values of the * dynamic programming table may be returned as attribute 'table' for * plotting, etc. Attribute 'pointer' contains an R ''segments'' * compatible representation of the (back)pointers (see also below). */ SEXP sdists_transcript(SEXP R_x, SEXP R_y, SEXP R_method, SEXP R_weight, SEXP R_table) { if (TYPEOF(R_x) != INTSXP || TYPEOF(R_y) != INTSXP) error("invalid sequence parameters"); if (TYPEOF(R_method) != INTSXP) error("invalid method parameter"); if (TYPEOF(R_weight) != REALSXP) error("invalid weight parameter"); if (TYPEOF(R_table) != LGLSXP) error("invalid option parameter"); double (*sdfun)(int *, int *, double *, int, int, int, double *, char *, double *) = NULL; int (*stfun)(char *, int, int, char *, int *) = NULL; int i, j, k, n, nx, ny, nw; double d, *v = 0, *t; // temporary storage char c, *b, *s; // temporary storage SEXP r, tv = (SEXP)0, tb = (SEXP)0; nw = length(R_weight); switch (INTEGER(R_method)[0]) { case 1: sdfun = edist_ow; stfun = next_transcript; break; case 2: if (!isMatrix(R_weight)) error("invalid weight parameter"); sdfun = edist_aw; stfun = next_transcript; nw = INTEGER(GET_DIM(R_weight))[0]; break; case 3: if (!isMatrix(R_weight)) error("invalid weight parameter"); sdfun = edist_awl; stfun = next_local_transcript; nw = INTEGER(GET_DIM(R_weight))[0]; break; default: error("method not implemented"); } nx = length(R_x); ny = length(R_y); if (LOGICAL(R_table)[0] == TRUE) { PROTECT(tv = allocMatrix(REALSXP, nx+1, ny+1)); PROTECT(tb = allocVector(VECSXP, 4)); v = REAL(tv); } // R-2.9.x b = (char *) RAW(PROTECT(allocVector(RAWSXP, (nx+1)*(ny+1)))); t = Calloc(ny+1, double); d = (*sdfun)(INTEGER(R_x),INTEGER(R_y), REAL(R_weight), nx, ny, nw, t, b, v); Free(t); if (!R_FINITE(d)) { UNPROTECT(1); if (LOGICAL(R_table)[0] == TRUE) UNPROTECT(2); return ScalarReal(d); } #ifdef TB_DEBUG Rprintf("traceback codes: 1 = up, 2 = left, 4 = replace, 8 = match\n\n"); for (i = 0; i <= nx; i++) { Rprintf("[%2i]", i); for (j = 0; j <= ny; j++) if (b[i+j*(nx+1)] & 16) Rprintf("(%2i)",b[i+j*(nx+1)] ^ 16); else Rprintf(" %2i ",b[i+j*(nx+1)]); Rprintf("\n"); } #endif if (LOGICAL(R_table)[0] == TRUE) { int b0; SEXP x0, y0, x1, y1; k = 0; for (i = 1; i < (nx+1)*(ny+1); i++) { b0 = b[i]; k += ((b0 & 1) == 1) + ((b0 & 2) == 2) + (((b0 & 4) == 4) || ((b0 & 8) == 8)); } SET_VECTOR_ELT(tb, 0, (x0 = allocVector(INTSXP, k))); SET_VECTOR_ELT(tb, 1, (y0 = allocVector(INTSXP, k))); SET_VECTOR_ELT(tb, 2, (x1 = allocVector(INTSXP, k))); SET_VECTOR_ELT(tb, 3, (y1 = allocVector(INTSXP, k))); k = 0; for (i = 0; i <= nx; i++) for (j = 0; j <= ny; j++) { b0 = b[i+j*(nx+1)]; if (b0 & 1) { INTEGER(x0)[k] = i-1; INTEGER(y0)[k] = j; INTEGER(x1)[k] = i; INTEGER(y1)[k] = j; k++; } if (b0 & 2) { INTEGER(x0)[k] = i; INTEGER(y0)[k] = j-1; INTEGER(x1)[k] = i; INTEGER(y1)[k] = j; k++; } if (b0 & 4 || b0 & 8) { INTEGER(x0)[k] = i-1; INTEGER(y0)[k] = j-1; INTEGER(x1)[k] = i; INTEGER(y1)[k] = j; k++; } } } // R-2.9.x s = (char *) RAW(PROTECT(allocVector(RAWSXP, nx+ny+1))); r = R_NilValue; do { n = (*stfun)(b, nx, ny, s, &k); for (i = 0; i < k/2; i++) { c = s[i]; s[i] = s[k-i-1]; s[k-i-1] = c; } PROTECT(r); r = CONS(mkChar(s), r); UNPROTECT(1); PROTECT(r); R_CheckUserInterrupt(); UNPROTECT(1); } while (n); UNPROTECT(2); PROTECT(r); r = PairToVectorList(r); UNPROTECT(1); PROTECT(r); SET_TYPEOF(r, STRSXP); setAttrib(r, install("value"), PROTECT(ScalarReal(d))); UNPROTECT(1); if (LOGICAL(R_table)[0] == TRUE) { setAttrib(r, install("table"), tv); setAttrib(r, install("pointer"), tb); UNPROTECT(3); } else UNPROTECT(1); return r; } // align two sequences according to an edit transcript SEXP sdists_align(SEXP R_x, SEXP R_y, SEXP t) { if (TYPEOF(R_x) != INTSXP || TYPEOF(R_y) != INTSXP) error("invalid sequence parameter(s)"); if (TYPEOF(t) != STRSXP || LENGTH(t) != 1) error("invalid transcript parameter"); int i, j, k, i0, j0; SEXP r, x = (SEXP)0, y = (SEXP)0; t = STRING_ELT(t, 0); PROTECT(r = allocVector(VECSXP, 2)); SET_VECTOR_ELT(r, 0, (x = allocVector(INTSXP, LENGTH(t)))); SET_VECTOR_ELT(r, 1, (y = allocVector(INTSXP, LENGTH(t)))); if (isFactor(R_x)) { SET_LEVELS(x, GET_LEVELS(R_x)); setAttrib(x, install("class"), PROTECT(mkString("factor"))); UNPROTECT(1); } if (isFactor(R_y)) { SET_LEVELS(y, GET_LEVELS(R_y)); setAttrib(y, install("class"), PROTECT(mkString("factor"))); UNPROTECT(1); } i = j = i0 = j0 = 0; for (k = 0; k < LENGTH(t); k++) { if (i > LENGTH(R_x) || j > LENGTH(R_y)) error("invalid edit transcript"); switch (CHAR(t)[k]) { case 'i': case 'I': INTEGER(x)[i0++] = NA_INTEGER; INTEGER(y)[j0++] = INTEGER(R_y)[j++]; break; case 'd': case 'D': INTEGER(x)[i0++] = INTEGER(R_x)[i++]; INTEGER(y)[j0++] = NA_INTEGER; break; case 'R': case 'M': case '?': INTEGER(x)[i0++] = INTEGER(R_x)[i++]; INTEGER(y)[j0++] = INTEGER(R_y)[j++]; break; default: error("invalid edit symbol"); } } if (i < LENGTH(R_x) || j < LENGTH(R_y)) error("invalid edit transcript"); UNPROTECT(1); return r; } /* * transform a vector of transcripts into a graph, i.e. a set of edges * with weights the number of times the edge is a member of a path in * the dynamic programming table. * * returns a list of 4 vectors of coordinates for use with 'segments', * x0, y0, x1, y1, where x denotes the first and y the second sequence, * and a vector of edge frequencies. * * notes: we code the edges into scalar integers so that we can sort * and thus efficiently count them. the cells of the dynamic * programming table are numbered column by column. an edit * path is therfore transformed into a sequence of indexes and * pairs of consecutive indexes indicate entries in the edge * table. the latter we number again by columns. the time * complexity thus depends on sorting. * * fixme: Calloc may raise an error so that we cannot free memory * previously allocated with calloc or Calloc. * * ceeboo 2006 */ SEXP sdists_graph(SEXP x) { if (TYPEOF(x) != STRSXP) error("invalid type"); int i = 0, j = 0, h, k, l, p = 0, q = 0, k0, k1, nx = 0, ny = 0, n = 0; int *i0, *i1; SEXP r, x0, y0, x1, y1, f; k0 = 0; for (k = 0; k < LENGTH(x); k++) k0 += LENGTH(STRING_ELT(x, k)); i0 = Calloc(k0, int); k0 = 0; for (h = 0; h < LENGTH(x); h++) { SEXP c = STRING_ELT(x, h); if (h == 0) { nx = ny = LENGTH(c); for (k = 0; k < LENGTH(c); k++) switch (CHAR(c)[k]) { case 'i': case 'I': nx--; break; case 'd': case 'D': ny--; } n = (nx+1) * (ny+1); } p = q = LENGTH(c); i = l = 0; for (k = 0; k < LENGTH(c); k++) { switch (CHAR(c)[k]) { case 'i': case 'I': i += nx+1; p--; break; case 'd': case 'D': i += 1; q--; break; case 'R': case 'M': case '?': i += nx+2; break; default: Free(i0); error("invalid symbol"); } i0[k0++] = l + i * n; l = i; } if (p != nx || q != ny) { Free(i0); error("transcripts do not conform"); } } R_isort(i0, k0); i1 = Calloc(k0, int); l = i0[0]; k1 = 0; for (k = 0; k < k0; k++) { if (i0[k] != l) { l = i0[k]; i0[++k1] = l; } i1[k1]++; } k1++; PROTECT(r = allocVector(VECSXP, 5)); SET_VECTOR_ELT(r, 0, (x0 = allocVector(INTSXP, k1))); SET_VECTOR_ELT(r, 1, (y0 = allocVector(INTSXP, k1))); SET_VECTOR_ELT(r, 2, (x1 = allocVector(INTSXP, k1))); SET_VECTOR_ELT(r, 3, (y1 = allocVector(INTSXP, k1))); SET_VECTOR_ELT(r, 4, (f = allocVector(INTSXP, k1))); for (k = 0; k < k1; k++) { l = i0[k]; i = l % n; j = (l - i) / n; INTEGER(x0)[k] = l = i % (nx+1); INTEGER(y0)[k] = (i - l) / (nx+1); INTEGER(x1)[k] = l = j % (nx+1); INTEGER(y1)[k] = (j - l) / (nx+1); INTEGER(f )[k] = i1[k]; } Free(i0); Free(i1); UNPROTECT(1); return r; } /**/ cba/CHANGELOG0000644000175100001440000001101214344034315012223 0ustar hornikusers Realease 0.2-23 sprintf was not an issue. Release 0.2-22 change encoding of source files. Release 0.2-21 fixed UNPROTECT in proximus. fixed missing PROTECT in sdists. Release 0.2-20 changed UN / PROTECT approach in proximus. Release 0.2-18 added DLL registry. Release 0.2-17 fixed duplicated factor levels in Mushroom data. Release 0.2-16 fixed colnames in fitted.proximus. Release 0.2-15 Release date: 2015-07-23 fixed imports. Release 0.2-14 renamed int_arraySubscript. Release 0.2-13 adjusted package dependencies and imports. Release 0.2-12 Release date: 2013-04-30 fixed invalid memory access in rockMerge. Release 0.2-11 Release date: 2012-11-30 duplicated arraySubscript for code isolation. Release 0.2-10 Release date: 2012-08-31 resolved native symbols. Release 0.2-9 Release date: 2012-01-18 extended the weight argument in all sdists interfaces. added approximate substring matching to sdists.trace. added text option to plot.sdists.graph. Release 0.2-8 Release date: 2011-09-15 fixed lty issue in plot.sdists.graph. fixed possible issue on sparc-solaris in proximus.c. Release 0.2-7 Release date: 2011-09-09 changed image.matrix to implot so that other packages which use image work with cba. fixed justification issues in documentation. Release 0.2-6 Release date: 2009-01-07 dists, row.dist, col.dist, colSums.dist, colMeans.dist, rowSums.dist, rowMeans.dist, [[.dist, subset.dist, dim.dist, dimnames.dist, dapply, dapply.list were removed as the same functionality is provided in package proxy. cluproxplot and seriation were removed as these are provided and further developed in package seriation. in sdists.c changed temporary storage from CHARSXP to RAWSXP for compatibility with R-2.9.x. added pairwise option to sdists. added sdists.center and sdists.center.align. reversed CHANGELOG. Release 0.2-5 Release date: 2008-05-25 fixed encoding declaration in proximus documentation (C) fixed example in rockCluster documentation (C) Release 0.2-1 Release date: 2006-09-04 fixed Windoze C99 versus gnu99 issues (C) fixed order.matrix and order.data.frame (C) Release 0.2-0 Release date: 2006-08-31 added stress (C) fixed future problems with strict type checking in (C) by using SET_STRING_ELT instead of SET_ELEMENT. Release 0.1-9 Release date: see next fixed asymmetric weights bug in sdists (C) added traceback function for sdists (C) fixed bug due to copy on change in all distance functions (C) added rowSums.dist and dapply (C) added interrupt checks to all distance functions (C) added traceback plot for sdists (C) added cluster.dist (C) Release 0.1-8 Release date: 2006-03-01 fixed random-access bug in subset_dist C code which made the return value dependent on the order of the subset indexes (C) added names to cluster labels in rockCluster and ccfksm (C) added C code for fuzzy binary distance computation (C) added C code for distance computation on sequences (C) reduced cluproxplots memory needs (M) introduced hcl colorspace and fixed colorkey for cluproxplot (M) Release 0.1-7 Release date: 2006-01-27 added cluProxMatrix class w/ print and plot methods (M) simplified the cluproxplot interface by using plotOptions for all plotting related options (m) added threshold to cluproxplot (M) Release 0.1-6 Release date: 2005-12-12 cluproxplot uses now really grid (M) added colorkey for cluproxplot (M) Release 0.1-5 Release date: 2005-10-26 Fixed similarity conversion for intraClusterSim in clusimplot (Michael) renamed clusimplot to cluproxplot (Cluster Proximity plot) (M) cluproxplot uses grid now and includes a silhouettes plot (M) added interface for seriation (M) Release 0.1-4 Release date: 2005-09-12 Added drop option to as.dummy and sparse K-means code. Added clusimplot (Michael) Added Mushroom data set (Michael) Release 0.1-3 Release date: 2005-09-08 The package develops into a trash bag: moved optimal leaf ordering and stuff in. This sucks! (C) Release 0.1-2 Release date: 2005-08-30 Major changes include merging in of Proximus. Visualization is sketchy but useable. Most of the stuff definitely belongs to separate packages. However, it all ended up here. The C code is at least -Wall -pedantic clean. Release 0.1-1 Release date: 2005-07-02 Bug fixes, better fitting in with existing R functionality (e.g. class dist). Added efficient conversion of nominal variables to dummy coding. Release 0.1 Release date: 2005-05-30 Alpha release of the Rock package which implements the Rock algorithm. Focus is on efficient implementation of time critical functions: distance computation and merging. cba/R/0000755000175100001440000000000014332123330011210 5ustar hornikuserscba/R/proximus.r0000644000175100001440000001155114332123330013264 0ustar hornikusers # # proximus.r - according to the paper: # # M. Koyutürk, A. Graham, and N. Ramakrishnan. Compression, # Clustering, and Pattern Discovery in Very High-Dimensional # Descrete-Attribute Data Sets. IEEE Transactions On Knowledge # and Data Engineering, Vol. 17, No. 4, (April) 2005 # # Contents: # # wrapper(s) for my C implementation of PROXIMUS. a better # implementation may use two sparse matrices holding the pair # of approximating matrices X and Y so that hat A = X * Y. # # Version: 0.1-1 # # (C) ceeboo, 2005 proximus <- function(x, max.radius=2, min.size=1, min.retry=10, max.iter=16, debug=FALSE) { if (!is.logical(x)) stop(paste(sQuote("x"),"not logical")) storage.mode(max.radius) <- storage.mode(min.size) <- "integer" storage.mode(min.retry) <- storage.mode(max.iter) <- "integer" storage.mode(debug) <- "logical" obj <- .Call(R_proximus, x, max.radius, min.size, min.retry, max.iter, debug) obj$max.radius <- max.radius obj$min.size <- min.size obj$rownames <- rownames(x) obj$colnames <- colnames(x) class(obj) <- c("proximus") invisible(obj) } # get the full storage representation + pattern (cluster) labels fitted.proximus <- function(object, drop=FALSE, ...) { x <- matrix(FALSE, nrow=object$nr, ncol=object$nc) c <- vector("integer", object$nr) for (i in 1:length(object$a)) { x[object$a[[i]]$x, object$a[[i]]$y] <- TRUE c[object$a[[i]]$x] <- i } k <- rep(TRUE, object$nr) # keep if (drop) { for (i in 1:length(object$a)) if (length(object$a[[i]]$x) < object$min.size || object$a[[i]]$r > object$max.radius) k[object$a[[i]]$x] <- FALSE x <- x[k,] c <- c[k] } rownames(x) <- object$rownames[k] colnames(x) <- object$colnames attr(c, "Index") <- which(k) # x <- list(x=x, pl=factor(c)) x } ### print.proximus <- function(x, ...) { cat("an object of class:",class(x),"\n") invisible(x) } summary.proximus <- function(object, ...) { n <- length(object$a) s <- as.data.frame(matrix(nrow=n, ncol=7)) names(s) <- c("Size","Length","Radius","Error","Fnorm","Jsim","Valid") e <- j <- 0 for (i in 1:n) { # pattern summaries a <- object$a[[i]] # approximation nx <- length(a$x) ny <- length(a$y) s[i,] <- c(nx, ny, a$r, (a$n - a$c) / (nx * object$nc), # Error sqrt(a$n - a$c), # Frobenius norm if (a$c == 0 && ny == 0) 1 # definition! else 1 / (1 + 2 * (a$n - a$c) / (a$c + nx * ny)),# Jaccard (nx >= object$min.size & a$r <= object$max.radius)) # valid e <- e + a$n - a$c # total Error j <- j + a$c + nx * ny # total Jaccard } storage.mode(s[,7]) <- "logical" s <- list(nr=object$nr, nc=object$nc, error=e / (object$nr * object$nc), fnorm=sqrt(e), jsim=if (j == 0 && e == 0) 1 # definition! else j / (j + e / 2), valid=sum(s$Valid), pattern=s) class(s) <- "summary.proximus" s } print.summary.proximus <- function(x, ...) { cat("approximates",x$nr,"x",x$nc, "matrix\n") cat("total Error:",format(x$error, digits=2), "\n") cat("total Fnorm:",format(x$fnorm, digits=2), "\n") cat("total Jsim:",format(x$jsim, digits=2), "\n") cat("total Valid:",x$valid,"\n") cat("Pattern Summary:\n") print(x$pattern[order(x$pattern$Size, decreasing=TRUE),], digits=2) invisible(x) } ### # Generate a matrix containing blocks of (overlapping) uniform # binary patterns on a noisy background. The perfect switch allows # for overlap between the first and last pattern block, making the # test case balanced. # # ceeboo 2005 rlbmat <- function(npat=4, rows=20, cols=12, over=4, noise=0.01, prob=0.8, perfect=FALSE) { rlmat <- function(nrow, ncol, prob=0.5) { x <- matrix(as.logical(runif(nrow*ncol) <= prob), ncol=ncol) x } nrow <- npat * rows ncol <- cols * npat + over x <- rlmat(nrow, ncol, noise) r <- c <- 1 while (r < nrow) { x[r:(r+rows-1), c:(c+cols+over-1)] <- rlmat(rows, cols+over, prob) r <- r + rows c <- c + cols } # overlap first and last block, too if (perfect) x[(r-rows):(r-1), 1:over] <- rlmat(rows, cols, prob) x } ### cba/R/cluster.r0000644000175100001440000000041512020453760013061 0ustar hornikusers ### ceeboo 2006 cluster.dist <- function(x, beta) { if (!inherits(x, "dist")) stop("'x' not of class dist") storage.mode(x) <- storage.mode(beta) <- "double" obj <- .Call(R_cluster_dist, x, beta) names(obj) <- attr(x,"Labels") obj } ### cba/R/sdists.r0000644000175100001440000002056313037611552012723 0ustar hornikusers # implements a wrapper to distance (similarity) computation on # collections of sequences. auto and cross distances can be # computed (compare with dist in package proxy) # # note that 1) we can supply lists of vectors or vectors of # character (strings) # 2) operation weights are in the order of # insertion/deletion, equality, and replacing # 3) the first row/column of the matrix of alphabet # weights are used for replacement with the empty # symbol (space) # 4) include NA, etc if exclude = NULL # 5) but the C function returns NA if NAs are encounterd # 6) use parallel mode only if y != NULL # # ceeboo 2006, 2008 sdists <- function(x,y=NULL, method="ow", weight=c(1,1,0,2), exclude=c(NA,NaN,Inf,-Inf), pairwise = FALSE) { METHODS <- c("ow","aw","awl") code <- pmatch(method, METHODS) if (is.na(code)) stop("invalid method") if (code == -1) stop("ambiguous method") if (is.character(x)) x <- strsplit(x,"") if (!is.list(x)) stop("'x' not a list") if (!is.null(y)) { if (is.character(y)) y <- strsplit(y,"") if (!is.list(y)) stop("'y' not a list") } if (code >= 2) { if (!is.matrix(weight)) stop("'weight' not a matrix") if (dim(weight)[1] != dim(weight)[2]) stop("'weight' not square") if (is.null(colnames(weight))) stop("'weight' no colnames") l <- colnames(weight) } else { if (length(weight) < 4) stop("'weight' invalid") # determine common symbol set l <- sort(unique(c(unlist(x),unlist(y),"")),na.last=TRUE) } x <- lapply(x,function(x) factor(x,levels=l,exclude=if(is.integer(x))NA else exclude)) if (!is.null(y)) { y <- lapply(y,function(x) factor(x,levels=l,exclude=if(is.integer(x))NA else exclude)) if (pairwise && length(x) != length(y)) stop("'pairwise', lengths of 'x' and 'y' do not conform") } if (!is.double(weight)) storage.mode(weight) <- "double" obj <- .Call(R_sdists,x,y,as.integer(code),weight,pairwise) if (is.null(y)) obj <- structure(obj, Size=length(x), class="dist", Diag=FALSE, Upper=FALSE, Labels=names(x), method=method) else if (!pairwise) { rownames(obj) <- names(x) colnames(obj) <- names(y) } obj } # as there is no unique space symbol available not 'excluding' # NA has NA as result (see the C implementation). # # if graph = TRUE the vector of transcripts is transformed into # graph data that can be supplied to 'segments', or 'grid.segments', # etc. the dynmic programming table is returned as attribute # 'table' and the traceback graph in attribute 'graph'. sdists.trace <- function(x,y, method="ow", weight=c(1,1,0,2), exclude=c(NA,NaN,Inf,-Inf), graph = FALSE, partial = FALSE) { METHODS <- c("ow","aw","awl") code <- pmatch(method, METHODS) if (is.na(code)) stop("invalid method") if (code == -1) stop("ambiguous method") if (is.character(x)) { if (length(x) != 1) stop("'x' not a scalar string") x <- strsplit(x,"")[[1]] } if (is.factor(x)) x <- as.character(x) if (!is.vector(x)) stop("'x' not a vector") if (is.character(y)) { if (length(y) != 1) stop("'y' not a scalar string") y <- strsplit(y,"")[[1]] } if (is.factor(y)) y <- as.character(y) if (!is.vector(y)) stop("'y' not a vector") if (code >= 2) { if (partial) stop("'partial' not implemented") if (!is.matrix(weight)) stop("'weight' not a matrix") if (is.null(colnames(weight))) stop("'weight' no colnames") l2 <- colnames(weight) if (is.null(rownames(weight))) { if (dim(weight)[1] != dim(weight)[2]) stop("'weight' not square") l1 <- l2 } else l1 <- rownames(weight) } else { if (length(weight) < 4) stop("'weight' invalid") if (partial) { if (length(weight) < 5) weight <- c(weight, weight[1], 0) if (length(weight) < 6) weight <- c(weight, 0) } # determine symbol sets l1 <- l2 <- sort(unique(c(x,y,"")),na.last=TRUE) } x <- factor(x,levels=l1,exclude=if(is.integer(x))NA else exclude) y <- factor(y,levels=l2,exclude=if(is.integer(y))NA else exclude) if (!is.double(weight)) storage.mode(weight) <- "double" t <- .Call(R_sdists_transcript, x, y, as.integer(code), weight, graph) if (is.na(t[1])) return(t) # reduce set of transcripts/paths if (partial) { z <- t ## reduce to maximum number of trailing inserts k <- attr(regexpr("I+$", z), "match.length") z <- z[k == max(k)] ## reduce to maximum number of matches k <- sapply(lapply(strsplit(z, ""), table), "[", "M") k <- which(k == max(k, na.rm = TRUE)) if (length(k)) z <- z[k] ## reduce to maximum number of leading inserts k <- attr(regexpr("^I+", z), "match.length") z <- z[k == max(k)] attributes(z) <- attributes(t) t <- z } if (graph) { dimnames(attr(t, "table")) <- list(x = c("", as.character(x)), y = c("", as.character(y))) attr(t, "graph") <- .Call(R_sdists_graph, t) names(attr(t, "graph")) <- c("x0", "y0", "x1", "y1", "weight") names(attr(t, "pointer")) <- c("x0", "y0", "x1", "y1") class(t) <- "sdists.graph" return(t) } z <- lapply(t, function(t) .Call(R_sdists_align, x, y, t)) names(z) <- t attr(z, "value") <- attr(t, "value") attr(z, "partial") <- attr(t, "partial") class(z) <- "sdists.trace" z } ### experimental plot function for # # idea from: http://home.uchicago.edu/~aabbott/ # # in R 2.4.x we will fix yscale = c(ny, 0) # # label in grid.xaxis cannot contain "", i.e. does not # produce output if it does. # # with pdf() it produces a garbage file that segfaults # xpdf (but not acroread) :-( # # fixme: use another line type for prefixes or suffixes # of local alignments. # # ceeboo 2006 plot.sdists.graph <- function(x, circle.col = 1, graph.col = 2, circle.scale = c("mean", "max", "last", "text"), main = "", ...) { circle.scale <- match.arg(circle.scale) g <- attr(x, "graph") b <- attr(x, "pointer") t <- attr(x, "table") nx <- dim(t)[2] ny <- dim(t)[1] if (circle.scale == "text") fontsize <- 24 ## FIXME else { t <- t - min(t) t <- t / switch(circle.scale, mean = mean(t), max = max(t), last = t[ny,nx]) } cn <- colnames(t) rn <- rownames(t) # bug?fix cn[cn == ""] <- " " rn[rn == ""] <- " " grid.newpage() grid.text(y = 0.95, label = main, gp = gpar(fontface = "bold")) vp <- viewport(xscale = c(0, nx), yscale = c(0, ny), width = nx / max(nx, ny) * 0.70, height = ny / max(nx, ny) * 0.70) pushViewport(vp) grid.grill(h = seq(ny)-0.5, v = seq(nx)-0.5, default.units = "native") grid.xaxis(at = seq(nx)-0.5, label = cn) grid.yaxis(at = seq(ny)-0.5, label = rn) if (circle.scale == "text") mapply(grid.text, label = t, x = rep(1:nx, each = ny) - 1/2, y = rep(1:ny, times = nx) - 1/2, MoreArgs = list( check.overlap = TRUE, default.units = "native", gp = gpar(col = "lightgrey", fontsize = fontsize)) ) else grid.circle(x = rep(1:nx, each = ny) - 1/2, y = rep(1:ny, times = nx) - 1/2, r = t / 2, default.units = "native", gp = gpar(col = circle.col)) grid.segments(x0 = b$y0 + 1/2, y0 = b$x0 + 1/2, x1 = b$y1 + 1/2, y1 = b$x1 + 1/2, default.units = "native", gp = gpar(lty = 3)) grid.segments(x0 = g$y0 + 1/2, y0 = g$x0 + 1/2, x1 = g$y1 + 1/2, y1 = g$x1 + 1/2, default.units = "native", gp = gpar(col = graph.col, lwd = g$weight, lty = (g$y1 > g$y0 & g$x1 > g$x0 & cn[g$y1+1] == rn[g$x1+1]) + 1)) popViewport() } ### cba/R/cut.ordered.r0000644000175100001440000000127211304023136013612 0ustar hornikusers # cutting of ordinal variables # # ceeboo 2005 cut.ordered <- function(x, breaks, ...) { if (is.logical(breaks)) { if (length(breaks) != nlevels(x)) stop("levels of",paste(sQuote("x"),"and",sQuote("breaks"), "do not conform")) breaks <- which(breaks) } else breaks <- sort(unique(breaks)) if (is.character(breaks)) breaks <- pmatch(breaks, levels(x)) else breaks <- match(breaks, 1:nlevels(x)) if (any(is.na(breaks))) stop(paste(sQuote("breaks"),"invalid")) # breaks <- unique(c(breaks, nlevels(x))) levels(x) <- rep(levels(x)[breaks], diff(c(0,breaks))) x <- as.ordered(x) x } ### cba/R/stress.r0000644000175100001440000000773713437252001012737 0ustar hornikusers # stuff for improving the presentation of tables, etc. # a.k.a. bertin matrices. # # (C) ceeboo 2005, 2006 # the interface to the stress functions allows for # arbitrary subsetting (see the wrapper in C). stress <- function(x, rows=NULL, cols=NULL, type="moore") { TYPE <- c(1,2,3) names(TYPE) <- c("moore", "neumann") if (inherits(x, "dist")) x <- as.matrix(x) if (!is.matrix(x)) stop(paste(sQuote("x"),"not a matrix")) if (!is.double(x)) storage.mode(x) <- "double" if (is.null(rows)) rows <- as.integer(1:dim(x)[1]) if (is.null(cols)) cols <- as.integer(1:dim(x)[2]) type <- as.integer(TYPE[type]) x <- .Call(R_stress, x, rows, cols, type) x } # interface to distance computation based on the above # stress functions (auto-distances only) stress.dist <- function(x, rows=NULL, cols=NULL, bycol=FALSE, type="moore") { TYPE <- c(1,2) names(TYPE) <- c("moore", "neumann") if (inherits(x, "dist")) as.matrix(x) if (!is.matrix(x)) stop(paste(sQuote("x"),"not a matrix")) if (!is.double(x)) storage.mode(x) <- "double" if (is.null(rows)) rows <- as.integer(1:dim(x)[1]) if (is.null(cols)) cols <- as.integer(1:dim(x)[2]) type <- as.integer(TYPE[type]) storage.mode(bycol) <- "logical" # obj <- .Call(R_stress_dist, x, rows, cols, bycol, type) # return dist object if (bycol) obj <- structure(obj, Size= if (bycol) dim(x)[2] else dim(x)[1], class="dist", Diag=FALSE, Upper=FALSE, Labels= if (bycol) { if (is.null(colnames(x))) cols else colnames(x)[cols] } else { if (is.null(rownames(x))) rows else rownames(x)[rows] }, method=names(TYPE[type])) obj } # reorder table like objects (we may use S3 dispatch in the # future order.dist <- function(x, index = FALSE) { if (!inherits(x, "dist")) stop("'x' not of class dist") k <- .Call(R_orderTSP, x, sample(attr(x, "Size"))) cat("length:", order.length(x, k),"\n") if (index) return(k) subset(x, k) } order.matrix <- function(x, type = "neumann", by = c("both","rows","cols"), index = FALSE) { if (!is.matrix(x)) stop("'x' not a matrix") by <- match.arg(by) if (by == "both") { r <- sample(dim(x)[1]) c <- sample(dim(x)[2]) c <- c[.Call(R_orderTSP, stress.dist(x,r,c,TRUE, type), seq(c))] r <- r[.Call(R_orderTSP, stress.dist(x,r,c,FALSE,type), seq(r))] } else if (by == "rows") { r <- sample(dim(x)[1]) c <- seq(dim(x)[2]) r <- r[.Call(R_orderTSP, stress.dist(x,r,c,FALSE,type), seq(r))] } else if (by == "cols") { r <- seq(dim(x)[1]) c <- sample(dim(x)[2]) c <- c[.Call(R_orderTSP, stress.dist(x,r,c,TRUE, type), seq(c))] } cat("stress:",stress(x,r,c,type),"\n") if (index) return(list(rows=r, cols=c)) x <- x[r,c] if (is.null(rownames(x))) rownames(x) <- r if (is.null(colnames(x))) colnames(x) <- c x } order.data.frame <- function(x, type = "neumann", by = c("both","rows","cols"), index = FALSE) { if (!inherits(x, "data.frame")) stop("'x' not a data frame") by <- match.arg(by) k <- sapply(x, function(x) is.numeric(x) || is.logical(x)) if (!any(k)) { warning("cannot order on ordinal attributes only") if (index) return(list(rows=seq(dim(x)[1]),cols=seq(dim(x)[2]))) x } z <- as.matrix(as.data.frame(lapply(x[k], function(x) { if (is.logical(x)) as.integer(x) else { m <- min(x) (x+m)/(max(x)-m) } }))) o <- order.matrix(z, type, by, index=TRUE) if (by == "cols" || by == "both") { c <- o$cols o$cols <- seq(k) o$cols[k] <- c } if (index) return(o) x[o$rows,o$cols] } ### the end cba/R/rock.r0000644000175100001440000000777312020453760012354 0ustar hornikusers # wrapper functions for the Rock algorithm. # # note that the behavior for other than the binary distance functions # has not been tested. therefore, the default relationship between beta # and theta may not be meaningful in all cases. # # (C) ceeboo 2005 # compute link counts (internal function) # # let me stress that the semantics are unscaled # similarities but we package as a dist object # for possible future use in different contexts. rockLink <- function(x, beta=0.5) { if (!inherits(x, "dist")) stop(paste(sQuote("x"),"not of class dist")) if (!is.double(x)) storage.mode(x) <- "double" storage.mode(beta) <- "double" obj <- .Call(R_rockLink, x, beta) obj <- structure(obj, Size=attr(x,"Size"), class="dist", Diag=FALSE, Upper=FALSE, Labels=attr(x, "Labels"), method="rock") #invisible(obj) obj } # merge into clusters (internal function) rockMerge <- function(x, n, theta=0.5, debug=FALSE) { if (!inherits(x, "dist")) stop(paste(sQuote("x"),"not of class dist")) if (n < 1) stop(paste(sQuote("n"),"illegal value")) if (theta < 0 || theta >= 1) stop(paste(sQuote("theta"),"illegal value")) if (!is.integer(x)) storage.mode(x) <- "integer" storage.mode(n) <- "integer" storage.mode(theta) <- "double" storage.mode(debug) <- "logical" obj <- .Call(R_rockMerge, x, n, theta, debug) names(obj) <- c("cl","size") names(obj$cl) <- attr(x,"Labels") invisible(obj) } # classify based on distances to clustered samples # (we have to compute these separately; for an # example wrapper see below; internal function) rockClass <- function(x, cl, beta=1-theta, theta=0.5) { if (!is.matrix(x)) stop(paste(sQuote("x"),"not a mtrix")) if (!is.factor(cl)) stop(paste(sQuote("cl"),"not a factor")) if (!is.double(x)) storage.mode(x) <- "double" storage.mode(beta) <- storage.mode(theta) <- "double" storage.mode(cl) <- "integer" obj <- .Call(R_rockClass, x, cl, beta, theta) names(obj) <- c("cl","size") names(obj$cl) <- rownames(x) invisible(obj) } # cluster interface rockCluster <- function(x, n, beta=1-theta, theta=0.5, fun="dist", funArgs=list(method="binary"), debug=FALSE) { if (!is.matrix(x)) warning(paste(sQuote("x"),"not a matrix")) if (n < 1) stop(paste(sQuote("n"),"illegal value")) if (is.function(fun)) fun <- deparse(substitute(fun)) # cluster cat("Clustering:\n") cat("computing distances ...\n") rc <- do.call(fun, c(list(x=x), as.list(funArgs))) cat("computing links ...\n") rc <- rockLink(rc, beta) cat("computing clusters ...\n") rc <- rockMerge(rc, n, theta, debug) rc <- list(x=x, cl=rc$cl, size=rc$size, beta=beta, theta=theta, fun=fun, funArgs=funArgs) class(rc) <- "rock" rc } # wrapper for predicting the class of new (or existing) samples # predict.rock <- function(object, x, drop=1, ...) { if (!is.matrix(x)) warning(paste(sQuote("x"),"not a matrix")) # drop if (drop > 0) { d <- which(object$size <= drop) if (length(d) > 0) { cat("dropping",length(d),"clusters\n") object$size <- object$size[-d] k <- !object$cl %in% d # keep object$cl <- factor(object$cl[k]) # enforce contiguous indexing !!! object$x <- object$x[k,] } } # classify cat("computing distances ...\n") x <- do.call(object$fun, c(list(x=x, y=object$x), as.list(object$funArgs))) cat("computing classes ...\n") x <- rockClass(x, object$cl, object$beta, object$theta) x } fitted.rock <- function(object, ...) predict.rock(object, object$x) print.rock <- function(x, ...) { cat(" data:",dim(x$y)[1],"x",dim(x$y)[2],"\n") cat(" beta:",x$beta,"\n") cat("theta:",x$theta,"\n") cat(" fun:",x$fun,"\n") cat(" args:",deparse(x$funArgs, control=NULL),"\n") print(x$size) invisible(x) } ### the end cba/R/order.r0000644000175100001440000000356112020453760012520 0ustar hornikusers # wrapper to the optimal leaf ordering algorithm # # ceeboo 2005 order.optimal <- function(dist, merge) { if (!inherits(dist,"dist")) stop(paste(sQuote("dist"),"not of class dist")) if (!is.matrix(merge)) stop(paste(sQuote("merge"),"not a matrix")) if (length(dim(merge)) != 2) stop(paste(sQuote("merge"),"invalid")) if (dim(merge)[1] != attr(dist,"Size")-1) stop(paste(sQuote("dist"),"and",sQuote("merge"),"do not conform")) if (!is.double(dist)) storage.mode(dist) <- "double" storage.mode(merge) <- "integer" obj <- .Call(R_order_optimal, dist, merge) names(obj) <- c("merge","order","length") names(obj$order) <- attr(dist,"Labels") obj } # wrapper to computing the lenght of the order # under a distance matrix, e.g. a tour where the # leg between the first and last city is omitted. # that this is a (Hamilton) path. # # note that this corresponds to the sum of distances # along the first off diagonal of the ordered distance # matrix. # order.length <- function(dist, order) { if (!inherits(dist,"dist")) stop(paste(sQuote("dist"),"not of class dist")) if (missing(order)) order <- 1:attr(dist, "Size") else { if (length(order) != attr(dist,"Size")) stop(paste(sQuote("order"),"invalid lenght")) } if (!is.double(dist)) storage.mode(dist) <- "double" if (!is.integer(order)) storage.mode(order) <- "integer" x <- .Call(R_order_length, dist, order) x } # wrapper to greedy ordering inspired by F. Murtagh # actually a hierarchical cluster algorithm. order.greedy <- function(dist) { if (!inherits(dist, "dist")) stop(paste(sQuote("dist"),"not of class dist")) if (!is.double(dist)) storage.mode(dist) <- "double" obj <- .Call(R_order_greedy, dist) names(obj) <- c("merge", "order", "height"); obj } ### cba/R/circleplot.R0000644000175100001440000000153211304023136013473 0ustar hornikusers ## ceeboo 2007 circleplot.dist <- function(x, cutoff = 0.5, col = 1, circle = FALSE, scale = 1.4) { if (!inherits(x, "dist")) stop("'x' not of class dist") x <- order.dist(x) # seriation z <- seq(-pi, pi, length.out = attr(x, "Size") + 1) x0 <- cos(z) y0 <- sin(z) r <- c(-1,1) * scale plot(x0, y0, type = "p", xlim = r, ylim = r, xlab = "", ylab = "", xaxt = "n", yaxt = "n", pty="s",) if (circle) { z <- seq(-pi, pi, 0.01) lines(cos(z), sin(z), lty = 2) } text(x0, y0, labels = dimnames(x), pos = sign(x0) + 3) w <- c(cut(c(x), seq(0, cutoff, length.out = 4))) k <- !is.na(w) if (any(k)) { i <- row.dist(x)[k] j <- col.dist(x)[k] segments(x0[i], y0[i], x0[j], y0[j], lwd = w[k], col = col) } invisible() } ## cba/R/sdists.util.R0000644000175100001440000000744212020453760013634 0ustar hornikusers ### ### stuff for analyzing sequences ### ### ceeboo 2006, 2007 ## Find the centroid (medoid) sequence(s), i.e. which have minimum ## sum of distance among a collection of sequences. ## ## Alternativley, apply FUN to the distances and select the ## distance with the minimum value of FUN (mean, median, etc.) ## ## Option 'unique' specifies to reduce the result set to a distinct ## set of sequences. ## sdists.center <- function(x, d = NULL, method = "ow", weight = c(1, 1, 0, 2), exclude = c(NA, NaN, Inf, -Inf), FUN = NULL, ..., unique = FALSE) { if (is.null(d)) d <- sdists(x, method = method, weight = weight, exclude = exclude) r <- if (is.null(FUN)) rowSums.dist(d) else apply(as.matrix(d), 1, FUN, ...) k <- which(r == min(r)) r <- x[k] if (unique && length(r) > 1) r <- r[!duplicated(sapply(r, paste, collapse = ""))] r } ## Compute a global alignment of a collection of sequences using ## the center star tree heuristic, i.e. 'c' is assumed to be the ## center and each remaining sequence 'x' is aligned in turn ## against the (aligned) center. Spaces are inserted as needed. ## ## If transitive = TRUE the space symbols in the center are ## replaced by the (non-space) symbols in the current sequence ## the center sequence was aligned with. This results in a ## 'transitive' global alignment, i.e. each pair of sequences ## is implicitly aligned, too. However, this usually results ## in spreading out the alignments (considerably). ## ## NOTE unfortunately, there may not exist a unique alignment ## of a pair of sequences, so that the global ## alignment may not be unique either. In this case we ## make a first or random choice. ## sdists.center.align <- function(x, center, method = "ow", weight = c(1, 1, 0, 2), exclude = c(NA, NaN, Inf, -Inf), break.ties = TRUE, transitive = FALSE, to.data.frame = FALSE) { if (!is.list(x) && !is.character(x)) stop("'x' not a list") if (missing(center)) { center <- sdists.center(x, method = method, weight = weight, exclude = exclude, unique = TRUE) k <- length(center) if (break.ties && k > 1) k <- sample(k, 1) else k <- 1 center <- center[[k]] } n <- 0 ## total number of ties r <- list() for (s in x) { a <- sdists.trace(center, s, method, weight, exclude) k <- length(a) if (break.ties && k > 1) { n <- n + k k <- sample(k, 1) ## random choice } else k <- 1 ## first center <- a[[k]][[1]] na <- is.na(center) if (any(na)) { center[na] <- "" if (length(r) > 0) { ## update t <- gsub("[Dd]","?", names(a)[k]) if (regexpr("[Ii]", t) > -1) r <- lapply(r, function(x) { x <- .Call(R_sdists_align, x, center, t)[[1]] x[is.na(x)] <- "" x }) } if (transitive) center[na] <- a[[k]][[2]][na] } s <- a[[k]][[2]] s[is.na(s)] <- "" r <- c(r, list(s)) } names(r) <- names(x) if (to.data.frame) { if (is.null(names(r))) names(r) <- seq_len(length(r)) r <- data.frame(center = center, r, check.names = FALSE) } else { is.na(center) <- center == "" ## recode space symbol names(center) <- seq(length(center)) ## add positional index r <- lapply(r, function(x) { is.na(x) <- x == "" names(x) <- names(center) x }) attr(r, "center") <- center attr(r, "ties") <- n } r } ### cba/R/ccfkms.r0000644000175100001440000000560213037706735012665 0ustar hornikusers# # k-means based on conjugate convex functions using sparse data # structures and centering (and optionally standardizing). # # for details see the C source code. # # (C) ceeboo 2005, 2007 ccfkms_sample <- function(x, n) { if (inherits(x, "dgCMatrix")) as(t(x[,sample(dim(x)[2],n)]), "matrix") else x[sample(dim(x)[1],n),] } ccfkms <- function (x, n, p=NULL, par=2, max.iter=100, opt.std=FALSE, opt.retry=0, debug=FALSE) { ## dgRMatrix is currently broken if (inherits(x, "dgTMatrix")) x <- t(as(x, "dgCMatrix")) else if (inherits(x, "dgCMatrix")) x <- t(x) else if (!is.matrix(x)) stop(paste(sQuote("x"), "invalid argument")) if (!missing(n) && length(n) != 1) stop(paste(sQuote("n"), "invalid argument")) if (is.null(p)) p <- ccfkms_sample(x, n) else if (!is.matrix(p) || ifelse(inherits(x,"dgCMatrix"), dim(x)[1], dim(x)[2]) != dim(p)[2]) stop(paste(sQuote("p"), "invalid argument")) if (is.matrix(x) && !is.double(x)) storage.mode(x) <- "double" if (!is.double(p)) storage.mode(p) <- "double" storage.mode(par) <- "double" storage.mode(max.iter) <- "integer" storage.mode(opt.std) <- storage.mode(debug) <- "logical" obj <- .Call(R_ccfkms, x, p, par, max.iter, opt.std, debug) if (opt.retry > 0) { for (i in 1:opt.retry) { p <- ccfkms_sample(x,n) robj <- .Call(R_ccfkms, x, p, par, max.iter, opt.std, debug) if (robj[[4]] < obj[[4]]) obj <- robj } } names(obj) <- c("centers", "size", "cl", "inv.inf") rownames(obj$centers) <- names(obj$size) <- levels(obj$cl) colnames(obj$centers) <- if (inherits(x, "dgCMatrix")) rownames(x) else colnames(x) names(obj$cl) <- if (inherits(x, "dgCMatrix")) colnames(x) else rownames(x) obj <- c(obj, par=par, opt.std=opt.std) class(obj) <- "ccfkms" obj } predict.ccfkms <- function(object, x, drop=1, ...) { if (inherits(x, "dgTMatrix")) x <- t(as(x, "dgCMatrix")) else if (inherits(x, "dgCMatrix")) x <- t(x) else if (!is.matrix(x)) stop(paste(sQuote("x"), "invalid argument")) if (ifelse(inherits(x, "dgCMatrix"), dim(x)[1], dim(x)[2]) != dim(object$centers)[2]) stop(paste(sQuote("x"), "and", sQuote("object"), "do not conform")) if (drop > 0) { d <- which(object$size <= drop) if (length(d) > 0) { cat("dropping", length(d), "clusters\n") object$size <- object$size[-d] k <- !object$cl %in% d object$cl <- factor(object$cl[k]) } } x <- ccfkms(x, p=object$centers, par=object$par, opt.std=object$opt.std, max.iter=1) x$par <- x$opt.std <- NULL # prohibit reuse x } ### cba/R/coding.r0000644000175100001440000000343213037611552012651 0ustar hornikusers # coding.r # # dummy coding for data mining applications # # fixme: no reverse methods implemented # # ceeboo 2005 as.dummy <- function(x, ...) UseMethod("as.dummy") as.dummy.logical <- function(x, ...) { x <- as.dummy(as.factor(x)) x } as.dummy.integer <- function(x, ...) { x <- as.dummy(as.factor(x)) x } as.dummy.factor <- function(x, ...) { x <- .Call("R_as_dummy", x) x } as.dummy.matrix <- function(x, sep=" ", drop=FALSE, ...) { if (is.null(colnames(x))) colnames(x) <- paste("V", 1:dim(x)[2], sep="") obj <- NULL levels <- NULL colnames <- NULL varnames <- NULL for (i in 1:dim(x)[2]) { z <- as.dummy(x[,i]) if (drop && nlevels(z) == 1) next obj <- cbind(obj, z) levels <- c(levels, list(levels(z))) varnames <- c(varnames, colnames(x)[i]) colnames <- c(colnames, paste(colnames(x)[i], levels(z), sep=sep)) } rownames(obj) <- rownames(x) colnames(obj) <- colnames names(levels) <- varnames attr(obj, "levels") <- levels obj } as.dummy.list <- function(x, ...) lapply(x, function(z) as.dummy(z)) as.dummy.data.frame <- function(x, sep=" ", drop=FALSE, ...) { if (is.null(names(x))) names(x) <- paste("V", 1:length(x), sep="") obj <- NULL levels <- NULL colnames <- NULL varnames <- NULL for (name in names(x)) { z <- as.dummy(x[[name]]) if (drop && nlevels(z) == 1) next obj <- cbind(obj, z) levels <- c(levels, list(levels(z))) varnames <- c(varnames, name) colnames <- c(colnames, paste(name, levels(z), sep=sep)) } rownames(obj) <- rownames(x) colnames(obj) <- colnames names(levels) <- varnames attr(obj, "levels") <- levels obj } ### cba/R/gknn.r0000644000175100001440000000140412020453760012334 0ustar hornikusers # knn.r # # implements generic k-nearest neighbors, i.e. for arbitrary distance # measures, in a way that is compatible with "knn" in package class. # # ceeboo 2005 gknn <- function(x, y, k=1, l=0, break.ties=TRUE, use.all=TRUE, prob=FALSE) { if (!is.matrix(x)) stop(paste(sQuote("x"),"not a matrix")) if (!is.factor(y)) stop(paste(sQuote("y"),"not a factor")) if (length(y) != dim(x)[2]) stop(paste(sQuote("x"),"and",sQuote("y"),"non-conformable")) storage.mode(x) <- "double" storage.mode(y) <- storage.mode(k) <- storage.mode(l) <- "integer" storage.mode(break.ties) <- storage.mode(use.all) <- storage.mode(prob) <- "logical" # y <- .Call(R_gknn, x, y, k, l, break.ties, use.all, prob) y } ### the end cba/R/plots.r0000644000175100001440000000752112020453760012546 0ustar hornikusers # Wrapper function for interpolating a logical matrix into # non-overlapping square blocks of user-specified size. # Returns binned values of the counts of TRUE values per # block. Note that the effective number of bins is one # greater the specified number because the zero bin is # always included. # # ceeboo 2005 lminter <- function(x, block.size=1, nbin=0) { if (!is.logical(x)) stop(paste(sQuote("x"),"not logical")) if (nbin < 0) stop(paste(sQuote("nbin"),"illegal value")) storage.mode(block.size) <- storage.mode(nbin) <- "integer" x <- .Call(R_lminter, x, block.size, nbin) x } # plot a logical matrix with the option to reduce the resolution lmplot <- function(x, block.size=1, gray=FALSE, xlab="", ylab="", axes = FALSE, ...) { if (!is.logical(x)) stop(paste(sQuote("x"),"not logical")) if (block.size < 1) stop(paste(sQuote("block.size"),"illegal value")) nbin <- 0 # majority mode if (block.size > 1) { if (gray) nbin <- min(block.size, 8) # maximum palette x <- lminter(x, block.size, nbin) } # density equals opacity # this sucks! gray <- rev(gray.colors(max(2, nbin + 1), start=0, end=1) )[is.element(0:max(1, nbin), x)] implot(x, xlab=xlab, ylab=ylab, col=gray, axes = axes, ...) } # plot a logical matrix with the option to color (by rows or # columns) and to reorder by rows and columns (using hclust). clmplot <- function(x, col, col.bycol=FALSE, order=FALSE, dist.method="binary", hclust.method="average", axes=FALSE, xlab="", ylab="", ...) { if (!is.logical(x)) stop(paste(sQuote("x"),"not logical")) if (order) { ro <- hclust(dist(x, method=dist.method), method=hclust.method)$order co <- hclust(dist(t(x), method=dist.method), method=hclust.method)$order x <- x[ro, co] } else { ro <- 1:dim(x)[1] co <- 1:dim(x)[2] } if (missing(col)) col <- factor("black") else { if (length(col) != if (col.bycol) length(co) else length(ro)) stop(paste(sQuote("x"),"and",sQuote("col"),"do not conform")) if (col.bycol) col <- col[co] else col <- col[ro] if (is.character(col)) col <- as.factor(col) else { col <- as.factor(col) levels(col) <- heat.colors(nlevels(col)) } if (col.bycol) x <- x * rep(as.integer(col), each=dim(x)[1]) else x <- x * rep(as.integer(col), dim(x)[2]) } implot(structure(x, dimnames = list(ro, co)), zlim=c(1,nlevels(col)), col=levels(col), xlab=xlab, ylab=ylab, axes = axes, ...) invisible(list(rows=ro, cols=co)) } # Make a proper image plot of a matrix. That is, # the rows and columns are swapped and the order of the # columns (original rows) is reversed. implot <- function(x, xlab="", ylab="", axes = FALSE, ticks = 10, las = 2, ...) { if (inherits(x, "dist")) x <- as.matrix(x) else { if (!is.matrix(x)) stop("'x' not of class matrix") x <- t(x) } x <- x[,rev(seq_len(dim(x)[2])),drop = FALSE] image.default(seq_len(dim(x)[1]), seq_len(dim(x)[2]), x, axes=FALSE, xlab=xlab, ylab=ylab, ...) if (axes) { if (ticks < 1) stop("'ticks' invalid") ticks <- as.integer(ticks) if (length(rownames(x))) { at <- seq(1, dim(x)[1], length.out = min(ticks, dim(x)[1])) axis(1, at, labels = rownames(x)[at], las = las, line = -0.5, tick = 0, cex.axis = 0.2 + 1/log10(length(at))) } if (length(colnames(x))) { at <- seq(1, dim(x)[2], length.out = min(ticks, dim(x)[2])) axis(4, at, labels = colnames(x)[at], las = las, line = -0.5, tick = 0, cex.axis = 0.2 + 1/log10(length(at))) } } invisible(x) } ### cba/MD50000644000175100001440000000603214344060573011334 0ustar hornikusers5474109e3c1721873eeffef3444b4fd3 *CHANGELOG 5f4d3d8d89dda5b564b63788be0692b5 *DESCRIPTION ee83dfb5779aac9143c472be1e0c33a4 *NAMESPACE b55737d69dd264044306e179d174d017 *R/ccfkms.r b3f65fab6ff0043e305d77f43f1698f1 *R/circleplot.R c72e86a160f440a10cf9510b14b13fff *R/cluster.r bc88c9402ecdd6052788df9c9de7879f *R/coding.r bf4f81f977a8b4b33913bb39089b10a5 *R/cut.ordered.r 8d6b01de986c64b57f048fe36b6f30ab *R/gknn.r feb86496c8eafc9b9eb6546cf1b71dce *R/order.r c41a46cc9d9e1c409f03426bbad0c1fb *R/plots.r 8f2d4d1812261bf2807f643d98da152b *R/proximus.r b9ec985120759f11b82f51dac96b37ee *R/rock.r 643c8ea3ec692a00994078661a279447 *R/sdists.r b477f489bcccf323f5005220358a59a3 *R/sdists.util.R 8366a3f21b1cbfd5ae6b386d08c462fc *R/stress.r 4d22d82120f151d8a1aba261ee1721e4 *TODO 1d082e34751b86f2eec9002530ef0683 *data/Mushroom.rda 29252ea1e818bb6234d87dd45d7e7023 *data/Votes.rda ee0710bb02aafe991232bed602b9af89 *data/townships.rda 2e65b9e3b935a2d706aee404065623eb *man/Mushroom.Rd e7d0d43b11487f4fcc1c85554de6f0b2 *man/Votes.Rd 33bee73777d10f2b368a7f2c34131c9e *man/ccfkms.Rd e0384dd29d5be7df7a85db9beaac6530 *man/circleplot.dist.Rd dbbe9cfc0c79a2b5337060a6f2cd9e85 *man/clmplot.Rd 503dda383258bad096b43115ac527ef6 *man/cluster.dist.Rd 09a0a957026071ccaeb1520508b48e15 *man/coding.Rd 2b231b8b6e0bdc38cb7f59d6c4457f57 *man/cut.ordered.Rd ada771779c072aaa56a08a44b6c5f853 *man/fitted.proximus.Rd 79d0bd6669e7dc11b9b0a58d0546c256 *man/gknn.Rd 66a3c9718ee92b9824f78bc210f0fcbc *man/image.Rd b0208e716aa6d9e4423657a295274167 *man/lminter.Rd 940308c3cff8ccafa9a76bd465212274 *man/lmplot.Rd f8f2259574ffae733ec04247c3551b2f *man/order.Rd b78b9419e53528c84ddbc7329161e4ca *man/order.greedy.Rd c1b36ba2096a39cbcee4c9be20940746 *man/order.length.Rd 59c0d131ca165bc0f33eab6d0e5f831d *man/order.optimal.Rd 9fb50f3e806b85e2a083876805600662 *man/plot.sdists.graph.Rd 4add29e1d8ca3101c9c0f499d6ffaa3a *man/predict.ccfkms.Rd fe35e1ffc9d3b99109d7051d70b29b18 *man/predict.rock.Rd be793dd63a489009151361be90fe0d0e *man/proximus.Rd d16cc9c1022c241850648cb9632a3873 *man/rlbmat.Rd a4710e715566cbe4cc4c894cf6b83aa0 *man/rockCluster.Rd f86cfb9cf7e0d48890b0596b7eba7c52 *man/sdists.Rd 980bfcccb80195048cd88e1b72c4d68a *man/sdists.center.Rd 658471bdec54892304b264ac06622275 *man/sdists.center.align.Rd f6f2e066701e66c42079552978da753a *man/sdists.trace.Rd b400b9082051a9387ca3aa94b65a6979 *man/stress.Rd d6cf6d45781842f2e405165bb84448fd *man/summary.proximus.Rd 85110593cfcfd8e14e87834012bd95df *man/townships.Rd b4a706d255fa2aeb729a733b122586af *src/arrayIndex.c 9ca98783cd41c91a125a91185312e181 *src/ccfkms.c 111261fcfaa5901cde5a0f4cc7166a19 *src/cluster.c de9a955fe73b30af2c2aec47c7f24251 *src/coding.c ef8c122e8f74586bbad0f10e8735d795 *src/dll.c c64aa0a75887b9a61da5bff3e3354d88 *src/gknn.c 17e1e159b79439e8c6a5e6bb0757908d *src/greedy.c 754456b8a5172943507a549604994f39 *src/interpolate.c 5899a7d270acda77b1e527a0443ad0d8 *src/optimal.c c72ac54ae06b7bf17c5cfbef9c6ec481 *src/proximus.c 6704480fe9751a7f71c0e6607028c772 *src/rock.c dad8cd9333cc78a732108fb4c271f680 *src/sdists.c fea7854b09482f20f42dd2b41beaf991 *src/stress.c