fastcluster/0000755000176200001440000000000013146533700012613 5ustar liggesusersfastcluster/inst/0000755000176200001440000000000013146376104013573 5ustar liggesusersfastcluster/inst/CITATION0000644000176200001440000000133312147514400014721 0ustar liggesuserscitHeader("To cite fastcluster in publications use:") citEntry(entry = "Article", title = "{fastcluster}: Fast Hierarchical, Agglomerative Clustering Routines for {R} and {Python}", author = personList(as.person("Daniel M\\\"ullner")), journal = "Journal of Statistical Software", year = "2013", volume = "53", number = "9", pages = "1--18", url = "http://www.jstatsoft.org/v53/i09/", textVersion = paste("Daniel Müllner (2013).", "fastcluster: Fast Hierarchical, Agglomerative Clustering Routines for R and Python.", "Journal of Statistical Software, 53(9), 1-18.", "URL http://www.jstatsoft.org/v53/i09/.") ) fastcluster/inst/doc/0000755000176200001440000000000013146376104014340 5ustar liggesusersfastcluster/inst/doc/fastcluster.pdf0000644000176200001440000034473213146376104017407 0ustar liggesusers%PDF-1.5 % 22 0 obj <> stream xڕXKW<|f'89drPY\E>%qGLFǕZ%VdU??S+,q2LYm %(ǃ[{לʢ yvf"BJ2Y"2T}آ?nޘ,mMUC8k,T"SI"smIڿ&n}ߡL(49ogf46cy)MZl]L$/U]bu&~|2m.ZHQma, RbE߬u.=8*{|a]2R# j4:tv"k׺^$XG^op?pbPIU Z oa)OeǓ}~wž{fVdwS}Ҹ#eT2mO.:it[]?R)1Zi! b J,pev?n{'E*ط^VT6Nh{PتDT]n nkMxDzp= Ct D+qЕZ t7ڍz^ k=/4Uh8x PdDR#!~%`7uuwc%%)|n^CҀIs0/1P y'Y_Ƃ*LRh XRrҪ|j"ؗ(kK-Ӽ\UE.3@N!ic_N2f9~8}Uj)+Bᵐ)E|}P4R$̞1Ta&P3-^䡔Ak/ΤL>]SWsЙJ+9 z:9u^,kKbj8Ex@w%LuO\[U ʥ/ Cbʤ ϵ*CvW9+E=@!6$4+9e!4ny*з7A, 0"A/I"a~p= "s5Zi3Vխ31`a4C뀛b[ꎇGp J6XC.-'\ ⑮p\Qx#o륚 \1[: 3=U.S[XmOZSK)wP.G 5",FꁐF>BZ\,%+^%J\?#κ:iS"c;ΌVФS jDw@YTaX@&*`- Nϊa4 q]\t$4Pq A74NH>0J;_%$cd¬r@s8[Dbe5f]ܷ.W4H YG,q'B$'szlp۩ƀDK,pZ8s%<<N`1s1"ASXFQy-$zs,Cal N48`yQ_@7 9? f17|Hlpt{T"Af.R%5?/PHS3g}Ρt;łNש5B+|=7T*;^8ĥqX L:alJ;ږ*z{H஢WoB|6 >KVA>+[xG񩹥Oa󴎊tJ"B=%:UiAn/˷QB- #a)5>)o6\ךk ]J7QГ6',7M@fQ1;$‚ P=bMB7ot XwBP{{봦]G >` Ĵ6#c 1 9gRCOkw7x7-&CO&eAF$x&^&Rԉ^M^ڏ @՚F U ԵN> stream xZKs6x1J*J&lh 8CI[n?Fwo)эF/(cf]uvw1I#wGaH h4!BKw;oo u]W5&lkrJO!tLt?4|{3AU9܋>d$ut^BEwrmXw A_Gl◟j Gg\W0(Oe`jA&L+ q2ƒvE6Hy%G$S|>te0o TN꜋(=0܃B j&wKnb_gk6]t [{Sd8)E♀jL趯SeB P$KX0$|]5V% V,|YȃI/ 1ʸѹ=xljbЮƺ$0 9+ (mh8 IiLJ黝ELnQLc5'ed!SZPCɇj4ǵOT~ۥKa] UhN*YB-|4հx*OϧehnP9il5*.8<x촞qXה}\ Dqǭ=ڗB K KMI1sLawO϶Zu y/K !O pL 40h-L\u̸Iծ N.?`ƥj TbŇ޹&|͓_zbk>sySz+;^:i:D;e43 >SJN#'zϪ0^YB.<DRxM Yx&O.PhI= FO#KJd7X\(x'p[zQZ BA RR@¡Whć1Hn]s]lIv%4uGA-3p]뢅f=9 *`{0bXP؟ c @~h+W9l`]-(zm-!iH ^ `b4@D0ol0!+c!7qHME X_"!صvACj, 9`4~N8oCo'j|Ʃﶈþvtd\ofB~9s =vdP@ PN8pM5oٯ"hBAM+hK կՔJ,q!Za`FA#_qapJu@n澯8D:sΘJ4 aK(@/L9U`U{g= bl[7>S5uz?Qb0Kŀ ;]]%rT> stream xڽYKsWjśS{cښT&N.qK%E-Ii hlkDht[P9?X7 J ?c]vrQRs}X(A Q,VBÐXn=~X~?-{/@-f0Ùй!wKwUDb)sjct-NX1IJ)8n]m\ {݃+^d_=U' 4iUo<6۶ ;k770ߧ!F)ɹwB9GUp$Ĕ+B,8)E9.H硭)ܜg=pQ>rSq}Bۯ0Y0 "kkr6=c40sD< T J!UXMb' -ZY ~3VEcIf4q9`†C_ml?Dŕ$"**l.%/"34x53ƍyc(Fsf87cX ,7h-@"RI&j,k2AXW/,o_Hxlȟ_%=/0-_a&OЎ4qc }|=VvmbrUUZO ws ҕ{Q}=6_s\ye5TƗ}S @UQ*X{xP냕1=mJec}"X+I'^|}V2FzP}0zE@SB}s6'e1P&l,brŴnO>07焉rTZS{CzU(Ngc|!x5xΝU`c.~3)NR[gƘ_ yk+it~_!.wZJR -й{3K^*~}JV;ak]mmǸgLm9#’Vy‰~.el 4e[󜕁!ag34j3Z5H%P:^(Os  '7F-ad7к)I4ޫҼqp{C!dY.VRv'84d][;v"IR ' 2O03H4!o "Cawڃ=TɃ>qECU2y,ɰU؍M&xFF#4b. )ĕqqHHێneIvco1ֻ(YGQ#.ۤ3`b]~lT%%zt E 9kd,4]yu2wOh735QJoqF30BLs4xO}>Mf >s9?|=<]@ckr6j(Nas; z.Ѿ endstream endobj 85 0 obj <> stream xZێ}W-d}v6 k, GHX<|}I.F&1cd/u9u_V|.-.t-B K(&SU+z$[ju ,)"` sU5$F1>~ !M|L'8,ɶ[ju)ߜϥ2ہ+JUnVDH.lQ疨rX!-n/f@ezb "{ =/7{k9yH'&'L:?r.w姪b-.ȅF=ލHSW&L 67g1+`<엲$A{cP(=w+9%dY-c&B~SRŒQSISB|a#z/|".X)~YfU&Zu:Lm-lJT~Hřzo2 *9E@-ޱ+*=2\%^m{ַz:3+Mp3@RDl?@)H4HNKi3|w`e fe[b2͐HTsQTˆA œU.+ TPa:}R0n&q>Z7s152o;x⇦Ct҂'Q!oRTC21`H Fܴe!%AR33# 0=1*0UKg\M?@{7"qqƈ[@^+|. yGy45y4ՓIfTÎFF!b,;_Cl6e` @P ~14_ƟX` _,/RPYJ\l)1UOR@@ez -&OT1֯|i4ƿK1ܽS)z&#ɔWoUTRSMpu!Bw}҇7h~o %>Uį 뼡5%?"! *! i$.t#&Br8:LNz ܔ BpP(h) r]ЀJB^:#2s|A#)RͿFE endstream endobj 89 0 obj <> stream x[o_ʸ5+~S{]CCPb+[Jr{_Pq6mQb-Qp8f;<ó$7-7U2 "f3NjbdStd_`T!8}zIRr}Ԅ>ʮ{H!R{otaWsbFƛ8s=>1Iָb=kS5] #ESB8MVͺ׶mU˦ve]ن뺱u).J+YRts"9QIQ$YwmW4n8Jte*ªZ PP7q߮m榬nWb"3ɂKQ%SQK]9L`Ym Ps[}*+=^g~-+ʻ>qHʤ,mŔ8O3i:H™)/֮RPDly(‘Ucyz1mlku |LˮnZPVvHnK-!zu7uSv,C$͂=ցTO+@c%FHpkppܼjemnkZ]U&7-v~HQkӓ1BG׼-@9[Lk0rvokmQuU L!k95~s<@I!{7?ew/])2OL/jkN B>1BB !z>p+c *A)0 + 0F*X7!Qf)^KO6a^}K(RG"aV55_bN8AUc$1Ф/ eL%9T02+c8p&N++~>DxSy#o( S7hq7}up6Ek(>,`3ㄣ̸ؗd%gJ! 9,YV uLC{iaNWyA̢P6`j%m>*49apH < Hc`mؤר?AI{OG:ΐ >XK!6ΆNb29<O1~bt;11 S{z}= } y ̏BD$p"3.xUQ]sPzkB@UC:ɡ4)$xf C̫(){xJvJvrJI *LS}7a7}dJp N郥>.De b"Rhsq =EHA֔0r!{ My@eJYmR8æ\9@)}\Vį}K<scp۟,9Yo".3]%PjkS2a}pHqQR /+q> DTi<8F"xLS& z7yN."e۲YɮfҥJ3mתm֫j+z L 4ؕ<ɍ -R14M~`p qh󤞥'ħ؛14~1q$0- ~$͵nA<$$aB`)@p  !|}a!x#1"?#b#iJ/<`9I t)kЖn1ZkQh;G.r0h`%LM˧T\~)d32HkmK@_a=nKZF5\:)@>,٣ćTVǩ(e_% Q wV#ȩ_'_c]y4ɆCeoBEt;SQJ^t?sٯ׻tR~Zueb*Axq}dU@"> Us|}&pif R>S' @ܔ)MQ說\Yhb_߷MѶ&O)4EDW `TIluc->QpG')x˩_8lOb]$^'6V#5k EMWXsk_KTb5w/<1~J5?Nt2R_iMUjXrJ9,%oWor_B;PCaXfܝ sl9”}ŘL\]b>2.LI쯓~(>/scH%X_ǘĘlQ0zgumT斫}\U A[Y;W'ΫQ8Fy͛\\"TW2ZL$ƒ!?J1Ȳgs^5[fL^Egה3esϛyu6ى將뷑m5 nN\+3+tˌ]bP[F x z3nl5Eys zV`L7t;}X$E;̮ι02Ii |. M_8Zwx^ɶG> ;{;K7d¸X Bfzp62Db*3 B4uaPЀdb f),3c˂lI(ۏ@'PNf!sP* 12-e endstream endobj 98 0 obj <> stream xڽZ[o~[(d5" )ٸhlhHCIv )4dR3gs?tK X9b))wF K⪸+!Vb.Dq_=yltOQ-U)pvH'-#w/6eZq"5{>UjpI| ( /EVXRXfV kEH(y'CIs `Sܷ\ٜr~1z 9eZ G 5=vwp 4ް<ݲSwG{i1gXv=W!^6;⤩ܷ <Nm ةV$R\JO}adTC:3K$`s /ov L߭vP|t+6@dE ,@%2؋b1{|DYH$=&"cΝgc'*hpm9R#fbn;@X6#m,(3^Q[~eJE%L$i(_BC;*!p93bj50~?PЉh¬pL%* apE8z1Ʃ2`$JaS`Cӓ+BDrtYæ \';if/Uyb]|Oœ_7_x:lH B@}|qa&Œa9)E^ew]7/TI>͢ugEaozS!'/ݢooI>!1p.dtZ/)t+fS;wAcҝ++{ZŰKept$N@pb}aEI x| av GF)D.4P5BcJeʒ#HHEC BkR;䨯!L^t%*(gaJ>9`˝9l^Ve1CASS,HSh!\R"j"X]*Xp~e.9Rĕ-UnvmBHdͧJ$Dfe{Ї){20x)PQWK-q:mö]C miq?TvqfPNkD`ncOdL @ypl0W\Pz.IuԐ8IS>KS8NcʏR1򏄐ݜaI&w$=u/f8|MTD2X= Q T d}(S8=Duۅq~2, ljjX۪mG;Iu x#6 PA˲mHޅMLqC1VNX+x>LgQ")iޅQ-H]iup7.a\M z]@{KyR.2YtBg~RHC9@̚`Tuxc>n?p )MڌPG].Ahbq g웇ۓQDz.nLk|~*Y6Դ8 ɥFʮͪ 4j+%c߇,~ƕٚqp8CA]'xr zl1ReHɐ#GX,V~tf(Vudٜ)s8*tn t9d+K1|2;NP x]Z fUJVDACm}clLR) w[סRyI¬)N)jOp$D$C&Mu*=YZ?&K {$0oL,|ON^|.Ȗn'w-Il}@>H8 %NYFjPMn([uAȦ*~W/:zuutg<]YbS} Ox$\CRIFBv1veOVx ?=Nرмv9żL'\?Ov&7E K{▻&?x\UMь4f6ſW'{p3\K:6PXb C V?DL endstream endobj 115 0 obj <> stream xY[~p1+ީ%Hl&6[,y)y&{IْG^kv yxn<].4-]ojL1}XHN 7W0%ӃydRd qA2qf.WLIS[IӋ5hE㚯&6$*>/pg[wM(@uA͔IdvE!ytEדPM]evK0LǭMre3K1Brn5,n+)]BEm}#īqj )OnapJGT0 %nw}Y:9 ;M݆>Q'B֗ŏ?޺M(rAiF 3^*w7/#kjh](~5&ӶN 5ABT1( z$9DN4ӄfc=!t,Au_ۏ٩-20j9 V1pø abLM(PsK3BHiFt cBݻَRY00柗+!dSoNZ"^DK4%K<"y9>`wO7a~ߗ:TVRyFm%b;¡ ]+r}$K!㠐f"6袼Ŋo\M$m[x(zTa,ǥI^taM]/w/)(XnBC3ѴQ c,DCw`8k#1CSU!b83}4@(RWeu[1B%!r'l6o̒ڂI L+IcQwW;t`h.{&v"ϜH 4ԁ?p4̠ѷ&LH[Sb!AV/~K #_~3m;#)NȠ }r5_qҘ-)h`Tߕw\`m M3,0&ǟ<7RKt:z uٶ宬rwЁv9(A/Kgсܮ.'+r L@*i endstream endobj 123 0 obj <> stream xZݏ۸_aKe fo1Cyp-晴%$fpHYrYZ"(r8C/+_Yj{X}~1gY߯d,ViK~~\oD}%񮢮=v-l6? г3z &eԱ_kUCZn8e΂ь;fUˉPw6W D6;A~+nLs Μ~;~"](- >xjcݵ?Cq]Ԫ!sj*ڟ{+XS%[*XfㅗamNHVsXzWW}okb=9P竍>,D ,-P3^*na&vidWD#HxM!xPDW{S+4 SF6ְ,N>D9a$.ѹɥʱnoͣpW(cBudɂB >BD=p$7HəA̬M{:wSp%V2!t wf덱*{y%<&ϳ]9ԊQiU˶T~]cW?V;"0dЦ<j]'fvB;D. ^l9>#Ȣ"0\#)JLJHʑKJReyAUԆ1 `gwyPk}ۂx<~ڧbLp|EB  mIaBbFy<zW^ 8g1-@^/a==eCE#v i 3^;(nʾ) Hڞ7-}Wo% %{Y*b. Хl2bGl]4 1 *Qp,W++i DEq) ءloOMLE[Tք s@'2!$\ *Y@Q(|i=x.u$P&Tj@ѓߥ= !}PHKSS%Σ3"HL"62vaܝH$}׽OU{`h#rru/ɔcSNy'AT*ND! rA۹VYBHPJi[l_h6-)Þ)6ŜcWs )> Mo:QjoAk1וeS,ݛx0;Tpڦ+Өq$`"-un!0 _Rg-pP`ןkN1 v ΁s"hjሮ*␁1$02xJ?#I8o%Vp Z{!I)g)S= \!<, z՗z 2BCǀœ_ mV=rlx/Gju-"pZ A?ߨ]$^7A^-X ǧ#žK}Ε[ơ:te Wx3#:"̹.x͏k h$CX)WgFOМG\<;/jyhAag<ҧ"v(ͩ ">8jmݩ:W8?JìT =EGt˽'oGO:|.ZŬ.-Mimu}x4)/|&tт77f4}H)6WS꡷D蝎*_YlA)C/^-]P7ջpT67r2' cp7A)҈pE)9l!LP271J &\q uMhKv`ȽkMRjYg%/3cC =g~/y, ؇z9tTNda/|3%."Ԇ+*xWR,,86mc={g7"^V?GUaf[u!PKm$×#:Ww]΅ϿkiXx>"J IL@v#~xBU0ɁQB|IBOW-\/[1`yYC> stream xZK۸W6OvW:[[[&p#$egӍ(PLK[ 6F_^E&tn~0FBK|nZ=OƥȒK' հߍݔ7U&uá4[ztÈW"YC5a?M}JYU`z}l4/X>f.E˛ʠQ)Ldmݠ̾nX=0.O]Gc@8u ػǮwʻ-]u{7YAKXǩT%U Fc½+4iJ4so"=YョnZk"nhh;7ݗ}Ip 8~(dRbɻG\h*eNRk(_I (n `I`!`Tu e㮜;ࡍ=* cM,aj.)+{@^Ov"|v,;uUH%۹`X|wFVe,zF1r??D66%/`jZR>HȽ_خÈ+ کO"NrōIOF?[kyC Wri̖Mc) pSPx5LNxC=6ȅ6Lr= }QCVӻpGݸ'zEFҸ{eQrFc 5Or$Pzp^A`; {48D*9>~B@A%Ʀ%/i9͑YgH~KǕYvuP@^be]LI${fX⤿ dRD˩zuaG Tuþ֋fiw ={Cvgq$Y;U^N x_MjRYB?@Qo Z vҞ'CbTLx[C&a_D=ʰ$iT)L/rΦP21xEtQ㭈QbM(ꤧjxPob #~yg@#Bbo`ۺZX xwau5ި4 !^"A[<~FzTG~!Y0=]6 ˳C 8?f֏p9/a}"?1$,ŐLY+!Y0&L'h~#bzgb*s{8f_~Mdg$J0shiG@ s&/ P~T]bWH(md; ۯ6m$\i zXx X-k\lmF2,>6a{T^P4@_OjXf0kG^ߪ6{n$IY' Q)'ҟ%ր2ͮj~XO5"̈+qY`R``<4դn?-ۻލE2eRMG 1@Y9zۨ!3C\ruk<WM3.Lxao踆r_ʛQ# >UK;:pxѓY=~_5`#3I m=zzSг({_b<=3:聧ݾ c8;QS[w$OykHx YLN9?0SaSLD"iTrGx.,y%g(h@xic0`WmlR3?"_ c\W%Q k:ɽCbjxzW= #2PO.{fi7lM[<fKv*  -WYc\ɰ4̲a S<{..=56`e@?_m/!*ROMLl .!J/PJVLg1|ډ ?07:ͽ8tԥdL4sƲy?ž_dibLz秞RΔyX[Rg15B~{wy OZ9Od-KO4@i~4';FM5LfKs,v 7;+RIrmvvs8-XgG)w _khݔ/ty1NH&& "nq~j{4#KXzd=yx8 &l;"(-2*|'@ b'cҋ@ :7qgz45Ӂ_mJG+*Ka[_CԣPHr+@#؀5BU1%+m.x#ѿ8?g&-Mq"W0V.Jy1? &DaA~9jiBJ{* BD>)d66쯊)@*_~6T[?lV52gR+l_eqn4}Fߢ//ֺo΀X[[1\|G|i7R #yf_U4ihepB' WB>ګWsS 69 ‹e1 endstream endobj 130 0 obj <> stream xZo_>UtYLp5qAyWuޯ8~$y|)(rg~3cL_duvq"H "Lr$T@ͮɯٜ)/+7 v=W݌iv߫KфXew9"]K# IXXn]X\9#"ufԔxbbV쨘 X14v V쯝>-խi =&.(R!Byu+y~H!&e53\I4‚dIւmW Q +x`,K(:#"h߲T ƅ #!C+ØD tu$A^) 8СǏ a[9SJj:"(g@^Or $3QR8X35Farumn  [Z' +{f~KO:#Bw=<~0lZZsֽ;QUѧuyo)K1ۭ~FZitk[m u\1}v0tMטmv 1}̍S()rZ7C0,cJ3mʏj=}  s5Me#-Z&[G{k)Z{_US .S$)&8͵gHyK>h@pRCdE9" D*o/:5)> x*$@6d,oi~X~[m405UW &U}ɈHd $PQB'CJ/*'M{\) Xe6@!r,Fԡ̉Z˙$-; zާ&PHv> X2bHqrɦs ɂeC? O"}I',w fUhL-Pf"xev!t>w6cR8)$"ϰ |bF$"  x騜I"=3΅8xX. 8,B dMNa?v$D6!5'Cx]rT}AX𰮗'H_x>6,YySM Fz&XI~&SLgA5P+HT2/ka!}W@GKQNVi#4(=DIBYCqs ݧc2^ 6w:J@bc<֛U ]m:ս66TRIKJ#(7uXM86CӤr1y<5!5ǘ )'2%%fKyc:flu:T?|j8l?SA1z/a &(q8<~ >F7`$hL|KBt%cI-Yx`Rf1DͮV鬄āyVBie^i*Iƌ[Wa]f+>")ef$^媽.%*V 0}pU|(dַ_=)k'` >$EJc3eQHD^g zYh+K{PMT`W)mH/+?;)> stream xZoܸ޿BNU\עP$zv'i:q0E Ù߼~x?'Q^_EeQ2#Ltu 2ɣ40%?uE}zײ^Vd 23%JqH̦*{e*3 O+:Lip\WywRMXO : h][&>+O7ͅNOދ R|3ѫX)uYf']bR}:\fw]?%N4cZ Bxi32%3&#st\% 3Py¸^2lI%jIz5 0[WmozW~ Lds rUSF?8$bmS}<#G9{C3`r'-4O^:3&.%,cΕUcѼd(LYhÈx(`f@E2Iibm7?g7UTٱ#1̸*bas6w\0;zkhØo-*Ǡ-ib\fS=SI0+af#HM3SPa@xbWC,jng4P$3CE9 p e "kc^w4h#Y #dt,fСv{9B!M]+`? d|G}صwÐonn˔IAJmΛԅQR`%Qŕc$Xm*|zS~[ "r#s /Zb!| _˴sP޶>]Xc XU9̏0"9 <-P@[ƭã/q~ !I!z_ŀ/|P1~Y\h2c)̣uΚ P }bXR[#)@ %n|ٺN6 nok&m^L 2|ض!d$VBZvP 9ǡd2LM,e6_w,µPkJ`FC` 7KK9yKM(vk 9f1VN1vMq e;LEφS.jTCO%!OZɖ{aUvVE%L.<˷%Q/&Aŧ<[4njߓakJ{%u4֌2rw3&d64`B[TyM3 TjoZMxicy22`f iP;Pಲ@!ΦN@٫so:f<~wc\!,mGehS endstream endobj 144 0 obj <> stream xZ[o6~_ه-mEA>(hb[-g&ԅth[AH)\sCg0LQ*"ۏ23H*L0 &.jVPz}kmٔ3/ S!5RJ߻" cD7)8:6me6݊rhʶ^pySw[(xw"P)f-ڧ|[?EZV)qo]:\ ʳdbD ZmH7*\<Nvܦz7[ٖy6j t[ZUwkyu߉YVu=s@k){7/Uw۶te=sUl̟[ $J#⯝5B?cދ|Y/ 9spN8-g{{FܘHe(wP>RuAY`6vs XA4!BduY+G \1oE?. Ȱ!VHhYAMLEbDT~ޏH.(GFڥPȌkaSncjy'Fd+IY$ڸB&)| I ~!f 2"feQ e[h-!ҏqQS1(ՁH0 1e8^bh945LXpY~oׁُp#$>D,bӈBMmv$~ PqD'OFvBQ)DZ/$!@ҕP=##yWq`)0 I0@&#a?3Q!$ Fz],, oqx(8hF (kZpu ext0ݎDo'F[%\1%ěEid'gmH LR aH@sdG G0 ռm6I9'j>$W+YF_cFb44|@72],/ç3IjQ}?LeNc3 ]#yp -n5M d [q%PTۍ]nRrs[U6 `^ŠMɋu$'1pAm&$I9kD`~񫮉bu[J؛EWu ~}8Wym̵\!նqmݬ~X3a!J2Rh8>R{H>^gp> e[%u,𔎏-M L1QSQ2֨p:51s;@דY6;;f1n*?L6I"C݄-%va¶c1< pU].K U%d;e`}} +(xyBkcBCD6# rq^c JEÎV9P*S;PLɍi^yP T"!dF 农 2ڤ!A@ɛ(X8M'Enڪkuݕ˾\Fj u{(a/PWÚ @}Rm|ylz)u{w*;@VX઼-庹) Sr)_ߺ؅Ţ,,-+'JV*飐==H1 `H7d(6(:O{NA5Rir' IUh N ޴ao4ElkSjtBC줉Pw=jQ;;[=ѡZac@6'O[_?eUnϐO 33AI$e/%˷طחji' a%mɇA@uy@ C8HxAɼAJAmXn,r&e6߄v@ާSPeIC 0|v'{ b_3Ri ,,?KFs-AQ?0ve8BaBG!G1 N8(~̆6Eo'9JYЯǺJ:O*~17S0o t0lKj OmүҚ2г)0gG>Xt^OWƦ]g%}q:IJuAM-۶0}E.@NIyO:(z nANO1RSN(!$ FYO!?& 9_|ۇ! d>xq~[]=ɓEb^HU$KZٚj|v2=D^IO$Kh:6,\4T]`(T lN9҃ճQ fj`S!5> stream xZ[~͘d[EPlyXY֌YYt}IQmnA y;G30$_:7& brw?)&3*-v/X'G(J9bcٯ 1zB" '}k c(UO%\WQ`B b4"ǒH~MU77a%_*ExqD %Ya:B$)^t:#R&^Hrp(M>TRm뗩 2v%YE̘B3'H=oMCϯ[lkh cFEM*jq߶AٟA hp7HR/g̪2dѦ3WZ!Wm7O~5A~~< g[3쪁.rK_k:03ck(yP蘵)`7Sƙn/cm6XqtҠei: El=n~J4>UC:k(rѵvI`O[߾1eGq%ԗkrmLK/U\#*,q>Wg>W{/|(_,kB̫" 9l-ln*EN}{!^i[3kh1RXEV-6*7|zf9$k[W12#|d&ڐ|qY6d=8l]kxjs3ʨ]o9CͻÆ!˯=O&%dcu vtcV7E5֑3ǻ9+}L[[b3x뾇f0 F8[/bc٭X1So߱iť )Udm]w7w=ߔG퇦'_7ͧS O&"a]VCΪrq|`n9Te ̠q}0 Fn[⺈ѡx4&:EfPkEv>|{_ Qjml4j4FoܓU=e$67\kGv=p*{+Hr@` !Uam6&Fb-Fϟަ ΌR ?{f??4w._ONچ}4C8u݃hپ<^GiGðZ0@K{C @H w ?ڤ^`ˬzhEp97 @2d J;XŌ;gXrLc/9ɽI ;OÈ0Tܢ'=6F~!G'y8 ;L f0pԯ/  endstream endobj 150 0 obj <> stream xZ[o6~߯dlxÊn0`]EfwJrR..MðvC\j$#'e$%AFR%!t1 C,9Y~ȩCXP枿ˋ +fL43HIH5Ʌ]eU^yvt|W/xb^·N˭w݃WT7ۦmb( +iYn# ߬Zgeծg`&ݹE?=""I M-ZefMV'8or*'pԶC*J?z'Uc݌2rTƒw/nкܟnm^LFALBLs/هxahiL ퟮ2"%)Ĉ;@Yc )f?}\V)nB(s59`!"d#ij 3/_$w/F+W  % w=`@? a\AaPV! P8A90G|b vCJ5ь`mfwCؤ6E>+/<֛~Z8@vl4v0/]V݈~57reUeu 5cx9& S.;*lTts}7w݊n0o+oV5$uEXqhuИVz7r֚Lovoѕjc;.?i ]uM> _ iQ: t ">DP;mwL1`.1?xOb73XaЖHr svknx@+lfSF5VY1kef`p s+H|^}l:|̏~$[:c}O0EQcfD>Ͽ"n-ț(TC?I9^. tgVk Lu:oDv}ሲ ZC:-"ͪmU"׬R- X04nV' snK̨EERűYcUj'}4^} )0QwjL!)¡ YzNqL^,S{nWzWqľXdoCޞ#բ3Kqn+wù,e8KD" u7$C$6#/5v\Ftd2`9HI|raıfq Õ3Y@L8ͣ), U$ =w@;];`x<Vܳ2CQ+qe^';"/I t3@2nXM= Imܿ Frm]n d,t"W_&E"a%aB)&/ 7^By=A7_] endstream endobj 156 0 obj <> stream xYKWbuOf:z$b(R&)O&>U]9ija?Uu+$2O|KtI! \rX-r';`H'wn@ȴHf8xik[C>2cJ;wPZ4KvV @ B}Dlyfv'|]c%2۝ʲ̅&q8ᬌI.r<%J9"(MrJ"U@=:vZh U/ ksJ&SjFfDD ZS̄.EʩL& 3?+١5gD~M"]:L-\d!jj堹`IujA>h\[Z3"Bp !Yru{XBIJ:apW8YȽd}UrRKWBO@/`s<-ȫ0k;j!3Y1v20he?vc]Τz3ˑZOGbl׭6UaC7]K+Dmn6`LϯJYh+*/p,OW<g_cV$NK )CJQXRIك>VAʓp&c6wgēfo9$($'T)ȱ=T]S<84R3u| VB0]i)ASfє:! "4MY!9T]?_>&] \6k\0$:d,ef*CQ뢸!݊+Mn L&J C?SZz5/}.scXBOifkZUƼUwȪU*ȃ*&{rk hCH-lVBQoU6%SFTDk,G&35AtU$-dߔlNuT$T"r5U/B.7M]^knV# :zWPS#@;k,VüH  hdm=z-^U鴵*h's`b5cX5a9>O%D5  [hI0vE3xMHXLK[8PS=eCܦ ˙v+ n֕=v0:Ȑ#e=}Ӷ)ؠz, @v+QKCKRp4>}-&VF5?U+OO}~/Z73;2 N5r@Ϣ5-C׃NJꞺ m`g!lf6?{ˆ ٥PåW3iy]Ba&؛뚂3 >hź\K0l}]-DQC*b Px +h7ñ4{"h%BԐ9``2.KtO?T'D'Jʜ.}4w4W}ߋOLP$<5"fPDrt(ރ@6" txJPWGe RLY/J1@)XYfWPībj~H,AP.jo )]d˕BP#qdN?`K=d ayRs]FS8P2 Ud  ׵sya-ʹЀsĥ1.Aըz~C gb _t^\>Y\~K\脠Fh!5! 4rW荏4 _M, pJTc:2d0^y8A 1]DŽ>0Ut#Ebb)(% hIC/.>98xhMr5|;Q'3)dX W_[|QRaT` s_۽)l)[6xڑ=hO %MMkNd}ۓH߇_ݤeRm-^Zi<F_~_> endstream endobj 171 0 obj <> stream xڽYKsW0Ѽ%ةl %J`$%%ߧ{zHQ>ytOnξČ?1sj?泴%Lp7[( Cjv+z}=m7*_*QZfr*n_bxJu^ΥZTѪ_O'-uUvhv[;k<7Őc3ZM ?%nqcjn0lf:O6`oq b`Lf O`'"-1EKQ$Jhi۽MS]Z\hu񝖀^ijM!>z=DR՘mZy[iLgg(- _8YV? Ny82@6P]M6i1]nr"1W JyfG+Vց'KP$3Rў:2vh7;e7 J_+ϩI|\@˔ J*/ ܾ<ʆC.*0uJwD5-'/. "'e+e}%ڒGR `Z7 I*Ka>9j0pDa m5hW_۴( #!YpkZo($ P*k}4-xPЯz )\Jb7r%9O__};z<'L~Y<237إEHI9a:y>xÉS3w$)o:s)R0LqCE_p̬Tactrq|e7Ë:q񦬸AflvO=ڙta374p6/zwNlR:sq7/>X>C_ ץz4UVR@ 4gA4`OT 2vL{`4 1)*d/7$!Bh3k;Q(M- f?5G^S‹X3ɌL"JPp9Y $b6S_EÏY(1J0|wy>FU?l#>`!{o/xygXf|^{Gcc3D_HG]S,(CMq_l܇+ʸ>>vdۋBt] &1lY@ lZvE:Y J 宺64PUo$΄|*[mTTŠ$RJzSN6_G >dl`; 09!^v oBz;ӦSWV=-|j <9khbtԫW` B;Z2gz0MHKD /w>_'V^7f^W%b@&*y=苐~=+$ZHA![CYw׌c 0W͠_Om|Mmw;v^C?}9o_> ݭlx#q@G*cC(_g!)>7]LoMy?fUߪܼ}pmcgC_f&՟תZA5~?%Vt'L *v6Zqn-Ht>LS)sIo):$P%ʅ3P"TEw0հ&v\W z 5|q(@ )7Bci#! o3-7gT>KLF-#y 5-nAm"ß C襹'd8Q䠦Mi3n 7 pDu_WiO:ɌcTpF` i%6˅ڔ endstream endobj 232 0 obj <> stream xڝX TqU vK\JܢF1* 5ȰL + JTpck&*1Dc^bI9?a$^r{9ꫪB880 BYje}'O8?:bL9FvWȢR&9v;D9c׳<`]Ћ8(hI-I IPO4iĉzszzI@`xtr|x: jzrP(!A u>>׼W^3Σ=7'cf2;uf% S~L$1g13¸21#Q df*"3g0 Of,f0˘ js25Zf3`OEO(MY`qXݮJRqξkp18uz33`pǟmtqwIuΏd65$ɃqX{hKV\"e%!CZ \߂mdu&Z)O];Tԣ4%t!˟tG GToTn٭;B@)G/ٿ@/lN+e[`M4gs-6J*3* q8K-޾_`v;:bqQ|C%_Xq.JsU~1r^˅|VKj0J9 ­ux7%geh_yZ F0,4Fﱻ`&1u)[긔؜P&{A"lj-0ۧx-iB .`T:ռVjJUP s !^ӻ,ⳎCMrsgS!c[%+gCp饮lcb,8^[ _`VKy53t˶#?ء%~c*q̏8F)nKwv842l 5.;גbc`Q,PPhAqqHF&%G ʥؑ~{WIR8jU7u.K ciގY8}ܫI‰>4+؏+_ha y$۬) hie$i.^8oY~ewܿxclSD dӰTE14ݰ lǑ\n`c49 ctL >?>INjPu%ĆJjn}XOS^Ww@1+s+#x8sYk΁lf'$@d_! ܸ;.'QM8.fh4yZe|!0"f鮡e~z zmK^7!R(Z(̀! Xk E#G: H)Kmѓ\086|3q"hcm&#Fƅ='C`LDZ ɰ^ڡTV,rsX@ qI:V%#㋬P;2lh_>z5t`v;w|q HΩFN w5gߡP MuI35|' Kd6^K@H8 ƾ#(5Q"yхDq撟\@q6` Bv9ה+jAc4a}(,U'7lp~n4پ;@g&wW(/// C͡a0F%-\(q4q/KZl_Xf rhTwu(QRv}>_>7#NKh U *_h\'Q$"]8ňbA$횤%Ym|>wbsV"#'hLO s"a_{JTp&9R_B"[]$2^V&D2q0\͆OJ/)h:";h]gdA]ͽBUul6)rĎ~41@i"/]3T㢖gE\EgcH0ڌ<8F=~۹!-lDA&\u-5ӎOhHlzHgf`B#׊4wl 4B/?t-,59˵v(e讔Q2'_WtRV4c^e6hs8]KʰR*T^F:7׸ފ7chb#JFm> wh$vGo՘*zK!+9ի4`8| gWaq n UY'^gG1j>۟)Zlm8-@II]gY3|:33o~./MP ͑(+ Q0^6I{y'Y@fuR Oc%nTQ*?mxAMn u ?gћ!+*9.5AhMАYdx]PУGTcx,PXAƉL{y2@fޟCptum`7 Y`Gr݊p:Gȸ==QZ~%K7/bq͇w+³VuO<ⴝO+D^O$mcʿr+܎}Q8D;(!_n PWAA D_o@yICa1A`9&&-wm ,~o"Id8l0w+,o^c8ڽx0 QM΢3Wd2[$9Zn:wxuV_QJ",zkcyŗ҃%{Qf ^PާOذ(=?7XfMX'^Kcq&Md5T.Kퟶnt=d z8޷LێS=f޹͸a_[+P\YF6W%F#$SZ\Ŧ6HH"PWn8wD;g?RUWTT`2׵*5pmly--l^B7-]XKk6Pw|Â0@} a 9i'KN$!|+ޫ''=_o3/RӮhmYΜAfsɌ8'ݎ3E^* Ԅ_܂;#SvWɫIFĽ*X\ kCtk8~砜r @G1R2(E0GUbݿ8p & H M}{eYZU`Ia~?a8;Bٶ/4"s'|'y029Pk ,E9>њ/rM0$ F2H ިl?~8foޙ,o"n2LZc}եgxC&r3sۏBpgХ.u+徭uܚj9r^ɎI .>ꪄ;Oc2; 3T P*U3;kb1 TV.]=C~βR qF;?4iYc9ױx.&v=ųiq- WgJ]Mt>fwB hsM^یu.UvRFP mYt<[{A'dm"5d3ptNmo]\6V((9v3;m2̿$ش4pZj=@ՙ|%LV4ˋm-b`ȦB"%hf> stream xmS{LSW>"sJ-kb2_@\܈" ,D-R(PBAyP^G{ZtlUTteqQ--n>f e9\3vlqN};ǤR qM[2lݖKNJH3Vn// x Fq~-*QpF`FZ#0\TDa$E1)ˋ4$&%dӴ\o"S^JHqYD%\g 5dFD2h7+ȕLґF[V@g[eJ?dZ+ë K6co`[mXaXdcR█%ْBtl1Ѓ88 !*utM]==Ͷ:l`9{|"| ,^`AXќ>+fb\~|>8!~S8:13"X<߿Xn7[H#gy xq mVssN:XKf`hn ,xu,(ͮ'U `:a^xFף4E{sb & DO9[kaUA 7֣d8;:6u1t1Q.:hځld ȀDwar$S r7[wnkgij UBLBj{J)ǐ2pjEͰ% }U [yG(Pl]m+mE~ݓM` \cX9Q Bc BP>`<_t1 3אStvO)ACCG(ţ 2%E ^9\њ.`]3mx$U=!3(C50{{ObW~}L[cbWpOwM;XrV5PgZXny_1[F>noEXm } ݎ^RknZpv$@ 6V͹W!$Vs4Zb( QcnuvPC.A~XL"ĤZĤRU&t+0}bgËGv_>~~UDi>ތH^ 7~«.A[kPZ* 9s{":..L_A3?tgPC"ƨ\T$D- EN'EqQQHQP40QOŽ\Wܲ?y endstream endobj 236 0 obj <> stream xڭViTWh*cڜ'qA$(q . BB764  fB0% %1qAø h$ƙW)PϜwӯ|}%!<= D)8""ԩ U9QJmB3pթ&u4BLUqwռ Id&y=)pr:,#Y.$0Lq^d\Mu:xvڎx1~>tװ)@4#%zF伝6$#QSx*!FH_>=rl2ʼnƫ) ǠS %Vs"y嗺4X )Ād8D4JT}?#^0$L#YnP,~ůROFTI"f 'kh*" KDإXmNNx괖> bBá0DH o>>k޼f&vʓ nҪ+lZ())ªыX%\*r@ṿPi7[j=9落f6NBM(/-J1?:dkN>V"^d#?*':[9x62 kh**07Nֱ(Sг;ck6DĆMͽJ?ȧ\-4jβp͸(?u&kCX@hC1R=oD[Dt>탨sV&솲bfٰgѪ Њnsvkػx&^`ߟf")x..pYo;_閝3.{+|rP}s0Su'`ܜԦDW- ݤ.?/'[;w5-lQz(4,}MBd;UNpAhJTͥopVص(ؠlgFܪ[#b6oeR@M8ɷWf߆"F'/;+*]ℤ.CW {3-x1nNO|/ux3[.zm=&Ao=ݙԦiԶ;ql$9ϼ,"<8̍31+6sp]\-:TE'~G U!l (oѡꎳPG'{X'Sb;g ZK=ܹڍkYo>9Ir1Fơ!9 A58.AR?8m=-jcbt$l>ġ@^ϝlXиhUS`L4ρ(=@ N(THޗj#߰HvkJrM/N %ž[&)Wp]EG NטZt-횷VFGxˌB!Өl0T6X]r;@?8qf{0}0G:F4y3O1TJ# gQ'b!d-Y2G9+D(2N> stream xڥW \Sg!{ED 3 S[-uRVhdaOr .hSNSתik]18o&%w?rvD"-k_2{߹Qo _S"I' r ?nebRvJLTt %w_lք%fx&Z3o<3ɗ1^3"C^^A^7 ~? ֍fPS+:j(i3ĹS"W*^4b(7jN&PQy|YjL\ TEAeq|YoMx~0@[$#whEyz|@.4t` L( ]A|R,RCpXŸbTt~V8y2F/SnCv骯r<^*ɀ(iCe8M8Wܠ%'^h\)#Cet1V{EsQM.tZ-7*LɴKtZKSH/p.Ȧ>kw:[\8$Hk>A!tWcoo,U#z=4u)V9wl| Ǜ.gBiwgF-z;֫? GaYU1C yݿ|`Nx1&I(I8}gQ"J>< "MqZE@PrioKXs۵w`BrTD#>tD#a@ȱ,wZ8S>?aUIUg7tm~`r=Gvm;{_"N*RПVj3k, eK/[̥?NήgiB%Fv;єgWni|b.oY8^8FEYo246fI/~[ϗ/,(F|`8S)F:\%h暽N_80#K &~W<(I3Q_iI&NwKjoCUB2]Ic`SN9 wz'X:ZSJg@4$b_cgY&CNqK UU/S4eru ?%ËKcPUcy:X!:RtԫsL ~@^*F3T ä@?{p?4@M UUa7 Q 9Z4Cmn,6Zf+QX=.- J?M %* qh% |8r-,\1V\r[1S0} FWVFopUK}vEEh[T0 r⤔Z-kt}fnU B&q~-:5AhQF+Rx@  ChKܟfBhF=Xݣ,1j4~akt5[s8'G#݊Q X+2k֔W b*/J  5j߂:*m=u#pJvK:x"hTm1݆Bue'OGI}[4 ,972!fdžHru{xJ]$qҷ<:NH# .ly+;O槤TɌrWH>=agJ a lݜ|)ax/-_d5 ډsg\y!2?G=zK&wƾKÙ7K,ȰkMݟ@vI43 \;,#Bl",zUf=/^as.IgSۮɪEޟInaK#J®ȰG3-)D廒Rvmŵ# |xڑEGyna#=iw~1I EF3j LRcNsc!DU[HIهĂMBo-9f_T勵+jhCT.ҥh f˿Y@ juQQiIa!X"Hq靊%}E}{TM9U,`xC7oVUit7['9Pd挄7k**Î${޿X ]'ˣ?\Ivs9 OHz>ˡvhm"Y=mI?m|3:]q1݂KhÅ7L5_⼊Ee=9/WA#dzhXG2 'xs9FH2 ^$ūZ m6] ur*a jUFl=OPiB  pfDɌU&0[=nsYhN{,D@;hj,.TJz7bg+k%JHWqР3)kAU'sPMvhL,RŒ1zOMЮ6 )AVQ/Pmɾ <-2T:&kz簡qᑅI$ v<-:von-UQAJj/[,+}u셯JLj %͎dM0;X*lYb~aաptjTZ޲tRPjn]}F~^P omžDĩK! AIY7.agd)% U* zjdNB>C,?}SLa٭N5.-%;7);$}ZX"n{qOoVA & 6wǶhu2ɚ{77޺Ro6 Fqs+?MGf^ endstream endobj 240 0 obj <> stream xڅUkTg10e ֙QaYiTn* V"$"E"B&4AKՃveusή2Ek=vk[z=;[tt7?sޙ{y{> "Hui맯LJU:1j !JGL䂞/GD #~%Eɸ@=>/p"$ 1fl\yCQ-Ǥ#9-EDUi5J~216)V[T^ܒҩZ&{2}UVaɼqJ'6&XL,";HK aH#&%2b9H$VID F&Zb"H""I%i%oJGTh0Kj#D gQ6Th`ÈgfBMR!z33xK`?ac$q 7wH(8Hj*/fP' Ndp.!bfEK9^3.\4pa[i~&Ne*X1șaF}^PLAXB(F=DjwYt\WqIT@i(\}:O3눱:ƩYQ=Q>{*5s <yAyWݲ %_l9X{2P鱝G*N3M6˅ֽX:,s}ͼ#.xۨOsg_~F̞Ǘ7̇.ps%{NV~ BU;Ze1M {ORxy(d1EJ%8([m xZ$FBʺ,s.9pj*kIKWeAs>y~8]RG4W3ͳYb%~xh ߐ<]{(ZL< n8PX tXqX1Akѵc 4DQQQ8\q 4QCBX}¿f endstream endobj 242 0 obj <> stream xڥYw\׷eagʸfQ3%=֨I=TĊ EzGD`).{vRGc hc^bILry/&{3ܹ{sGBYZRfՊWmb@׀)' s QpIe?*wl*EIWjRwW;ry}5e%Ȝ#vO4yAQ!;lN}.]jX8h蠀AYW,7 W c6 5as>tWZMJ-gdFTe9yT/ S)̄晤h UEBLZKI0ڌ.^dլ}J{QpF&ܑ =;*(8(+t74h`ʸ8H-Ii^/hQIF*hPԩ b#Ldlv0 {⑬ htH ff6`Je}*XHjDRO/UA%<_x&3~ AOiP/ވۏ> 4Q„FVVp8F@PCq dE?㍳ѳ &맭5=-T #Hil=eK`x+=_斢hƧ2pGZO}6b JʈuSVXLÇwjeh/tz7 TQҸ^G'>:= :!AHt3/L.8_q;m׎L춐M>4X;޵t0q/b%Ò`0Pdm@D~n8Z1Q^T5un'M6EL,  _G Ǯ$'+[#}wY)`î"<Y5&ۇ4(-4F5SFl0~9<3=& nPu q<0jF-D=sOx'^bnTM: >ǑE.­3yݖXF,ᢗ_ mvnO6ʼnП4]~$KQ k玹ɗ_І:&ISl 59|[] `6wqxNikG|Yvr +yD|ΰV~T\S%Yf֨1L̅ MRDlT9hsY a?lt,qS8_?Y.|@~w^۵_⭏W]pDvYKeNg4qNc$59s+*DEO5oV;]> <eZډ4Y&'jߗ [\9\qcTͪWnNpsv𥦚cN㏶5|:|Ro١ %BS QxFE%c.m0dҀP/C~,B81ɫR͖o[!ӣq\*N{s~f h!ЬeՋB^2zYu,oCL蕶E#LHc"z[ǘh/omYT]T p廼tmZELւrNeF Tgh%>JGAb>cjzNaofFz&P[֞Λgu9+*v}b*2Ō%b~]_-,ou=4^=nSRn xnT( l8NEmvgݦ=YM\ƒ0,1 W@ ]^[`SDQW_Y aǺZⶅ$:r)aBd>,3f]HyĚ2lN|^C`6wәR,ʘ.;%-ʎ!11XߕCGβ|x2'qepnqʉk 1Z00#~^+7n!YkoVlݣ E[^aW)ϟa88[{a4`zaX fn(c$nʑ ӟNmna0,R~q쨭k]hȬ)C £/=vQ4*߼/c4}E [bT#5VۗWO|Qșa*7ul}^P` L_Tn.ۗ}5);>4 }!3t_sOJ}5b $7HI> UR$dDC4h1ؽEFݎ:¡?-M!J%z!$H 5hl Gr0ʒpIbKuFaP 5Hن>{Pr; ,}Pr|}&uZUȮT^ cB26 k%nɗoI t:Ca Կ^Y 7A2:%DiQˤ[\m81|D#혈bِ˩ԺDb~!$b?L*@'pא1<' Kѝ!|01+R.[IeU[~E6l|k 7tD&!nFˑOwMG\ h@?Xoe&;R K,Î>!4P. daUvgv% }&{a5ݜ X3GF S{<W]qz- -c'wz/2vv2X?4{ e[ Q7ߥ/$# %Es /I0i!Cstˈz6W)=i 69`LA~ }U36?%nOQ_fW6p/-e򣟀Ss. :N=Y tZv2%NKR„8.|(Q G*8Tis+;fEsy"@hO}OӱvEdƣiș/ՙoCE(i]4j!SžJtwOKA-8[*"E780)>CL.C`fo&ˣ&t[2kEԥuJT87~ß xxsWtDӅc߱ `) {z07nV+q0M8m!P8/\wS'<}~Lޙ{ZN/G ]9DýX Yup":%%[]2RueF}BWL#QNQ  `SsOXx1YqD|wE4m!-- U5 ?[J 'C>oୋe70]6HӅX]jJ:9 ]Yg rLX9TdrKRs=Sg˜`σ|0dBݙ}Ӧ.B­3ƻuGdAiqJPc5j<)2t$)T42s.(v2Ugj6Ii>i(9̰vU);s\',٭[tT-#l~ں B1##GX́mNQ B*W26+;7  Z6}6/\9 )~>dtƘ20$9Mά\Qf\ nbwq4ʌ7%{2NC}%|&E08O`Oۏf7[N*ࢮMX&8hϧsÃ$=~41hh-=SVȣD]ޯŚH*ie=I%fU l zb?:a[x;r,yهcLy 52hv[Bv1lm@ ܬ=CFùeSP-y]dCc2dgT ɢDS {@+40Vf2B#QaMCh (!9{ym<;~+$AJQGygEoaR_v.q/9r/NIcrrqh~*!J#/EՍaʵ̤ŋS;/σ ܙ֑/opPs|c5 B`$}S#x%?rS րGOy4$1f맗Qa;{-Isu gZgy5C~҈]:Nnq=ph|W>ezh ۷O[amJ$فҘm =|'\&G-_'`iuq6skEБZg2#J #aGH։\홮9KȨQ_|C6OOƻ&dR?c㴚֣ vc k՟05f4,`6zfיyg#{H@`p=+#}Y/R߉D> stream xڭY XSg>!k+4sm-Amǩպ "jM %! I!}[չ3;rs{Xݱw}pICJгlON OZHs\1'u2[ eNo3}=kILpx2K=69%'-.&6g҅ 5|6G  |“|6.ڲgkrgorOāhhv߾g oMNK O͚K<籄xXI&r7۹D8ELbE!hb$1xX@,$~Coe *b XG īFb^o  mvb"vn"C, 8$3/'#UL9h2%`uj*1bfz3vYo<=l"/dy>?G?by>2g˜s{~cɊp{؝MvN Ź\u"]ǑxLt{8^7{Cd+0.A^hq<@=hQ'twnB~u#}{&c/Wd8\ mkе-q"9Axj*ʶ VfoD4 F"#ǟTIw+ٛV|i4 uhc7>@BZA- gpq׆\Ws#=X>^ i*G2R顤qrz 6š fE؀eOپ.% %ĀQoBwT>:8A5})NoE&홫4vw)L6{_`8@۠ P% "2PyprBP-('sj4LEZѠFY'Lw &4=U!uOȻmw܎9.v2b7 gen1/YS VNU}VVM sH>}?uG zn M7'l Y雨B W'Iw\0DgIH÷[%, cI]nv\g/C - +NL(S?oFlώX.PI5ǝ/\KfNxORk!9 \Z#(6Y!/hqbHuRIXjMMJŚB4C$chgi :u=EQH0rǐU7.\yy*tZߤ%6m8dԻVq7&g xϬ'F6}h|XzroGh:*ugGzts]IX͞/ܻ*: y9CS |[p`lTHv2O0j(0n[ܶz.Î;iGCOg}UoRݭPH,&Z GN]u ߆w_Φي_+>Y})D 2 Rv)EMz>/x33r墠Ĥt|~l֙t&] Ty/S9h0?uB>4zS[ ]\Yn wׯ!U#SfhD=\6"{ Tfq*u,X@}י./SQCyfUEUd̀oKVCUk ǭV+OOAiMՙ-V[}3REEb~_&D~3Z-*{THXT<']OTbzC^iwtBAU[((600pR?CpJ58V 1y87*mH*DM)a8)_vt5 }6[r D/  Zm@YRX ̹AwI! ݶu%Y]+ǚԚuc?W\ʨD\l7OW@Q`2\nuzsm^4 ΐ7@llB; Qk*J*>2؍5{qVWRw*F]h&[EqB$7 -0cuoطwl|[.\*NÀP>;§4vP@.XڭbwƳrcCEur58aX^:a+;{~ Sԇ8dj 5o\|1t{U`eX=i wAXx\Wܿ9WC3uŃh?թ @J ˂G:#`oVٻJσs~9E)@*,O=MycG]PPRW^zit{ssǨ۪?qJpU(d ّ҅['pqj ʫUVI*YŞr  ؜WM۠dAϏSqjNc E.Ȗ~ Z, 8*ygmESimw9Dnj]:S'8ɋ[aO vdTmLrf 2c 蹱`1c3{ ,&)L-ܖPFB)iT! x&xFoK&n0Z0;|ʔ[c8h:E {6M3G-|mm{G0AFeٵh0 Z#zw:1_L0)}b/t76dBa& Sh׷C2uB;NaD_CFk< Ą3'Xi2*#-<5B(V_`|tv˫ch E\ch1n(N6ƘnN5J`sk+m}ݯS# E#~ 3`A88h#ЭgbBtY7gct~\3ԕK?ͱ+KN;6)4<(إԲ_|pkȡj97Nsg6tFCUZP Y5l?ua~vrR"# on\Ǫ*D#lQ,+͎?*$Eꟺ$OB[smsePwb ONsNCIfզLE VδnxPn,o{e>RYzwJRύ|O-dD Fd "^2D{bma;h@X$@ng)Y&D8a.C TeCع8s\~Uy,',ýoht7䰻Cvnɼ{KA!LmHT8Y)!j}6U:QC"%p`壻ْGqn9 ȧGf3nae(o5dSUz$6`Oat%ܘR 6렌4槥eGNAkWEsz D_=q>կpUx4bs'5pz\RȀO!@uȬՙ52Dwƽ| }UWrVݹ֊> stream x}V TTW}eQ11av41c2vtƕι3KYά+W2Of5Ea2>/0Lq !#s}=rDڈP,X\W,W4(+g=\W:R32cd宅Qߎ7K7K|:j"z%[,Nt} hlTeŵۣB#o 4J1V$fteeՂ?%9DwSri:;d1W)XI2:ea+No[ZmE Cp"D^X$ ɍ7o_>«đ^j^hc\ٺO_v|Lq#p3ڦ\;z}nNnf>FˏH-#œQ ia;xpTI]s|7s61$,*.轧ިx o_X 8mJ#qCtIBv#2U6C{DN4p8GTu |vdB/o5 {, 5B9vu湃3T'gAEdk8FC[ljE;P:(9+lO8HTZctH PF~Mn!);8R;Qd8ܝ氅R)d2g7oAlE!sʣᘚ#Ox7=!(O&քk{l^Жf̖b \ɂ;IL氋%Y -{"OEpUUuqyOӞj5-GƄgDDvg0=|.2CMjB|vdfӂ*p{8$%HU^R!W!%8tUfԕ8/u> sq#\Z&pLDH;DINO,j{̎&aNdWZ/,@v]k`6'7Z?ĩ/ܱ:n=/B<=5o!]c85ֆ.B[tMmkvzM&=Z$Ia7s$m^I 95(kJot)2:7g$2y:;_TgxR:Oշl y-Cu, gQ1Z.},Tul9`aˆBH{<_eBssdE2wK("}ZFV p)'͒p擥OěIȓ--6X\ x_U4B8>.`XbzI~ <~4iglT6Hm:$z=z/Ck 9~M ogB2Z*,x˸ :JI=4b©'W9n;*.2@g6 7ϞjT Qm'vr}WוBFTa"o4B%}B)nKN\_[v&.)/O`FĢW vhJ1,R"S6ߣ+BV;qA^52*xR?A z$à=_AM2d# SI,4۫89 8Җx { +/-qǷM q[~+[or?>б&z+a!":OLۆՇ(!q̰>&E}g*w5)?‹3:d Na-"+T6vg8by 2**?+^F=աmayKHJ\&=gG= dʤ?9g+88 ] tiIk2v:_+w_gשhdܿ!Pu;)g,|F~kKogHޔTf.Koe7ElZn⾛|t&7Gl.\PheGP%܁?C!cPPT!2%<0dq4{{Mwa ]%8B}99Dީ&26 q+38 8 p>JDdԊ8JJ68V6] ؀SD2}lSuRsͻg/%s.aɪߠe[jB=܂^n;D$i(.ȥ{TD6AF{-[ՠRP Ϥ=^-*XQu%ϡJ!JE|8wЁMŸ96mNx5q~Gn"0{,9.;WDJO)0cp^fׁuh>W8a5`[}Y.vgցCUr| <[/ŗ1V$]e5Rf_#HFDاGc{q]/7dKk+;xlklLJd~ER}Qq2iM n.-S ,b-GY(T>e4 yW,-Ž2cq~^r0C7}s endstream endobj 248 0 obj <> stream xڅX \T?̙#"ql9JDiKbႠ/20þ?*.)dꐖ׾MjW}w0 g?yP&&D"Q8{{5kXh؋ E{+{)7#3xD8(JaR\&>3)DB wq?H[{.Km}lWٺx  gbjg1,F0v^X~`_0_[l7us]i_kÔ!]LYj"jdHN9Qde䭠(cJJMLʜ,)+RӨGRzIs "j1ZNVPkE7(gʅZ/Jwoޤޢ(wʃLmRۨ'%LbhIfm7RU]5V3NIS ewTΤI& 2u21y3S@S1onae%E֍KlT~SW^>m9xR"eyɀ ^KQGC jj HƂ/&dvd=Y﫫e&A"Y\-Oi)'`P&pe(qkE Qv4N&=&&/kK.D&%Z24; \cH")w+6S2r)9l۽s}2m9:rJ/+'_B@xxybfև=DAJ8 {w7ݛi-?ț:c/'-G̈IAU^K,KxJ܃ ,1cca'ʡ&$GY^EEyqE11б#j> TB5yH.Ɂt`bT<ĖeTMLJÒ,=ʡ9j ==#:ՑqN6w[B7NcZ}GFPpEP)SLHJɎ˄FU_4_bDz!#$rD1~6 i گ  >ř kU51\2ɗrR'1qU\#WUyV8#bVC7Ƨ1Ky !NGΩLN+\݃מ#s9s Z3Cԅl!UVƑTxǏ2콏οw>glMbNL/]۞\TAyfEjq{'>]/-gN6*R22A@5]ezKy ̻Zw6 U n "qysc'DFp lQVffwu]`﷛pe_[ n5GNF ^XY~zed*Ƿ=NoRi%wt+6?rŠ˪Z+xHN\-,m>V{AÉ.lz>(,\g3n1L!pgL͉c)rż ~"aÕ1!AkpXgh܊[rnK^ɍo;1 t+5?^oHx|< 5,v,W+=?ҳm+㛷WpK9YD&;}냁] 1Ta`ń-!z_V4`\}>] -JuX/8 O3;. ACVv%bh[cSAɏ#;Me ;A g?qa˜^-dPCyW"o .Rrc2cw:k8Վ2|:8267t ΂d&KlFO5[IvlPҥ 5[(`8q;|d'_t"zj12E7"x hd;& b G#*Đ 2h0Mt˵^}~JgIpu5k/$I,8 !{ˀMqA^ cV4?NQLc}uXZ>bXB+9B A=%TkF/:Ý 9^ ^+:,r/Wݟy"la|}^,6,S+jMy_ٜv1 _Xb7s\WH`V™SqɄ ۣiᖾ.1iQ{ [pX~ Kd 2_ulB~I厦2Ѕ[ȤZ(>ȯ.hC@6$yV|ۀ'4yң m44;ygik@>J,Gi4%%Y$K-O>ICB6v!Zx}ۈ^b/ZvmtI@oF\zM.Yڐ^g銖?OV3P'PU.%<%VɱDž{?((O\&^+ h#lR)M֚&. z 5us=fSƩX endstream endobj 250 0 obj <> stream xڕyTSW_X^J ´jĊӺ7*.C$ȚH %7,""LXl!Q:S1K3=ul֎sp9ӿ&9s_>/LJS7reܜ5kתU5"–I_"4/<}4N#Ad'ב"L>FxDD{j.]!g߈(qN,\J"MSd)UxUpq:?TgUdD"V7%oo\a8fúgg[KN,!DH)KWeD4XAJbXKFbOl&[`/B@Ӊ^`U%8n7}t^~Ka~ 3aWp3 oB!Rhfq<ׯ O >;Xh~^[P3=;wUbI)ʭ1H.=zz0rCh2bQ Bw_ebZ4is._ȱLS!?qA!1񘡎ɺc,ֻ\QZ78DvQǨ@AEq%k̝?Zhֵ1fUkП^-ݏ&n_TQɲ^@u5Yh65yP M)=,QTEa8s\i_x$Erc (0GP^ygԒBl͍],+ hizW! wjfƮ_z/o·med|>i/9$eIʐ";j0vD*)f"KWɷgƜ&[ EǕ蝀"ۍ,Z螭._eK_^зmo 8/{r,P(+g ~`7b}PΫE4 _.3EhY쩯)p>I:GE*E}kP* Avd)GɆF{nNCVcԲA^<[T| @0l1 cۮ)F+Goݴ\A>HԒӳуΔZ Ps-dA9g5PfPO;J" zxڳ[l{%yZA@vi~C0(+܋ [q[~ Sy͞Y I>DlO>q3C>Bes:V?7~m pPŬDt%d]_+Umb6 - YO@Q{@g^cGp.Iav®a@ϩ~Sf)v =u`lb9 t_;!& ^#18erO  'zB*1OFY:1|#%--=]֒Ɍ  B7_=[C"BG:Nj58 6I!Ĭ&u* {,>7C2V:@^+ y2 c^iS-F V|pc ~qs?x>7U2jGʠ_8zUjnLE[buRz/)WsH\ѱ0X`tՎêK]m8T z"YFS΁]H3 E 6,0x_^źPr%\btǨbH#~3@bAj:Ejd*|3Kwep.i@z;Hu z Na毎[t>vHx\by-*,K(DS-[:!-@$&%:Uh2Czk$)@3VkC_\U|" Gh5jfCRC/2*299M76[,&]h,=hq )+ NJ!Daf lPkTٙZ`w+ϔat''q ,[T,I2d0mEـ0 % FZ -R #j~G] 2KjR ZN;6lwO2F_inS".+wࢼ+5'A'`4.PBNʆ p8 Л&Ti" >~DnkbT5V,7Xo_tp7 endstream endobj 252 0 obj <> stream xcd`aa`ddp r74 JM/I,əa!C<?/*UD~@5`tb`cdͮn,,/I-KI+)64300q/,L(Q0200 I Nz ^ٙ y) ^zz ~@L<ԌĜ44`נ` Ѐ`M=t~E9@g1v&FFk,\{c ߏ)ڹr^I7GWUUg n)^}=i9[ ?]X@ߡ z&L0qpN>u{= 0a~Du z'zpT Emy endstream endobj 254 0 obj <> stream xcd`aa`ddp u M3 JM/I,əf!Cy?wȰI*BD>@`\+SReg``_PYQ`d`` $-*ˋ3R|ˁ y I9i i ! A Az΅rs`4dd`bf`fdd} &,Y߽~yIww[ʮ=c}:{} 9O/>}h9ǟIYurlY=> ^luͭvݵ}ؾX.~Wo&ڦ.^1gܖur._~[+n_߭$;9~<{)XBy8'pupŽwR+xxO?r$^9 Ie zu endstream endobj 8 0 obj <> stream x][s7~_o#V_RJnj+lͮ4ղ8)I>n52i[JL6 sppTZJXIS*iUPKQ ZӲRat_PIgM;Yi .V*@PVZWJF]iTlv-] `Dp$Dc+ZUA}*$%4^kH>0j4ꂩ,++'PS*|9&фvMA hXQT U QŁAU>*^TpyQՇPE&a'AHULU  UR*D<8V`  &D!HNQlHrc$ #(!2@hXMa5"XP^\8Ґ)4/E핦 P%i,Poʒ )$N %jԖrB$,ez$}[$f;C D fbKl/Re4_ƐC0$Ei.x"vQO+ ̈́)qAĭPA٬B4X (S>.IC,TNNѭ>*ôĈ3Q n\>. REb=/qYGi1 y>fVrdSC_UTO/K;zA?4lXh%|Aï57?H%0tbQ~G1LtN% :\1՜ ?ÂzXNcMt~ 䙖6Ԝ2@GOZۨSugAvx`RhDnq04MmrB1 *P#60MN45),5鈿uW2;яC泫MMHs*,͞\; $k{(2.MsR՝R>K .EF_:ߕ{?mg97 aA[\]npG yi$˔2ncC+"ܓ8ɶxGH'cv9;OsV>orr5q䖠q#f=Z.(^ (n̖hC"6GN;1eR&icAYmG)WӛG'X5  TYwqszTr b~e /nxZW+J1TʱCg1 <7 QX'wI__Ǟ>\0|F1np{c-;tDJ+HT.*O}{iT aXKO%p@RYWsC-_ ) 8Oӳ,Rl F ,șDHfS!?$4&P@ 4pf69EvO7,jcS+tcJ5Bތg5Y}M=(9(n!EisU~}ԫz=?܈t*9Ve"+>;χo~mh/7;DI$,:>dl,59*Էq֪Vvv 6;=d0zt/jIWԷ˽w1$ǜ΀ 6",QiY u|$]1jf3>u CΧJ—09 漕1^\R>b8 9_.+Nj?mA$4Ww[w5zQH+ /Ҝa3|Džr,9].~m3=Irp QiIO;ܽR͘"[LֽyaP2U{&BzZ쮻nx3ځ>g{@GVخ7~Z͑~3u?~34ѪL]~JiOkNk{Z:ЍPcc+> W"_VV\)3pr04NI UX79 Afý'2 |z]=Trrϓ|qoj㴣Uڑ"} ճ S5y\#l xش6٦ ;.&*.YP4"^32DiXmg5]hQ >;@a&ʼnߖSn]0nN;F܀.IEmt^ù[N|icjK"2?ܧڹ{k (_,.q & OųR7m.nk6v`@ K}3X2<%{*s5NO8w*O?<5pS#XH4/x i@m1 HѸ_AxBfݤ\Hlnl./)>⃲7g#Yma)#lG:C# \{u6 P\7\FQZS4* E|# hFnl[dͮg7w +ՑQ8Hq z?z^8h2wTCvlayGճq\LNHhb?|Ec:)qCq>l'_W_s6l=/R<;Y1a}!Ǫ H8H<րcAp5hTtߨQJD}+X [BOb˞B4i% ~:+Ǯ߬~[Fq}'ʧyQVh ݪ3O;5X];#}lQ3n<ؿWqX"x'x!i Ze|)ɗdVy#XBܭ͆=^ʕ^")_Q!͹ʱ|N6霳L*]dzTȗBk/tD Eg(yCKb2&77MnnrsAKo876xӺse"]&e.2>3rVlm@]NŽ.5=Zts*'l9;|N:o֛"}}ʼr/s`^:1Qr@֮yNU9rhc5^4Hs)B(:ct2g=~<˙ZkjxrL6tG@]d!ERT!EF.3jђg2e2l<=ٚſ=ٶ02PFAY,fP(f̆lz(i#M!w(){:frL98XaH`lotvTDz8 p}g-,pErq9+R\,>D'"ōH?*(^Kx.3i~rhZ^Kri&.dzu3ެO^i^[n+Y˪}KUef'Ogm|l[{s}j֧[4듋ߖoO3|ۜ<`}@x1T_>TY}`aW?\4mm+?ͻfrgIz"^mw 4%BWI~6ίoޤ#quZv㙃k D4ֈrT0^fySZkn JyblZx1V5sJ~k~&]BL|ʕ_9r$DoU!C ʥ7b_&I=-Wǡs2)H#ݵ_rwqo޽|ƴ* jrDe&6`]#:1;$JQ,=@[}O [)x\ƯDi!7V6JS _a%$mQW/?]d) ${GG=Ժ~Ġt7_vo=|\u wDn+z9(Vm 8xKShyh1Jo:,&7O0slٿfH`(HdjbUGj&仪Tffav} ?'azO3rDtVB<:,FEw]M7dyOm:A,S|(PrmS}kײ>LAPO++-Lғpz=x%BHʬS ݶvD&6_ΣRsy7i?Lp}_C#3~2.y&Qz]'|mnWy>7!ykYHQ`y }ߝ as M9UsNný!ٗ1.KL endstream endobj 256 0 obj <> stream xcd`aa`dd w M3 JM/I,əf!Cy?aU |" H' ]-&H9(3=DXWHZ*$U*8)x%&ggg*$(x)34R3sBR#B]܃C5Н &h xLR7t9mJ`W鞱F.ݹrު)Kv_=NE.!P@xP;԰ߟgxwS?0X_4ѳժ?wk=9jʕ[u_.oߢfZ-Y1wºrxa[+n'Wߍs%s8~-<{ XBy8'pupoǽwRO<<z7O;W1+ : endstream endobj 259 0 obj <> stream xڝW XSg>I 爠16vbDJqz]jU" !$,ʾ$|!! W\Vq\f:qq{$ϟo"t: +|fYQ|ưX˵~Gpo #dV/q[ַ5d6A+[֯&XV/~?@X bE\loUtRl8GOY3g<ǐ$G/w]펫׺;S%?F8NW)Cƒ;U;7mql㊍m^K/SF+xtvo n;!&V >D0NFJBH {p"\W=b6ExKerbXE&눏Mf—#[BB@XfRA0^xVHtjUBk@ry=uLtlжIb};jlĎAn-{Mf wrK~2N@72sA?Y@IH ~m%ZvSiJih/9|\ߟr_ޛ&Ed{7R͆7EVjL O Y>on)[(g|ύ @!?4r9 mf1ڿDʛmtg/9DIIG N>r;UQI&sm]3=Φ3P ԹsJ2嬰 9\ep!^3U¯exI :/&/89Џ 3n3kFa=q>HN"98=u[r>#K|_Vً.cı(i$|&<;wcmJ-RȏZ(i9n-?hgzGӑӍ38 I̼8Hk }T_ziEL,Ex,LGhJ) :&!`"U"O{^ $=bEX7 9vPk,M{O"E7BvCwuFw18_rk>Z~ 9Ԅ7 ZQ |s&[u: !>hQZ)tLP[)x%p#OZY蝶7 =L.mܔ1ˋ45 {'D`):?wA#dxH @I!{X' \h6u;Əp9XBG#ž,a6=M6HS#aw؎2QHHse/8c[NfK&ap0@@ z}o1ɒϞ?XU3k$v,Z耄7mnBO K)6;V* ]k qGR}l12CNohHSs/rߚsO0/WnS]IopGp຾~ M{*apLd?(tДC2dhr2K 2>Mamj q9?)|ӗ5\{yv_ˆۡnm #Fz49l7\ uBrAd~w] CqZ[.tT_gw6ax<$JPgP7Si]f?! g] S ӫar.-ig o3s:!VP`-^g9\)Pev%mRznj MuNZ ؔ242'T'ƥ*waR#m%sޕHU^XmRَ)۱ͱfk }bkkU6늊uz}3gNt{?Lolw endstream endobj 261 0 obj <> stream xڅ \w'$ VXZu&ڪ]K)VTDԋ#rH%KKPC9"ţEEZeVԮںڵ~wvx !U93f/;? @Dݚ%a~HyVlpi B2qOVoqqf\{\9O ؠx]\S$+|+FHEp\b*x1R*NYPW*W(-dei۳]RLj9۽Cx1S:٩Z"Y ԙHBrU5b(&N flb1'@,$D XN ֎2bH)`kKM+k?t#ҩ|\?_nG89SZSrxZO p eX x,-3P P)J?1"TڬڳyJ+ᣟ!Ӗ'3jNkN TU1rk)~׵rr謕`Uh[b!rhb wSn՛c i2%68дz6!-̰" ,bc&zNjEi/>&?*dbH57% ZwDl qB25^4,%+bTvdyh[e 1$={gQa=8cc#x .@2^|UN U'TCU^iV^n^dPtjli:vr0L.8  y/Mtb-Sg\|f4@uw =OA?9t>W}ao'-8=A)vbyr}?\ǎՓW^%zɤqO޸q~.pB/ q^FD9V+uz[t@G:&7l9@= oK|S-Lu&;9[y]YP 8KG;a88zsŊVz#uPWإTBb3G56d,q (Yiv,oHK=s/96+&lJgUB+Uo´xS1ACq/l7vK P_d‡ja KgN0g5(hm@5|lc6R0&hgF@9%oV4 1rwAƬp3>G1{n}'A"&/P(}ti{<\p8}ًg'06797Vz(壐J)N+Y׆f5y׷5S[zq5)@M:߀¬E3Qpz0"}8Z)kt m)l#MuЂN? rapGo0~N= ed3G9 3 sIp'ALnhrF*%NȅEcWd?#;J0WUqE3,l<Ϸ.u^.[s_ƻwW0.;"!kvPnӧ{]Q@#M񠂸2UYE A. ėwݮk^Y mBYaqkΒWfaϦR76MU :.gb'N:p/C03ga$ye𾭓 33B_F&G+LBu: o>٫!C>{ErT=A ^B#b=cb4^% #{{%BЯZ05uMlll33΂k=Htw '<l4 LOD 2+JOZq$[vsI\J\qP$4FaވN(Jpa#)FxfMpu.\&}(_o{b^#F;$' ^O5 >gx. s~y |] 閧VI%PE`,H>x"G ?Knwt\1K@Xo)p"$uD҃{]~?xR^6,[VSfBw!$ eڽUio2B+J]{+je q.9,4>]PWj6̿tA 쥪,PmH6FCi==y7@^F~nHU(l'I,z'DVgXX/&Bk̫\ɬ* SU%cQ_DNuӮ2P㬶D2>?,&2uz5E!JZˡWn&S1S&wFR Zvg~|jc`ڣ%/OMSN&w,m|#;8כ 5ʁ҃fE/x'K=՟|j^OקNU%lNI[POG>?sZpHi*– eFe EQ5;WnI;UP EwOXމs9hy-8Ha)b|zk9j1U!H,16Ņ%%PIvQkV( ka!OT)Y) \͉ɱ,]ڼ흵_;Z X(g] W $_rWVǖ3n5Tnwrj)+3 &C o4Y /(' endstream endobj 263 0 obj <> stream xcd`aa`ddws M34,ILa!C<^Iy&Ú',AD@1PQ@44$83/='UH9(3=DXWHZ*$U*8)x%&ggg*$(x)34R3sBR#B]܃C5н&Ũ$Y|hj_e?X +|Rs^Ԫn@ؕ^{D;{Kod3OYz{DI3gr.<{qlr\,!> stream xڵWixTU"c7FiBBzAB!BTR$U=꫺/I* =D ,8QgV~\NœiTôcI{~QӦQ"M|ږ5\.[+M/\4qsvneazyxoBPE!6&ĉC ؏։3~ Ljb(֟WvY? OePq"]2 LiIK.}buI;*NqqDIm?m&?< _ toE|'% mp F;z\!hooΒf L÷OJz+:+*;*z{;:z\ BhVlv ,&+6T`lkC!q-[r <.i<(@o=LŢ5۶ClsמL. f/u$u{4=8,@7{"op *аfn1 aD̍>iv-[ |B @@ge 0ERaj>\C]Iui-zcIZy#3 `y4PSB {@hCҰE{MBTw[X^[4܇@BNa5`2j@'t_ /ZVf̸02VOv՟D};k G@_' NSL<6eVm,xhCSK+lnr$_}KbRjв/Bw.}fv|fRGt"( !'o;e *VU̓hGl=hwKO{I ӡq`W* \,L&!>mۮT^*BF`澻 \hhnj@Zg_~6hkU& jzXYt o︕6sĀh]q. p3f%8zHۯJR>R Ix.~?xz ^NL*8vg 9j&~%@P~4HGƀ>#7wETsPtt@Cm E)&"ViˌZr8EfA7T=2;j!^|22ifsHB<98IR, /s g3X\Ǜ.z|<;jU@D^V}'z6zmB,zחն|+xӤ(KkI(*u-Fx<?썤O ]/?Sá|bM Coe:*]tġ 1)eՍ5}}k7lH%ssiols wHjr|Ȏl@0wbk178U5e`vLY@e`ޱ#h+|l~ۇCBsT|VzkU+E"Ycm{wwW7Klm}&}hǂͷ?7I+mliH@ʫj3ܼPrWnVWfUc/';Om"Fy#Co8Sb*iavkVr8w'qCN5-&0Y;yMӵhtvkW̮:_*^L{6܋\v5흃?ZA6AEe <9ݘԤhޠ'ΉCaek (h=yceRzFp7,6_ mݶc$<?Rn߄kN/-o4!h3[G${{i M]94|o^>ʮ38ÞA D#-/,Ӑ ϲ;`gnA9|<_ ߨ;eHuDzrц[p7FndPAWܓ@n6g(ɯvVyN`Q*ii :@uIzKgBs-kҁRi9|15dXv\ZY9DBUYX i 0<x2٨[EP^*kWU=2_OO`1@z:{;@ե:L+w^SAIgЇ'|f6?C)7ēkӥM5 vKQuN/~,l[^1Ҿ38 p&Z[te"cCWApAm(7H/Iv1MlDWϨ ҷ>`p.utQ]W<8F84Xuu*ocCXnMglRh;h^%s_s?L~PPn6mj9pX6ҳ+^.51k̚113`hn2Ϛi$c2  endstream endobj 267 0 obj <> stream xuT{LSgzƜIMYNm3>uN*j[(UJ{ -A}@ܜ%1N㦋3Yfl.wۯ}'wΥ i[4n1שt6WkTʇ JI D;b QX5+ҎmM?EGHS|*rbLrizte7 ]uWVkFFiTPը)5J7^QT5U<֨R4 ,X\;h)oFv78Vs  䍋#Q2ݎPD^-lz!x{V{{Q6]LU2)sK#yl 6Z'oc Cl'tǑUʐ59yݐ L\嫰҉iwalMFe* fȷSSߊuqN?x ?H.>qT?j+;p3xC:rvN]`6wJ`XC00. :b,= ODيs{k8M"o;'P㱑2Pqk(Hr endstream endobj 269 0 obj <> stream xڍV TT>39*ar/3u]B L|$ͪGVZws{o @I$ח.u[=(:&UO'R$IE^&r,DQqv4X?Oڭ1jgJ!(DhrCbS"fzxzZ-SN|LjyDdBR8U(rʤt2N55ijstlz*i*$z*4xqPjIP`iI5,J(O,OPk0*8J-HR VLFS8ʙb)%5)lj."G\NP+@*!:jPT25 yN Kdof9O{_(Rh:c+,# 7{$ī# .iwL@eWDJč7} Ci3׭Le0+lb}mĹm { FtctC]d\CK.!{wm'ݾ7b(SjEf/T`R'NHvrf99˷ӝ瀹qd|=ϯ{!$xogDZ=\`S"N}$|mF(eqyppg1z/Vt '0[h_}:Mu GiAnW: ەŷZ,B;is ԓdJ4f`SVL,^Y 45eERK[VJEBx|< Y<3!+PRVUys8 -(X=b{ԖpS;a؅ `gk|NlA86Mr#E]ȣKrfݮGt;|SNx[E]x憿Ud?ҰxS1_ƾ$=6t&F~:[Ng1LR{L~ɞjk.!pd>ff@M 蛕 sx+ڭQ͍wf2;T!Gz̑~4+"ܕihMr*QU6+Uf#EN[hv];r bB 3gH~cdPPKL{ ZihJ +i.8X4B kYr6:@~`F.hSii-.^u,T웈3$ht U`A5[]x/o Oˌ4ѰֲÆ>nqn".Xٝb&^)-¡ΨCÀaYbݍr_Jgq jSXhz spo:Q$3ː*д/?ŏ$EnSFW^ZˊKfbVNzz[ZXʩi$:Cʺw{Sғ4H޷=lg$g;Q] v+ qeohCZZ3h:M^ 1(k`Sio\MNwD\Hzæ~y7=!Nf{Uq8/܇ ȖmOzZRl= }5V%R8eu1Ks61HjY =Хs}#zE &ΐ lnku"N,>cvn?6O9C)%Jܷ:-""ji c:qH ^&1ʾ@K7@+"G]ܼb]tf&gS`b%~ð,_d"fml7bf\JUe'd؅U\)fm~3⨡1}̏HkڎБj)EZ_#DGߓO [7$:4=2{t3y<O4;oم$XGshdub LSJ+|]<ػvd}"%ai\%XUb'dAV΢yePWzakl4tiJȂx+]6~CPU4d36w:aPPu`imS\*5YPyy #BpȰOj5Ue7!$qThpc=0b4M&c$hLe 8>y/~}R endstream endobj 271 0 obj <> stream xڍ{pSUo6BAQ([tET^0"}@"TiZZZZM6my$w$mHiU*Uq]ݝ;㞔]Fw_&w=swHN&Yox8{f$C!-/-\t{e*1`]">__q&),dM) Bdn"κ Gq5* RQ^+{Z&*=PH[_ԥ_)-,Jӥ3m^EK* JEw>=7}[s\pꐴ|XK<%@d$m!Ҥ BagKzi""b17:B+X(x;d.eC?E܌$;s>sCq :"[V +˺*_:nv6ef5C]E972}\oP ΉRqH3k'S2i ,Ylj (w$^B;;U~,oiB-4we[YM "}€qHW DZe ;@v)4":̯a i@:AƝjuZۀ zA/CP1-]bH0уa\4N|`bgt75PeM6j#*$:!<6hh5LR%}pFEҍҁ ] |̃$(>;)vV*v!wH$x5FrjXQU\ HcN.8X> 1mP̖B5lsViڠnj P &xO5h81ʔ*ePXd8EI]^z~ V͂{_ҳS31J]sf#8Ty SRYVjC`ؠԄ pBBՠdo" ӏCC۪0~Gg6iNQ C3O Imᖳό-^}He/rbzx+؝'-BR-m]RLFnz>L?k?sѪxdeY xZ $n>g@! r0D-A*s= s˒U{$,9Ϡ+gZ#@=3y,-Փ6ff^#w -mnmri {X endstream endobj 273 0 obj <> stream xm_HSqmӫ͑Aa$]mi5i9͊^Ns׶2p>U!hJ{ŊaASQ= fп߽VvD ||΁ePhnhJj/WB}f7$E2՚4jȤ1vVgkiBvQ,hƌn * 1]3VO pP![!IUX$ѩ"9V! rzKv>^'jTb*a;P)"Bfmnj=+sJ[  .Xm߁Ex d, B%w֘7I_ d`Zq!UNqQ[Ƨ𔘷Frg*N2HX@-nONK*(O-H"R;}K jSDwoaQLk"y `udvaݖw=e|N*;I$.%FFy&]sS endstream endobj 275 0 obj <> stream xڅT{lTU?3sd%3 ei BeQjхvŬii:tڙc:mg~t2-TB]Q!GͦͪYw71圙skLix1?|~{1jxѦ-+M7kqZ+YjZMԦta<>+fNb_z.BTx >#*A! aGKhGmȋF8::,xunZښ^[bqnp1Ϝp޳yli[8crrT}b A o @9c:)&]%y_~yT(ACa?\v&O j1Tee||f+qӲI }Cb-z+̢-'ǺFbWҰ7bEH+Ul]g߲5kL&!oșt'b1+1-%7I\)$&sv*Y6ۿ}Z"UsU@0>qc0/UʇkRb=P0IZol$v!l-1x#\O@ xjۄ]`Xqi @`'T/ƾ/J_^C}tsҠYH%f'I2hk{1f,g,)}b:Xa=,XYK2;{r¨a_v0$$d$8E#IwJ s^JSmPFz^Pz Zc\'p}ʣ %[ouL4 A8JݖMe@z!tO]IX #oNw WCgkIXJǫztFԿsrH(;V'g?|vCvft'(OA tSQGwwn.)Qq2~oSoeD3cKHԟBȡH¹(DԌ}MҼN_lE`VI/%wsBw7]J~r9Y!8BC`-ẻf̅y'^7>P4~<#D(<O<#22Xi` endstream endobj 277 0 obj <> stream x]kHSa߳Iy[Hvde LͦDB5=)ILa:)@E].&$FHH}I޷`K_~~R(EQ' 3 Nfe*3!l.B$"2bƣT;e4B{&nR&5!%EN;.Xhk:&eĔ3i&RZ*y4dɔX-2+cELAQsa>^Vj,(+3BQ GZ! )P Smx4xCT3~>jj-I5-uLd?< Ґ79=ۣTHm W)C080CˁG*Oз L`>AaX>NvVu pg;^):M=`G hYm*QWVDڗU9qL]uՒ~Y+c7vKRH@K݃#}bűBwoOc*%YJ.FB'p4aU[8B$N+d'jEIN\f耖FiiL O&М XNQ+p)y/1bg J4Mhŀ!u-{##x{M.1S` 7"# endstream endobj 279 0 obj <> stream xcd`aa`ddst u v54TC Yr?Yd { TX9j *23J45 --u ,sS2|K2RsKԒJ +}rbt;M̒ ԢT kiNj1A@g30110]gZ}?˺D׳}]3zqʴI}݋9˱͝=W{r뤦޴QgWL<ss|7slݕU')=%rv\FͩnjG{Ţ7޽@-Mrպr#T*8~~]н䃭~T6-˻peڽy.w6?ٻ &Wr|ZtROY {g*_~'Mg̵{ b1v endstream endobj 281 0 obj <> stream xڅT{LSW-na 25n XԂ ByYh<_i@PuXE!2 E671sr.vʒ-Y|'}|@829%&%55  ?>tmskTgc)2uTg?[tcd- ׋H>%{ߚ>v1t! rxd`$f(Zb4Z*c^:PZ;8'VOB]CT#+Z&bd X':`!8♎l*f3;Cgw Uf |r(.+V+⠎9}) hS5k#1Aolt4Z:.!<B+ƮJC!Jwk9s5=y\Ұy3[Q~ҙc=*0A O©*"[\Je?J`Wx& 6e]f3SUJby ͪ8 fKTMw+߃RJqgQخz uJ@ջO[tQe쮄rjp[ڡV6\3%\F'9`I5U4&892>6LaG4g썽@%_9minb:RFZ:;=\tZtj JjTNe$g^3Oζ?^u U I_AR*G7MM&M$1u7MFssh9-zܟ Ab endstream endobj 283 0 obj <> stream xcd`aa`ddtvvw M34 JM/I,If!Cy?Nɰv*RD>@`b`edd-1300q/,L(Q0200 I Nz ^ٙ y) ^zz ~@L<ԌĜ44`נ` Ѐ`M= rsa:E̜e}+W:ccecs֭3gn.<{޴ XBy8'pupgRoo %[&N0Wsp endstream endobj 285 0 obj <> stream x]Pn0[^@TR E w`X:-c >ؠ+Rj_3YtL1hH€!VVG3|H)-GRwvmW8.:G`gcX݀Oi4CnM'-Ru+w5iT$MDQ ڶfH߮è~e`<~ɸ"1$j aӐ_%!-|Bo= endstream endobj 297 0 obj <> stream x]Pn0 ӡ]@|,ю$yR HGc L'85bِ„!V7JGVҳKZ Imoq~T/Gh|8[Lh}XLaӇvd;h 8] -Rڶ(1z0HZ [}2$v0Yd`y;9C=Ҟ¾/rC|w>(l endstream endobj 305 0 obj <> stream x]P=o0 +<^*[" E8Q$n`K~~3o'00lhQ嬬7IZo&^/]yvؖɁ 68k7K¾`hӵ22"E(X]q"4.^* fd(j]W3$w(Id`3=g&Ur4QZn_'U?nz endstream endobj 308 0 obj <> stream x]Pn0+*^ M$ qC%c/R1ؠz5;&sj֢^(nqԓeB$.ۦp3I OjDPv8:ts;ez EAotv݉_> stream x]j0>YNEE/ԙ L.kEw[j΍ogxzL#t8H BrE$>d !EJ1& _.E[ܜy( Yp8 Sо@'kFB!K2#F-0)]omYFra„:9KȧodSプۤIpa[m.I筱*?P~ endstream endobj 258 0 obj <> stream x͚kO8ZrfѼ(zJҰMIIF;9? cI˙gL8kfٲ <Ξ 3a@A2aBA3%8<'8BiFRrZV thL1&=S F#Fb/SN T69$ii0\Lk ӹJA1a"0$ HDfHa 3R_H1gh,@Tb8gVK(b\`ElP7$saGWiqŜP9T0,='~P+(kL0z0@gS y^`0HOC?CU`!PR.еKKJp &s4i^QM#bބ ap )AFpi+jQ͏՟n]ϯrV[NYtvr]VʰuWi3kR@vQEȏjֳ}zWVP3)ݩ^WHZ/V X.Mkɗs~zH|f/|>6Uh)hy <-Ղ\ƫ\D Z[JB#y4bZ3`9ړQ>0%D3s%mT\R}śb"iI_=Tҟ>t(^F~ljLk^ZyK&Z5W [&UmY"c5Z ق g5.%U(^LĕK.)\$Uh:2ƹFD}da M%q+wl9/PdΆƉ@y%?y8M [M2.A'{ Y:ə̝h~rVy-;Ն:e,EqgPJ~>h%ݗ.Ho1ְ4?͇3H{Ӳ"ތn|DMF>c;8 urA7  ns)=+g< ӧaM: .ޠ1',}2&;Ng-;$֯U2f=RucZC$lJ e­raUAߗt{okBL2 <'P=PhӒj=gJl:cHs:z \;M_E5XV2Uk eؿіS =VU*&5Zu\_fᣀ^T+RɁ*P_ڠEK!PvG`45 5N!f8QRtlvN>Z Mͫ4uhSYIطOx3Nap<1"CEwQbjR%89I yyI΋@{]z+[d'S$|+v` *(95hӒĴ*͋NITPPq^- !S3JKJ$ 0w__Og ^? AT(%5(Ҕ$J8OOgi~&b}MD:4yT4QJסL$0Ue1,w/N?wZLDA4tK4 _2;Eq4?ljibi,T"( 5hy?'''GUm©8(n!YHŤn,ԠEKKLJݓn!oaOgk{P3DYE;is֧4,)?loԈ4:;nE;kuh S_":9:>xu!O_|ÅUk}^mι*WcK&R4y|t-mXu:ëlM4jIFy 0wenvJl]nO| goH/_W•KOh="uіi6-~[Z?7TXnu/7^?BQjR>({%U5S$N:pb]l}+Wrݪ,>C-XSvRvL*(e-U%Z$ Svm#KhiHhkp)P<s(YFKqҥ]@BD{u)Go#uQVjТ%8R>Byjc|σn@EB+|3m-L}+zMc.AF15_|moOvu7U;c2K3[ 6O8oC?g?t> ^42F`8hE4eY&FO 2 oL2~ݤy,_NaL39{H괞z(0E. ڨ>{*fOҳYl^(U?v@q17nA'}J{DCBb5' endstream endobj 321 0 obj <]/Size 322/W[1 3 2]/Filter/FlateDecode/Length 750>> stream x5YPWe7UJ rAAP45LE2 sCiS4rI-қf; oHﯛْ??*Ozq)>፞BVOjӲm)ެŸe * # * All changes from version 1.1.24 on: © Google Inc. # # Test script for the R interface seed = as.integer(runif(1, 0, 1e9)) set.seed(seed) cat(sprintf("Random seed: %d\n",seed)) print_seed <- function() { return(sprintf(' Please send a report to the author of the \'fastcluster\' package, Daniel Müllner. For contact details, see . To make the error reproducible, you must include the following number (the random seed value) in your error report: %d.\n\n', seed)) } hasWardD2 = getRversion() >= '3.1.0' # Compare two dendrograms and check whether they are equal, except that # ties may be resolved differently. compare <- function(dg1, dg2) { h1 <- dg1$height h2 <- dg2$height # "height" vectors may have small numerical errors. rdiffs <- abs(h1-h2)/pmax(abs(h1),abs(h2)) rdiffs = rdiffs[complete.cases(rdiffs)] rel_error <- max(rdiffs) # We allow a relative error of 1e-13. if (rel_error>1e-13) { print(h1) print(h2) cat(sprintf('Height vectors differ! The maximum relative error is %e.\n', rel_error)) return(FALSE) } # Filter the indices where consecutive merging distances are distinct. d = diff(dg1$height) b = (c(d,1)!=0 & c(1,d)!=0) #cat(sprintf("Percentage of indices where we can test: %g.\n",100.0*length(b[b])/length(b))) if (any(b)) { m1 = dg1$merge[b,] m2 = dg2$merge[b,] r = function(i) { if (i<0) { return(1) } else { return(b[i]) } } f = sapply(m1,r) fm1 = m1*f fm2 = m2*f # The "merge" matrices must be identical whereever indices are not ambiguous # due to ties. if (!identical(fm1,fm2)) { cat('Merge matrices differ!\n') return(FALSE) } # Compare the "order" vectors only if all merging distances were distinct. if (all(b) && !identical(dg1$order,dg2$order)) { cat('Order vectors differ!\n') return(FALSE) } } return(TRUE) } # Generate uniformly distributed random data generate.uniform <- function() { n = sample(10:1000,1) range_exp = runif(1,min=-10, max=10) cat(sprintf("Number of sample points: %d\n",n)) cat(sprintf("Dissimilarity range: [0,%g]\n",10^range_exp)) d = runif(n*(n-1)/2, min=0, max=10^range_exp) # Fake a compressed distance matrix attributes(d) <- NULL attr(d,"Size") <- n attr(d, "call") <- 'N/A' class(d) <- "dist" return(d) } # Generate normally distributed random data generate.normal <- function() { n = sample(10:1000,1) dim = sample(2:20,1) cat (sprintf("Number of sample points: %d\n",n)) cat (sprintf("Dimension: %d\n",dim)) pcd = matrix(rnorm(n*dim), c(n,dim)) d = dist(pcd) return(d) } # Test the clustering functions when a distance matrix is given. test.dm <- function(d) { d2 = d if (hasWardD2) { methods = c('single','complete','average','mcquitty','ward.D','ward.D2','centroid','median') } else { methods = c('single','complete','average','mcquitty','ward','centroid','median') } for (method in methods) { cat(paste('Method :', method, '\n')) dg_stats = stats::hclust(d, method=method) if (method == 'ward') { method = 'ward.D' } dg_fastcluster = fastcluster::hclust(d, method=method) if (!identical(d,d2)) { cat('Input array was corrupted!\n') stop(print_seed()) } if (!compare(dg_stats, dg_fastcluster)) { stop(print_seed()) } } cat('Passed.\n') } # Test the clustering functions for vector input in Euclidean space. test.vector <- function() { # generate test data n = sample(10:1000,1) dim = sample(2:20,1) cat (sprintf("Number of sample points: %d\n",n)) cat (sprintf("Dimension: %d\n",dim)) range_exp = runif(1,min=-10, max=10) pcd = matrix(rnorm(n*dim, sd=10^range_exp), c(n,dim)) pcd2 = pcd # test method='single' cat(paste('Method:', method, '\n')) for (metric in c('euclidean', 'maximum', 'manhattan', 'canberra', 'minkowski')) { cat(paste(' Metric:', metric, '\n')) if (metric=='minkowski') { p = runif(1, min=1.0, max=10.0) cat (sprintf(" p: %g\n",p)); dg_fastcluster = fastcluster::hclust.vector(pcd, method=method, metric=metric, p=p) d = dist(pcd, method=metric, p=p) } else { dg_fastcluster = fastcluster::hclust.vector(pcd, method=method, metric=metric) d = dist(pcd, method=metric) } d2 = d dg_fastcluster_dist = fastcluster::hclust(d, method=method) if (!identical(d,d2) || !identical(pcd,pcd2)) { cat('Input array was corrupted!\n') stop(print_seed()) } if (!compare(dg_fastcluster_dist, dg_fastcluster)) { stop(print_seed()) } } for (method in c('ward','centroid','median') ) { cat(paste('Method:', method, '\n')) dg_fastcluster = fastcluster::hclust.vector(pcd, method=method) if (!identical(pcd,pcd2)) { cat('Input array was corrupted!\n') stop(print_seed()) } d = dist(pcd) if(method == "ward" && hasWardD2) { method = "ward.D2" } else { # Workaround: fastcluster::hclust expects _squared_ euclidean distances. d = d^2 } d2 = d dg_fastcluster_dist = fastcluster::hclust(d, method=method) if (!identical(d,d2)) { cat('Input array was corrupted!\n') stop(print_seed()) } if(method != "ward.D2") { dg_fastcluster_dist$height = sqrt(dg_fastcluster_dist$height) } # The Euclidean methods may have small numerical errors due to squaring/ # taking the root in the Euclidean distances. if (!compare(dg_fastcluster_dist, dg_fastcluster)) { stop(print_seed()) } } cat('Passed.\n') } # Test the single linkage function with the "binary" metric test.vector.binary <- function() { # generate test data cat (sprintf("Uniform sampling for the 'binary' metric:\n")) n = sample(10:400,1) dim = sample(n:(2*n),1) cat (sprintf("Number of sample points: %d\n",n)) cat (sprintf("Dimension: %d\n",dim)) pcd = matrix(sample(-1:2, n*dim, replace=T), c(n,dim)) pcd2 = pcd # test method='single' metric='binary' cat(paste('Method:', method, '\n')) cat(paste(' Metric:', metric, '\n')) dg_fastcluster = fastcluster::hclust.vector(pcd, method=method, metric=metric) d = dist(pcd, method=metric) d2 = d dg_fastcluster_dist = fastcluster::hclust(d, method=method) if (!identical(d,d2) || !identical(d,d2)) { cat('Input array was corrupted!\n') stop(print_seed()) } if (!compare(dg_fastcluster_dist, dg_fastcluster)) { stop(print_seed()) } cat('Passed.\n') } N = 15 for (i in (1:N)) { if (i%%2==1) { cat(sprintf('Random test %d of %d (uniform distribution of distances):\n',i,2*N)) d = generate.uniform() } else { cat(sprintf('Random test %d of %d (Gaussian density):\n',i,2*N)) d = generate.normal() } test.dm(d) } for (i in (N+1:N)) { cat(sprintf('Random test %d of %d (Gaussian density):\n',i,2*N)) test.vector() test.vector.binary() } cat('Done.\n') fastcluster/src/0000755000176200001440000000000013146376104013405 5ustar liggesusersfastcluster/src/fastcluster.cpp0000644000176200001440000014702413146376104016460 0ustar liggesusers/* fastcluster: Fast hierarchical clustering routines for R and Python Copyright: * Until package version 1.1.23: © 2011 Daniel Müllner * All changes from version 1.1.24 on: © Google Inc. This library implements various fast algorithms for hierarchical, agglomerative clustering methods: (1) Algorithms for the "stored matrix approach": the input is the array of pairwise dissimilarities. MST_linkage_core: single linkage clustering with the "minimum spanning tree algorithm (Rohlfs) NN_chain_core: nearest-neighbor-chain algorithm, suitable for single, complete, average, weighted and Ward linkage (Murtagh) generic_linkage: generic algorithm, suitable for all distance update formulas (Müllner) (2) Algorithms for the "stored data approach": the input are points in a vector space. MST_linkage_core_vector: single linkage clustering for vector data generic_linkage_vector: generic algorithm for vector data, suitable for the Ward, centroid and median methods. generic_linkage_vector_alternative: alternative scheme for updating the nearest neighbors. This method seems faster than "generic_linkage_vector" for the centroid and median methods but slower for the Ward method. All these implementation treat infinity values correctly. They throw an exception if a NaN distance value occurs. */ // Older versions of Microsoft Visual Studio do not have the fenv header. #ifdef _MSC_VER #if (_MSC_VER == 1500 || _MSC_VER == 1600) #define NO_INCLUDE_FENV #endif #endif // NaN detection via fenv might not work on systems with software // floating-point emulation (bug report for Debian armel). #ifdef __SOFTFP__ #define NO_INCLUDE_FENV #endif #ifdef NO_INCLUDE_FENV #pragma message("Do not use fenv header.") #else #pragma message("Use fenv header. If there is a warning about unknown #pragma STDC FENV_ACCESS, this can be ignored.") #pragma STDC FENV_ACCESS on #include #endif #include // for std::pow, std::sqrt #include // for std::ptrdiff_t #include // for std::numeric_limits<...>::infinity() #include // for std::fill_n #include // for std::runtime_error #include // for std::string #include // also for DBL_MAX, DBL_MIN #ifndef DBL_MANT_DIG #error The constant DBL_MANT_DIG could not be defined. #endif #define T_FLOAT_MANT_DIG DBL_MANT_DIG #ifndef LONG_MAX #include #endif #ifndef LONG_MAX #error The constant LONG_MAX could not be defined. #endif #ifndef INT_MAX #error The constant INT_MAX could not be defined. #endif #ifndef INT32_MAX #ifdef _MSC_VER #if _MSC_VER >= 1600 #define __STDC_LIMIT_MACROS #include #else typedef __int32 int_fast32_t; typedef __int64 int64_t; #endif #else #define __STDC_LIMIT_MACROS #include #endif #endif #define FILL_N std::fill_n #ifdef _MSC_VER #if _MSC_VER < 1600 #undef FILL_N #define FILL_N stdext::unchecked_fill_n #endif #endif // Suppress warnings about (potentially) uninitialized variables. #ifdef _MSC_VER #pragma warning (disable:4700) #endif #ifndef HAVE_DIAGNOSTIC #if __GNUC__ > 4 || (__GNUC__ == 4 && (__GNUC_MINOR__ >= 6)) #define HAVE_DIAGNOSTIC 1 #endif #endif #ifndef HAVE_VISIBILITY #if __GNUC__ >= 4 #define HAVE_VISIBILITY 1 #endif #endif /* Since the public interface is given by the Python respectively R interface, * we do not want other symbols than the interface initalization routines to be * visible in the shared object file. The "visibility" switch is a GCC concept. * Hiding symbols keeps the relocation table small and decreases startup time. * See http://gcc.gnu.org/wiki/Visibility */ #if HAVE_VISIBILITY #pragma GCC visibility push(hidden) #endif typedef int_fast32_t t_index; #ifndef INT32_MAX #define MAX_INDEX 0x7fffffffL #else #define MAX_INDEX INT32_MAX #endif #if (LONG_MAX < MAX_INDEX) #error The integer format "t_index" must not have a greater range than "long int". #endif #if (INT_MAX > MAX_INDEX) #error The integer format "int" must not have a greater range than "t_index". #endif typedef double t_float; /* Method codes. These codes must agree with the METHODS array in fastcluster.R and the dictionary mthidx in fastcluster.py. */ enum method_codes { // non-Euclidean methods METHOD_METR_SINGLE = 0, METHOD_METR_COMPLETE = 1, METHOD_METR_AVERAGE = 2, METHOD_METR_WEIGHTED = 3, METHOD_METR_WARD = 4, METHOD_METR_WARD_D = METHOD_METR_WARD, METHOD_METR_CENTROID = 5, METHOD_METR_MEDIAN = 6, METHOD_METR_WARD_D2 = 7, MIN_METHOD_CODE = 0, MAX_METHOD_CODE = 7 }; enum method_codes_vector { // Euclidean methods METHOD_VECTOR_SINGLE = 0, METHOD_VECTOR_WARD = 1, METHOD_VECTOR_CENTROID = 2, METHOD_VECTOR_MEDIAN = 3, MIN_METHOD_VECTOR_CODE = 0, MAX_METHOD_VECTOR_CODE = 3 }; // self-destructing array pointer template class auto_array_ptr{ private: type * ptr; auto_array_ptr(auto_array_ptr const &); // non construction-copyable auto_array_ptr& operator=(auto_array_ptr const &); // non copyable public: auto_array_ptr() : ptr(NULL) { } template auto_array_ptr(index const size) : ptr(new type[size]) { } template auto_array_ptr(index const size, value const val) : ptr(new type[size]) { FILL_N(ptr, size, val); } ~auto_array_ptr() { delete [] ptr; } void free() { delete [] ptr; ptr = NULL; } template void init(index const size) { ptr = new type [size]; } template void init(index const size, value const val) { init(size); FILL_N(ptr, size, val); } inline operator type *() const { return ptr; } }; struct node { t_index node1, node2; t_float dist; }; inline bool operator< (const node a, const node b) { return (a.dist < b.dist); } class cluster_result { private: auto_array_ptr Z; t_index pos; public: cluster_result(const t_index size) : Z(size) , pos(0) {} void append(const t_index node1, const t_index node2, const t_float dist) { Z[pos].node1 = node1; Z[pos].node2 = node2; Z[pos].dist = dist; ++pos; } node * operator[] (const t_index idx) const { return Z + idx; } /* Define several methods to postprocess the distances. All these functions are monotone, so they do not change the sorted order of distances. */ void sqrt() const { for (node * ZZ=Z; ZZ!=Z+pos; ++ZZ) { ZZ->dist = std::sqrt(ZZ->dist); } } void sqrt(const t_float) const { // ignore the argument sqrt(); } void sqrtdouble(const t_float) const { // ignore the argument for (node * ZZ=Z; ZZ!=Z+pos; ++ZZ) { ZZ->dist = std::sqrt(2*ZZ->dist); } } #ifdef R_pow #define my_pow R_pow #else #define my_pow std::pow #endif void power(const t_float p) const { t_float const q = 1/p; for (node * ZZ=Z; ZZ!=Z+pos; ++ZZ) { ZZ->dist = my_pow(ZZ->dist,q); } } void plusone(const t_float) const { // ignore the argument for (node * ZZ=Z; ZZ!=Z+pos; ++ZZ) { ZZ->dist += 1; } } void divide(const t_float denom) const { for (node * ZZ=Z; ZZ!=Z+pos; ++ZZ) { ZZ->dist /= denom; } } }; class doubly_linked_list { /* Class for a doubly linked list. Initially, the list is the integer range [0, size]. We provide a forward iterator and a method to delete an index from the list. Typical use: for (i=L.start; L succ; private: auto_array_ptr pred; // Not necessarily private, we just do not need it in this instance. public: doubly_linked_list(const t_index size) // Initialize to the given size. : start(0) , succ(size+1) , pred(size+1) { for (t_index i=0; i(2*N-3-(r_))*(r_)>>1)+(c_)-1] ) // Z is an ((N-1)x4)-array #define Z_(_r, _c) (Z[(_r)*4 + (_c)]) /* Lookup function for a union-find data structure. The function finds the root of idx by going iteratively through all parent elements until a root is found. An element i is a root if nodes[i] is zero. To make subsequent searches faster, the entry for idx and all its parents is updated with the root element. */ class union_find { private: auto_array_ptr parent; t_index nextparent; public: union_find(const t_index size) : parent(size>0 ? 2*size-1 : 0, 0) , nextparent(size) { } t_index Find (t_index idx) const { if (parent[idx] != 0 ) { // a → b t_index p = idx; idx = parent[idx]; if (parent[idx] != 0 ) { // a → b → c do { idx = parent[idx]; } while (parent[idx] != 0); do { t_index tmp = parent[p]; parent[p] = idx; p = tmp; } while (parent[p] != idx); } } return idx; } void Union (const t_index node1, const t_index node2) { parent[node1] = parent[node2] = nextparent++; } }; class nan_error{}; #ifdef FE_INVALID class fenv_error{}; #endif static void MST_linkage_core(const t_index N, const t_float * const D, cluster_result & Z2) { /* N: integer, number of data points D: condensed distance matrix N*(N-1)/2 Z2: output data structure The basis of this algorithm is an algorithm by Rohlf: F. James Rohlf, Hierarchical clustering using the minimum spanning tree, The Computer Journal, vol. 16, 1973, p. 93–95. */ t_index i; t_index idx2; doubly_linked_list active_nodes(N); auto_array_ptr d(N); t_index prev_node; t_float min; // first iteration idx2 = 1; min = std::numeric_limits::infinity(); for (i=1; i tmp) d[i] = tmp; else if (fc_isnan(tmp)) throw (nan_error()); #if HAVE_DIAGNOSTIC #pragma GCC diagnostic pop #endif if (d[i] < min) { min = d[i]; idx2 = i; } } Z2.append(prev_node, idx2, min); } } /* Functions for the update of the dissimilarity array */ inline static void f_single( t_float * const b, const t_float a ) { if (*b > a) *b = a; } inline static void f_complete( t_float * const b, const t_float a ) { if (*b < a) *b = a; } inline static void f_average( t_float * const b, const t_float a, const t_float s, const t_float t) { *b = s*a + t*(*b); #ifndef FE_INVALID #if HAVE_DIAGNOSTIC #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wfloat-equal" #endif if (fc_isnan(*b)) { throw(nan_error()); } #if HAVE_DIAGNOSTIC #pragma GCC diagnostic pop #endif #endif } inline static void f_weighted( t_float * const b, const t_float a) { *b = (a+*b)*.5; #ifndef FE_INVALID #if HAVE_DIAGNOSTIC #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wfloat-equal" #endif if (fc_isnan(*b)) { throw(nan_error()); } #if HAVE_DIAGNOSTIC #pragma GCC diagnostic pop #endif #endif } inline static void f_ward( t_float * const b, const t_float a, const t_float c, const t_float s, const t_float t, const t_float v) { *b = ( (v+s)*a - v*c + (v+t)*(*b) ) / (s+t+v); //*b = a+(*b)-(t*a+s*(*b)+v*c)/(s+t+v); #ifndef FE_INVALID #if HAVE_DIAGNOSTIC #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wfloat-equal" #endif if (fc_isnan(*b)) { throw(nan_error()); } #if HAVE_DIAGNOSTIC #pragma GCC diagnostic pop #endif #endif } inline static void f_centroid( t_float * const b, const t_float a, const t_float stc, const t_float s, const t_float t) { *b = s*a - stc + t*(*b); #ifndef FE_INVALID if (fc_isnan(*b)) { throw(nan_error()); } #if HAVE_DIAGNOSTIC #pragma GCC diagnostic pop #endif #endif } inline static void f_median( t_float * const b, const t_float a, const t_float c_4) { *b = (a+(*b))*.5 - c_4; #ifndef FE_INVALID #if HAVE_DIAGNOSTIC #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wfloat-equal" #endif if (fc_isnan(*b)) { throw(nan_error()); } #if HAVE_DIAGNOSTIC #pragma GCC diagnostic pop #endif #endif } template static void NN_chain_core(const t_index N, t_float * const D, t_members * const members, cluster_result & Z2) { /* N: integer D: condensed distance matrix N*(N-1)/2 Z2: output data structure This is the NN-chain algorithm, described on page 86 in the following book: Fionn Murtagh, Multidimensional Clustering Algorithms, Vienna, Würzburg: Physica-Verlag, 1985. */ t_index i; auto_array_ptr NN_chain(N); t_index NN_chain_tip = 0; t_index idx1, idx2; t_float size1, size2; doubly_linked_list active_nodes(N); t_float min; for (t_float const * DD=D; DD!=D+(static_cast(N)*(N-1)>>1); ++DD) { #if HAVE_DIAGNOSTIC #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wfloat-equal" #endif if (fc_isnan(*DD)) { throw(nan_error()); } #if HAVE_DIAGNOSTIC #pragma GCC diagnostic pop #endif } #ifdef FE_INVALID if (feclearexcept(FE_INVALID)) throw fenv_error(); #endif for (t_index j=0; jidx2) { t_index tmp = idx1; idx1 = idx2; idx2 = tmp; } if (method==METHOD_METR_AVERAGE || method==METHOD_METR_WARD) { size1 = static_cast(members[idx1]); size2 = static_cast(members[idx2]); members[idx2] += members[idx1]; } // Remove the smaller index from the valid indices (active_nodes). active_nodes.remove(idx1); switch (method) { case METHOD_METR_SINGLE: /* Single linkage. Characteristic: new distances are never longer than the old distances. */ // Update the distance matrix in the range [start, idx1). for (i=active_nodes.start; i(members[i]); for (i=active_nodes.start; i(members[i]) ); // Update the distance matrix in the range (idx1, idx2). for (; i(members[i]) ); // Update the distance matrix in the range (idx2, N). for (i=active_nodes.succ[idx2]; i(members[i]) ); break; default: throw std::runtime_error(std::string("Invalid method.")); } } #ifdef FE_INVALID if (fetestexcept(FE_INVALID)) throw fenv_error(); #endif } class binary_min_heap { /* Class for a binary min-heap. The data resides in an array A. The elements of A are not changed but two lists I and R of indices are generated which point to elements of A and backwards. The heap tree structure is H[2*i+1] H[2*i+2] \ / \ / ≤ ≤ \ / \ / H[i] where the children must be less or equal than their parent. Thus, H[0] contains the minimum. The lists I and R are made such that H[i] = A[I[i]] and R[I[i]] = i. This implementation is not designed to handle NaN values. */ private: t_float * const A; t_index size; auto_array_ptr I; auto_array_ptr R; // no default constructor binary_min_heap(); // noncopyable binary_min_heap(binary_min_heap const &); binary_min_heap & operator=(binary_min_heap const &); public: binary_min_heap(t_float * const A_, const t_index size_) : A(A_), size(size_), I(size), R(size) { // Allocate memory and initialize the lists I and R to the identity. This // does not make it a heap. Call heapify afterwards! for (t_index i=0; i>1); idx>0; ) { --idx; update_geq_(idx); } } inline t_index argmin() const { // Return the minimal element. return I[0]; } void heap_pop() { // Remove the minimal element from the heap. --size; I[0] = I[size]; R[I[0]] = 0; update_geq_(0); } void remove(t_index idx) { // Remove an element from the heap. --size; R[I[size]] = R[idx]; I[R[idx]] = I[size]; if ( H(size)<=A[idx] ) { update_leq_(R[idx]); } else { update_geq_(R[idx]); } } void replace ( const t_index idxold, const t_index idxnew, const t_float val) { R[idxnew] = R[idxold]; I[R[idxnew]] = idxnew; if (val<=A[idxold]) update_leq(idxnew, val); else update_geq(idxnew, val); } void update ( const t_index idx, const t_float val ) const { // Update the element A[i] with val and re-arrange the indices to preserve // the heap condition. if (val<=A[idx]) update_leq(idx, val); else update_geq(idx, val); } void update_leq ( const t_index idx, const t_float val ) const { // Use this when the new value is not more than the old value. A[idx] = val; update_leq_(R[idx]); } void update_geq ( const t_index idx, const t_float val ) const { // Use this when the new value is not less than the old value. A[idx] = val; update_geq_(R[idx]); } private: void update_leq_ (t_index i) const { t_index j; for ( ; (i>0) && ( H(i)>1) ); i=j) heap_swap(i,j); } void update_geq_ (t_index i) const { t_index j; for ( ; (j=2*i+1)=H(i) ) { ++j; if ( j>=size || H(j)>=H(i) ) break; } else if ( j+1 static void generic_linkage(const t_index N, t_float * const D, t_members * const members, cluster_result & Z2) { /* N: integer, number of data points D: condensed distance matrix N*(N-1)/2 Z2: output data structure */ const t_index N_1 = N-1; t_index i, j; // loop variables t_index idx1, idx2; // row and column indices auto_array_ptr n_nghbr(N_1); // array of nearest neighbors auto_array_ptr mindist(N_1); // distances to the nearest neighbors auto_array_ptr row_repr(N); // row_repr[i]: node number that the // i-th row represents doubly_linked_list active_nodes(N); binary_min_heap nn_distances(&*mindist, N_1); // minimum heap structure for // the distance to the nearest neighbor of each point t_index node1, node2; // node numbers in the output t_float size1, size2; // and their cardinalities t_float min; // minimum and row index for nearest-neighbor search t_index idx; for (i=0; ii} D(i,j) for i in range(N-1) t_float const * DD = D; for (i=0; i::infinity(); for (idx=j=i+1; ji} D(i,j) Normally, we have equality. However, this minimum may become invalid due to the updates in the distance matrix. The rules are: 1) If mindist[i] is equal to D(i, n_nghbr[i]), this is the correct minimum and n_nghbr[i] is a nearest neighbor. 2) If mindist[i] is smaller than D(i, n_nghbr[i]), this might not be the correct minimum. The minimum needs to be recomputed. 3) mindist[i] is never bigger than the true minimum. Hence, we never miss the true minimum if we take the smallest mindist entry, re-compute the value if necessary (thus maybe increasing it) and looking for the now smallest mindist entry until a valid minimal entry is found. This step is done in the lines below. The update process for D below takes care that these rules are fulfilled. This makes sure that the minima in the rows D(i,i+1:)of D are re-calculated when necessary but re-calculation is avoided whenever possible. The re-calculation of the minima makes the worst-case runtime of this algorithm cubic in N. We avoid this whenever possible, and in most cases the runtime appears to be quadratic. */ idx1 = nn_distances.argmin(); if (method != METHOD_METR_SINGLE) { while ( mindist[idx1] < D_(idx1, n_nghbr[idx1]) ) { // Recompute the minimum mindist[idx1] and n_nghbr[idx1]. n_nghbr[idx1] = j = active_nodes.succ[idx1]; // exists, maximally N-1 min = D_(idx1,j); for (j=active_nodes.succ[j]; j(members[idx1]); size2 = static_cast(members[idx2]); members[idx2] += members[idx1]; } Z2.append(node1, node2, mindist[idx1]); // Remove idx1 from the list of active indices (active_nodes). active_nodes.remove(idx1); // Index idx2 now represents the new (merged) node with label N+i. row_repr[idx2] = N+i; // Update the distance matrix switch (method) { case METHOD_METR_SINGLE: /* Single linkage. Characteristic: new distances are never longer than the old distances. */ // Update the distance matrix in the range [start, idx1). for (j=active_nodes.start; j(members[j]) ); if (n_nghbr[j] == idx1) n_nghbr[j] = idx2; } // Update the distance matrix in the range (idx1, idx2). for (; j(members[j]) ); if (D_(j, idx2) < mindist[j]) { nn_distances.update_leq(j, D_(j, idx2)); n_nghbr[j] = idx2; } } // Update the distance matrix in the range (idx2, N). if (idx2(members[j]) ); min = D_(idx2,j); for (j=active_nodes.succ[j]; j(members[j]) ); if (D_(idx2,j) < min) { min = D_(idx2,j); n_nghbr[idx2] = j; } } nn_distances.update(idx2, min); } break; case METHOD_METR_CENTROID: { /* Centroid linkage. Shorter and longer distances can occur, not bigger than max(d1,d2) but maybe smaller than min(d1,d2). */ // Update the distance matrix in the range [start, idx1). t_float s = size1/(size1+size2); t_float t = size2/(size1+size2); t_float stc = s*t*mindist[idx1]; for (j=active_nodes.start; j static void MST_linkage_core_vector(const t_index N, t_dissimilarity & dist, cluster_result & Z2) { /* N: integer, number of data points dist: function pointer to the metric Z2: output data structure The basis of this algorithm is an algorithm by Rohlf: F. James Rohlf, Hierarchical clustering using the minimum spanning tree, The Computer Journal, vol. 16, 1973, p. 93–95. */ t_index i; t_index idx2; doubly_linked_list active_nodes(N); auto_array_ptr d(N); t_index prev_node; t_float min; // first iteration idx2 = 1; min = std::numeric_limits::infinity(); for (i=1; i tmp) d[i] = tmp; else if (fc_isnan(tmp)) throw (nan_error()); #if HAVE_DIAGNOSTIC #pragma GCC diagnostic pop #endif if (d[i] < min) { min = d[i]; idx2 = i; } } Z2.append(prev_node, idx2, min); } } template static void generic_linkage_vector(const t_index N, t_dissimilarity & dist, cluster_result & Z2) { /* N: integer, number of data points dist: function pointer to the metric Z2: output data structure This algorithm is valid for the distance update methods "Ward", "centroid" and "median" only! */ const t_index N_1 = N-1; t_index i, j; // loop variables t_index idx1, idx2; // row and column indices auto_array_ptr n_nghbr(N_1); // array of nearest neighbors auto_array_ptr mindist(N_1); // distances to the nearest neighbors auto_array_ptr row_repr(N); // row_repr[i]: node number that the // i-th row represents doubly_linked_list active_nodes(N); binary_min_heap nn_distances(&*mindist, N_1); // minimum heap structure for // the distance to the nearest neighbor of each point t_index node1, node2; // node numbers in the output t_float min; // minimum and row index for nearest-neighbor search for (i=0; ii} D(i,j) for i in range(N-1) for (i=0; i::infinity(); t_index idx; for (idx=j=i+1; j(i,j); } if (tmp(idx1,j); for (j=active_nodes.succ[j]; j(idx1,j); if (tmp(j, idx2); if (tmp < mindist[j]) { nn_distances.update_leq(j, tmp); n_nghbr[j] = idx2; } else if (n_nghbr[j] == idx2) n_nghbr[j] = idx1; // invalidate } // Find the nearest neighbor for idx2. if (idx2(idx2,j); for (j=active_nodes.succ[j]; j(idx2, j); if (tmp < min) { min = tmp; n_nghbr[idx2] = j; } } nn_distances.update(idx2, min); } } } } template static void generic_linkage_vector_alternative(const t_index N, t_dissimilarity & dist, cluster_result & Z2) { /* N: integer, number of data points dist: function pointer to the metric Z2: output data structure This algorithm is valid for the distance update methods "Ward", "centroid" and "median" only! */ const t_index N_1 = N-1; t_index i, j=0; // loop variables t_index idx1, idx2; // row and column indices auto_array_ptr n_nghbr(2*N-2); // array of nearest neighbors auto_array_ptr mindist(2*N-2); // distances to the nearest neighbors doubly_linked_list active_nodes(N+N_1); binary_min_heap nn_distances(&*mindist, N_1, 2*N-2, 1); // minimum heap // structure for the distance to the nearest neighbor of each point t_float min; // minimum for nearest-neighbor searches // Initialize the minimal distances: // Find the nearest neighbor of each point. // n_nghbr[i] = argmin_{j>i} D(i,j) for i in range(N-1) for (i=1; i::infinity(); t_index idx; for (idx=j=0; j(i,j); } if (tmp All changes from version 1.1.24 on: © Google Inc. This module provides fast hierarchical clustering routines. The "linkage" method is designed to provide a replacement for the “linkage” function and its siblings in the scipy.cluster.hierarchy module. You may use the methods in this module with the same syntax as the corresponding SciPy functions but with the benefit of much faster performance. The method "linkage_vector" performs clustering of vector data with memory- saving algorithms. Refer to the User's manual "fastcluster.pdf" for comprehensive details. It is located in the directory inst/doc/ in the source distribution and may also be obtained at . """ __all__ = ['single', 'complete', 'average', 'weighted', 'ward', 'centroid', 'median', 'linkage', 'linkage_vector'] __version_info__ = ('1', '1', '24') __version__ = '.'.join(__version_info__) from numpy import double, empty, array, ndarray, var, cov, dot, bool, \ expand_dims, ceil, sqrt from numpy.linalg import inv try: from scipy.spatial.distance import pdist except ImportError: def pdist(*args, **kwargs): raise ImportError('The fastcluster.linkage function cannot process ' 'vector data since the function ' 'scipy.partial.distance.pdist could not be ' 'imported.') from _fastcluster import linkage_wrap, linkage_vector_wrap def single(D): '''Single linkage clustering (alias). See the help on the “linkage” function for further information.''' return linkage(D, method='single') def complete(D): '''Complete linkage clustering (alias). See the help on the “linkage” function for further information.''' return linkage(D, method='complete') def average(D): '''Hierarchical clustering with the “average” distance update formula (alias). See the help on the “linkage” function for further information.''' return linkage(D, method='average') def weighted(D): '''Hierarchical clustering with the “weighted” distance update formula (alias). See the help on the “linkage” function for further information.''' return linkage(D, method='weighted') def ward(D): '''Hierarchical clustering with the “Ward” distance update formula (alias). See the help on the “linkage” function for further information.''' return linkage(D, method='ward') def centroid(D): '''Hierarchical clustering with the “centroid” distance update formula (alias). See the help on the “linkage” function for further information.''' return linkage(D, method='centroid') def median(D): '''Hierarchical clustering with the “median” distance update formula (alias). See the help on the “linkage” function for further information.''' return linkage(D, method='median') # This dictionary must agree with the enum method_codes in fastcluster.cpp. mthidx = {'single' : 0, 'complete' : 1, 'average' : 2, 'weighted' : 3, 'ward' : 4, 'centroid' : 5, 'median' : 6 } def linkage(X, method='single', metric='euclidean', preserve_input=True): r'''Hierarchical, agglomerative clustering on a dissimilarity matrix or on Euclidean data. Apart from the argument 'preserve_input', the method has the same input parameters and output format as the functions of the same name in the module scipy.cluster.hierarchy. The argument X is preferably a NumPy array with floating point entries (X.dtype==numpy.double). Any other data format will be converted before it is processed. If X is a one-dimensional array, it is considered a condensed matrix of pairwise dissimilarities in the format which is returned by scipy.spatial.distance.pdist. It contains the flattened, upper- triangular part of a pairwise dissimilarity matrix. That is, if there are N data points and the matrix d contains the dissimilarity between the i-th and j-th observation at position d(i,j), the vector X has length N(N-1)/2 and is ordered as follows: [ d(0,1), d(0,2), ..., d(0,n-1), d(1,2), ..., d(1,n-1), ..., d(n-2,n-1) ] The 'metric' argument is ignored in case of dissimilarity input. The optional argument 'preserve_input' specifies whether the method makes a working copy of the dissimilarity vector or writes temporary data into the existing array. If the dissimilarities are generated for the clustering step only and are not needed afterward, approximately half the memory can be saved by specifying 'preserve_input=False'. Note that the input array X contains unspecified values after this procedure. It is therefore safer to write linkage(X, method="...", preserve_input=False) del X to make sure that the matrix X is not accessed accidentally after it has been used as scratch memory. (The single linkage algorithm does not write to the distance matrix or its copy anyway, so the 'preserve_input' flag has no effect in this case.) If X contains vector data, it must be a two-dimensional array with N observations in D dimensions as an (N×D) array. The preserve_input argument is ignored in this case. The specified metric is used to generate pairwise distances from the input. The following two function calls yield the same output: linkage(pdist(X, metric), method="...", preserve_input=False) linkage(X, metric=metric, method="...") The general scheme of the agglomerative clustering procedure is as follows: 1. Start with N singleton clusters (nodes) labeled 0,...,N−1, which represent the input points. 2. Find a pair of nodes with minimal distance among all pairwise distances. 3. Join the two nodes into a new node and remove the two old nodes. The new nodes are labeled consecutively N, N+1, ... 4. The distances from the new node to all other nodes is determined by the method parameter (see below). 5. Repeat N−1 times from step 2, until there is one big node, which contains all original input points. The output of linkage is stepwise dendrogram, which is represented as an (N−1)×4 NumPy array with floating point entries (dtype=numpy.double). The first two columns contain the node indices which are joined in each step. The input nodes are labeled 0,...,N−1, and the newly generated nodes have the labels N,...,2N−2. The third column contains the distance between the two nodes at each step, ie. the current minimal distance at the time of the merge. The fourth column counts the number of points which comprise each new node. The parameter method specifies which clustering scheme to use. The clustering scheme determines the distance from a new node to the other nodes. Denote the dissimilarities by d, the nodes to be joined by I, J, the new node by K and any other node by L. The symbol |I| denotes the size of the cluster I. method='single': d(K,L) = min(d(I,L), d(J,L)) The distance between two clusters A, B is the closest distance between any two points in each cluster: d(A,B) = min{ d(a,b) | a∈A, b∈B } method='complete': d(K,L) = max(d(I,L), d(J,L)) The distance between two clusters A, B is the maximal distance between any two points in each cluster: d(A,B) = max{ d(a,b) | a∈A, b∈B } method='average': d(K,L) = ( |I|·d(I,L) + |J|·d(J,L) ) / (|I|+|J|) The distance between two clusters A, B is the average distance between the points in the two clusters: d(A,B) = (|A|·|B|)^(-1) · \sum { d(a,b) | a∈A, b∈B } method='weighted': d(K,L) = (d(I,L)+d(J,L))/2 There is no global description for the distance between clusters since the distance depends on the order of the merging steps. The following three methods are intended for Euclidean data only, ie. when X contains the pairwise (non-squared!) distances between vectors in Euclidean space. The algorithm will work on any input, however, and it is up to the user to make sure that applying the methods makes sense. method='centroid': d(K,L) = ( (|I|·d(I,L) + |J|·d(J,L)) / (|I|+|J|) − |I|·|J|·d(I,J)/(|I|+|J|)^2 )^(1/2) There is a geometric interpretation: d(A,B) is the distance between the centroids (ie. barycenters) of the clusters in Euclidean space: d(A,B) = ‖c_A−c_B∥, where c_A denotes the centroid of the points in cluster A. method='median': d(K,L) = ( d(I,L)/2 + d(J,L)/2 − d(I,J)/4 )^(1/2) Define the midpoint w_K of a cluster K iteratively as w_K=k if K={k} is a singleton and as the midpoint (w_I+w_J)/2 if K is formed by joining I and J. Then we have d(A,B) = ∥w_A−w_B∥ in Euclidean space for all nodes A,B. Notice however that this distance depends on the order of the merging steps. method='ward': d(K,L) = ( ((|I|+|L)d(I,L) + (|J|+|L|)d(J,L) − |L|d(I,J)) / (|I|+|J|+|L|) )^(1/2) The global cluster dissimilarity can be expressed as d(A,B) = ( 2|A|·|B|/(|A|+|B|) )^(1/2) · ‖c_A−c_B∥, where c_A again denotes the centroid of the points in cluster A. The clustering algorithm handles infinite values correctly, as long as the chosen distance update formula makes sense. If a NaN value occurs, either in the original dissimilarities or as an updated dissimilarity, an error is raised. The linkage method does not treat NumPy's masked arrays as special and simply ignores the mask.''' X = array(X, copy=False, subok=True) if X.ndim==1: if method=='single': preserve_input = False X = array(X, dtype=double, copy=preserve_input, order='C', subok=True) NN = len(X) N = int(ceil(sqrt(NN*2))) if (N*(N-1)//2) != NN: raise ValueError(r'The length of the condensed distance matrix ' r'must be (k \choose 2) for k data points!') else: assert X.ndim==2 N = len(X) X = pdist(X, metric) X = array(X, dtype=double, copy=False, order='C', subok=True) Z = empty((N-1,4)) if N > 1: linkage_wrap(N, X, Z, mthidx[method]) return Z # This dictionary must agree with the enum metric_codes in fastcluster_python.cpp. mtridx = {'euclidean' : 0, 'minkowski' : 1, 'cityblock' : 2, 'seuclidean' : 3, 'sqeuclidean' : 4, 'cosine' : 5, 'hamming' : 6, 'jaccard' : 7, 'chebychev' : 8, 'canberra' : 9, 'braycurtis' : 10, 'mahalanobis' : 11, 'yule' : 12, 'matching' : 13, 'sokalmichener' : 13, # an alias for 'matching' 'dice' : 14, 'rogerstanimoto' : 15, 'russellrao' : 16, 'sokalsneath' : 17, 'kulsinski' : 18, 'USER' : 19, } booleanmetrics = ('yule', 'matching', 'dice', 'kulsinski', 'rogerstanimoto', 'sokalmichener', 'russellrao', 'sokalsneath', 'kulsinski') def linkage_vector(X, method='single', metric='euclidean', extraarg=None): r'''Hierarchical (agglomerative) clustering on Euclidean data. Compared to the 'linkage' method, 'linkage_vector' uses a memory-saving algorithm. While the linkage method requires Θ(N^2) memory for clustering of N points, this method needs Θ(ND) for N points in R^D, which is usually much smaller. The argument X has the same format as before, when X describes vector data, ie. it is an (N×D) array. Also the output array has the same format. The parameter method must be one of 'single', 'centroid', 'median', 'ward', ie. only for these methods there exist memory-saving algorithms currently. If 'method', is one of 'centroid', 'median', 'ward', the 'metric' must be 'euclidean'. For single linkage clustering, any dissimilarity function may be chosen. Basically, every metric which is implemented in the method scipy.spatial.distance.pdist is reimplemented here. However, the metrics differ in some instances since a number of mistakes and typos (both in the code and in the documentation) were corrected in the fastcluster package. Therefore, the available metrics with their definitions are listed below as a reference. The symbols u and v mostly denote vectors in R^D with coordinates u_j and v_j respectively. See below for additional metrics for Boolean vectors. Unless otherwise stated, the input array X is converted to a floating point array (X.dtype==numpy.double) if it does not have already the required data type. Some metrics accept Boolean input; in this case this is stated explicitly below. If a NaN value occurs, either in the original dissimilarities or as an updated dissimilarity, an error is raised. In principle, the clustering algorithm handles infinite values correctly, but the user is advised to carefully check the behavior of the metric and distance update formulas under these circumstances. The distance formulas combined with the clustering in the 'linkage_vector' method do not have specified behavior if the data X contains infinite or NaN values. Also, the masks in NumPy’s masked arrays are simply ignored. metric='euclidean': Euclidean metric, L_2 norm d(u,v) = ∥u−v∥ = ( \sum_j { (u_j−v_j)^2 } )^(1/2) metric='sqeuclidean': squared Euclidean metric d(u,v) = ∥u−v∥^2 = \sum_j { (u_j−v_j)^2 } metric='seuclidean': standardized Euclidean metric d(u,v) = ( \sum_j { (u_j−v_j)^2 / V_j } )^(1/2) The vector V=(V_0,...,V_{D−1}) is given as the 'extraarg' argument. If no 'extraarg' is given, V_j is by default the unbiased sample variance of all observations in the j-th coordinate: V_j = Var_i (X(i,j) ) = 1/(N−1) · \sum_i ( X(i,j)^2 − μ(X_j)^2 ) (Here, μ(X_j) denotes as usual the mean of X(i,j) over all rows i.) metric='mahalanobis': Mahalanobis distance d(u,v) = ( transpose(u−v) V (u−v) )^(1/2) Here, V=extraarg, a (D×D)-matrix. If V is not specified, the inverse of the covariance matrix numpy.linalg.inv(numpy.cov(X, rowvar=False)) is used. metric='cityblock': the Manhattan distance, L_1 norm d(u,v) = \sum_j |u_j−v_j| metric='chebychev': the supremum norm, L_∞ norm d(u,v) = max_j { |u_j−v_j| } metric='minkowski': the L_p norm d(u,v) = ( \sum_j |u_j−v_j|^p ) ^(1/p) This metric coincides with the cityblock, euclidean and chebychev metrics for p=1, p=2 and p=∞ (numpy.inf), respectively. The parameter p is given as the 'extraarg' argument. metric='cosine' d(u,v) = 1 − ⟨u,v⟩ / (∥u∥·∥v∥) = 1 − (\sum_j u_j·v_j) / ( (\sum u_j^2)(\sum v_j^2) )^(1/2) metric='correlation': This method first mean-centers the rows of X and then applies the 'cosine' distance. Equivalently, the correlation distance measures 1 − (Pearson’s correlation coefficient). d(u,v) = 1 − ⟨u−μ(u),v−μ(v)⟩ / (∥u−μ(u)∥·∥v−μ(v)∥) metric='canberra' d(u,v) = \sum_j ( |u_j−v_j| / (|u_j|+|v_j|) ) Summands with u_j=v_j=0 contribute 0 to the sum. metric='braycurtis' d(u,v) = (\sum_j |u_j-v_j|) / (\sum_j |u_j+v_j|) metric=(user function): The parameter metric may also be a function which accepts two NumPy floating point vectors and returns a number. Eg. the Euclidean distance could be emulated with fn = lambda u, v: numpy.sqrt(((u-v)*(u-v)).sum()) linkage_vector(X, method='single', metric=fn) This method, however, is much slower than the build-in function. metric='hamming': The Hamming distance accepts a Boolean array (X.dtype==bool) for efficient storage. Any other data type is converted to numpy.double. d(u,v) = |{j | u_j≠v_j }| metric='jaccard': The Jaccard distance accepts a Boolean array (X.dtype==bool) for efficient storage. Any other data type is converted to numpy.double. d(u,v) = |{j | u_j≠v_j }| / |{j | u_j≠0 or v_j≠0 }| d(0,0) = 0 Python represents True by 1 and False by 0. In the Boolean case, the Jaccard distance is therefore: d(u,v) = |{j | u_j≠v_j }| / |{j | u_j ∨ v_j }| The following metrics are designed for Boolean vectors. The input array is converted to the 'bool' data type if it is not Boolean already. Use the following abbreviations to count the number of True/False combinations: a = |{j | u_j ∧ v_j }| b = |{j | u_j ∧ (¬v_j) }| c = |{j | (¬u_j) ∧ v_j }| d = |{j | (¬u_j) ∧ (¬v_j) }| Recall that D denotes the number of dimensions, hence D=a+b+c+d. metric='yule' d(u,v) = 2bc / (ad+bc) metric='dice': d(u,v) = (b+c) / (2a+b+c) d(0,0) = 0 metric='rogerstanimoto': d(u,v) = 2(b+c) / (b+c+D) metric='russellrao': d(u,v) = (b+c+d) / D metric='sokalsneath': d(u,v) = 2(b+c)/ ( a+2(b+c)) d(0,0) = 0 metric='kulsinski' d(u,v) = (b/(a+b) + c/(a+c)) / 2 metric='matching': d(u,v) = (b+c)/D Notice that when given a Boolean array, the 'matching' and 'hamming' distance are the same. The 'matching' distance formula, however, converts every input to Boolean first. Hence, the vectors (0,1) and (0,2) have zero 'matching' distance since they are both converted to (False, True) but the Hamming distance is 0.5. metric='sokalmichener' is an alias for 'matching'.''' if method=='single': assert metric!='USER' if metric in ('hamming', 'jaccard'): X = array(X, copy=False, subok=True) dtype = bool if X.dtype==bool else double else: dtype = bool if metric in booleanmetrics else double X = array(X, dtype=dtype, copy=False, order='C', subok=True) else: assert metric=='euclidean' X = array(X, dtype=double, copy=(method=='ward'), order='C', subok=True) assert X.ndim==2 N = len(X) Z = empty((N-1,4)) if metric=='seuclidean': if extraarg is None: extraarg = var(X, axis=0, ddof=1) elif metric=='mahalanobis': if extraarg is None: extraarg = inv(cov(X, rowvar=False)) # instead of the inverse covariance matrix, pass the matrix product # with the data matrix! extraarg = array(dot(X,extraarg),dtype=double, copy=False, order='C', subok=True) elif metric=='correlation': X = X-expand_dims(X.mean(axis=1),1) metric='cosine' elif not isinstance(metric, str): assert extraarg is None metric, extraarg = 'USER', metric elif metric!='minkowski': assert extraarg is None if N > 1: linkage_vector_wrap(X, Z, mthidx[method], mtridx[metric], extraarg) return Z fastcluster/src/python/tests/0000755000176200001440000000000013144412414016061 5ustar liggesusersfastcluster/src/python/tests/__init__.py0000644000176200001440000000053612610011707020173 0ustar liggesusersimport unittest class fastcluster_test(unittest.TestCase): def test(self): from tests.test import test self.assertTrue(test(10)) def test_nan(self): from tests.nantest import test self.assertTrue(test()) def test_vector(self): from tests.vectortest import test self.assertTrue(test(10)) fastcluster/src/python/tests/test.py0000644000176200001440000001376213144412405017423 0ustar liggesusers#!/usr/bin/env python # -*- coding: utf-8 -*- print(''' Test program for the 'fastcluster' package. Copyright: * Until package version 1.1.23: (c) 2011 Daniel Müllner * All changes from version 1.1.24 on: (c) Google Inc. ''') import sys import fastcluster as fc import numpy as np from scipy.spatial.distance import pdist, squareform import math version = '1.1.24' if fc.__version__ != version: raise ValueError('Wrong module version: {} instead of {}.'.format(fc.__version__, version)) import atexit def print_seed(): print("Seed: {0}".format(seed)) atexit.register(print_seed) seed = np.random.randint(0,1e9) np.random.seed(seed) #abstol = 1e-14 # absolute tolerance rtol = 1e-14 # relative tolerance # NaN values are used in computations. Do not warn about them. np.seterr(invalid='ignore') def test_all(D): D2 = D.copy() for method in ['single', 'complete', 'average', 'weighted', 'ward', 'centroid', 'median']: Z2 = fc.linkage(D, method) if np.any(D2!=D): raise AssertionError('Input array was corrupted.') check(Z2, D, method) def check(Z2, D, method): sys.stdout.write("Method: " + method + "...") I = np.array(Z2[:,:2], dtype=int) Ds = squareform(D) n = len(Ds) row_repr = np.arange(2*n-1) row_repr[n:] = -1 size = np.ones(n, dtype=np.int) np.fill_diagonal(Ds, np.nan) mins = np.empty(n-1) for i in range(n-1): for j in range(n-1): # Suppress warning if all distances are NaN. if np.all(np.isnan(Ds[j,j+1:])): mins[j] = np.nan else: mins[j] = np.nanmin(Ds[j,j+1:]) gmin = np.nanmin(mins) if (Z2[i,2]-gmin) > max(abs(Z2[i,2]),abs(gmin))*rtol: raise AssertionError('Not the global minimum in step {2}: {0}, {1}'.\ format(Z2[i,2], gmin, i)) i1, i2 = row_repr[I[i,:]] if (i1<0): raise AssertionError('Negative index i1.') if (i2<0): raise AssertionError('Negative index i2.') if I[i,0]>=I[i,1]: raise AssertionError('Convention violated.') if i1>i2: i1, i2 = i2, i1 if (Ds[i1,i2]-gmin) > max(abs(Ds[i1,i2]),abs(gmin))*rtol: raise AssertionError('The global minimum is not at the right place: ' '({0}, {1}): {2} != {3}. Difference: {4}'.\ format(i1, i2, Ds[i1, i2], gmin, Ds[i1, i2]-gmin)) s1 = size[i1] s2 = size[i2] S = float(s1+s2) if method=='single': if i1>0: # mostly unnecessary; workaround for a bug/feature in NumPy 1.7.0.dev # see http://projects.scipy.org/numpy/ticket/2078 Ds[:i1,i2] = np.min( Ds[:i1,(i1,i2)],axis=1) Ds[i1:i2,i2] = np.minimum(Ds[i1,i1:i2],Ds[i1:i2,i2]) Ds[i2,i2:] = np.min( Ds[(i1,i2),i2:],axis=0) elif method=='complete': if i1>0: Ds[:i1,i2] = np.max( Ds[:i1,(i1,i2)],axis=1) Ds[i1:i2,i2] = np.maximum(Ds[i1,i1:i2],Ds[i1:i2,i2]) Ds[i2,i2:] = np.max( Ds[(i1,i2),i2:],axis=0) elif method=='average': Ds[:i1,i2] = ( Ds[:i1,i1]*s1 + Ds[:i1,i2]*s2 ) / S Ds[i1:i2,i2] = ( Ds[i1,i1:i2]*s1 + Ds[i1:i2,i2]*s2 ) / S Ds[i2,i2:] = ( Ds[i1,i2:]*s1 + Ds[i2,i2:]*s2 ) / S elif method=='weighted': if i1>0: Ds[:i1,i2] = np.mean( Ds[:i1,(i1,i2)],axis=1) Ds[i1:i2,i2] = ( Ds[i1,i1:i2] + Ds[i1:i2,i2] ) *.5 Ds[i2,i2:] = np.mean( Ds[(i1,i2),i2:],axis=0) elif method=='ward': Ds[:i1,i2] = np.sqrt((np.square(Ds[:i1,i1])*(s1+size[:i1]) -gmin*gmin*size[:i1]+np.square(Ds[:i1,i2]) *(s2+size[:i1]))/(S+size[:i1])) Ds[i1:i2,i2] = np.sqrt((np.square(Ds[i1,i1:i2])*(s1+size[i1:i2]) -gmin*gmin*size[i1:i2]+np.square(Ds[i1:i2,i2]) *(s2+size[i1:i2]))/(S+size[i1:i2])) Ds[i2,i2:] = np.sqrt((np.square(Ds[i1,i2:])*(s1+size[i2:]) -gmin*gmin*size[i2:]+np.square(Ds[i2,i2:]) *(s2+size[i2:]))/(S+size[i2:])) elif method=='centroid': Ds[:i1,i2] = np.sqrt((np.square(Ds[:i1,i1])*s1 +np.square(Ds[:i1,i2])*s2)*S-gmin*gmin*s1*s2) / S Ds[i1:i2,i2] = np.sqrt((np.square(Ds[i1,i1:i2])*s1 +np.square(Ds[i1:i2,i2])*s2)*S-gmin*gmin*s1*s2) / S Ds[i2,i2:] = np.sqrt((np.square(Ds[i1,i2:])*s1 +np.square(Ds[i2,i2:])*s2)*S-gmin*gmin*s1*s2) / S elif method=='median': Ds[:i1,i2] = np.sqrt((np.square(Ds[:i1,i1])+\ np.square(Ds[:i1,i2]))*2-gmin*gmin)*.5 Ds[i1:i2,i2] = np.sqrt((np.square(Ds[i1,i1:i2])+\ np.square(Ds[i1:i2,i2]))*2-gmin*gmin)*.5 Ds[i2,i2:] = np.sqrt((np.square(Ds[i1,i2:])+\ np.square(Ds[i2,i2:]))*2-gmin*gmin)*.5 else: raise ValueError('Unknown method.') Ds[i1, i1:n] = np.nan Ds[:i1, i1] = np.nan row_repr[n+i] = i2 size[i2] = S print('OK.') def test(repeats): if repeats: iterator = range(repeats) else: import itertools iterator = itertools.repeat(None) print(''' If everything is OK, the test program will run forever, without an error message. ''') for _ in iterator: dim = np.random.randint(2,20) n = np.random.randint(2,100) print('Dimension: {0}'.format(dim)) print('Number of points: {0}'.format(n)) D = pdist(np.random.randn(n,dim)) try: print('Real distance values:') test_all(D) D = np.round(D*n/4) print('Integer distance values:') test_all(D) except AssertionError as E: print(E) print(squareform(D)) return False return True if __name__ == "__main__": test(None) fastcluster/src/python/tests/nantest.py0000644000176200001440000000420313144412414020106 0ustar liggesusers#!/usr/bin/env python # -*- coding: utf-8 -*- '''Test whether the fastcluster package correctly recognizes NaN values and raises a FloatingPointError.''' print(''' Test program for the 'fastcluster' package. Copyright: * Until package version 1.1.23: (c) 2011 Daniel Müllner * All changes from version 1.1.24 on: (c) Google Inc. ''') import numpy as np import fastcluster version = '1.1.24' if fastcluster.__version__ != version: raise ValueError('Wrong module version: {} instead of {}.'.format(fastcluster.__version__, version)) import atexit def print_seed(): print("Seed: {0}".format(seed)) atexit.register(print_seed) seed = np.random.randint(0,1e9) np.random.seed(seed) def test(): n = np.random.randint(2,100) # Part 1: distance matrix input N = n*(n-1)//2 D = np.random.rand(N) # Insert a single NaN value pos = np.random.randint(N) D[pos] = np.nan for method in ['single', 'complete', 'average', 'weighted', 'ward', 'centroid', 'median']: try: fastcluster.linkage(D, method=method) raise AssertionError('fastcluster did not detect a NaN value!') except FloatingPointError: pass # Next: the original array does not contain a NaN, but a NaN occurs # as an updated distance. for method in ['average', 'weighted', 'ward', 'centroid', 'median']: try: fastcluster.linkage([np.inf,-np.inf,-np.inf], method=method) raise AssertionError('fastcluster did not detect a NaN value!') except FloatingPointError: pass # Part 2: vector input dim = np.random.randint(2,13) X = np.random.rand(n,dim) pos = (np.random.randint(n), np.random.randint(dim)) # Insert a single NaN coordinate X[pos] = np.nan for method in ['single', 'ward', 'centroid', 'median']: try: fastcluster.linkage_vector(X, method=method) raise AssertionError('fastcluster did not detect a NaN value!') except FloatingPointError: pass return True if __name__ == "__main__": test() print('OK.') fastcluster/src/python/tests/vectortest.py0000644000176200001440000002142013144412374020641 0ustar liggesusers#!/usr/bin/env python # -*- coding: utf-8 -*- # TBD test single on integer matrices for hamming/jaccard print(''' Test program for the 'fastcluster' package. Copyright: * Until package version 1.1.23: (c) 2011 Daniel Müllner * All changes from version 1.1.24 on: (c) Google Inc. ''') import sys import fastcluster as fc import numpy as np from scipy.spatial.distance import pdist, squareform import math version = '1.1.24' if fc.__version__ != version: raise ValueError('Wrong module version: {} instead of {}.'.format(fc.__version__, version)) import atexit def print_seed(): print("Seed: {0}".format(seed)) atexit.register(print_seed) seed = np.random.randint(0,1e9) print_seed() np.random.seed(seed) abstol = 1e-14 # absolute tolerance rtol = 1e-13 # relative tolerance # NaN values are used in computations. Do not warn about them. np.seterr(invalid='ignore') def correct_for_zero_vectors(D, pcd, metric): # Correct some metrics: we want the distance from the zero vector # to itself to be 0, not NaN. if metric in ('jaccard', 'dice', 'sokalsneath'): z = np.flatnonzero(np.all(pcd==0, axis=1)) if len(z): DD = squareform(D) DD[np.ix_(z, z)] = 0 D = squareform(DD) return D def test_all(n,dim): method = 'single' # metrics for boolean vectors pcd = np.random.randint(0, 2, size=(n,dim), dtype=np.bool) pcd2 = pcd.copy() for metric in ('hamming', 'jaccard', 'yule', 'matching', 'dice', 'rogerstanimoto', #'sokalmichener', # exclude, bug in Scipy # http://projects.scipy.org/scipy/ticket/1486 'russellrao', 'sokalsneath', #'kulsinski' # exclude, bug in Scipy # http://projects.scipy.org/scipy/ticket/1484 ): sys.stdout.write("Metric: " + metric + "...") D = pdist(pcd, metric) D = correct_for_zero_vectors(D, pcd, metric) try: Z2 = fc.linkage_vector(pcd, method, metric) except FloatingPointError: # If linkage_vector reported a NaN dissimilarity value, # check whether the distance matrix really contains NaN. if np.any(np.isnan(D)): print("Skip this test: NaN dissimilarity value.") continue else: raise AssertionError('"linkage_vector" erroneously reported NaN.') if np.any(pcd2!=pcd): raise AssertionError('Input array was corrupted.', pcd) check(Z2, method, D) # metrics for real vectors bound = math.sqrt(n) pcd = np.random.randint(-bound, bound + 1, (n,dim)) for metric in ['euclidean', 'sqeuclidean', 'cityblock', 'chebychev', 'minkowski', 'cosine', 'correlation', 'hamming', 'jaccard', 'canberra', # canberra: see bug in older Scipy versions # http://projects.scipy.org/scipy/ticket/1430 'braycurtis', 'seuclidean', 'mahalanobis', 'user']: sys.stdout.write("Metric: " + metric + "...") if metric=='minkowski': p = np.random.uniform(1.,10.) sys.stdout.write("p: " + str(p) + "...") D = pdist(pcd, metric, p) Z2 = fc.linkage_vector(pcd, method, metric, p) elif metric=='user': # Euclidean metric as a user function fn = (lambda u, v: np.sqrt(((u-v)*(u-v).T).sum())) D = pdist(pcd, fn) Z2 = fc.linkage_vector(pcd, method, fn) else: D = pdist(pcd, metric) D = correct_for_zero_vectors(D, pcd, metric) try: Z2 = fc.linkage_vector(pcd, method, metric) except FloatingPointError: if np.any(np.isnan(D)): print("Skip this test: NaN dissimilarity value.") continue else: raise AssertionError( '"linkage_vector" erroneously reported NaN.') check(Z2, method, D) D = pdist(pcd) for method in ['ward', 'centroid', 'median']: Z2 = fc.linkage_vector(pcd, method) check(Z2, method, D) def check(Z2, method, D): sys.stdout.write("Method: " + method + "...") I = np.array(Z2[:,:2], dtype=int) Ds = squareform(D) n = len(Ds) row_repr = np.arange(2*n-1) row_repr[n:] = -1 size = np.ones(n, dtype=np.int) np.fill_diagonal(Ds, np.nan) mins = np.empty(n-1) for i in range(n-1): for j in range(n-1): mins[j] = np.nanmin(Ds[j,j+1:]) gmin = np.nanmin(mins) if abs(Z2[i,2]-gmin) > max(abs(Z2[i,2]),abs(gmin))*rtol and \ abs(Z2[i,2]-gmin)>abstol: raise AssertionError( 'Not the global minimum in step {2}: {0}, {1}'. format(Z2[i,2], gmin,i), squareform(D)) i1, i2 = row_repr[I[i,:]] if (i1<0): raise AssertionError('Negative index i1.', squareform(D)) if (i2<0): raise AssertionError('Negative index i2.', squareform(D)) if I[i,0]>=I[i,1]: raise AssertionError('Convention violated.', squareform(D)) if i1>i2: i1, i2 = i2, i1 if abs(Ds[i1,i2]-gmin) > max(abs(Ds[i1,i2]),abs(gmin))*rtol and \ abs(Ds[i1,i2]-gmin)>abstol: raise AssertionError( 'The global minimum is not at the right place in step {5}: ' '({0}, {1}): {2} != {3}. Difference: {4}' .format(i1, i2, Ds[i1, i2], gmin, Ds[i1, i2]-gmin, i), squareform(D)) s1 = size[i1] s2 = size[i2] S = float(s1+s2) if method=='single': if i1>0: # mostly unnecessary; workaround for a bug/feature in NumPy # 1.7.0.dev, see http://projects.scipy.org/numpy/ticket/2078 Ds[:i1,i2] = np.min( Ds[:i1,(i1,i2)],axis=1) Ds[i1:i2,i2] = np.minimum(Ds[i1,i1:i2],Ds[i1:i2,i2]) Ds[i2,i2:] = np.min( Ds[(i1,i2),i2:],axis=0) elif method=='complete': if i1>0: Ds[:i1,i2] = np.max( Ds[:i1,(i1,i2)],axis=1) Ds[i1:i2,i2] = np.maximum(Ds[i1,i1:i2],Ds[i1:i2,i2]) Ds[i2,i2:] = np.max( Ds[(i1,i2),i2:],axis=0) elif method=='average': Ds[:i1,i2] = ( Ds[:i1,i1]*s1 + Ds[:i1,i2]*s2 ) / S Ds[i1:i2,i2] = ( Ds[i1,i1:i2]*s1 + Ds[i1:i2,i2]*s2 ) / S Ds[i2,i2:] = ( Ds[i1,i2:]*s1 + Ds[i2,i2:]*s2 ) / S elif method=='weighted': if i1>0: Ds[:i1,i2] = np.mean( Ds[:i1,(i1,i2)],axis=1) Ds[i1:i2,i2] = ( Ds[i1,i1:i2] + Ds[i1:i2,i2] )*.5 Ds[i2,i2:] = np.mean( Ds[(i1,i2),i2:],axis=0) elif method=='ward': Ds[:i1,i2] = np.sqrt((np.square(Ds[:i1,i1])*(s1+size[:i1]) -gmin*gmin*size[:i1] +np.square(Ds[:i1,i2])*(s2+size[:i1]))/(S+size[:i1])) Ds[i1:i2,i2] = np.sqrt((np.square(Ds[i1,i1:i2])*(s1+size[i1:i2]) -gmin*gmin*size[i1:i2] +np.square(Ds[i1:i2,i2])*(s2+size[i1:i2])) /(S+size[i1:i2])) Ds[i2,i2:] = np.sqrt((np.square(Ds[i1,i2:])*(s1+size[i2:]) -gmin*gmin*size[i2:] +np.square(Ds[i2,i2:])*(s2+size[i2:]))/(S+size[i2:])) elif method=='centroid': Ds[:i1,i2] = np.sqrt((np.square(Ds[:i1,i1])*s1 +np.square(Ds[:i1,i2])*s2)*S-gmin*gmin*s1*s2) / S Ds[i1:i2,i2] = np.sqrt((np.square(Ds[i1,i1:i2])*s1 +np.square(Ds[i1:i2,i2])*s2)*S-gmin*gmin*s1*s2) / S Ds[i2,i2:] = np.sqrt((np.square(Ds[i1,i2:])*s1 +np.square(Ds[i2,i2:])*s2)*S-gmin*gmin*s1*s2) / S elif method=='median': Ds[:i1,i2] = np.sqrt((np.square(Ds[:i1,i1]) +np.square(Ds[:i1,i2]))*2-gmin*gmin)*.5 Ds[i1:i2,i2] = np.sqrt((np.square(Ds[i1,i1:i2]) +np.square(Ds[i1:i2,i2]))*2-gmin*gmin)*.5 Ds[i2,i2:] = np.sqrt((np.square(Ds[i1,i2:]) +np.square(Ds[i2,i2:]))*2-gmin*gmin)*.5 else: raise ValueError('Unknown method.') Ds[i1, i1:n] = np.inf Ds[:i1, i1] = np.inf row_repr[n+i] = i2 size[i2] = S print('OK.') def test(repeats): if repeats: iterator = range(repeats) else: import itertools iterator = itertools.repeat(None) print(''' If everything is OK, the test program will run forever, without an error message. ''') for _ in iterator: dim = np.random.randint(2, 13) n = np.random.randint(max(2*dim,5),200) print('Dimension: {0}'.format(dim)) print('Number of points: {0}'.format(n)) try: test_all(n,dim) except AssertionError as E: print(E.args[0]) print(E.args[1]) return False return True if __name__ == "__main__": test(None) fastcluster/src/python/setup.py0000644000176200001440000001566413146374205016454 0ustar liggesusers#!/usr/bin/env python # -*- coding: utf-8 -*- import os import sys #import distutils.debug #distutils.debug.DEBUG = 'yes' from setuptools import setup, Extension if sys.hexversion < 0x03000000: # uniform unicode handling for both Python 2.x and 3.x def u(x): return x.decode('utf-8') def textfileopen(filename): return open(filename, mode='r') else: def u(x): return x def textfileopen(filename): return open(filename, mode='r', encoding='utf_8') u(''' fastcluster: Fast hierarchical clustering routines for R and Python Copyright: * Until package version 1.1.23: © 2011 Daniel Müllner * All changes from version 1.1.24 on: © Google Inc. ''') with textfileopen('fastcluster.py') as f: for line in f: if line.find('__version_info__ =') == 0: version = '.'.join(line.split("'")[1:-1:2]) break print('Version: ' + version) def get_include_dirs(): """ Avoid importing numpy until here, so that users can run "setup.py install" without having numpy installed yet. """ def is_special_command(): special_list = ('--help-commands', 'egg_info', '--version', 'clean') return ('--help' in sys.argv[1:] or sys.argv[1] in special_list) if len(sys.argv) >= 2 and is_special_command(): return [] import numpy return [numpy.get_include()] setup(name='fastcluster', version=version, py_modules=['fastcluster'], description='Fast hierarchical clustering routines for R and Python.', long_description=u(""" This library provides Python functions for hierarchical clustering. It generates hierarchical clusters from distance matrices or from vector data. Part of this module is intended to replace the functions :: linkage, single, complete, average, weighted, centroid, median, ward in the module ``scipy.cluster.hierarchy`` with the same functionality but much faster algorithms. Moreover, the function ``linkage_vector`` provides memory-efficient clustering for vector data. The interface is very similar to MATLAB's Statistics Toolbox API to make code easier to port from MATLAB to Python/NumPy. The core implementation of this library is in C++ for efficiency. **User manual:** `fastcluster.pdf `_. Installation files for Windows are provided on `PyPI `_ and on `Christoph Gohlke's web page `_. **The fastcluster package is considered stable and will undergo few changes from now on. If some years from now there have not been any updates, this does not necessarily mean that the package is unmaintained but maybe it just was not necessary to correct anything. Of course, please still report potential bugs and incompatibilities to daniel@danifold.net. You may also use** `my GitHub repository `_ **for bug reports, pull requests etc.** Note that PyPI and my GitHub repository host the source code for the Python interface only. The archive with both the R and the Python interface is available on `CRAN `_ and the GitHub repository `“cran/fastcluster” `_. Even though I appear as the author also of this second GitHub repository, this is just an automatic, read-only mirror of the CRAN archive, so please do not attempt to report bugs or contact me via this repository. Reference: Daniel Müllner, *fastcluster: Fast Hierarchical, Agglomerative Clustering Routines for R and Python*, Journal of Statistical Software, **53** (2013), no. 9, 1–18, http://www.jstatsoft.org/v53/i09/. """), requires=['numpy'], install_requires=["numpy>=1.9"], provides=['fastcluster'], ext_modules=[Extension('_fastcluster', ['fastcluster_python.cpp'], extra_compile_args=['/EHsc'] if os.name == 'nt' else [], include_dirs=get_include_dirs(), # Feel free to uncomment the line below if you use the GCC. # This switches to more aggressive optimization and turns # more warning switches on. No warning should appear in # the compilation process. # # Also, the author's Python distribution generates debug # symbols by default. This can be turned off, resulting a in # much smaller compiled library. # # Optimization #extra_compile_args=['-O2', '-g0', '-march=native', '-mtune=native', '-fno-math-errno'], # # List of all warning switches, somewhere from stackoverflow.com #extra_compile_args=['-Wall', '-Weffc++', '-Wextra', '-Wall', '-Wcast-align', '-Wchar-subscripts', '-Wcomment', '-Wconversion', '-Wsign-conversion', '-Wdisabled-optimization', '-Wfloat-equal', '-Wformat', '-Wformat=2', '-Wformat-nonliteral', '-Wformat-security', '-Wformat-y2k', '-Wimport', '-Winit-self', '-Winline', '-Winvalid-pch', '-Wunsafe-loop-optimizations', '-Wmissing-braces', '-Wmissing-field-initializers', '-Wmissing-format-attribute', '-Wmissing-include-dirs', '-Wmissing-noreturn', '-Wpacked', '-Wparentheses', '-Wpointer-arith', '-Wredundant-decls', '-Wreturn-type', '-Wsequence-point', '-Wshadow', '-Wsign-compare', '-Wstack-protector', '-Wstrict-aliasing', '-Wstrict-aliasing=2', '-Wswitch', '-Wswitch-enum', '-Wtrigraphs', '-Wuninitialized', '-Wunknown-pragmas', '-Wunreachable-code', '-Wunused', '-Wunused-function', '-Wunused-label', '-Wunused-parameter', '-Wunused-value', '-Wunused-variable', '-Wvariadic-macros', '-Wvolatile-register-var', '-Wwrite-strings', '-Wlong-long', '-Wpadded', '-Wcast-qual', '-Wswitch-default', '-Wnon-virtual-dtor', '-Wold-style-cast', '-Woverloaded-virtual', '-Waggregate-return', '-Werror'], # # Linker optimization #extra_link_args=['-Wl,--strip-all'], )], keywords=['dendrogram', 'linkage', 'cluster', 'agglomerative', 'hierarchical', 'hierarchy', 'ward'], author=u("Daniel Müllner"), author_email="daniel@danifold.net", license="BSD ", classifiers=[ "Topic :: Scientific/Engineering :: Information Analysis", "Topic :: Scientific/Engineering :: Artificial Intelligence", "Topic :: Scientific/Engineering :: Bio-Informatics", "Topic :: Scientific/Engineering :: Mathematics", "Programming Language :: Python", "Programming Language :: Python :: 2", "Programming Language :: Python :: 3", "Programming Language :: C++", "Operating System :: OS Independent", "License :: OSI Approved :: BSD License", "License :: OSI Approved :: GNU General Public License v2 (GPLv2)", "Intended Audience :: Science/Research", "Development Status :: 5 - Production/Stable"], url='http://danifold.net', test_suite='tests.fastcluster_test', ) fastcluster/src/python/fastcluster_python.cpp0000644000176200001440000010667213146376104021406 0ustar liggesusers/* fastcluster: Fast hierarchical clustering routines for R and Python Copyright: * Until package version 1.1.23: © 2011 Daniel Müllner * All changes from version 1.1.24 on: © Google Inc. */ // for INT32_MAX in fastcluster.cpp // This must be defined here since Python.h loads the header file pyport.h, // and from this stdint.h. INT32_MAX is defined in stdint.h, but only if // __STDC_LIMIT_MACROS is defined. #define __STDC_LIMIT_MACROS #define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION #if __GNUC__ > 4 || (__GNUC__ == 4 && (__GNUC_MINOR__ >= 6)) #define HAVE_DIAGNOSTIC 1 #endif #if HAVE_DIAGNOSTIC #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wswitch-default" #pragma GCC diagnostic ignored "-Wpadded" #pragma GCC diagnostic ignored "-Wlong-long" #pragma GCC diagnostic ignored "-Wformat" #endif #include #if HAVE_DIAGNOSTIC #pragma GCC diagnostic pop #endif #if HAVE_DIAGNOSTIC #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wlong-long" #pragma GCC diagnostic ignored "-Wpedantic" #pragma GCC diagnostic ignored "-Wpadded" #pragma GCC diagnostic ignored "-Wcast-qual" #endif #include #if HAVE_DIAGNOSTIC #pragma GCC diagnostic pop #endif /* It's complicated, but if I do not include the C++ math headers, GCC will complain about conversions from 'double' to 'float', whenever 'isnan' is called in a templated function (but not outside templates). The '#include ' seems to cure the problem. */ //#include #define fc_isnan(X) ((X)!=(X)) // There is Py_IS_NAN but it is so much slower on my x86_64 system with GCC! #include // for std::abs, std::pow, std::sqrt #include // for std::ptrdiff_t #include // for std::numeric_limits<...>::infinity() #include // for std::stable_sort #include // for std::bad_alloc #include // for std::exception #include "../fastcluster.cpp" // backwards compatibility #ifndef NPY_ARRAY_CARRAY_RO #define NPY_ARRAY_CARRAY_RO NPY_CARRAY_RO #endif /* Since the public interface is given by the Python respectively R interface, * we do not want other symbols than the interface initalization routines to be * visible in the shared object file. The "visibility" switch is a GCC concept. * Hiding symbols keeps the relocation table small and decreases startup time. * See http://gcc.gnu.org/wiki/Visibility */ #if HAVE_VISIBILITY #pragma GCC visibility push(hidden) #endif /* Convenience class for the output array: automatic counter. */ class linkage_output { private: t_float * Z; public: linkage_output(t_float * const Z_) : Z(Z_) {} void append(const t_index node1, const t_index node2, const t_float dist, const t_float size) { if (node1(node1); *(Z++) = static_cast(node2); } else { *(Z++) = static_cast(node2); *(Z++) = static_cast(node1); } *(Z++) = dist; *(Z++) = size; } }; /* Generate the SciPy-specific output format for a dendrogram from the clustering output. The list of merging steps can be sorted or unsorted. */ // The size of a node is either 1 (a single point) or is looked up from // one of the clusters. #define size_(r_) ( ((r_ static void generate_SciPy_dendrogram(t_float * const Z, cluster_result & Z2, const t_index N) { // The array "nodes" is a union-find data structure for the cluster // identities (only needed for unsorted cluster_result input). union_find nodes(sorted ? 0 : N); if (!sorted) { std::stable_sort(Z2[0], Z2[N-1]); } linkage_output output(Z); t_index node1, node2; for (node const * NN=Z2[0]; NN!=Z2[N-1]; ++NN) { // Get two data points whose clusters are merged in step i. if (sorted) { node1 = NN->node1; node2 = NN->node2; } else { // Find the cluster identifiers for these points. node1 = nodes.Find(NN->node1); node2 = nodes.Find(NN->node2); // Merge the nodes in the union-find data structure by making them // children of a new node. nodes.Union(node1, node2); } output.append(node1, node2, NN->dist, size_(node1)+size_(node2)); } } /* Python interface code */ static PyObject * linkage_wrap(PyObject * const self, PyObject * const args); static PyObject * linkage_vector_wrap(PyObject * const self, PyObject * const args); // List the C++ methods that this extension provides. static PyMethodDef _fastclusterWrapMethods[] = { {"linkage_wrap", linkage_wrap, METH_VARARGS, NULL}, {"linkage_vector_wrap", linkage_vector_wrap, METH_VARARGS, NULL}, {NULL, NULL, 0, NULL} /* Sentinel - marks the end of this structure */ }; /* Tell Python about these methods. Python 2.x and 3.x differ in their C APIs for this part. */ #if PY_VERSION_HEX >= 0x03000000 static struct PyModuleDef fastclustermodule = { PyModuleDef_HEAD_INIT, "_fastcluster", NULL, // no module documentation -1, /* size of per-interpreter state of the module, or -1 if the module keeps state in global variables. */ _fastclusterWrapMethods, NULL, NULL, NULL, NULL }; /* Make the interface initalization routines visible in the shared object * file. */ #if HAVE_VISIBILITY #pragma GCC visibility push(default) #endif PyMODINIT_FUNC PyInit__fastcluster(void) { PyObject * m; m = PyModule_Create(&fastclustermodule); if (!m) { return NULL; } import_array(); // Must be present for NumPy. Called first after above line. return m; } #if HAVE_VISIBILITY #pragma GCC visibility pop #endif # else // Python 2.x #if HAVE_VISIBILITY #pragma GCC visibility push(default) #endif PyMODINIT_FUNC init_fastcluster(void) { (void) Py_InitModule("_fastcluster", _fastclusterWrapMethods); import_array(); // Must be present for NumPy. Called first after above line. } #if HAVE_VISIBILITY #pragma GCC visibility pop #endif #endif // PY_VERSION class GIL_release { private: // noncopyable GIL_release(GIL_release const &); GIL_release & operator=(GIL_release const &); public: inline GIL_release(bool really = true) : _save(really ? PyEval_SaveThread() : NULL) { } inline ~GIL_release() { if (_save) PyEval_RestoreThread(_save); } private: PyThreadState * _save; }; /* Interface to Python, part 1: The input is a dissimilarity matrix. */ static PyObject *linkage_wrap(PyObject * const, PyObject * const args) { PyArrayObject * D, * Z; long int N_ = 0; unsigned char method; try{ #if HAVE_DIAGNOSTIC #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wold-style-cast" #endif // Parse the input arguments if (!PyArg_ParseTuple(args, "lO!O!b", &N_, // signed long integer &PyArray_Type, &D, // NumPy array &PyArray_Type, &Z, // NumPy array &method)) { // unsigned char return NULL; // Error if the arguments have the wrong type. } #if HAVE_DIAGNOSTIC #pragma GCC diagnostic pop #endif if (N_ < 1 ) { // N must be at least 1. PyErr_SetString(PyExc_ValueError, "At least one element is needed for clustering."); return NULL; } /* (1) The biggest index used below is 4*(N-2)+3, as an index to Z. This must fit into the data type used for indices. (2) The largest representable integer, without loss of precision, by a floating point number of type t_float is 2^T_FLOAT_MANT_DIG. Here, we make sure that all cluster labels from 0 to 2N-2 in the output can be accurately represented by a floating point number. Conversion of N to 64 bits below is not really necessary but it prevents a warning ("shift count >= width of type") on systems where "long int" is 32 bits wide. */ if (N_ > MAX_INDEX/4 || static_cast(N_-1)>>(T_FLOAT_MANT_DIG-1) > 0) { PyErr_SetString(PyExc_ValueError, "Data is too big, index overflow."); return NULL; } t_index N = static_cast(N_); // Allow threads! GIL_release G; t_float * const D_ = reinterpret_cast(PyArray_DATA(D)); cluster_result Z2(N-1); auto_array_ptr members; // For these methods, the distance update formula needs the number of // data points in a cluster. if (method==METHOD_METR_AVERAGE || method==METHOD_METR_WARD || method==METHOD_METR_CENTROID) { members.init(N, 1); } // Operate on squared distances for these methods. if (method==METHOD_METR_WARD || method==METHOD_METR_CENTROID || method==METHOD_METR_MEDIAN) { for (t_float * DD = D_; DD!=D_+static_cast(N)*(N-1)/2; ++DD) *DD *= *DD; } switch (method) { case METHOD_METR_SINGLE: MST_linkage_core(N, D_, Z2); break; case METHOD_METR_COMPLETE: NN_chain_core(N, D_, NULL, Z2); break; case METHOD_METR_AVERAGE: NN_chain_core(N, D_, members, Z2); break; case METHOD_METR_WEIGHTED: NN_chain_core(N, D_, NULL, Z2); break; case METHOD_METR_WARD: NN_chain_core(N, D_, members, Z2); break; case METHOD_METR_CENTROID: generic_linkage(N, D_, members, Z2); break; case METHOD_METR_MEDIAN: generic_linkage(N, D_, NULL, Z2); break; default: throw std::runtime_error(std::string("Invalid method index.")); } if (method==METHOD_METR_WARD || method==METHOD_METR_CENTROID || method==METHOD_METR_MEDIAN) { Z2.sqrt(); } t_float * const Z_ = reinterpret_cast(PyArray_DATA(Z)); if (method==METHOD_METR_CENTROID || method==METHOD_METR_MEDIAN) { generate_SciPy_dendrogram(Z_, Z2, N); } else { generate_SciPy_dendrogram(Z_, Z2, N); } } // try catch (const std::bad_alloc&) { return PyErr_NoMemory(); } catch(const std::exception& e){ PyErr_SetString(PyExc_EnvironmentError, e.what()); return NULL; } catch(const nan_error&){ PyErr_SetString(PyExc_FloatingPointError, "NaN dissimilarity value."); return NULL; } #ifdef FE_INVALID catch(const fenv_error&){ PyErr_SetString(PyExc_FloatingPointError, "NaN dissimilarity value in intermediate results."); return NULL; } #endif catch(...){ PyErr_SetString(PyExc_EnvironmentError, "C++ exception (unknown reason). Please send a bug report."); return NULL; } #if HAVE_DIAGNOSTIC #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wold-style-cast" #endif Py_RETURN_NONE; #if HAVE_DIAGNOSTIC #pragma GCC diagnostic pop #endif } /* Part 2: Clustering on vector data */ /* Metric codes. These codes must agree with the dictionary mtridx in fastcluster.py. */ enum metric_codes { // metrics METRIC_EUCLIDEAN = 0, METRIC_MINKOWSKI = 1, METRIC_CITYBLOCK = 2, METRIC_SEUCLIDEAN = 3, METRIC_SQEUCLIDEAN = 4, METRIC_COSINE = 5, METRIC_HAMMING = 6, METRIC_JACCARD = 7, METRIC_CHEBYCHEV = 8, METRIC_CANBERRA = 9, METRIC_BRAYCURTIS = 10, METRIC_MAHALANOBIS = 11, METRIC_YULE = 12, METRIC_MATCHING = 13, METRIC_DICE = 14, METRIC_ROGERSTANIMOTO = 15, METRIC_RUSSELLRAO = 16, METRIC_SOKALSNEATH = 17, METRIC_KULSINSKI = 18, METRIC_USER = 19, METRIC_INVALID = 20, // sentinel METRIC_JACCARD_BOOL = 21, // separate function for Jaccard metric on }; // Boolean input data /* Helper class: Throw this if calling the Python interpreter from within C returned an error. */ class pythonerror {}; /* This class handles all the information about the dissimilarity computation. */ class python_dissimilarity { private: t_float * Xa; std::ptrdiff_t dim; // size_t saves many statis_cast<> in products t_index N; auto_array_ptr Xnew; t_index * members; void (cluster_result::*postprocessfn) (const t_float) const; t_float postprocessarg; t_float (python_dissimilarity::*distfn) (const t_index, const t_index) const; // for user-defined metrics PyObject * X_Python; PyObject * userfn; auto_array_ptr precomputed; t_float * precomputed2; PyArrayObject * V; const t_float * V_data; // noncopyable python_dissimilarity(); python_dissimilarity(python_dissimilarity const &); python_dissimilarity & operator=(python_dissimilarity const &); public: // Ignore warning about uninitialized member variables. I know what I am // doing here, and some member variables are only used for certain metrics. #if HAVE_DIAGNOSTIC #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Weffc++" #endif python_dissimilarity (PyArrayObject * const Xarg, t_index * const members_, const method_codes method, const metric_codes metric, PyObject * const extraarg, bool temp_point_array) : Xa(reinterpret_cast(PyArray_DATA(Xarg))), dim(PyArray_DIM(Xarg, 1)), N(static_cast(PyArray_DIM(Xarg, 0))), Xnew(temp_point_array ? (N-1)*dim : 0), members(members_), postprocessfn(NULL), V(NULL) { switch (method) { case METHOD_METR_SINGLE: postprocessfn = NULL; // default switch (metric) { case METRIC_EUCLIDEAN: set_euclidean(); break; case METRIC_SEUCLIDEAN: if (extraarg==NULL) { PyErr_SetString(PyExc_TypeError, "The 'seuclidean' metric needs a variance parameter."); throw pythonerror(); } #if HAVE_DIAGNOSTIC #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wold-style-cast" #endif V = reinterpret_cast(PyArray_FromAny(extraarg, PyArray_DescrFromType(NPY_DOUBLE), 1, 1, NPY_ARRAY_CARRAY_RO, NULL)); #if HAVE_DIAGNOSTIC #pragma GCC diagnostic pop #endif if (PyErr_Occurred()) { throw pythonerror(); } if (PyArray_DIM(V, 0)!=dim) { PyErr_SetString(PyExc_ValueError, "The variance vector must have the same dimensionality as the data."); throw pythonerror(); } V_data = reinterpret_cast(PyArray_DATA(V)); distfn = &python_dissimilarity::seuclidean; postprocessfn = &cluster_result::sqrt; break; case METRIC_SQEUCLIDEAN: distfn = &python_dissimilarity::sqeuclidean; break; case METRIC_CITYBLOCK: set_cityblock(); break; case METRIC_CHEBYCHEV: set_chebychev(); break; case METRIC_MINKOWSKI: set_minkowski(extraarg); break; case METRIC_COSINE: distfn = &python_dissimilarity::cosine; postprocessfn = &cluster_result::plusone; // precompute norms precomputed.init(N); for (t_index i=0; i(dim); break; case METRIC_JACCARD: distfn = &python_dissimilarity::jaccard; break; case METRIC_CANBERRA: distfn = &python_dissimilarity::canberra; break; case METRIC_BRAYCURTIS: distfn = &python_dissimilarity::braycurtis; break; case METRIC_MAHALANOBIS: if (extraarg==NULL) { PyErr_SetString(PyExc_TypeError, "The 'mahalanobis' metric needs a parameter for the inverse covariance."); throw pythonerror(); } #if HAVE_DIAGNOSTIC #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wold-style-cast" #endif V = reinterpret_cast(PyArray_FromAny(extraarg, PyArray_DescrFromType(NPY_DOUBLE), 2, 2, NPY_ARRAY_CARRAY_RO, NULL)); #if HAVE_DIAGNOSTIC #pragma GCC diagnostic pop #endif if (PyErr_Occurred()) { throw pythonerror(); } if (PyArray_DIM(V, 0)!=N || PyArray_DIM(V, 1)!=dim) { PyErr_SetString(PyExc_ValueError, "The inverse covariance matrix has the wrong size."); throw pythonerror(); } V_data = reinterpret_cast(PyArray_DATA(V)); distfn = &python_dissimilarity::mahalanobis; postprocessfn = &cluster_result::sqrt; break; case METRIC_YULE: distfn = &python_dissimilarity::yule; break; case METRIC_MATCHING: distfn = &python_dissimilarity::matching; postprocessfn = &cluster_result::divide; postprocessarg = static_cast(dim); break; case METRIC_DICE: distfn = &python_dissimilarity::dice; break; case METRIC_ROGERSTANIMOTO: distfn = &python_dissimilarity::rogerstanimoto; break; case METRIC_RUSSELLRAO: distfn = &python_dissimilarity::russellrao; postprocessfn = &cluster_result::divide; postprocessarg = static_cast(dim); break; case METRIC_SOKALSNEATH: distfn = &python_dissimilarity::sokalsneath; break; case METRIC_KULSINSKI: distfn = &python_dissimilarity::kulsinski; postprocessfn = &cluster_result::plusone; precomputed.init(N); for (t_index i=0; i(sum); } break; case METRIC_USER: X_Python = reinterpret_cast(Xarg); this->userfn = extraarg; distfn = &python_dissimilarity::user; break; default: // case METRIC_JACCARD_BOOL: distfn = &python_dissimilarity::jaccard_bool; } break; case METHOD_METR_WARD: postprocessfn = &cluster_result::sqrtdouble; break; default: postprocessfn = &cluster_result::sqrt; } } #if HAVE_DIAGNOSTIC #pragma GCC diagnostic pop #endif ~python_dissimilarity() { #if HAVE_DIAGNOSTIC #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wold-style-cast" #endif Py_XDECREF(V); #if HAVE_DIAGNOSTIC #pragma GCC diagnostic pop #endif } inline t_float operator () (const t_index i, const t_index j) const { return (this->*distfn)(i,j); } inline t_float X (const t_index i, const t_index j) const { return Xa[i*dim+j]; } inline bool Xb (const t_index i, const t_index j) const { return reinterpret_cast(Xa)[i*dim+j]; } inline t_float * Xptr(const t_index i, const t_index j) const { return Xa+i*dim+j; } void merge(const t_index i, const t_index j, const t_index newnode) const { t_float const * const Pi = i(members[i]) + Pj[k]*static_cast(members[j])) / static_cast(members[i]+members[j]); } members[newnode] = members[i]+members[j]; } void merge_weighted(const t_index i, const t_index j, const t_index newnode) const { t_float const * const Pi = i(members[i]) + Pj[k]*static_cast(members[j])) / static_cast(members[i]+members[j]); } members[j] += members[i]; } void merge_inplace_weighted(const t_index i, const t_index j) const { t_float const * const Pi = Xa+i*dim; t_float * const Pj = Xa+j*dim; for(t_index k=0; k(members[i]); t_float mj = static_cast(members[j]); return sqeuclidean(i,j)*mi*mj/(mi+mj); } inline t_float ward_initial(const t_index i, const t_index j) const { // alias for sqeuclidean // Factor 2!!! return sqeuclidean(i,j); } // This method must not produce NaN if the input is non-NaN. inline static t_float ward_initial_conversion(const t_float min) { return min*.5; } inline t_float ward_extended(const t_index i, const t_index j) const { t_float mi = static_cast(members[i]); t_float mj = static_cast(members[j]); return sqeuclidean_extended(i,j)*mi*mj/(mi+mj); } /* We need two variants of the Euclidean metric: one that does not check for a NaN result, which is used for the initial distances, and one which does, for the updated distances during the clustering procedure. */ template t_float sqeuclidean(const t_index i, const t_index j) const { t_float sum = 0; /* for (t_index k=0; k::infinity()) { set_chebychev(); } else if (postprocessarg==1.0){ set_cityblock(); } else if (postprocessarg==2.0){ set_euclidean(); } else { distfn = &python_dissimilarity::minkowski; postprocessfn = &cluster_result::power; } #if HAVE_DIAGNOSTIC #pragma GCC diagnostic pop #endif } void set_euclidean() { distfn = &python_dissimilarity::sqeuclidean; postprocessfn = &cluster_result::sqrt; } void set_cityblock() { distfn = &python_dissimilarity::cityblock; } void set_chebychev() { distfn = &python_dissimilarity::chebychev; } t_float seuclidean(const t_index i, const t_index j) const { t_float sum = 0; for (t_index k=0; kmax) { max = diff; } } return max; } t_float cosine(const t_index i, const t_index j) const { t_float sum = 0; for (t_index k=0; k(sum1) / static_cast(sum2); } t_float canberra(const t_index i, const t_index j) const { t_float sum = 0; for (t_index k=0; k(dim)-NTT-NXO); // NFFTT } void nbool_correspond_xo(const t_index i, const t_index j) const { NXO = 0; for (t_index k=0; k(2*NTFFT) / static_cast(NTFFT + NFFTT); } // Prevent a zero denominator for equal vectors. t_float dice(const t_index i, const t_index j) const { nbool_correspond(i, j); return (NXO==0) ? 0 : static_cast(NXO) / static_cast(NXO+2*NTT); } t_float rogerstanimoto(const t_index i, const t_index j) const { nbool_correspond_xo(i, j); return static_cast(2*NXO) / static_cast(NXO+dim); } t_float russellrao(const t_index i, const t_index j) const { nbool_correspond_tt(i, j); return static_cast(dim-NTT); } // Prevent a zero denominator for equal vectors. t_float sokalsneath(const t_index i, const t_index j) const { nbool_correspond(i, j); return (NXO==0) ? 0 : static_cast(2*NXO) / static_cast(NTT+2*NXO); } t_float kulsinski(const t_index i, const t_index j) const { nbool_correspond_tt(i, j); return static_cast(NTT) * (precomputed[i] + precomputed[j]); } // 'matching' distance = Hamming distance t_float matching(const t_index i, const t_index j) const { nbool_correspond_xo(i, j); return static_cast(NXO); } // Prevent a zero denominator for equal vectors. t_float jaccard_bool(const t_index i, const t_index j) const { nbool_correspond(i, j); return (NXO==0) ? 0 : static_cast(NXO) / static_cast(NXO+NTT); } }; static PyObject *linkage_vector_wrap(PyObject * const, PyObject * const args) { PyArrayObject * X, * Z; unsigned char method, metric; PyObject * extraarg; try{ // Parse the input arguments #if HAVE_DIAGNOSTIC #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wold-style-cast" #endif if (!PyArg_ParseTuple(args, "O!O!bbO", &PyArray_Type, &X, // NumPy array &PyArray_Type, &Z, // NumPy array &method, // unsigned char &metric, // unsigned char &extraarg )) { // Python object return NULL; } #if HAVE_DIAGNOSTIC #pragma GCC diagnostic pop #endif if (PyArray_NDIM(X) != 2) { PyErr_SetString(PyExc_ValueError, "The input array must be two-dimensional."); } npy_intp const N_ = PyArray_DIM(X, 0); if (N_ < 1 ) { // N must be at least 1. PyErr_SetString(PyExc_ValueError, "At least one element is needed for clustering."); return NULL; } npy_intp const dim = PyArray_DIM(X, 1); if (dim < 1 ) { PyErr_SetString(PyExc_ValueError, "Invalid dimension of the data set."); return NULL; } /* (1) The biggest index used below is 4*(N-2)+3, as an index to Z. This must fit into the data type used for indices. (2) The largest representable integer, without loss of precision, by a floating point number of type t_float is 2^T_FLOAT_MANT_DIG. Here, we make sure that all cluster labels from 0 to 2N-2 in the output can be accurately represented by a floating point number. Conversion of N to 64 bits below is not really necessary but it prevents a warning ("shift count >= width of type") on systems where "int" is 32 bits wide. */ if (N_ > MAX_INDEX/4 || dim > MAX_INDEX || static_cast(N_-1)>>(T_FLOAT_MANT_DIG-1) > 0) { PyErr_SetString(PyExc_ValueError, "Data is too big, index overflow."); return NULL; } t_index N = static_cast(N_); cluster_result Z2(N-1); auto_array_ptr members; if (method==METHOD_METR_WARD || method==METHOD_METR_CENTROID) { members.init(2*N-1, 1); } if ((method!=METHOD_METR_SINGLE && metric!=METRIC_EUCLIDEAN) || metric>=METRIC_INVALID) { PyErr_SetString(PyExc_IndexError, "Invalid metric index."); return NULL; } if (PyArray_ISBOOL(X)) { if (metric==METRIC_HAMMING) { metric = METRIC_MATCHING; // Alias } if (metric==METRIC_JACCARD) { metric = METRIC_JACCARD_BOOL; } } if (extraarg!=Py_None && metric!=METRIC_MINKOWSKI && metric!=METRIC_SEUCLIDEAN && metric!=METRIC_MAHALANOBIS && metric!=METRIC_USER) { PyErr_SetString(PyExc_TypeError, "No extra parameter is allowed for this metric."); return NULL; } /* temp_point_array must be true if the alternative algorithm is used below (currently for the centroid and median methods). */ bool temp_point_array = (method==METHOD_METR_CENTROID || method==METHOD_METR_MEDIAN); python_dissimilarity dist(X, members, static_cast(method), static_cast(metric), extraarg, temp_point_array); if (method!=METHOD_METR_SINGLE && method!=METHOD_METR_WARD && method!=METHOD_METR_CENTROID && method!=METHOD_METR_MEDIAN) { PyErr_SetString(PyExc_IndexError, "Invalid method index."); return NULL; } // Allow threads if the metric is not "user"! GIL_release G(metric!=METRIC_USER); switch (method) { case METHOD_METR_SINGLE: MST_linkage_core_vector(N, dist, Z2); break; case METHOD_METR_WARD: generic_linkage_vector(N, dist, Z2); break; case METHOD_METR_CENTROID: generic_linkage_vector_alternative(N, dist, Z2); break; default: // case METHOD_METR_MEDIAN: generic_linkage_vector_alternative(N, dist, Z2); } if (method==METHOD_METR_WARD || method==METHOD_METR_CENTROID) { members.free(); } dist.postprocess(Z2); t_float * const Z_ = reinterpret_cast(PyArray_DATA(Z)); if (method!=METHOD_METR_SINGLE) { generate_SciPy_dendrogram(Z_, Z2, N); } else { generate_SciPy_dendrogram(Z_, Z2, N); } } // try catch (const std::bad_alloc&) { return PyErr_NoMemory(); } catch(const std::exception& e){ PyErr_SetString(PyExc_EnvironmentError, e.what()); return NULL; } catch(const nan_error&){ PyErr_SetString(PyExc_FloatingPointError, "NaN dissimilarity value."); return NULL; } catch(const pythonerror){ return NULL; } catch(...){ PyErr_SetString(PyExc_EnvironmentError, "C++ exception (unknown reason). Please send a bug report."); return NULL; } #if HAVE_DIAGNOSTIC #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wold-style-cast" #endif Py_RETURN_NONE; #if HAVE_DIAGNOSTIC #pragma GCC diagnostic pop #endif } #if HAVE_VISIBILITY #pragma GCC visibility pop #endif fastcluster/src/fastcluster_R.cpp0000644000176200001440000006612413146376104016742 0ustar liggesusers/* fastcluster: Fast hierarchical clustering routines for R and Python Copyright: * Until package version 1.1.23: © 2011 Daniel Müllner * All changes from version 1.1.24 on: © Google Inc. */ #if __GNUC__ > 4 || (__GNUC__ == 4 && (__GNUC_MINOR__ >= 6)) #define HAVE_DIAGNOSTIC 1 #endif #if HAVE_DIAGNOSTIC #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wredundant-decls" #pragma GCC diagnostic ignored "-Wpadded" #endif #include #include #include // for R_pow #if HAVE_DIAGNOSTIC #pragma GCC diagnostic pop #endif #define fc_isnan(X) ((X)!=(X)) // There is ISNAN but it is so much slower on my x86_64 system with GCC! #include // for std::abs #include // for std::ptrdiff_t #include // for std::numeric_limits<...>::infinity() #include // for std::stable_sort #include // for std::runtime_error #include // for std::string #include // for std::bad_alloc #include // for std::exception #include "fastcluster.cpp" /* Since the public interface is given by the Python respectively R interface, * we do not want other symbols than the interface initalization routines to be * visible in the shared object file. The "visibility" switch is a GCC concept. * Hiding symbols keeps the relocation table small and decreases startup time. * See http://gcc.gnu.org/wiki/Visibility */ #if HAVE_VISIBILITY #pragma GCC visibility push(hidden) #endif /* Helper function: order the nodes so that they can be displayed nicely in a dendrogram. This is used for the 'order' field in the R output. */ #if HAVE_DIAGNOSTIC #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wpadded" #endif struct pos_node { t_index pos; int node; }; #if HAVE_DIAGNOSTIC #pragma GCC diagnostic pop #endif void order_nodes(const int N, const int * const merge, const t_index * const node_size, int * const order) { /* Parameters: N : number of data points merge : (N-1)×2 array which specifies the node indices which are merged in each step of the clustering procedure. Negative entries -1...-N point to singleton nodes, while positive entries 1...(N-1) point to nodes which are themselves parents of other nodes. node_size : array of node sizes - makes it easier order : output array of size N Runtime: Θ(N) */ auto_array_ptr queue(N/2); int parent; int child; t_index pos = 0; queue[0].pos = 0; queue[0].node = N-2; t_index idx = 1; do { --idx; pos = queue[idx].pos; parent = queue[idx].node; // First child child = merge[parent]; if (child<0) { // singleton node, write this into the 'order' array. order[pos] = -child; ++pos; } else { /* compound node: put it on top of the queue and decompose it in a later iteration. */ queue[idx].pos = pos; queue[idx].node = child-1; // convert index-1 based to index-0 based ++idx; pos += node_size[child-1]; } // Second child child = merge[parent+N-1]; if (child<0) { order[pos] = -child; } else { queue[idx].pos = pos; queue[idx].node = child-1; ++idx; } } while (idx>0); } #define size_(r_) ( ((r_ void generate_R_dendrogram(int * const merge, double * const height, int * const order, cluster_result & Z2, const int N) { // The array "nodes" is a union-find data structure for the cluster // identites (only needed for unsorted cluster_result input). union_find nodes(sorted ? 0 : N); if (!sorted) { std::stable_sort(Z2[0], Z2[N-1]); } t_index node1, node2; auto_array_ptr node_size(N-1); for (t_index i=0; inode1; node2 = Z2[i]->node2; } else { node1 = nodes.Find(Z2[i]->node1); node2 = nodes.Find(Z2[i]->node2); // Merge the nodes in the union-find data structure by making them // children of a new node. nodes.Union(node1, node2); } // Sort the nodes in the output array. if (node1>node2) { t_index tmp = node1; node1 = node2; node2 = tmp; } /* Conversion between labeling conventions. Input: singleton nodes 0,...,N-1 compound nodes N,...,2N-2 Output: singleton nodes -1,...,-N compound nodes 1,...,N */ merge[i] = (node1(node1)-1 : static_cast(node1)-N+1; merge[i+N-1] = (node2(node2)-1 : static_cast(node2)-N+1; height[i] = Z2[i]->dist; node_size[i] = size_(node1) + size_(node2); } order_nodes(N, merge, node_size, order); } /* R interface code */ enum { METRIC_R_EUCLIDEAN = 0, METRIC_R_MAXIMUM = 1, METRIC_R_MANHATTAN = 2, METRIC_R_CANBERRA = 3, METRIC_R_BINARY = 4, METRIC_R_MINKOWSKI = 5, METRIC_R_CANBERRA_OLD = 6 }; #if HAVE_DIAGNOSTIC #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wpadded" #endif class R_dissimilarity { private: t_float * Xa; std::ptrdiff_t dim; // std::ptrdiff_t saves many statis_cast<> in products t_float * members; void (cluster_result::*postprocessfn) (const t_float) const; t_float postprocessarg; t_float (R_dissimilarity::*distfn) (const t_index, const t_index) const; auto_array_ptr row_repr; int N; // no default constructor R_dissimilarity(); // noncopyable R_dissimilarity(R_dissimilarity const &); R_dissimilarity & operator=(R_dissimilarity const &); public: // Ignore warning about uninitialized member variables. I know what I am // doing here, and some member variables are only used for certain metrics. #if HAVE_DIAGNOSTIC #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Weffc++" #endif R_dissimilarity (t_float * const X_, const int N_, const int dim_, t_float * const members_, const unsigned char method, const unsigned char metric, const t_float p, bool make_row_repr) : Xa(X_), dim(dim_), members(members_), postprocessfn(NULL), postprocessarg(p), N(N_) { switch (method) { case METHOD_VECTOR_SINGLE: switch (metric) { case METRIC_R_EUCLIDEAN: distfn = &R_dissimilarity::sqeuclidean; postprocessfn = &cluster_result::sqrt; break; case METRIC_R_MAXIMUM: distfn = &R_dissimilarity::maximum; break; case METRIC_R_MANHATTAN: distfn = &R_dissimilarity::manhattan; break; case METRIC_R_CANBERRA: distfn = &R_dissimilarity::canberra; break; case METRIC_R_BINARY: distfn = &R_dissimilarity::dist_binary; break; case METRIC_R_MINKOWSKI: distfn = &R_dissimilarity::minkowski; postprocessfn = &cluster_result::power; break; case METRIC_R_CANBERRA_OLD: distfn = &R_dissimilarity::canberra_old; break; default: throw std::runtime_error(std::string("Invalid method.")); } break; case METHOD_VECTOR_WARD: postprocessfn = &cluster_result::sqrtdouble; break; default: postprocessfn = &cluster_result::sqrt; } if (make_row_repr) { row_repr.init(2*N-1); for (t_index i=0; i*distfn)(i,j); } inline t_float X (const t_index i, const t_index j) const { // "C-style" array alignment return Xa[i*dim+j]; } inline t_float * Xptr(const t_index i, const t_index j) const { // "C-style" array alignment return Xa+i*dim+j; } void merge(const t_index i, const t_index j, const t_index newnode) const { merge_inplace(row_repr[i], row_repr[j]); row_repr[newnode] = row_repr[j]; } void merge_inplace(const t_index i, const t_index j) const { for(t_index k=0; k(i1,i2)*members[i1]*members[i2]/ \ (members[i1]+members[i2]); } inline double ward_initial(t_index const i1, t_index const i2) const { /* In the R interface, ward_initial is the same as ward. Only the Python interface has two different functions here. */ return ward(i1,i2); } // This method must not produce NaN if the input is non-NaN. inline static t_float ward_initial_conversion(const t_float min) { // identity return min; } double ward_extended(t_index i1, t_index i2) const { return ward(row_repr[i1], row_repr[i2]); } /* The following definitions and methods have been taken directly from the R source file /src/library/stats/src/distance.c in the R release 2.13.0. The code has only been adapted very slightly. (Unfortunately, the methods cannot be called directly in the R libraries since the functions are declared "static" in the above file.) Note to maintainers: If the code in distance.c changes in future R releases compared to 2.13.0, please update the definitions here, if necessary. */ // translation of variable names #define nc dim #define nr N #define x Xa #define p postprocessarg // The code from distance.c starts here #define both_FINITE(a,b) (R_FINITE(a) && R_FINITE(b)) #ifdef R_160_and_older #define both_non_NA both_FINITE #else #define both_non_NA(a,b) (!ISNAN(a) && !ISNAN(b)) #endif /* We need two variants of the Euclidean metric: one that does not check for a NaN result, which is used for the initial distances, and one which does, for the updated distances during the clustering procedure. */ // still public template double sqeuclidean(t_index const i1, t_index const i2) const { double dev, dist; int count, j; count = 0; dist = 0; double * p1 = x+i1*nc; double * p2 = x+i2*nc; for(j = 0 ; j < nc ; ++j) { if(both_non_NA(*p1, *p2)) { dev = (*p1 - *p2); if(!ISNAN(dev)) { dist += dev * dev; ++count; } } ++p1; ++p2; } if(count == 0) return NA_REAL; if(count != nc) dist /= (static_cast(count)/static_cast(nc)); //return sqrt(dist); // we take the square root later if (check_NaN) { #if HAVE_DIAGNOSTIC #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wfloat-equal" #endif if (fc_isnan(dist)) throw(nan_error()); #if HAVE_DIAGNOSTIC #pragma GCC diagnostic pop #endif } return dist; } inline double sqeuclidean_extended(t_index const i1, t_index const i2) const { return sqeuclidean(row_repr[i1], row_repr[i2]); } private: double maximum(t_index i1, t_index i2) const { double dev, dist; int count, j; count = 0; dist = -DBL_MAX; double * p1 = x+i1*nc; double * p2 = x+i2*nc; for(j = 0 ; j < nc ; ++j) { if(both_non_NA(*p1, *p2)) { dev = std::abs(*p1 - *p2); if(!ISNAN(dev)) { if(dev > dist) dist = dev; ++count; } } ++p1; ++p2; } if(count == 0) return NA_REAL; return dist; } double manhattan(t_index i1, t_index i2) const { double dev, dist; int count, j; count = 0; dist = 0; double * p1 = x+i1*nc; double * p2 = x+i2*nc; for(j = 0 ; j < nc ; ++j) { if(both_non_NA(*p1, *p2)) { dev = std::abs(*p1 - *p2); if(!ISNAN(dev)) { dist += dev; ++count; } } ++p1; ++p2; } if(count == 0) return NA_REAL; if(count != nc) dist /= (static_cast(count)/static_cast(nc)); return dist; } double canberra(t_index i1, t_index i2) const { double dev, dist, sum, diff; int count, j; count = 0; dist = 0; double * p1 = x+i1*nc; double * p2 = x+i2*nc; for(j = 0 ; j < nc ; ++j) { if(both_non_NA(*p1, *p2)) { sum = std::abs(*p1) + std::abs(*p2); diff = std::abs(*p1 - *p2); if (sum > DBL_MIN || diff > DBL_MIN) { dev = diff/sum; if(!ISNAN(dev) || (!R_FINITE(diff) && diff == sum && /* use Inf = lim x -> oo */ (dev = 1., true))) { dist += dev; ++count; } } } ++p1; ++p2; } if(count == 0) return NA_REAL; if(count != nc) dist /= (static_cast(count)/static_cast(nc)); return dist; } double canberra_old(t_index i1, t_index i2) const { double dev, dist, sum, diff; int count, j; count = 0; dist = 0; double * p1 = x+i1*nc; double * p2 = x+i2*nc; for(j = 0 ; j < nc ; ++j) { if(both_non_NA(*p1, *p2)) { sum = std::abs(*p1 + *p2); diff = std::abs(*p1 - *p2); if (sum > DBL_MIN || diff > DBL_MIN) { dev = diff/sum; if(!ISNAN(dev) || (!R_FINITE(diff) && diff == sum && /* use Inf = lim x -> oo */ (dev = 1., true))) { dist += dev; ++count; } } } ++p1; ++p2; } if(count == 0) return NA_REAL; if(count != nc) dist /= (static_cast(count)/static_cast(nc)); return dist; } double dist_binary(t_index i1, t_index i2) const { int total, count, dist; int j; total = 0; count = 0; dist = 0; double * p1 = x+i1*nc; double * p2 = x+i2*nc; for(j = 0 ; j < nc ; ++j) { if(both_non_NA(*p1, *p2)) { if(!both_FINITE(*p1, *p2)) { // warning(_("treating non-finite values as NA")); } else { #if HAVE_DIAGNOSTIC #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wfloat-equal" #endif if(*p1 || *p2) { ++count; if( ! (*p1 && *p2) ) { ++dist; } } #if HAVE_DIAGNOSTIC #pragma GCC diagnostic pop #endif ++total; } } ++p1; ++p2; } if(total == 0) return NA_REAL; if(count == 0) return 0; return static_cast(dist) / static_cast(count); } double minkowski(t_index i1, t_index i2) const { double dev, dist; int count, j; count= 0; dist = 0; double * p1 = x+i1*nc; double * p2 = x+i2*nc; for(j = 0 ; j < nc ; ++j) { if(both_non_NA(*p1, *p2)) { dev = (*p1 - *p2); if(!ISNAN(dev)) { dist += R_pow(std::abs(dev), p); ++count; } } ++p1; ++p2; } if(count == 0) return NA_REAL; if(count != nc) dist /= (static_cast(count)/static_cast(nc)); //return R_pow(dist, 1.0/p); // raise to the (1/p)-th power later return dist; } }; #if HAVE_DIAGNOSTIC #pragma GCC diagnostic pop #endif extern "C" { SEXP fastcluster(SEXP const N_, SEXP const method_, SEXP D_, SEXP members_) { SEXP r = NULL; // return value try{ /* Input checks */ // Parameter N: number of data points if (!IS_INTEGER(N_) || LENGTH(N_)!=1) Rf_error("'N' must be a single integer."); const int N = INTEGER_VALUE(N_); if (N<2) Rf_error("N must be at least 2."); const R_xlen_t NN = static_cast(N)*(N-1)/2; // Parameter method: dissimilarity index update method if (!IS_INTEGER(method_) || LENGTH(method_)!=1) Rf_error("'method' must be a single integer."); const int method = INTEGER_VALUE(method_) - 1; // index-0 based; if (methodMAX_METHOD_CODE) { Rf_error("Invalid method index."); } // Parameter members: number of members in each node auto_array_ptr members; if (method==METHOD_METR_AVERAGE || method==METHOD_METR_WARD_D || method==METHOD_METR_WARD_D2 || method==METHOD_METR_CENTROID) { members.init(N); if (Rf_isNull(members_)) { for (t_index i=0; i D__; if (method!=METHOD_METR_SINGLE) { D__.init(NN); for (R_xlen_t i=0; i(N)*(N-1)/2; ++DD) *DD *= *DD; } /* Clustering step */ cluster_result Z2(N-1); switch (method) { case METHOD_METR_SINGLE: MST_linkage_core(N, D, Z2); break; case METHOD_METR_COMPLETE: NN_chain_core(N, D__, NULL, Z2); break; case METHOD_METR_AVERAGE: NN_chain_core(N, D__, members, Z2); break; case METHOD_METR_WEIGHTED: NN_chain_core(N, D__, NULL, Z2); break; case METHOD_METR_WARD_D: case METHOD_METR_WARD_D2: NN_chain_core(N, D__, members, Z2); break; case METHOD_METR_CENTROID: generic_linkage(N, D__, members, Z2); break; case METHOD_METR_MEDIAN: generic_linkage(N, D__, NULL, Z2); break; default: throw std::runtime_error(std::string("Invalid method.")); } D__.free(); // Free the memory now members.free(); // (not strictly necessary). SEXP m; // return field "merge" PROTECT(m = NEW_INTEGER(2*(N-1))); int * const merge = INTEGER_POINTER(m); SEXP dim_m; // Specify that m is an (N-1)×2 matrix PROTECT(dim_m = NEW_INTEGER(2)); INTEGER(dim_m)[0] = N-1; INTEGER(dim_m)[1] = 2; SET_DIM(m, dim_m); SEXP h; // return field "height" PROTECT(h = NEW_NUMERIC(N-1)); double * const height = NUMERIC_POINTER(h); SEXP o; // return fiels "order' PROTECT(o = NEW_INTEGER(N)); int * const order = INTEGER_POINTER(o); if (method==METHOD_METR_WARD_D2) { Z2.sqrt(); } if (method==METHOD_METR_CENTROID || method==METHOD_METR_MEDIAN) generate_R_dendrogram(merge, height, order, Z2, N); else generate_R_dendrogram(merge, height, order, Z2, N); SEXP n; // names PROTECT(n = NEW_CHARACTER(3)); SET_STRING_ELT(n, 0, COPY_TO_USER_STRING("merge")); SET_STRING_ELT(n, 1, COPY_TO_USER_STRING("height")); SET_STRING_ELT(n, 2, COPY_TO_USER_STRING("order")); PROTECT(r = NEW_LIST(3)); // field names in the output list SET_ELEMENT(r, 0, m); SET_ELEMENT(r, 1, h); SET_ELEMENT(r, 2, o); SET_NAMES(r, n); UNPROTECT(6); // m, dim_m, h, o, r, n } // try catch (const std::bad_alloc&) { Rf_error( "Memory overflow."); } catch(const std::exception& e){ Rf_error( e.what() ); } catch(const nan_error&){ Rf_error("NaN dissimilarity value."); } #ifdef FE_INVALID catch(const fenv_error&){ Rf_error( "NaN dissimilarity value in intermediate results."); } #endif catch(...){ Rf_error( "C++ exception (unknown reason)." ); } return r; } SEXP fastcluster_vector(SEXP const method_, SEXP const metric_, SEXP X_, SEXP members_, SEXP p_) { SEXP r = NULL; // return value try{ /* Input checks */ // Parameter method: dissimilarity index update method if (!IS_INTEGER(method_) || LENGTH(method_)!=1) Rf_error("'method' must be a single integer."); int method = INTEGER_VALUE(method_) - 1; // index-0 based; if (methodMAX_METHOD_VECTOR_CODE) { Rf_error("Invalid method index."); } // Parameter metric if (!IS_INTEGER(metric_) || LENGTH(metric_)!=1) Rf_error("'metric' must be a single integer."); int metric = INTEGER_VALUE(metric_) - 1; // index-0 based; if (metric<0 || metric>6 || (method!=METHOD_VECTOR_SINGLE && metric!=0) ) { Rf_error("Invalid metric index."); } // data array PROTECT(X_ = AS_NUMERIC(X_)); SEXP dims_ = PROTECT( Rf_getAttrib( X_, R_DimSymbol ) ) ; if( dims_ == R_NilValue || LENGTH(dims_) != 2 ) { Rf_error( "Argument is not a matrix."); } const int * const dims = INTEGER(dims_); const int N = dims[0]; const int dim = dims[1]; if (N<2) Rf_error("There must be at least two data points."); // Make a working copy of the dissimilarity array // for all methods except "single". double * X__ = NUMERIC_POINTER(X_); // Copy the input array and change it from Fortran-contiguous style // to C-contiguous style. auto_array_ptr X(LENGTH(X_)); for (std::ptrdiff_t i=0; i members; if (method==METHOD_VECTOR_WARD || method==METHOD_VECTOR_CENTROID) { members.init(N); if (Rf_isNull(members_)) { for (t_index i=0; i(method), static_cast(metric), p, make_row_repr); cluster_result Z2(N-1); /* Clustering step */ switch (method) { case METHOD_VECTOR_SINGLE: MST_linkage_core_vector(N, dist, Z2); break; case METHOD_VECTOR_WARD: generic_linkage_vector(N, dist, Z2); break; case METHOD_VECTOR_CENTROID: generic_linkage_vector_alternative(N, dist, Z2); break; case METHOD_VECTOR_MEDIAN: generic_linkage_vector_alternative(N, dist, Z2); break; default: throw std::runtime_error(std::string("Invalid method.")); } X.free(); // Free the memory now members.free(); // (not strictly necessary). dist.postprocess(Z2); SEXP m; // return field "merge" PROTECT(m = NEW_INTEGER(2*(N-1))); int * const merge = INTEGER_POINTER(m); SEXP dim_m; // Specify that m is an (N-1)×2 matrix PROTECT(dim_m = NEW_INTEGER(2)); INTEGER(dim_m)[0] = N-1; INTEGER(dim_m)[1] = 2; SET_DIM(m, dim_m); SEXP h; // return field "height" PROTECT(h = NEW_NUMERIC(N-1)); double * const height = NUMERIC_POINTER(h); SEXP o; // return fiels "order' PROTECT(o = NEW_INTEGER(N)); int * const order = INTEGER_POINTER(o); if (method==METHOD_VECTOR_SINGLE) generate_R_dendrogram(merge, height, order, Z2, N); else generate_R_dendrogram(merge, height, order, Z2, N); SEXP n; // names PROTECT(n = NEW_CHARACTER(3)); SET_STRING_ELT(n, 0, COPY_TO_USER_STRING("merge")); SET_STRING_ELT(n, 1, COPY_TO_USER_STRING("height")); SET_STRING_ELT(n, 2, COPY_TO_USER_STRING("order")); PROTECT(r = NEW_LIST(3)); // field names in the output list SET_ELEMENT(r, 0, m); SET_ELEMENT(r, 1, h); SET_ELEMENT(r, 2, o); SET_NAMES(r, n); UNPROTECT(6); // m, dim_m, h, o, r, n } // try catch (const std::bad_alloc&) { Rf_error( "Memory overflow."); } catch(const std::exception& e){ Rf_error( e.what() ); } catch(const nan_error&){ Rf_error("NaN dissimilarity value."); } catch(...){ Rf_error( "C++ exception (unknown reason)." ); } return r; } #if HAVE_VISIBILITY #pragma GCC visibility push(default) #endif void R_init_fastcluster(DllInfo * const dll) { R_CallMethodDef callMethods[] = { {"fastcluster", (DL_FUNC) &fastcluster, 4}, {"fastcluster_vector", (DL_FUNC) &fastcluster_vector, 5}, {NULL, NULL, 0} }; R_registerRoutines(dll, NULL, callMethods, NULL, NULL); R_useDynamicSymbols(dll, FALSE); R_forceSymbols(dll, TRUE); } #if HAVE_VISIBILITY #pragma GCC visibility pop #endif } // extern "C" #if HAVE_VISIBILITY #pragma GCC visibility pop #endif fastcluster/src/Makevars.win0000644000176200001440000000003213146376104015670 0ustar liggesusersOBJECTS = fastcluster_R.o fastcluster/NAMESPACE0000644000176200001440000000011711727523223014033 0ustar liggesusersuseDynLib(fastcluster, .registration=TRUE) export('hclust', 'hclust.vector') fastcluster/INSTALL0000644000176200001440000000777113144412206013653 0ustar liggesusersfastcluster: Fast hierarchical clustering routines for R and Python Copyright: * Until package version 1.1.23: © 2011 Daniel Müllner * All changes from version 1.1.24 on: © Google Inc. Installation ‾‾‾‾‾‾‾‾‾‾‾‾ Installation procedures were tested under 64-bit Ubuntu. CRAN also hosts precompiled binaries (of the R library, not the Python module) for Windows and OS X. In principle, it should be possible to install the fastcluster package on any system that has a C++ compiler and R respectively Python with NumPy. There are no unusual libraries needed to compile the package, only the STL library, which every C++ compiler should have by default. Please send me feedback if you accomplish to install the fastcluster package on a certain platform but needed to tweak the configuration! I will update the installation instructions and modify the package if needed (eg. include the right compiler flags for various operating systems). Installation for R ‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾ Enter the command install.packages("fastcluster") in R, and R will download the package automatically, then install it. That's it! If this does not work, please consult R's help function by typing ?INSTALL from within R or read the “R installation and administration” manual: http://cran.r-project.org/doc/manuals/R-admin.html#Installing-packages For manual download, you can get the fastcluster package from the download page at CRAN: http://cran.r-project.org/web/packages/fastcluster/ You may need to start R with administrator rights to be able to install packages. There are ways to install R packages without administrator privileges in your user directories. See this help page for example: http://csg.sph.umich.edu/docs/R/localpackages.html Installation for Python ‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾ Make sure that you have both Python and NumPy installed. 1. On all platforms ‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾ If pip is installed, type pip install --upgrade --user fastcluster in a terminal, which automatically downloads the latest version from PyPI, compiles the C++ library and installs the package for a single user without administrator rights. If this works, there is no need to follow the alternative steps below. 2. Microsoft Windows ‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾ Installation files for Windows are stored on PyPI: https://pypi.python.org/pypi/fastcluster Christoph Gohlke also provides installation files for Windows on his web page: http://www.lfd.uci.edu/~gohlke/pythonlibs/#fastcluster 3. With setuptools ‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾ If pip is not available but setuptools, type easy_install --upgrade --user fastcluster in a terminal. 4. From the source package ‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾ If you have not done so already, download the fastcluster package from PyPI here: http://pypi.python.org/pypi/fastcluster/ Open a terminal, go to the directory with the downloaded file and extract the contents of the archive with: tar -xvf fastcluster-(version).tar.gz Alternatively, use your favorite archive manager for unpacking, eg. on Windows. This will generate a new directory “fastcluster-(version)”. Switch to this subdirectory: cd fastcluster-(...) The source distribution on CRAN also contains the complete source files. See the directory src/python there. Now compile and install the Python module by: python setup.py install You may need to precede this command with sudo or install the package in your home directory, like this: python setup.py install --user See the chapter “Installing Python modules” in the Python documentation for further help: http://docs.python.org/install/index.html fastcluster/NEWS0000644000176200001440000001463013144412336013315 0ustar liggesusersfastcluster: Fast hierarchical clustering routines for R and Python Copyright: • Until package version 1.1.23: © 2011 Daniel Müllner • All changes from version 1.1.24 on: © Google Inc. Version history ‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾ Version 1.0.0, 03/14/2011: • Initial release, dependent on Rcpp. Not available on CRAN. Version 1.0.1, 03/15/2011: • Removed the dependence on Rcpp; only R's original C interface is used. Version 1.0.2, 03/17/2011: • File DESCRIPTION: Fixed a typo Version 1.0.3, 03/20/2011: • File README: Removed the warning about false results from the flashClust package since the new flashClust version 1.01 has this error corrected. • Cleaned the test file fastcluster_test.R up. (No dependence on the MASS package any more) Version 1.0.4, 03/21/2011: • Changed the name of the external function from the outdated "Rcpp_linkage" to "fastcluster". • Registered the external function "fastcluster" in R. • Configured the C header inclusions to work on Fedora (thanks to Peter Langfelder). Version 1.1.0, 08/21/2011 • Routines for clustering vector data. • Added a User's manual • Revision of all files Version 1.1.1, 10/08/2011 • Fixed test scripts, which indicated an error on some architectures, even if results were correct. (The assumption was that ties in single linkage clustering are resolved in the same way, both for dissimilarity input and for vector input. This is not necessarily true if the floating point unit uses "excess precision". Now the test scripts are content with arbitrary resolution of ties and do not assume a specific scheme.) • Bug fix: uninitialized function pointer in Version 1.1.0 Version 1.1.2, 10/11/2011 • Fix for Solaris: replaced ssize_t by ptrdiff_t in the C++ code. • Removed the NN-chain algorithm for vector input: it was not clear that it would work under all circumstances with the intricacies of floating- point arithmetic. Especially the effects of the excess precision on the x87 are impossible to control in a portable way. Now, the memory-saving routines for the “Ward” linkage use the generic algorithm, as “centroid” and “median” linkage do. Version 1.1.3, 12/10/2011 • Replaced ptrdiff_t by std::ptrdiff_t, as GCC 4.6.1 complains about this. Version 1.1.4, 02/01/2012 • Release the GIL in the Python package, so that it can be used efficiently in multithreaded applications. • Improved performance for the "Ward" method with vector input. • The "members" parameter in the R interface is now treated as a double array, not an integer array as before. This was a slight incompatibility with the stats::hclust function. Thanks to Matthias Studer, University of Geneva, for pointing this out. Version 1.1.5, 02/14/2012 • Updated the "members" specification in the User's manual to reflect the recent change. Version 1.1.6, 03/12/2012 • Bug fix related to GIL release in the Python wrapper. Thanks to Massimo Di Stefano for the bug report. • Small compatibility changes in the Python test scripts (again thanks to Massimo Di Stefano for the report). Version 1.1.7, 09/17/2012 • Scipy import is now optional (suggested by Forest Gregg) • Compatibility fix for NumPy 1.7. Thanks to Semihcan Doken for the bug report. Version 1.1.8, 08/28/2012 • Test for NaN dissimilarity values: Now the algorithms produce an error message instead of silently giving false results. The documentation was updated accordingly. This is the final design as intended: the fastcluster package handles infinity values correctly but complains about NaNs. • The Python interface now works with both Python 2 and Python 3. • Changed the license to BSD. Version 1.1.9, 03/15/2013 • Compatibility fix for the MSVC compilers on Windows. • Simplified GIL release in the Python interface. Version 1.1.10, 05/22/2013 • Updated citation information (JSS paper). • Suppress warnings where applicable. Compilation with GCC should not produce any warning at all, even if all compiler warnings are enabled. (The switch -pedantic still does not work, but this is due to the Python headers.) • Optimization: Hidden symbols. Only the interface functions are exported to the symbol table with GCC. Version 1.1.11, 05/23/2013 • Compatibility fix for Solaris. Version 1.1.12, 12/10/2013 • Tiny maintenance updates: new author web page and e-mail address, new location for R vignette. Version 1.1.13, 12/17/2013 • Moved the "python" directory due to CRAN requirements. Version 1.1.14, 01/02/2015 • Updated the DESCRIPTION file according to CRAN rules. • Renamed the “ward” method for dissimilarity input to “ward.D” in the R interface and created a new method “ward.D2”, following changes in R's hclust package. Version 1.1.15, 01/05/2015 • Fixed the unit test to work with old and new R versions (see the changes in stats::hclust in R 3.1.0). Version 1.1.16, 01/07/2015 • Support for large distance matrices (more than 2^31 entries, R's long vector support since version 3.0.0). Version 1.1.17, 07/03/2015 • Resolved MSVC compiler warnings. Version 1.1.18, 07/16/2015 • Fixed missing NumPy header include path. Version 1.1.19, 07/19/2015 • Fixed unit tests. They can be run with "python setup.py test" now. Version 1.1.20, 07/19/2015 • New version number due to PyPI upload error. Version 1.1.21, 09/18/2016 • Appropiate use of std namespace, as required by CRAN. Version 1.1.22, 06/12/2016 • No fenv header usage if software floating-point emulation is used (bug report: NaN test failed on Debian armel). Version 1.1.23, 03/24/2017 • setup.py: Late NumPy import for better dependency management. Version 1.1.24, 08/04/2017 • R 3.5 corrects the formula for the “Canberra” metric. See https://bugs.r-project.org/bugzilla/show_bug.cgi?id=17285. The formula in the fastcluster package was changed accordingly. This concerns only the R interface. SciPy and fastcluster's Python interface always had the correct formula. fastcluster/R/0000755000176200001440000000000013144415553013017 5ustar liggesusersfastcluster/R/fastcluster.R0000644000176200001440000000453613144415553015511 0ustar liggesusers# fastcluster: Fast hierarchical clustering routines for R and Python # # Copyright: # * Until package version 1.1.23: © 2011 Daniel Müllner # * All changes from version 1.1.24 on: © Google Inc. hclust <- function(d, method="complete", members=NULL) { # Hierarchical clustering, on raw input data. if(method == "ward") { message("The \"ward\" method has been renamed to \"ward.D\"; note new \"ward.D2\"") method <- "ward.D" } # This array must agree with the enum method_codes in fastcluster.cpp. METHODS <- c("single", "complete", "average", "mcquitty", "ward.D", "centroid", "median", "ward.D2") method <- pmatch(method, METHODS) if (is.na(method)) stop("Invalid clustering method.") if (method == -1) stop("Ambiguous clustering method.") dendrogram <- c( .Call(fastcluster, attr(d, "Size"), method, d, members), list( labels = attr(d, "Labels") ,method = METHODS[method] ,call = match.call() ,dist.method = attr(d, "method") ) ) class(dendrogram) <- "hclust" return (dendrogram) } hclust.vector <- function(X, method='single', members=NULL, metric='euclidean', p=NULL) { # Hierarchical clustering, on vector data. METHODS <- c("single", "ward", "centroid", "median") methodidx <- pmatch(method, METHODS) if (is.na(methodidx)) stop(paste("Invalid clustering method '", method, "' for vector data.", sep='')) if (methodidx == -1) stop("Ambiguous clustering method.") METRICS <- c("euclidean", "maximum", "manhattan", "canberra", "binary", "minkowski") metric = pmatch(metric, METRICS) if (is.na(metric) || metric > 6) stop("Invalid metric.") if (metric == -1) stop("Ambiguous metric.") if (metric == 4 && getRversion() < "3.5.0") metric <- as.integer(7) # special metric code for backwards compatibility if (methodidx!=1 && metric!=1) stop("The Euclidean methods 'ward', 'centroid' and 'median' require the 'euclidean' metric.") X <- as.matrix(X) dendrogram <- c( .Call(fastcluster_vector, methodidx, metric, X, members, p), list( labels = dimnames(X)[[1L]] ,method = METHODS[methodidx] ,call = match.call() ,dist.method = METRICS[metric] ) ) class(dendrogram) <- "hclust" return (dendrogram) } fastcluster/vignettes/0000755000176200001440000000000013146376104014626 5ustar liggesusersfastcluster/vignettes/Makefile0000644000176200001440000000032113146376104016262 0ustar liggesusersall: latex fastcluster.Rtex latex fastcluster.Rtex latex fastcluster.Rtex dvipdfmx fastcluster.dvi mkdir keep mv fastcluster.pdf keep mv fastcluster.Rtex keep rm fastcluster.* mv keep/* . rmdir keep fastcluster/vignettes/fastcluster.Rtex0000644000176200001440000012147113144414201020024 0ustar liggesusers\def\fastclusterversion{1.1.24} \documentclass[fontsize=10pt,paper=letter,BCOR=-6mm]{scrartcl} \usepackage[utf8]{inputenc} \usepackage{lmodern} \normalfont \usepackage[T1]{fontenc} \usepackage{textcomp} \newcommand*\q{\textquotesingle} \usepackage{amsmath} \usepackage{amsfonts} \usepackage{xcolor} \usepackage{ifpdf} \ifpdf \newcommand*\driver{} \else \newcommand*\driver{dvipdfmx} \fi \usepackage[% pdftitle={fastcluster manual}, pdfauthor={Daniel Müllner}, % pdfsubject={}, pdfdisplaydoctitle=true, % pdfduplex=DuplexFlipLongEdge, pdfstartview=FitH, colorlinks=True, pdfhighlight=/I, % pdfborder={0 0 1}, % linkbordercolor={1 .8 .8}, % citebordercolor={.5 .9 .5}, % urlbordercolor={.5 .7 1}, % linkcolor={blue}, % citecolor={blue}, urlcolor={blue!80!black}, linkcolor={red!80!black}, % runcolor={blue}, % filecolor={blue}, pdfpagemode=UseOutlines, bookmarksopen=true, bookmarksopenlevel=1, bookmarksdepth=2, breaklinks=true, unicode=true, \driver ]{hyperref} % Optimize the PDF targets and make the PDF file smaller \ifpdf\RequirePackage{hypdestopt}\fi \renewcommand*\sectionautorefname{Section} \usepackage{typearea} \DeclareMathOperator\size{size} \DeclareMathOperator\Var{Var} \newcommand*\linkage{\href{http://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.linkage.html}{\texttt{linkage}}} \newcommand*\hierarchy{\href{http://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.linkage.html}{\texttt{scipy.\hskip0pt cluster.\hskip0pt hierarchy}}} \newcommand*\hclust{\href{http://stat.ethz.ch/R-manual/R-patched/library/stats/html/hclust.html}{\texttt{hclust}}} \newcommand*\stats{\href{http://stat.ethz.ch/R-manual/R-devel/library/stats/html/00Index.html}{\texttt{stats}}} \newcommand*\flashClustPack{\href{http://cran.r-project.org/web/packages/flashClust/index.html}{\texttt{flashClust}}} \newcommand*\dist{\href{http://stat.ethz.ch/R-manual/R-devel/library/stats/html/dist.html}{\texttt{dist}}} \newcommand*\print{\href{http://stat.ethz.ch/R-manual/R-patched/library/base/html/print.html}{\texttt{print}}} \newcommand*\plot{\href{http://stat.ethz.ch/R-manual/R-patched/library/graphics/html/plot.html}{\texttt{plot}}} \newcommand*\identify{\href{http://stat.ethz.ch/R-manual/R-patched/library/stats/html/identify.hclust.html}{\texttt{identify}}} \newcommand*\rect{\href{http://stat.ethz.ch/R-manual/R-patched/library/stats/html/rect.hclust.html}{\texttt{rect.hclust}}} \newcommand*\NA{\href{http://stat.ethz.ch/R-manual/R-devel/library/base/html/NA.html}{\texttt{NA}}} %\usepackage{showframe} \makeatletter \newenvironment{methods}{% \list{}{\labelwidth\z@ \itemindent-\leftmargin \let\makelabel\methodslabel}% }{% \endlist } \newcommand*{\methodslabel}[1]{% %\hspace{\labelsep}% \hbox to \textwidth{\hspace{\labelsep}% \normalfont\bfseries\ttfamily #1\hskip-\labelsep\hfill}% } \makeatother \setkomafont{descriptionlabel}{\normalfont\ttfamily\bfseries} \begin{document} %\VignetteIndexEntry{User's manual} \title{The \textit{fastcluster} package: User's manual} \author{\href{http://danifold.net}{Daniel Müllner}} \date{August 14, 2017} \subtitle{Version \fastclusterversion} \maketitle \makeatletter \renewenvironment{quotation}{% \list{}{\listparindent 1em% \itemindent \listparindent \leftmargin2.5em \rightmargin \leftmargin \parsep \z@ \@plus\p@ }% \item\relax }{% \endlist } \makeatother \begin{abstract}\noindent\small The fastcluster package is a C++ library for hierarchical, agglomerative clustering. It efficiently implements the seven most widely used clustering schemes: single, complete, average, weighted/mcquitty, Ward, centroid and median linkage. The library currently has interfaces to two languages: R and Python/SciPy. Part of the functionality is designed as drop-in replacement for existing routines: \linkage{} in the SciPy package \hierarchy{}, \hclust{} in R's \stats{} package, and the \flashClustPack{} package. Once the fastcluster library is loaded at the beginning of the code, every program that uses hierarchical clustering can benefit immediately and effortlessly from the performance gain. Moreover, there are memory-saving routines for clustering of vector data, which go beyond what the existing packages provide. \end{abstract} \noindent This document describes the usage for the two interfaces for R and Python and is meant as the reference document for the end user. Installation instructions are given in the file INSTALL in the source distribution and are not repeated here. The sections about the two interfaces are independent and in consequence somewhat redundant, so that users who need a reference for one interface need to consult only one section. If you use the fastcluster package for scientific work, please cite it as: \begin{quote} Daniel Müllner, \textit{fastcluster: Fast Hierarchical, Agglomerative Clustering Routines for R and Python}, Journal of Statistical Software, \textbf{53} (2013), no.~9, 1--18, \url{http://www.jstatsoft.org/v53/i09/}. \end{quote} \textbf{The fastcluster package is considered stable and will undergo few changes from now on. If some years from now there have not been any updates, this does not necessarily mean that the package is unmaintained but maybe it just was not necessary to correct anything. Of course, please still report potential bugs and incompatibilities to \texttt{daniel@danifold.net}.} \pagebreak \tableofcontents \section{The R interface} Load the package with the following command: \begin{quote} \texttt{library(\q fastcluster\q)} \end{quote} The package overwrites the function \hclust{} from the \stats{} package (in the same way as the \flashClustPack{} package does). Please remove any references to the \flashClustPack{} package in your R files to not accidentally overwrite the \hclust{} function with the \flashClustPack{} version. The \hyperref[hclust]{new \texttt{hclust} function} has exactly the same calling conventions as the old one. You may just load the package and immediately and effortlessly enjoy the performance improvements. The function is also an improvement to the \texttt{flashClust} function from the \flashClustPack{} package. Just replace every call to \texttt{flashClust} by \hyperref[hclust]{\texttt{hclust}} and expect your code to work as before, only faster.\footnote{If you are using flashClust prior to version 1.01, update it! See the change log for \flashClustPack{} at \url{http://cran.r-project.org/web/packages/flashClust/ChangeLog}.} In case the data includes infinite or NaN values, see \autoref{sec:infnan}. If you need to access the old function or make sure that the right function is called, specify the package as follows: \begin{quote} \texttt{\hyperref[hclust]{fastcluster::hclust}(…)}\\ \texttt{flashClust::hclust(…)}\\ \texttt{stats::hclust(…)} \end{quote} Vector data can be clustered with a memory-saving algorithm with the command: \begin{quote} \texttt{\hyperref[hclust.vector]{hclust.vector}(…)} \end{quote} The following sections contain comprehensive descriptions of these methods. \begin{methods} \item [\normalfont\texttt{\textbf{hclust}}\,(\textit{d, method=\q complete\q, members=NULL})] \phantomsection\label{hclust} \addcontentsline{toc}{subsection}{\texttt{hclust}} Hierarchical, agglomerative clustering on a condensed dissimilarity matrix. This method has the same specifications as the method \hclust{} in the package \stats{} and \texttt{hclust} alias \texttt{flashClust} in the package \flashClustPack{}. In particular, the \print{}, \plot{}, \rect{} and \identify{} methods work as expected. The argument $d$ is a condensed distance matrix, as it is produced by \dist. The argument \textit{method} is one of the strings \textit{\q single\q}, \textit{\q complete\q}, \textit{\q average\q}, \textit{\q mcquitty\q}, \textit{\q centroid\q}, \textit{\q median\q}, \textit{\q ward.D\q}, \textit{\q ward.D2\q} or an unambiguous abbreviation thereof. The argument \textit{members} specifies the sizes of the initial nodes, ie.\ the number of observations in the initial clusters. The default value \texttt{NULL} says that all initial nodes are singletons, ie.\ have size 1. Otherwise, \textit{members} must be a vector whose size is the number of input points. The vector is processed as a \href{http://stat.ethz.ch/R-manual/R-patched/library/base/html/double.html}{\texttt{double}} array so that not only integer cardinalities of nodes can be accounted for but also weighted nodes with real weights. The general scheme of the agglomerative clustering procedure is as follows: \begin{enumerate} \item Start with $N$ singleton clusters (nodes) labeled $-1,\ldots, -N$, which represent the input points. \item Find a pair of nodes with minimal distance among all pairwise distances. \item Join the two nodes into a new node and remove the two old nodes. The new nodes are labeled consecutively $1,2,\ldots$ \item The distances from the new node to all other nodes is determined by the \textit{method} parameter (see below). \item Repeat $N-1$ times from step 2, until there is one big node, which contains all original input points. \end{enumerate} The output of \texttt{hclust} is an object of class \texttt{\q hclust\q} and represents a \emph{stepwise dendrogram}. It contains the following fields: \begin{description} \item[\normalfont\textit{merge}] This is an $(N-1)\times 2$ array. Row $i$ specifies the labels of the nodes which are joined step $i$ of the clustering. \item[\normalfont\textit{height}] This is a vector of length $N-1$. It contains the sequence of dissimilarities at which every pair of nearest nodes is joined. \item[\normalfont\textit{order}] This is a vector of length $N$. It contains a permutation of the numbers $1,\ldots N$ for the \plot{} method. When the dendrogram is plotted, this is the order in which the singleton nodes are plotted as the leaves of a rooted tree. The order is computed so that the dendrogram is plotted without intersections (except the case when there are inversions for the \textit{\q centroid\q} and \textit{\q median\q} methods). The choice of the \textit{\q order\q} sequence follows the same scheme as the \texttt{stats} package does, only with a faster algorithm. Note that there are many valid choices to order the nodes in a dendrogram without intersections. Also, subsequent points in the \textit{\q order\q} field are not always close in the ultrametric given by the dendrogram. \item[\normalfont\textit{labels}] This copies the attribute \textit{\q Labels\q} from the first input parameter $d$. It contains the labels for the objects being clustered. \item[\normalfont\textit{method}] The (unabbreviated) string for the \textit{\q method\q} parameter. See below for a specification of all available methods. \item[\normalfont\textit{call}] The full command that produced the result. See \href{http://stat.ethz.ch/R-manual/R-patched/library/base/html/match.call.html}{\texttt{match.call}}. \item[\normalfont\textit{dist.method}] This \textit{\q method\q} attribute of the first input parameter $d$. This specifies which metric was used in the \texttt{dist} method which generated the first argument. \end{description} The parameter \textit{method} specifies which clustering scheme to use. The clustering scheme determines the distance from a new node to the other nodes. Denote the dissimilarities by $d$, the nodes to be joined by $I,J$, the new node by $K$ and any other node by $L$. The symbol $|I|$ denotes the size of the cluster $I$. \begin{description} \item [\normalfont\textit{method=\q single\q}:] $\displaystyle d(K,L) = \min(d(I,L), d(J,L))$ The distance between two clusters $A,B$ is the closest distance between any two points in each cluster: \[ d(A,B)=\min_{a\in A, b\in B}d(a,b) \] \item [\normalfont\textit{method=\q complete\q}:] $\displaystyle d(K,L) = \max(d(I,L), d(J,L))$ The distance between two clusters $A,B$ is the maximal distance between any two points in each cluster: \[ d(A,B)=\max_{a\in A, b\in B}d(a,b) \] \item [\normalfont\textit{method=\q average\q}:] $\displaystyle d(K,L) = \frac{|I|\cdot d(I,L)+|J|\cdot d(J,L)}{|I|+|J|}$ The distance between two clusters $A,B$ is the average distance between the points in the two clusters: \[ d(A,B)=\frac1{|A||B|}\sum_{a\in A, b\in B}d(a,b) \] \item [\normalfont\textit{method=\q mcquitty\q}:] $\displaystyle d(K,L) = \tfrac12(d(I,L)+d(J,L))$ There is no global description for the distance between clusters since the distance depends on the order of the merging steps. \end{description} The following three methods are intended for Euclidean data only, ie.\ when $X$ contains the pairwise \textbf{squared} distances between vectors in Euclidean space. The algorithm will work on any input, however, and it is up to the user to make sure that applying the methods makes sense. \begin{description} \item [\normalfont\textit{method=\q centroid\q}:] $\displaystyle d(K,L) = \frac{|I|\cdot d(I,L)+|J|\cdot d(J,L)}{|I|+|J|}-\frac{|I|\cdot|J|\cdot d(I,J)}{(|I|+|J|)^2}$ There is a geometric interpretation: $d(A,B)$ is the distance between the centroids (ie.\ barycenters) of the clusters in Euclidean space: \[ d(A,B) = \|\vec c_A-\vec c_B\|^2, \] where $\vec c_A$ denotes the centroid of the points in cluster $A$. \item [\normalfont\textit{method=\q median\q}:] $\displaystyle d(K,L) = \tfrac12 d(I,L)+\tfrac12 d(J,L)-\tfrac14 d(I,J)$ Define the midpoint $\vec w_K$ of a cluster $K$ iteratively as $\vec w_K=k$ if $K=\{k\}$ is a singleton and as the midpoint $\frac12(\vec w_I+\vec w_J)$ if $K$ is formed by joining $I$ and $J$. Then we have \[ d(A,B)=\|\vec w_A-\vec w_B\|^2 \] in Euclidean space for all nodes $A,B$. Notice however that this distance depends on the order of the merging steps. \item [\normalfont\textit{method=\q ward.D\q}:] $\displaystyle d(K,L) = \frac{(|I|+|L|)\cdot d(I,L)+(|J|+|L|)\cdot d(J,L)-|L|\cdot d(I,J)}{|I|+|J|+|L|}$ The global cluster dissimilarity can be expressed as \[ d(A,B)=\frac{2|A||B|}{|A|+|B|}\cdot\|\vec c_A-\vec c_B\|^2, \] where $\vec c_A$ again denotes the centroid of the points in cluster $A$. \item [\normalfont\textit{method=\q ward.D2\q}:] This is the equivalent of \textit{\q ward.D\q}, but for input consisting of untransformed (in particular: \textbf{non-squared}) Euclidean distances. Internally, all distances are squared first, then method \textit{ward.D} is applied, and finally the square root of all heights in the dendrogram is taken. Thus, global cluster dissimilarity can be expressed as the square root of that for \textit{ward.D}, namely \[ d(A,B)=\sqrt{\frac{2|A||B|}{|A|+|B|}}\cdot\|\vec c_A-\vec c_B\|. \] \end{description} \item [\normalfont\texttt{\textbf{hclust.vector}}\,(\textit{X, method=\q single\q, members=NULL, metric=\q euclidean\q, p=NULL})] \phantomsection\label{hclust.vector} \addcontentsline{toc}{subsection}{\texttt{hclust.vector}} This performs hierarchical, agglomerative clustering on vector data with memory-saving algorithms. While the \hyperref[hclust]{\texttt{hclust}} method requires $\Theta(N^2)$ memory for clustering of $N$ points, this method needs $\Theta(ND)$ for $N$ points in $\mathbb R^D$, which is usually much smaller. The argument $X$ must be a two-dimensional matrix with \href{http://stat.ethz.ch/R-manual/R-patched/library/base/html/double.html}{\texttt{double}} precision values. It describes $N$ data points in $\mathbb R^D$ as an $(N\times D)$ matrix. The parameter \textit{\q members\q} is the same as for \hyperref[hclust]{\texttt{hclust}}. The parameter \textit{\q method\q} is one of the strings \textit{\q single\q}, \textit{\q centroid\q}, \textit{\q median\q}, \textit{\q ward\q}, or an unambiguous abbreviation thereof. If \textit{method} is \textit{\q single\q}, single linkage clustering is performed on the data points with the metric which is specified by the \textit{metric} parameter. The choices are the same as in the \href{http://stat.ethz.ch/R-manual/R-patched/library/stats/html/dist.html}{\texttt{dist}} method: \textit{\q euclidean\q}, \textit{\q maximum\q}, \textit{\q manhattan\q}, \textit{\q canberra\q}, \textit{\q binary\q} and \textit{\q minkowski\q}. Any unambiguous substring can be given. The parameter \textit{p} is used for the \textit{\q minkowski\q} metric only. The call \begin{quote} \texttt{hclust.vector(X, method=\q single\q, metric=[...])} \end{quote} is equivalent to \begin{quote} \texttt{hclust(dist(X, metric=[...]), method=\q single\q)} \end{quote} but uses less memory and is equally fast. Ties may be resolved differently, ie.\ if two pairs of nodes have equal, minimal dissimilarity values at some point, in the specific computer's representation for floating point numbers, either pair may be chosen for the next merging step in the dendrogram. Note that the formula for the \textit{\q canberra\q} metric changed in R 3.5.0: Before R version 3.5.0, the \textit{\q canberra\q} metric was computed as \[ d(u,v) = \sum_j\frac{|u_j-v_j|}{|u_j+v_j|}. \] Starting with R version 3.5.0, the formula was corrected to \[ d(u,v) = \sum_j\frac{|u_j-v_j|}{|u_j|+|v_j|}. \] Summands with $u_j=v_j=0$ always contribute 0 to the sum. The second, newer formula equals SciPy's definition. The fastcluster package detects the R version at runtime and chooses the formula accordingly, so that fastcluster and the \href{http://stat.ethz.ch/R-manual/R-patched/library/stats/html/dist.html}{\texttt{dist}} method always use the same formula for a given R version. If \textit{method} is one of \textit{\q centroid\q}, \textit{\q median\q}, \textit{\q ward\q}, clustering is performed with respect to Euclidean distances. In this case, the parameter \textit{metric} must be \textit{\q euclidean\q}. Notice that \texttt{hclust.vector} operates on Euclidean distances for compatibility reasons with the \dist{} method, while \hyperref[hclust]{\texttt{hclust}} assumes \textbf{squared} Euclidean distances for compatibility with the \href{http://stat.ethz.ch/R-manual/R-patched/library/stats/html/hclust.html}{\texttt{stats::hclust}} method! Hence, the call \phantomsection\label{squared} \begin{quote} \texttt{hc = hclust.vector(X, method=\q centroid\q)} \end{quote} is, aside from the lesser memory requirements, equivalent to \begin{quote} \texttt{d = dist(X)}\\ \texttt{hc = hclust(d\textasciicircum 2, method=\q centroid\q)}\\ \texttt{hc\$height = sqrt(hc\$height)} \end{quote} The same applies to the \textit{\q median\q} method. The \textit{\q ward\q} method in \hyperref[hclust.vector]{\texttt{hclust.vector}} is equivalent to \hyperref[hclust]{\texttt{hclust}} with method \textit{\q ward.D2\q}, but to method \textit{\q ward.D\q} only after squaring as above. Differences in these algebraically equivalent methods may arise only from floating-point inaccuracies and the resolution of ties (which may, however, in extreme cases affect the entire clustering result due to the inherently unstable nature of the clustering schemes). \end{methods} \section{The Python interface} The fastcluster package is imported as usual by: \begin{quote} \texttt{import fastcluster} \end{quote} It provides the following functions: \begin{quote} \hyperref[linkage]{\texttt{linkage}}\,(\textit{X, method=\q single\q, metric=\q euclidean\q, preserve\_input=True})\\ \hyperref[single]{\texttt{single}}\,($X$)\\ \hyperref[complete]{\texttt{complete}}\,($X$)\\ \hyperref[average]{\texttt{average}}\,($X$)\\ \hyperref[weighted]{\texttt{weighted}}\,($X$)\\ \hyperref[ward]{\texttt{ward}}\,($X$)\\ \hyperref[centroid]{\texttt{centroid}}\,($X$)\\ \hyperref[median]{\texttt{median}}\,($X$)\\ \hyperref[linkage_vector]{\texttt{linkage\_vector}}\,(\textit{X, method=\q single\q, metric=\q euclidean\q, extraarg=None}) \end{quote} The following sections contain comprehensive descriptions of these methods. \begin{methods} \item [\normalfont\texttt{fastcluster.\textbf{linkage}}\,(\textit{X, method=\q single\q, metric=\q euclidean\q, preserve\_input=\q True\q})] \phantomsection\label{linkage} \addcontentsline{toc}{subsection}{\texttt{linkage}} Hierarchical, agglomerative clustering on a condensed dissimilarity matrix or on vector data. Apart from the argument \textit{preserve\_input}, the method has the same input parameters and output format as the function of the same name in the module \hierarchy. The argument $X$ is preferably a \href{http://docs.scipy.org/doc/numpy/reference/generated/numpy.ndarray.html}{NumPy array} with floating point entries (\texttt{X.dtype\hskip0pt==\hskip0pt numpy.double}). Any other data format will be converted before it is processed. NumPy's \href{http://docs.scipy.org/doc/numpy/reference/maskedarray.html}{masked arrays} are not treated as special, and the mask is simply ignored. If $X$ is a one-dimensional array, it is considered a condensed matrix of pairwise dissimilarities in the format which is returned by \href{http://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.pdist.html}{\texttt{scipy.spatial.distance.pdist}}. It contains the flattened, upper-triangular part of a pairwise dissimilarity matrix. That is, if there are $N$ data points and the matrix $d$ contains the dissimilarity between the $i$-th and $j$-th observation at position $d_{i,j}$, the vector $X$ has length $\binom N2$ and is ordered as follows: \[ d = \begin{pmatrix} 0&d_{0,1}&d_{0,2}&\ldots&d_{0,n-1}\\ & 0&d_{1,2} & \ldots\\ &&0&\ldots\\ &&&\ddots\\ &&&&0 \end{pmatrix} = \begin{pmatrix} 0&X[0] &X[1]&\ldots&X[n-2]\\ & 0&X[n-1] & \ldots\\ &&0&\ldots\\ &&&\ddots\\ &&&&0 \end{pmatrix} \] The \textit{metric} argument is ignored in case of dissimilarity input. The optional argument \textit{preserve\_input} specifies whether the method makes a working copy of the dissimilarity vector or writes temporary data into the existing array. If the dissimilarities are generated for the clustering step only and are not needed afterward, approximately half the memory can be saved by specifying \textit{preserve\_input=False}. Note that the input array $X$ contains unspecified values after this procedure. It is therefore safer to write \begin{verbatim} linkage(X, method="...", preserve_input=False) del X \end{verbatim} to make sure that the matrix $X$ is not accessed accidentally after it has been used as scratch memory. (The single linkage algorithm does not write to the distance matrix or its copy anyway, so the \textit{preserve\_input} flag has no effect in this case.) If $X$ contains vector data, it must be a two-dimensional array with $N$ observations in $D$ dimensions as an $(N\times D)$ array. The \textit{preserve\_input} argument is ignored in this case. The specified \textit{metric} is used to generate pairwise distances from the input. The following two function calls yield equivalent output: \begin{verbatim} linkage(pdist(X, metric), method="...", preserve_input=False) linkage(X, metric=metric, method="...") \end{verbatim} The two results are identical in most cases, but differences occur if ties are resolved differently: if the minimum in step 2 below is attained for more than one pair of nodes, either pair may be chosen. It is not guaranteed that both \texttt{linkage} variants choose the same pair in this case. The general scheme of the agglomerative clustering procedure is as follows: \begin{enumerate} \item Start with $N$ singleton clusters (nodes) labeled $0,\ldots, N-1$, which represent the input points. \item Find a pair of nodes with minimal distance among all pairwise distances. \item Join the two nodes into a new node and remove the two old nodes. The new nodes are labeled consecutively $N,N+1,\ldots$ \item The distances from the new node to all other nodes is determined by the \textit{method} parameter (see below). \item Repeat $N-1$ times from step 2, until there is one big node, which contains all original input points. \end{enumerate} The output of \texttt{linkage} is \emph{stepwise dendrogram}, which is represented as an $(N-1)\times 4$ \href{http://docs.scipy.org/doc/numpy/reference/generated/numpy.ndarray.html}{NumPy array} with floating point entries (\texttt{dtype=numpy.double}). The first two columns contain the node indices which are joined in each step. The input nodes are labeled $0,\ldots,N-1$, and the newly generated nodes have the labels $N,\ldots, 2N-2$. The third column contains the distance between the two nodes at each step, ie.\ the current minimal distance at the time of the merge. The fourth column counts the number of points which comprise each new node. The parameter \textit{method} specifies which clustering scheme to use. The clustering scheme determines the distance from a new node to the other nodes. Denote the dissimilarities by $d$, the nodes to be joined by $I,J$, the new node by $K$ and any other node by $L$. The symbol $|I|$ denotes the size of the cluster $I$. \begin{description} \item [\normalfont\textit{method=\q single\q}:] $\displaystyle d(K,L) = \min(d(I,L), d(J,L))$ The distance between two clusters $A,B$ is the closest distance between any two points in each cluster: \[ d(A,B)=\min_{a\in A, b\in B}d(a,b) \] \item [\normalfont\textit{method=\q complete\q}:] $\displaystyle d(K,L) = \max(d(I,L), d(J,L))$ The distance between two clusters $A,B$ is the maximal distance between any two points in each cluster: \[ d(A,B)=\max_{a\in A, b\in B}d(a,b) \] \item [\normalfont\textit{method=\q average\q}:] $\displaystyle d(K,L) = \frac{|I|\cdot d(I,L)+|J|\cdot d(J,L)}{|I|+|J|}$ The distance between two clusters $A,B$ is the average distance between the points in the two clusters: \[ d(A,B)=\frac1{|A||B|}\sum_{a\in A, b\in B}d(a,b) \] \item [\normalfont\textit{method=\q weighted\q}:] $\displaystyle d(K,L) = \tfrac12(d(I,L)+d(J,L))$ There is no global description for the distance between clusters since the distance depends on the order of the merging steps. \end{description} The following three methods are intended for Euclidean data only, ie.\ when $X$ contains the pairwise (non-squared!)\ distances between vectors in Euclidean space. The algorithm will work on any input, however, and it is up to the user to make sure that applying the methods makes sense. \begin{description} \item [\normalfont\textit{method=\q centroid\q}:] $\displaystyle d(K,L) = \sqrt{\frac{|I|\cdot d(I,L)+|J|\cdot d(J,L)}{|I|+|J|}-\frac{|I|\cdot|J|\cdot d(I,J)}{(|I|+|J|)^2}}$ There is a geometric interpretation: $d(A,B)$ is the distance between the centroids (ie.\ barycenters) of the clusters in Euclidean space: \[ d(A,B) = \|\vec c_A-\vec c_B\|, \] where $\vec c_A$ denotes the centroid of the points in cluster $A$.\pagebreak[2] \item [\normalfont\textit{method=\q median\q}:] $\displaystyle d(K,L) = \sqrt{\tfrac12 d(I,L)+\tfrac12 d(J,L)-\tfrac14 d(I,J)}$ Define the midpoint $\vec w_K$ of a cluster $K$ iteratively as $\vec w_K=k$ if $K=\{k\}$ is a singleton and as the midpoint $\frac12(\vec w_I+\vec w_J)$ if $K$ is formed by joining $I$ and $J$. Then we have \[ d(A,B)=\|\vec w_A-\vec w_B\| \] in Euclidean space for all nodes $A,B$. Notice however that this distance depends on the order of the merging steps. \item [\normalfont\textit{method=\q ward\q}:] $\displaystyle d(K,L) = \sqrt{\frac{(|I|+|L|)\cdot d(I,L)+(|J|+|L|)\cdot d(J,L)-|L|\cdot d(I,J)}{|I|+|J|+|L|}}$ The global cluster dissimilarity can be expressed as \[ d(A,B)=\sqrt{\frac{2|A||B|}{|A|+|B|}}\cdot\|\vec c_A-\vec c_B\|, \] where $\vec c_A$ again denotes the centroid of the points in cluster $A$. \end{description} \item [\normalfont\texttt{fastcluster.\textbf{single}}\,(\textit{X})] \phantomsection\addcontentsline{toc}{subsection}{\texttt{single}}\label{single} Alias for \texttt{fastcluster.\textbf{linkage}}\,(\textit{X, method=\q single\q}). \item [\normalfont\texttt{fastcluster.\textbf{complete}}\,(\textit{X})] \phantomsection\addcontentsline{toc}{subsection}{\texttt{complete}}\label{complete} Alias for \texttt{fastcluster.\textbf{linkage}}\,(\textit{X, method=\q complete\q}). \item [\normalfont\texttt{fastcluster.\textbf{average}}\,(\textit{X})] \phantomsection\addcontentsline{toc}{subsection}{\texttt{average}}\label{average} Alias for \texttt{fastcluster.\textbf{linkage}}\,(\textit{X, method=\q average\q}). \item [\normalfont\texttt{fastcluster.\textbf{weighted}}\,(\textit{X})] \phantomsection\addcontentsline{toc}{subsection}{\texttt{weighted}}\label{weighted} Alias for \texttt{fastcluster.\textbf{linkage}}\,(\textit{X, method=\q weighted\q}). \item [\normalfont\texttt{fastcluster.\textbf{centroid}}\,(\textit{X})] \phantomsection\addcontentsline{toc}{subsection}{\texttt{centroid}}\label{centroid} Alias for \texttt{fastcluster.\textbf{linkage}}\,(\textit{X, method=\q centroid\q}). \item [\normalfont\texttt{fastcluster.\textbf{median}}\,(\textit{X})] \phantomsection\addcontentsline{toc}{subsection}{\texttt{median}}\label{median} Alias for \texttt{fastcluster.\textbf{linkage}}\,(\textit{X, method=\q median\q}). \item [\normalfont\texttt{fastcluster.\textbf{ward}}\,(\textit{X})] \phantomsection\addcontentsline{toc}{subsection}{\texttt{ward}}\label{ward} Alias for \texttt{fastcluster.\textbf{linkage}}\,(\textit{X, method=\q ward\q}). \item [\normalfont\texttt{fastcluster.\textbf{linkage\_vector}}\,(\textit{X, method=\q single\q, metric=\q euclidean\q, extraarg=\q None\q})] \phantomsection\addcontentsline{toc}{subsection}{\texttt{linkage\_vector}}\label{linkage_vector} This performs hierarchical, agglomerative clustering on vector data with memory-saving algorithms. While the \hyperref[linkage]{\texttt{linkage}} method requires $\Theta(N^2)$ memory for clustering of $N$ points, this method needs $\Theta(ND)$ for $N$ points in $\mathbb R^D$, which is usually much smaller. The argument $X$ has the same format as before, when $X$ describes vector data, ie.\ it is an $(N\times D)$ array. Also the output array has the same format. The parameter \textit{method} must be one of \textit{\q single\q}, \textit{\q centroid\q}, \textit{\q median\q}, \textit{\q ward\q}, ie.\ only for these methods there exist memory-saving algorithms currently. If \textit{method}, is one of \textit{\q centroid\q}, \textit{\q median\q}, \textit{\q ward\q}, the \textit{metric} must be \textit{\q euclidean\q}. Like the \texttt{linkage} method, \texttt{linkage\_vector} does not treat NumPy's \href{http://docs.scipy.org/doc/numpy/reference/maskedarray.html}{masked arrays} as special and simply ignores the mask. For single linkage clustering, any dissimilarity function may be chosen. Basically, every metric which is implemented in the method \href{http://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.pdist.html}{\texttt{scipy.spatial.distance.pdist}} is reimplemented here. However, the metrics differ in some instances since a number of mistakes and typos (both in the code and in the documentation) were corrected in the \textit{fastcluster} package.\footnote{Hopefully, the SciPy metric will be corrected in future versions and some day coincide with the \textit{fastcluster} definitions. See the bug reports at \url{http://projects.scipy.org/scipy/ticket/1484}, \url{http://projects.scipy.org/scipy/ticket/1486}.} Therefore, the available metrics with their definitions are listed below as a reference. The symbols $u$ and $v$ mostly denote vectors in $\mathbb R^D$ with coordinates $u_j$ and $v_j$ respectively. See below for additional metrics for Boolean vectors. Unless otherwise stated, the input array $X$ is converted to a floating point array (\texttt{X.dtype==numpy.double}) if it does has have already the required data type. Some metrics accept Boolean input; in this case this is stated explicitly below. \begin{description} \item[\normalfont\textit{\q euclidean\q}:] Euclidean metric, $L_2$ norm \[ d(u,v) = \| u-v\|_2 = \sqrt{\sum_j (u_j-v_j)^2} \] \item[\normalfont\textit{\q sqeuclidean\q}:] squared Euclidean metric \[ d(u,v) = \| u-v\|^2_2 = \sum_j (u_j-v_j)^2 \] \item[\normalfont\textit{\q seuclidean\q}:] standardized Euclidean metric \[ d(u,v) = \sqrt{\sum_j (u_j-v_j)^2 /V_j} \] The vector $V=(V_0,\ldots,V_{D-1})$ is given as the \textit{extraarg} argument. If no \textit{extraarg} is given, $V_j$ is by default the unbiased sample variance of all observations in the $j$-th coordinate, $V_j = \Var_i(X_{i,j})=\frac1{N-1}\sum_i(X_{i,j}^2-\mu(X_j)^2)$. (Here, $\mu(X_j)$ denotes as usual the mean of $X_{i,j}$ over all rows $i$.) \item[\normalfont\textit{\q mahalanobis\q}:] Mahalanobis distance \[ d(u,v) = \sqrt{(u-v)^{\mkern-3mu\top}V (u-v)} \] Here, $V=\textit{extraarg}$, a $(D\times D)$-matrix. If $V$ is not specified, the inverse of the covariance matrix \texttt{numpy.linalg.inv(numpy.cov(X, rowvar=False))} is used: \[ (V^{-1})_{j,k} = \frac1{N-1} \sum_i (X_{i,j}-\mu(X_j))(X_{i,k}-\mu(X_k)) \] \item[\normalfont\textit{\q cityblock\q}:] the Manhattan distance, $L_1$ norm \[ d(u,v) = \sum_j |u_j-v_j| \] \item[\normalfont\textit{\q chebychev\q}:] the supremum norm, $L_\infty$ norm \[ d(u,v) = \max_j |u_j-v_j| \] \item[\normalfont\textit{\q minkowski\q}:] the $L_p$ norm \[ d(u,v) = \left(\sum_j |u_j-v_j|^p\right)^{1/p} \] This metric coincides with the \textit{cityblock}, \textit{euclidean} and \textit{chebychev} metrics for $p=1$, $p=2$ and $p=\infty$ (\texttt{numpy.inf}), respectively. The parameter $p$ is given as the \textit{\q extraarg\q} argument. \item[\normalfont\textit{\q cosine\q}] \[ d(u,v) = 1 - \frac{\langle u,v\rangle}{\|u\|\cdot\|v\|} = 1 - \frac{\sum_j u_jv_j}{\sqrt{\sum_j u_j^2\cdot \sum_j v_j^2}} \] \item[\normalfont\textit{\q correlation\q}:] This method first mean-centers the rows of $X$ and then applies the \textit{cosine} distance. Equivalently, the \textit{correlation} distance measures $1-{}$\textrm{(Pearson's correlation coefficient)}. \[ d(u,v) = 1 - \frac{\langle u-\mu(u),v-\mu(v)\rangle}{\|u-\mu(u)\|\cdot\|v-\mu(v)\|}, \] \item[\normalfont\textit{\q canberra\q}] \[ d(u,v) = \sum_j\frac{|u_j-v_j|}{|u_j|+|v_j|} \] Summands with $u_j=v_j=0$ contribute 0 to the sum. \item[\normalfont\textit{\q braycurtis\q}] \[ d(u,v) = \frac{\sum_j |u_j-v_j|}{\sum_j |u_j+v_j|} \] \item[\textnormal{(user function):}] The parameter \textit{metric} may also be a function which accepts two NumPy floating point vectors and returns a number. Eg.\ the Euclidean distance could be emulated with \begin{quote} \texttt{fn = lambda u, v: numpy.sqrt(((u-v)*(u-v)).sum())}\\ \texttt{linkage\_vector(X, method=\q single\q, metric=fn)} \end{quote} This method, however, is much slower than the built-in function. \item[\normalfont\textit{\q hamming\q}:] The Hamming distance accepts a Boolean array (\texttt{X.dtype==bool}) for efficient storage. Any other data type is converted to \texttt{numpy.double}. \[ d(u,v) = |\{j\mid u_j\neq v_j\}| \] \item[\normalfont\textit{\q jaccard\q}:] The Jaccard distance accepts a Boolean array (\texttt{X.dtype\hskip0pt ==\hskip0pt bool}) for efficient storage. Any other data type is converted to \texttt{numpy.double}. \[ d(u,v) = \frac{|\{j\mid u_j\neq v_j\}|}{|\{j\mid u_j\neq 0\text{ or } v_j\neq 0\}|} \] \[ d(0,0) = 0 \] Python represents \texttt{True} by 1 and \texttt{False} by 0. In the Boolean case, the Jaccard distance is therefore: \[ d(u,v) = \frac{|\{j\mid u_j\neq v_j\}|}{|\{j\mid u_j\lor v_j\}|} \] \end{description} The following metrics are designed for Boolean vectors. The input array is converted to the \texttt{bool} data type if it is not Boolean already. Use the following abbreviations for the entries of a contingency table: \begin{align*} a &= |\{j\mid u_j\land v_j \}| & b &= |\{j\mid u_j\land(\lnot v_j)\}|\\ c &= |\{j\mid (\lnot u_j)\land v_j \}| & d &= |\{j\mid (\lnot u_j)\land(\lnot v_j)\}| \end{align*} Recall that $D$ denotes the number of dimensions, hence $D=a+b+c+d$. \begin{description} \item[\normalfont\textit{\q yule\q}] \[ d(u,v) = \frac{2bc}{ad+bc} \] \item[\normalfont\textit{\q dice\q}] \begin{gather*} d(u,v) = \frac{b+c}{2a+b+c}\\ d(0,0) = 0 \end{gather*} \item[\normalfont\textit{\q rogerstanimoto\q}] \[ d(u,v) = \frac{2(b+c)}{b+c+D} \] \item[\normalfont\textit{\q russellrao\q}] \[ d(u,v) = \frac{b+c+d}{D} \] \item[\normalfont\textit{\q sokalsneath\q}] \begin{gather*} d(u,v) = \frac{2(b+c)}{a+2(b+c)}\\ d(0,0) = 0 \end{gather*} \item[\normalfont\textit{\q kulsinski\q}] \[ d(u,v) = \frac 12\cdot\left(\frac b{a+b} + \frac c{a+c}\right) \] \item[\normalfont\textit{\q matching\q}] \[ d(u,v) = \frac{b+c}{D} \] Notice that when given a Boolean array, the \textit{matching} and \textit{hamming} distance are the same. The \textit{matching} distance formula, however, converts every input to Boolean first. Hence, the vectors $(0,1)$ and $(0,2)$ have zero \textit{matching} distance since they are both converted to $(\mathrm{False}, \mathrm{True})$ but the \textit{hamming} distance is $0.5$. \item[\normalfont\textit{\q sokalmichener\q}] is an alias for \textit{\q matching\q}. \end{description} \end{methods} \section{Behavior for NaN and infinite values}\label{sec:infnan} Whenever the fastcluster package encounters a NaN value as the distance between nodes, either as the initial distance or as an updated distance after some merging steps, it raises an error. This was designed intentionally, even if there might be ways to propagate NaNs through the algorithms in a more or less sensible way. Indeed, since the clustering result depends on every single distance value, the presence of NaN values usually indicates a dubious clustering result, and therefore NaN values should be eliminated in preprocessing.\pagebreak[1] In the R interface for vector input, coordinates with {\NA} value are interpreted as missing data and treated in the same way as R's {\dist} function does. This results in valid output whenever the resulting distances are not NaN. The Python interface does not provide any way of handling missing coordinates, and data should be processed accordingly and given as pairwise distances to the clustering algorithms in this case. The fastcluster package handles node distances and coordinates with infinite values correctly, as long as the formulas for the distance updates and the metric (in case of vector input) make sense. In concordance with the statement above, an error is produced if a NaN value results from performing arithmetic with infinity. Also, the usual proviso applies: internal formulas in the code are mathematically equivalent to the formulas as stated in the documentation only for finite, real numbers but might produce different results for $\pm\infty$. Apart from obvious cases like single or complete linkage, it is therefore recommended that users think about how they want infinite values to be treated by the distance update and metric formulas and then check whether the fastcluster code does exactly what they want in these special cases. \section{Differences between the two interfaces} \begin{itemize} \item The \textit{\q mcquitty\q} method in R is called \textit{\q weighted\q} in Python. \item R and SciPy use different conventions for the ``Euclidean'' methods \textit{\q centroid\q}, \textit{\q median\q}! R assumes that the dissimilarity matrix consists of squared Euclidean distances, while SciPy expects non-squared Euclidean distances. The fastcluster package respects these conventions and uses different formulas in the two interfaces. The \textit{\q ward\q} method in the Python interface is identical to \textit{\q ward.D2\q} in the R interface. If the same results in both interfaces ought to be obtained, then the \hyperref[hclust]{\texttt{hclust}} function in R must be input the entry-wise square of the distance matrix, \verb!d^2!, for the \textit{\q ward.D\q}, \textit{\q centroid\q} and \textit{\q median\q} methods, and later the square root of the height field in the dendrogram must be taken. The \hyperref[hclust.vector]{\texttt{hclust.vector}} method calculates non-squared Euclidean distances, like R's \dist{} method and identically to the Python interface. See the \hyperref[squared]{example} in the \hyperref[hclust.vector]{\texttt{hclust.vector}} documentation above. For the \textit{\q average\q} and \textit{\q weighted\q} alias \textit{\q mcquitty\q} methods, the same, non-squared distance matrix \texttt{d} as in the Python interface must be used for the same results. The \textit{\q single\q} and \textit{\q complete\q} methods only depend on the relative order of the distances, hence it does not make a difference whether the method operates on the distances or the squared distances. The code example in the R documentation (enter \texttt{?hclust} or \texttt{example(hclust)} in R) contains another instance where the squared distance matrix is generated from Euclidean data. \item The Python interface is not designed to deal with missing values, and NaN values in the vector data raise an error message. The \hyperref[hclust.vector]{\texttt{hclust.vector}} method in the R interface, in contrast, deals with NaN and the (R specific) {\NA} values in the same way as the \dist{} method does. Confer the documentation for \dist{} for details. \end{itemize} \section{References} \begin{trivlist} \item \textit{NumPy: Scientific computing tools for Python}, \url{http://numpy.scipy.org/}. \item Eric Jones, Travis Oliphant, Pearu Peterson et al., \textit{SciPy: Open Source Scientific Tools for Python}, 2001, \url{http://www.scipy.org}. \item \textit{R: A Language and Environment for Statistical Computing}, R Foundation for Statistical Computing, Vienna, 2011, \url{http://www.r-project.org}. \end{trivlist} \end{document} %%% Local variables: %%% mode: latex %%% TeX-master: "fastcluster.Rtex" %%% TeX-PDF-mode: t %%% End: fastcluster/MD50000644000176200001440000000272413146533700013130 0ustar liggesusersc003d3dcbd395ef849e3af680c14ea04 *DESCRIPTION f42049b61f5700e04db55fea48c3f172 *INSTALL f4abec074fd2a5f5df26d4ea11206493 *LICENSE da8e9d68585993250a9c29c3e9bff50b *NAMESPACE 4be4155d9678f4ddfb76071de259a2ac *NEWS e17871d8f0d7650d3d588c7d4fd7cdc0 *R/fastcluster.R e1e421b365b092b958761b9aa4542751 *README 787b94de9a3092c7dd7763d3a5e64414 *build/vignette.rds 459081fd7078ab4eadf2e3ce7e45bab1 *inst/CITATION 1c31e2352078833f8d2f664fa4d92222 *inst/doc/fastcluster.Rtex 504632b4c6500b0994acaf07ec3bd865 *inst/doc/fastcluster.pdf 3eed5fa276cbf58077d5304bb8ed0eb7 *man/fastcluster.Rd 14abdf33b799d6d48057f19d1974a6bc *man/hclust.Rd a6ca386b8617952d163ef83abb8b6819 *man/hclust.vector.Rd 97bb0f9bf046e498c47423129fc3691a *src/Makevars 7b8a328733afe582986d5292e9c91278 *src/Makevars.win 60cb0a90da9ab22ad5871ae71b434a2f *src/fastcluster.cpp 08eeb0c1683b6dea8fbb7840c7aaf2f1 *src/fastcluster_R.cpp 3206c9ffac28920af50a7d8049ee302b *src/python/fastcluster.py b72007eb0c73f20ef901f84bdd32f1ae *src/python/fastcluster_python.cpp 2d4ab7ae984ecc57fe6448a6bb2f83d1 *src/python/setup.py 0553f404a601c5830f33d7f2216c7530 *src/python/tests/__init__.py 68604314cc18b0aa691934edd94eebff *src/python/tests/nantest.py c8c9a929ee8a22b8219e376de6677020 *src/python/tests/test.py 3b1cf8f33d62292394f1ae56b37b2022 *src/python/tests/vectortest.py 7862ca89f826da64aedcc585521795c4 *tests/test_fastcluster.R 9cbb544a7574e9d55aed550e5f3608a4 *vignettes/Makefile 1c31e2352078833f8d2f664fa4d92222 *vignettes/fastcluster.Rtex fastcluster/README0000644000176200001440000001431213144412357013476 0ustar liggesusersfastcluster: Fast hierarchical clustering routines for R and Python Copyright: * Until package version 1.1.23: © 2011 Daniel Müllner * All changes from version 1.1.24 on: © Google Inc. The fastcluster package is a C++ library for hierarchical, agglomerative clustering. It efficiently implements the seven most widely used clustering schemes: single, complete, average, weighted/McQuitty, Ward, centroid and median linkage. The library currently has interfaces to two languages: R and Python/NumPy. Part of the functionality is designed as drop-in replacement for existing routines: “linkage” in the SciPy package “scipy.cluster.hierarchy”, “hclust” in R's “stats” package, and the “flashClust” package. Once the fastcluster library is loaded at the beginning of the code, every program that uses hierarchical clustering can benefit immediately and effortlessly from the performance gain. Moreover, there are memory-saving routines for clustering of vector data, which go beyond what the existing packages provide. See the author's home page for more information, in particular a performance comparison with other clustering packages. The User's manual is the file inst/doc/fastcluster.pdf in the source distribution. The fastcluster package is distributed under the BSD license. See the file LICENSE in the source distribution or . Installation ‾‾‾‾‾‾‾‾‾‾‾‾ See the file INSTALL in the source distribution. Usage ‾‾‾‾‾ 1. R ‾‾‾‾ In R, load the package with the following command: library('fastcluster') The package overwrites the function hclust from the “stats” package (in the same way as the flashClust package does). Please remove any references to the flashClust package in your R files to not accidentally overwrite the hclust function with the flashClust version. The new hclust function has exactly the same calling conventions as the old one. You may just load the package and immediately and effortlessly enjoy the performance improvements. The function is also an improvement to the flashClust function from the “flashClust” package. Just replace every call to flashClust by hclust and expect your code to work as before, only faster. (If you are using flashClust prior to version 1.01, update it! See the change log for flashClust: http://cran.r-project.org/web/packages/flashClust/ChangeLog ) If you need to access the old function or make sure that the right function is called, specify the package as follows: fastcluster::hclust(…) flashClust::hclust(…) stats::hclust(…) Vector data can be clustered with a memory-saving algorithm with the command hclust.vector(…) See the User's manual inst/doc/fastcluster.pdf for further details. WARNING ‾‾‾‾‾‾‾ R and Matlab/SciPy use different conventions for the “Ward”, “centroid” and “median” methods. R assumes that the dissimilarity matrix consists of squared Euclidean distances, while Matlab and SciPy expect non-squared Euclidean distances. The fastcluster package respects these conventions and uses different formulas in the two interfaces. If you want the same results in both interfaces, then feed the hclust function in R with the entry-wise square of the distance matrix, D^2, for the “Ward”, “centroid” and “median” methods and later take the square root of the height field in the dendrogram. For the “average” and “weighted” alias “mcquitty” methods, you must still take the same distance matrix D as in the Python interface for the same results. The “single” and “complete” methods only depend on the relative order of the distances, hence it does not make a difference whether the method operates on the distances or the squared distances. The code example in the R documentation (enter ?hclust or example(hclust) in R) contains an instance where the squared distance matrix is generated from Euclidean data. 2. Python ‾‾‾‾‾‾‾‾‾ The fastcluster package is imported as usual by import fastcluster It provides the following functions: linkage(X, method='single', metric='euclidean', preserve_input=True) single(X) complete(X) average(X) weighted(X) ward(X) centroid(X) median(X) linkage_vector(X, method='single', metric='euclidean', extraarg=None) The argument X is either a compressed distance matrix or a collection of n observation vectors in d dimensions as an (n×d) array. Apart from the argument preserve_input, the methods have the same input and output as the functions of the same name in the package scipy.cluster.hierarchy. The additional, optional argument preserve_input specifies whether the fastcluster package first copies the distance matrix or writes into the existing array. If the dissimilarities are generated for the clustering step only and are not needed afterward, approximately half the memory can be saved by specifying preserve_input=False. Note that the input array X contains unspecified values after this procedure. You may want to write linkage(X, method='…', preserve_input=False) del X to make sure that the matrix X is not accessed accidentally after it has been used as scratch memory. The method linkage_vector(X, method='single', metric='euclidean', extraarg=None) provides memory-saving clustering for vector data. It also accepts a collection of n observation vectors in d dimensions as an (n×d) array as the first parameter. The parameter 'method' is either 'single', 'ward', 'centroid' or 'median'. The 'ward', 'centroid' and 'median' methods require the Euclidean metric. In case of single linkage, the 'metric' parameter can be chosen from all metrics which are implemented in scipy.spatial.dist.pdist. There may be differences between linkage(scipy.spatial.dist.pdist(X, metric='…')) and linkage_vector(X, metric='…') since there have been made a few corrections compared to the pdist function. Please consult the the User's manual inst/doc/fastcluster.pdf for comprehensive details. fastcluster/build/0000755000176200001440000000000013146376104013715 5ustar liggesusersfastcluster/build/vignette.rds0000644000176200001440000000031613146376104016254 0ustar liggesusersb```b`fad`b2 1# 'X\SZ\ZTZ&ZZ^W&ɏ % Md0&$yּb4M.y) 3GZY_Ӄ -3'foHf e2|s mMI,F(WJbI^ZP?oefastcluster/DESCRIPTION0000644000176200001440000000275613146533700014333 0ustar liggesusersPackage: fastcluster Encoding: UTF-8 Type: Package Version: 1.1.24 Date: 2017-08-14 Title: Fast Hierarchical Clustering Routines for R and Python Authors@R: person("Daniel", "Müllner", email = "daniel@danifold.net", role = c("aut", "cph", "cre")) Copyright: Until package version 1.1.23: © 2011 Daniel Müllner . All changes from version 1.1.24 on: © Google Inc. . Enhances: stats, flashClust Depends: R (>= 3.0.0) Description: This is a two-in-one package which provides interfaces to both R and Python. It implements fast hierarchical, agglomerative clustering routines. Part of the functionality is designed as drop-in replacement for existing routines: linkage() in the SciPy package 'scipy.cluster.hierarchy', hclust() in R's 'stats' package, and the 'flashClust' package. It provides the same functionality with the benefit of a much faster implementation. Moreover, there are memory-saving routines for clustering of vector data, which go beyond what the existing packages provide. For information on how to install the Python files, see the file INSTALL in the source distribution. License: FreeBSD | GPL-2 | file LICENSE URL: http://danifold.net/fastcluster.html NeedsCompilation: yes Packaged: 2017-08-20 21:16:52 UTC; muellner Author: Daniel Müllner [aut, cph, cre] Maintainer: Daniel Müllner Repository: CRAN Date/Publication: 2017-08-21 10:36:48 UTC fastcluster/man/0000755000176200001440000000000013022112552013355 5ustar liggesusersfastcluster/man/hclust.Rd0000644000176200001440000000440413022112527015152 0ustar liggesusers\name{hclust} \alias{hclust} \title{Fast hierarchical, agglomerative clustering of dissimilarity data} \description{ This function implements hierarchical clustering with the same interface as \code{\link[stats:hclust]{hclust}} from the \pkg{\link{stats}} package but with much faster algorithms. } \usage{hclust(d, method="complete", members=NULL)} \arguments{ \item{d}{a dissimilarity structure as produced by \code{dist}.} \item{method}{the agglomeration method to be used. This must be (an unambiguous abbreviation of) one of \code{"single"}, \code{"complete"}, \code{"average"}, \code{"mcquitty"}, \code{"ward.D"}, \code{"ward.D2"}, \code{"centroid"} or \code{"median"}.} \item{members}{\code{NULL} or a vector with length the number of observations.} } \value{An object of class \code{'hclust'}. It encodes a stepwise dendrogram.} \details{See the documentation of the original function \code{\link[stats:hclust]{hclust}} in the \pkg{\link{stats}} package. A comprehensive User's manual \href{https://CRAN.R-project.org/package=fastcluster/vignettes/fastcluster.pdf}{fastcluster.pdf} is available as a vignette. Get this from the R command line with \code{vignette('fastcluster')}. } \references{\url{http://danifold.net/fastcluster.html}} \author{Daniel Müllner} \seealso{\code{\link{fastcluster}}, \code{\link{hclust.vector}}, \code{\link[stats:hclust]{stats::hclust}}} \examples{# Taken and modified from stats::hclust # # hclust(...) # new method # stats::hclust(...) # old method require(fastcluster) require(graphics) hc <- hclust(dist(USArrests), "ave") plot(hc) plot(hc, hang = -1) ## Do the same with centroid clustering and squared Euclidean distance, ## cut the tree into ten clusters and reconstruct the upper part of the ## tree from the cluster centers. hc <- hclust(dist(USArrests)^2, "cen") memb <- cutree(hc, k = 10) cent <- NULL for(k in 1:10){ cent <- rbind(cent, colMeans(USArrests[memb == k, , drop = FALSE])) } hc1 <- hclust(dist(cent)^2, method = "cen", members = table(memb)) opar <- par(mfrow = c(1, 2)) plot(hc, labels = FALSE, hang = -1, main = "Original Tree") plot(hc1, labels = FALSE, hang = -1, main = "Re-start from 10 clusters") par(opar) } \keyword{multivariate} \keyword{cluster} fastcluster/man/fastcluster.Rd0000644000176200001440000000571313022112552016211 0ustar liggesusers\name{fastcluster} \alias{fastcluster} \alias{fastcluster-package} \docType{package} \title{Fast hierarchical, agglomerative clustering routines for R and Python} \description{The \pkg{fastcluster} package provides efficient algorithms for hierarchical, agglomerative clustering. In addition to the R interface, there is also a Python interface to the underlying C++ library, to be found in the source distribution. } \details{The function \code{\link{hclust}} provides clustering when the input is a dissimilarity matrix. A dissimilarity matrix can be computed from vector data by \code{\link{dist}}. The \code{\link{hclust}} function can be used as a drop-in replacement for existing routines: \code{\link[stats:hclust]{stats::hclust}} and \code{\link[flashClust:hclust]{flashClust::hclust}} alias \code{\link[flashClust:flashClust]{flashClust::flashClust}}. Once the fastcluster library is loaded at the beginning of the code, every program that uses hierarchical clustering can benefit immediately and effortlessly from the performance gain When the package is loaded, it overwrites the function \code{\link{hclust}} with the new code. The function \code{\link{hclust.vector}} provides memory-saving routines when the input is vector data. Further information: \itemize{ \item R documentation pages: \code{\link{hclust}}, \code{\link{hclust.vector}} \item A comprehensive User's manual: \href{https://CRAN.R-project.org/package=fastcluster/vignettes/fastcluster.pdf}{fastcluster.pdf}. Get this from the R command line with \code{vignette('fastcluster')}. \item JSS paper: \url{https://www.jstatsoft.org/v53/i09/}. \item See the author's home page for a performance comparison: \url{http://danifold.net/fastcluster.html}. } } \references{\url{http://danifold.net/fastcluster.html}} \author{Daniel Müllner} \seealso{\code{\link{hclust}}, \code{\link{hclust.vector}}} \examples{# Taken and modified from stats::hclust # # hclust(...) # new method # hclust.vector(...) # new method # stats::hclust(...) # old method require(fastcluster) require(graphics) hc <- hclust(dist(USArrests), "ave") plot(hc) plot(hc, hang = -1) ## Do the same with centroid clustering and squared Euclidean distance, ## cut the tree into ten clusters and reconstruct the upper part of the ## tree from the cluster centers. hc <- hclust.vector(USArrests, "cen") # squared Euclidean distances hc$height <- hc$height^2 memb <- cutree(hc, k = 10) cent <- NULL for(k in 1:10){ cent <- rbind(cent, colMeans(USArrests[memb == k, , drop = FALSE])) } hc1 <- hclust.vector(cent, method = "cen", members = table(memb)) # squared Euclidean distances hc1$height <- hc1$height^2 opar <- par(mfrow = c(1, 2)) plot(hc, labels = FALSE, hang = -1, main = "Original Tree") plot(hc1, labels = FALSE, hang = -1, main = "Re-start from 10 clusters") par(opar) } \keyword{multivariate} \keyword{cluster} fastcluster/man/hclust.vector.Rd0000644000176200001440000000701013022112460016443 0ustar liggesusers\name{hclust.vector} \alias{hclust.vector} \title{Fast hierarchical, agglomerative clustering of vector data} \description{ This function implements hierarchical, agglomerative clustering with memory-saving algorithms.} \usage{hclust.vector(X, method="single", members=NULL, metric='euclidean', p=NULL)} \arguments{ \item{X}{an \eqn{(N\times D)}{(N×D)} matrix of '\link{double}' values: \eqn{N}{N} observations in \eqn{D}{D} variables.} \item{method}{the agglomeration method to be used. This must be (an unambiguous abbreviation of) one of \code{"single"}, \code{"ward"}, \code{"centroid"} or \code{"median"}.} \item{members}{\code{NULL} or a vector with length the number of observations.} \item{metric}{the distance measure to be used. This must be one of \code{"euclidean"}, \code{"maximum"}, \code{"manhattan"}, \code{"canberra"}, \code{"binary"} or \code{"minkowski"}. Any unambiguous substring can be given.} \item{p}{parameter for the Minkowski metric.} } \details{The function \code{\link{hclust.vector}} provides clustering when the input is vector data. It uses memory-saving algorithms which allow processing of larger data sets than \code{\link{hclust}} does. The \code{"ward"}, \code{"centroid"} and \code{"median"} methods require \code{metric="euclidean"} and cluster the data set with respect to Euclidean distances. For \code{"single"} linkage clustering, any dissimilarity measure may be chosen. Currently, the same metrics are implemented as the \code{\link[stats:dist]{dist}} function provides. The call\preformatted{ hclust.vector(X, method='single', metric=[...])} gives the same result as\preformatted{ hclust(dist(X, metric=[...]), method='single')} but uses less memory and is equally fast. For the Euclidean methods, care must be taken since \code{\link{hclust}} expects \bold{squared} Euclidean distances. Hence, the call\preformatted{ hclust.vector(X, method='centroid')} is, aside from the lesser memory requirements, equivalent to\preformatted{ d = dist(X) hc = hclust(d^2, method='centroid') hc$height = sqrt(hc$height)} The same applies to the \code{"median"} method. The \code{"ward"} method in \code{\link{hclust.vector}} is equivalent to \code{\link{hclust}} with method \code{"ward.D2"}, but to method \code{"ward.D"} only after squaring as above. More details are in the User's manual \href{https://CRAN.R-project.org/package=fastcluster/vignettes/fastcluster.pdf}{fastcluster.pdf}, which is available as a vignette. Get this from the R command line with \code{vignette('fastcluster')}. } \references{\url{http://danifold.net/fastcluster.html}} \author{Daniel Müllner} \seealso{\code{\link{fastcluster}}, \code{\link{hclust}}} \examples{# Taken and modified from stats::hclust ## Perform centroid clustering with squared Euclidean distances, ## cut the tree into ten clusters and reconstruct the upper part of the ## tree from the cluster centers. hc <- hclust.vector(USArrests, "cen") # squared Euclidean distances hc$height <- hc$height^2 memb <- cutree(hc, k = 10) cent <- NULL for(k in 1:10){ cent <- rbind(cent, colMeans(USArrests[memb == k, , drop = FALSE])) } hc1 <- hclust.vector(cent, method = "cen", members = table(memb)) # squared Euclidean distances hc1$height <- hc1$height^2 opar <- par(mfrow = c(1, 2)) plot(hc, labels = FALSE, hang = -1, main = "Original Tree") plot(hc1, labels = FALSE, hang = -1, main = "Re-start from 10 clusters") par(opar) } \keyword{multivariate} \keyword{cluster} fastcluster/LICENSE0000644000176200001440000000262413144412154013621 0ustar liggesusersCopyright: * Until package version 1.1.23: © 2011 Daniel Müllner * All changes from version 1.1.24 on: © Google Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.