fastcluster/0000755000176200001440000000000012470743142012615 5ustar liggesusersfastcluster/inst/0000755000176200001440000000000012470717051013572 5ustar liggesusersfastcluster/inst/CITATION0000644000176200001440000000133312147514400014721 0ustar liggesuserscitHeader("To cite fastcluster in publications use:") citEntry(entry = "Article", title = "{fastcluster}: Fast Hierarchical, Agglomerative Clustering Routines for {R} and {Python}", author = personList(as.person("Daniel M\\\"ullner")), journal = "Journal of Statistical Software", year = "2013", volume = "53", number = "9", pages = "1--18", url = "http://www.jstatsoft.org/v53/i09/", textVersion = paste("Daniel Müllner (2013).", "fastcluster: Fast Hierarchical, Agglomerative Clustering Routines for R and Python.", "Journal of Statistical Software, 53(9), 1-18.", "URL http://www.jstatsoft.org/v53/i09/.") ) fastcluster/inst/doc/0000755000176200001440000000000012470717051014337 5ustar liggesusersfastcluster/inst/doc/fastcluster.pdf0000644000176200001440000033746512470717051017413 0ustar liggesusers%PDF-1.5 % 22 0 obj <> stream xڕX[۶~[ K/yj6}ı7C\"\>| m>lM&SmvFE2M~ѩ4jch bj!$2/ %f0RY$ATVve_Nhn*ZY81R." #:v]1D,/!74 7q"upVHe(;gTAgIth" 0&e.gDaUYM.py09(v]`О[ )K%^~ 88<Gg 9 M˻ ag$u֤+'3N+g;7#k:*0Ri7̳ᾈRzJ4#@;U(i뽾D<) )H1燱r q!CP/;B\H+_8Yix`K `ҵ1|*-MV QL(͙D2Zfߔ'eOUp~*1el*׵9O)'v(eA3-@d>XuXHP_SA"ejR 2g4$Ppp[AF-C_y8wYKM\KS@u $83C*٬(ĉ7 @[RQ4GP5~/X#yڶ. -bW,Z>37j5:E]k]GVĩɏtcDts5"yu#nK7Ex,b@ӦB⨻oa6@c58T?iidfTrkq{-i6/+ȁ 9s828Pe( +'&,uPJ,F6؄k2!d^Y3 L3r_ eUOY7|}3_azD(D)_R77 Tգo;8G\T=P-V Z5=;<3db D:q&Jx j ?6g4Cr#4|zף'?D$)3-1Q K܂sy+σG_aЯJ^x2 =1l@xQ0i`!(1D~R.8A`u:%N] <|)wq -'E4K{U :~52Ϭ텧zxWA&̀?PiϬ w &$7*`~ \le@9ܑsR9f-üP8) ?egmsŕ2ѤH6esG endstream endobj 59 0 obj <> stream xZo6BT-/QcޡEQ_p\JP+q*r 9#oGG9PɀX96|u_ [L\n[> OBk[ߕ]?W$)2IMM pv@A&D{ZkpGr,I^&XB}&Y(ȉXtn9/)Oa8eT% )bar[+հQnWZsCG^Ͼ->s-B7ڇbGϴ+2cBmvц\r55Øpӹ0dJd:p#) *a τf4Zy!9˳kxcñ/C1N!!))h.8&]v͵;2JiL̻~rʶ!\TMj][qgfԿȻ)šrJ.TBaz-`ՐI(=C%j1柚܃P06@,uz~yq<p8Ss 3ƒCRt"m)IOf.=*v=,!?UaDE}8fT+\M+X-4r@(PY 0<m3APwt Lwhu:=XH5yh`lGcCcf."b^O{fw鋁xNPnU7;io/` [m7M4I%SZ#=r!{tEQ)WA߮Z`DC{8>oUd`/nx-a:BKw3>5 ]$N~\ܛۮB-#ĉ5wW?Wk%̿rIvEscOclûOp'S|$RR݀>4Xhaowp슆uk;5v7Ѓ/^qq"'?y% ^p=Jg endstream endobj 77 0 obj <> stream xڽ˒T &^|˗<6T6\29`$DeӍHPf8S+h4_rj۬RV~O/?\}zXi &VvJL?ls?՟>@11vv7edp>QNS/A}!p6fu!nR$;3dk8Pz#҇c5hdhM=mz=КzvԴm8X\4{?-Ri@x!Lw%Y!g]!Xb} ?ueU{厸<8ك=u.pۮ:;U>|i-ȤHv= 7t7D(L`:8s}1)Y=t{G(sONGuv|/~u<Vv dΔ|ɲ]%DzƽZb/L]u0 cd$Nv z2"쪾6`!䉐4f誟hՏeI`PoR fzӄAmuJ5A`qWxY".(38K8*p[Ctt , %sȒg~`wBύH!2:c׵%L`H'ç9s܆; t_Oy=9i0nOtl˳R /u9+uH|K<4b*`ndD?`b clA(zw7JbDg!!_!xACXӹ%"2e8)G7qpliElIױ:[Z O! Ʊa FX9*ժ}V}>}?g}g?W.Kζ!`%*f`\kUTWsoA9A/a]*i:VCej8~Ies+_w#f9A{Zd, $RMxJ Ks!]vZ  EIfOόMte &ft*rc[c@Os(Pa-q6R=V}w(pMh $Np9ImPC?Do=D 0@bTR_T J4VYrrmEFѻ F$9~퓠"ڞr)TP)SLF;73j(Z4[?h_!F{6Q&[#wCe{}o-{kPo/,@$vA?~QՖBq8|݄%w]h7wo#ĨmU[lȋֈTdwlCK[ֿe$(ٜ=kWM:EIdĮBBSڀ<0<$}E[\mvQ,׻4MH<1|xV_7<˒BL||xxgJSN@ztm/qʉg"ٞhQ'|ɱ;n|tMy,31d>fz /^^Q]\.5SH΅wb G[=MrxrYX|PdeT {|@#<3}e $#j{5cZ?@~y3!&Y>߱cub5gB z+_Gx_-!$jFg+)o-B endstream endobj 85 0 obj <> stream xZ[~[)4p.ȃ6E @Z4KHy3r(.I5@r83 5{?L1ϖٛߪYIJgw-f .avW6Ӻw0w[|YkgLefٷ{lY}v$2R1tq¸yR]JAHxAB l(n_Ήۯ{ՙ]Uun}&ܩmA&y Wsuyfre޻nzmM7]|cWvxNzkVdbSJArfKٟ5.##qY)2w-иxQn^0nN<2  Mé5 .55 .eY_DkCjg8lЂC&}MDJ‹r)%)GvGT 큱=߆R9U*H\9pӉm),׭Bscm(ެvo=+x*%t:Lܻ 8ECYWoӣYq !n0տfiL5g]sK8w3$PJAĀwdžtO7:pƲ)޵xgⳛ@s$pZ9r}f~Y]4$Pzlmw*TƭTz)fJJԇ}(`8C/{̶֫l\QMVDmMg9%aHXW۔tQusɵL8'okL&'PVnQߴ|#0"/F mLt c4-:su ڄ%l~0}O [ K;i"UK\?Փ}t_yjE6Q1|&,zCL ׬N,pB:L: JF4; 04'GKjw;.=/urJ49O(s6ao>96+ν \r༝p]tJ;[N?$9#}zc5lͳ <-rۣ8vĐ:XjXS5 V—(xjݏC̈]=Zs'fm8-uKHð@ O 'T: [vG5SB0A Tq$Lt-qPS&2s'^d(NW :D)Vܫ)h=R8 !uZ=ёT ^t,,==qI!ϨQċiV/(90vDŰ S|L5s1Y"1@{̬tP:.z!ӥb=ko7 ^ I1]u>iRA$i6 (E'*0T11}XQ*&I9H)OC?#nK%$Ѕ]jYbp9;oRb|_CJw?fLb\k@pm+vkLl`L/m? }10"~zr< ?h@M`1b h.I6 -Fc O晿Z]o$ź( A4$*&'rn'W%1;̖ٖ:F祍KMMW-$CDꃱ(RbSO'%Y,oډh ,ŋm>SbWNV?v/C+[.пǴ#oH;#/-'Rg+bϳ#/ ȑ%/@-MRPDJ݅$INOp)L.䪢g/W At! t[ v;*ՙ#(MQ/}ߞ+'$FH0116 c\|O H /BI c. d`ι%HK]*!MpHH.cRЉ?U endstream endobj 89 0 obj <> stream xZKW a{Xc7!@-qfhKLRK~{A6GMIcd?YdG/[W׋]-4lq},|q$,Ne,I?-sZ96g:Ėuh,ͤݮpbTGթ]KS%Rx _`)CM|f\o+"J~ET3?ryEN<([v:eFΩiJ>.R= Z`Uӎw)ܠny@;8 P[T=||Ҥ+L'Nɲ[|6:6OQ[ϰĊ K4ccIÂRSii`rG06/sm8rʜ3--[K.S]z{}Np߷ T JHU_DRIi)U >_UX @i(v6.3堽ʯp [l8K),SbƬYŌ)<*fy,lTG)y|w#ӹ6)>@BL#Lt @9ܖ ѶZR][1iݫ"{7`c~L=|^Pr(1tX[- <:go-Eh|`,EhdbӔqŌs=jn"ywɹ8m24`~P͏'L^EO$ >DΜ<%dO nhcX ˈ)gstjB`]M(=(Μ7@UO;&9C=N!l)MEc*0dlـh؄EӢcZ e  UC\|l)=+sSp# Bd)ݾSCDϤC5&(Fk7ӅHT0UM0.kxzy9yus`:G:HD;-4zpAJ2̝uMr˨l4˹T C%Sz]T1Fk>#J=rpӴ҉;[vGoAMq"C@,#gXF}a` ;{yVi xG'Ü7S''䉢59yɊNʟ:ɵ8= R evѽNrP93{PV`Kj*c(162sʓ4}g$Z,6M;PT#ۤIA` 1N M=;H4TS.G&oNԗ^2@?jJ2L =zS#N9sH'N XVJp/)S զhރp@.9sז]gP;|Fke" U>,Yj`iVU}?򯢰8 ą,e3Y"7}lVuk9B6o<+fIqOzZ+_0#N$zl}q[T΍R>A>q||쐂/.T=M>>^&Sگ/ȥͻsX/}i)h67ck:dUw޾w_̵CKꪩ;pqUtaeouCP$)D+Z`_^$LĺfotJrM|z?+'} .6x]_!70h^Š /. W.>IN;L{\¬ʵq> stream xڽZm۸_N D" -rmp=rhYk gKI$C56b,Rp^yf͇ dG7)On7m RH0M Iݯɡ<jD+?˫*9UáɳdϧsFDxzYx]_7cY "%r?$ ikͻ-]]ƄzEu.˛&uw"^ZBɂ!nh߹vb 2sSou &eTpv'w&LW09B>c{~c.*t'q`pc= ,vS'vI)%EF67 Ճ[ SO0yC NƆC|翏Q̨&U6fEF "握K[PpTqR26F 8g|&VŽ$+7$u3X5†Q:N.؆"<0ʮ;k(A)SzC{]d_!8ϩ뭇k v>ɕzWv1Rh4+4-7n{r"PvAƳ%%0fy"u>gs<~/';\H3x`/(h09. d2Mg. . w؅"2KDJ ]+1&`ʄ#)R鍼2jJ2x0̗ڄ6G_JR7efו2AAO 7՘tA_,(eDYS\_Uܮ:Q*HZq~M@aMJ9JDknAocMxnˀznvU5cڻHJr3ǒ-`\rrHq 闯ྨ3ӯ,Ku/j'է)679a@ŷexrYD^礚' [֍:9X5r0SĒ=7j!OP^L#Lc %4mD,-rU$KE0y'gTW͙=ZjVI1QҔ3N8<}|~gzHm*A 7D1-{e͙@,:t^+rD4AE^|J^K[~J)_VJJmMeĤ.,t4̯)O)Zb98ki{DQձ{c[%ެ4Ыo;>Q[ 9X(fӻo;Yp"EBWwV$uEpYw{oG:MR2J ם?h| ?!vU{m&- YM˥+s?&Τ60C&y(-USQ4;H4_U*?>*ȓTW .=@HA%WhN/lq"YL6bt+(T(׊-DAמ:xpOaX.P6~]._ mzITp0ŸU? :87k>J l X] 3~m PAWs׹4 қ؄t|Ԏ/Q> stream xZ[~[d fy)%X'A/AZJlɕ %lO-ZLq8f>Č?1l}w?_,eQYXg ecv)*B[念u2kW*_ME˹Q[݊i(Λ/Z򟄢`M^{+P6mYщZfSJE}:(l PJGQ7pa؟Ib&4Sڠ>dR5[$ 6Enąe>hף .Y;NiԱ@B  |!|E]YC:4lK]xoF{"e$X@I=FXv@0,npb7'[7o-,qϿz}VX2M-JfgSɄ-|q8k4.;í|1t~W!< w !5"GbFK[r 75E$-5(o9(~nmT ΙaS^ ΘTqr[Ôuju۵H#y0ED=]`n2q/y uަ\H&5IW:'(BDJ&m]7} W2X>r=m~/ +r n&>.).bвcU"u OD\Pu/N&|'fԱ&/G|Z ڲ.J=̝4f)}v+vGNHf;MsZTd7]ڑ]&_l6_ vA7Rg:=1;L'w4fho 5II)%z[` ʨPYa&eee%~]MSmV+dEj f=ڔ+ej “s<&6ě .3_ o܆=% d38Vǃ#mٖڬ\lk楹a&o"wIruHc;K\r QOֶ9Jߓ# ~VEGںe7!Ʉ&U7qB8|(:Ǡ튮#B1J~|*/0u. ?>&O@-ʩ0 5#/B9 ;%Pk H3cOG汎2p0 4;e7 (Sc% $ JA]&vR}KAA%~CM'x/_45*ڱH_-9iCGv+q`1peLY,A{Dqzȹ&&n$!kNoZϵaCQSǁ(K `Q]Y`2791h6󧀼Hs$<'i骳u,c F;a478#C2t (70>'J& >.Z`x}-=p.Iu-/RΌ\,tHvJG:*xS1Lb3bP)z Ƣli'c.D01AtZ :"'.9 C@h8N\igYSbOEsȖRfހ!a=FBЊc,@#%kSާ'/p!rK2yE'"*x)AuIsPEJI0o^7qHbt;n?o< endstream endobj 123 0 obj <> stream xڝYY~ϯ0IFƊx`^d cѶ2::}XEmutRX#m'v]*?_v0OᴋUA%б{(<\?T:K(hB]yd&%8eӝ6{ :(M_TP66^ i=G:!(EFy2p)JP]y2yNekMej,࡙~5D0<|l)AM4ؼʮf|Cj,vPZQcg! -fJr1m-Ę%ɩ&F=>]@X Ju&7 )9{,M(P|! \W wwq("<@*- zBzǎ[]}k#e*FPr;\Dx}v,M)dxYUÊ0m~8ۨ1ZL|:fʂGJAUqw^Luna/5((\(//pMryx0͑]k!V=WCw.y cY?`a9lm!POWȸ؜I9m4Qd]9v<>,F^4-m2oGx+߄ *m% Q8BER8-Gn ]w(J !.6 7fc+n2$+ZX \̏.JXd?nvc/Q+ 1ؘbjQ"[M)l" Qy L8lVKKC%&b^6cažUkB5p^uLc |RWT@Q 9 nڍ>ݖtA3c$'z{.{~ha " 1zv4́Mq#'UŠRڪ`\m,㝲+S[*W\/\ybYA'zlߊ۪M1v?``X XjiyDr] =[=a_G,J@L8v4Kyb[J/J((6zj )k2Aƣxa~!D̀htGlԎǡp6 4؞nw S8i(#r[#Xl)8rāK3LqMghKl+mjzyrbI(ж0fA٪u"`NHq-v_QŸ>8R*%Œ/sjkɳYdqsX?607a ׀p0sJ0!/·dAg!t32Oi(;4v_08v: l>^0 8xq}2Օe:RmjyI|\OtEtqd"X`ԭts rA]ȑqfٟ[ϑb`p_|‘PқGcy=Nls{2i╺a7N o7ujNvUё=8& (O9OC30S2j6#915 {}wgYLJ`J,QBmMY&BvъE]Dr]tYtO]Y}UP\+C݊ [,+ՃB<-g_:`~M(p"w<*a"3l:x%#-ށ,Tu!GDҰtnV$.HH(v|-Ԙuvt|2]ACDquWRˆQЮcW )~?#O fN=r>opyJ@h wg쎧؟ j!)-'_uM2AGcP՝s^E!(+Iqw/6D /y+G!1,-gnڳWN'*"d&r6jЯ fǛӹӔ(;!M<_ /WKXj6_L:d"hz6`Y)=c[KCвhr S$4 m zZq:IJ<j~it?-$C+˼??~0 endstream endobj 126 0 obj <> stream xZݏ۸_>UFkVIM4}PleW9Yr$9_~HM{aDp73'dG?K6]ǿȤ ݇$Dd%4$wM~f*j[ L8+f_uiRA+p"]e&hFp$(XTZ[|/m<+M﫶˱r$mCq|^A[u47CDJȏu9D.K;&JJ׋]%1> in롲#̦>9X|IsF7ߜ8a~jG%;ǔ<#M,xQe~^{ VPn-<ڼWR3Z9r}FfG6RS9`ݎ '%E(@6u$;X;!'\NOobd,ЂK'1ف;2x2 n bDR!/D"^_kW̾م$R``lCC[`8fJ\bx2/RR.3g&}k!HlN4{}(^ٌGFe1_!V](bvB6#/Fl@Bвoّ[:MxIIf7eXWve~]62\7_kבWx g!ジeM9(׽ dңxbU0&Xbi"XV6#G̯7 fT> stream xZK۶WhWYMA[Ӣ@YtؚMjfs{!E)?'$uH~*{Ѭ4 lξIe%)Ϯo3-Vٔ+hȮu[OU^x~Wu63ڬ&L]>uWuz+OV>p@sߡv~7yB'FϿv2Jb(y9CNEXԭ}nͦVpqOWN 3ha*u3ء,4p^7ySɔËon70=ۼk?-` -y]\c5f(M%3k{ʴοLOM 'OH-\'Di%1DgMQJ"X0. ~/ em4`[ 1täLha.V eޛi4|45Mlw~2K=A zY_[f"#--i) í*4 J ɾSIX )dth8&J썲YBUWѭ\@?O8e봲:T6c+p0 K>&&P R׎*EU9jBA*DN$}h1lYGnUBC Abhxm.Y1tWd.S&$QfB~o .V蚍})e3t*r"Vt~J_g[$6xa x̅|پX^O@$l-$*ȺEsdu]*m6n&h 'Yģ[&ĭW mfDK:iVw[cPZaPzL&#s ;wB%2w0E1{xrŸ=C%{I^ԙ]%$6#!I*u{~`,1uEȒ^ R 0h)yHҀ\*v$Ѳ?ǘR!8QEQ- gxk=K(Q^HHf!gfxZ,\jmӼNE>4!/[w؊+'\m 9ҔhNlL[ E0¦ LER^끱S3<$?^%ddlu~M:mAgO%k`iJ]fq޶gvCIJsH/>؛g` B1Bo+e5lUK'1.}?>ѸI2ԧ.Y[Tks?6ͲYTmDO=@?!q?=fc3%c^9Ȣ'd;j$&OSfq,y ='=Vp^{GGZS &r 'ZeH榜 ,OZ^!6+WzKt:д3U{}[r?o\4U8$Njh;a%fKcϯ`^R{Z L=⫂y9("$Ђ\DE% v\4HPFÅa gn?Ti(pjG1yE`x pX`elwT7w]=>w! `JJBmƅIƑzGPX? R2=:L9NQJ0h,/NG#sCI&cASbB2M/^RڱP 9}A /:# endstream endobj 140 0 obj <> stream xZݏ_'~C$M Hr6@(t2wW9ړPesNSq5M|C^}XU իoU -W7+B6Rz쇪9rZ0c`O$m룭z5幧}u3yF<]g<پ68K3F)uv WB!G]]ŸQedUSo-^G602$iYeF˙(.>*)q*lP޸۵[+TPx)#Fp@p^sT )]AM6}2t+Sms'R3ՏsBBxqLfA%b)Lw%S] ,KFiȵn[Ą-m}$t#,)XO1_. 9g@֊l&XECgi7-g#djg\Cpn`U`Ë)&x,z9dUraW3Im# 5TzEBu!#)VG>n%L# '>"xs"śl XĶL`V{+&rs: xެެ~$k *NB\F?aJ΄XĦg/ƖHp]b>cѧ?Ix)/iȝ|!\%KQ˶"ԘWn`2?\ @ԣ= }yh(0yoRUs)^ǜxx.lUqƅU&/pg=XqΟ`2` ʛ)Q4k=bŔ<3n_|yܢ v2ǭ6JK`OE1z[ ya`NjˢYb9DoNLqB{5U䘦XvoZlj@Pgmso6YaȗHjBʜZm){sp֜~5B̛ᑆcuӐ h:hV38}s' ȻC8zv8#%R= ğ:-8tC^i.{ˌb ,ƕJbf:Kk; ΔMc>z6Dx__@=&rݾrWC]+rt,6uLaq=_"\PhŒ=(tZGS endstream endobj 143 0 obj <> stream xZo޿Bv_%hCpWܡ!@Dnj%JNFA 73>E"#"+W_2QrȦ̚(QnDWWw,Q\3ir_ogM,?y\pfY"cbGH5VIK NQ´og0&~l Oi/1̪k9Aa]! 輞5+MF5AR4!v 4v(?2&fupXpҞT- >Ū,j"/a2gۼ𖈮B2+ܱAXxзlڮ "lPQԭ/FDg\2$,Ců26bY뛪#inդ4p!'H'EۻH0qRzƓ2]h=P40ƻ5!ӊU.|t["Upmb~x|E@@oTËf-ͼq֑G"%H$P`?c\^2IqI;d6f2 7RvNN&*kYW"*[\Fw_#(N[rF0- NՄ#NV~[}B )z$c3a!d$!{Pn@n9reFr+a ;L?&ȇ^<4D̴~UV/l {Uusg@a*5dLJbb+g r2`xC($D&Ry.Ix?8l-)յIɌȺBȠ \. ڠЈ\@*X_A 0>߾f-b%aiх_ [L4G'k. Ҫ'yH嫤!mgw,)# ¸!S<) ;M驼'qEJ.jfIIHS+b8IUJ/DqL,xh'z]@qb7nNVdCۉЂ>|ues!&bv QGhągrr1}Fy7ܕ7_M'g=Ac:m†aRk痳HZ_^9XvU /ڻwBo]&BU߯we<vBzNWBBCP=hVh;00}W\`"[w2c=&3$^>?"  G*< ]!$m]pZ2;Oڻ ՟輝<\ݹ~'TsPVZ;cSz"(G=!q> stream xZYܶ~ϯ[8ȵqY)9T*8$w ⱪ@#Kvʵ*@ >x z oo'w}E x /W0up4_EUDmW33^L 6uLHʽ b^#m1%xKbwibƇoVcѱ g:UĿ2tYt Q'ZF%B%0iޏRkAģ)$I;$^@w[,G0 {K{9|o g9PXHXREӰfuѭΙJ8 NV@0\9XАDg?Bb/ptd7%~V/Bpo*/aKw2FL;wVS>}9N%'!",cPSB]Jx|:˖#FR^uB:Cqq#lW.h'8F*P#BD5f- /z"65TG%+) I-!!f)kpJxzvfnkseFA%`OD &AbWZ3QvX>WVnPr7wLƫ}u,z6}PKg>KF;QHƋ:V%L [F#Kar4x1.j7@jm'H)XwCyunyJDLF^ai;h @UYzNuz~!fg8Ԁ2 YDL[4z|VN*ID$Z/u-*˔$nÐILT *"v44'Lޟą4A38 3սȱwEˆ0kžB@8}f !p(ӗ%B(*1!"T#X[B &FhCWk`.@sBOIإI±N0~ HrL",JD@"U1QQ œFR)I|-5΂_g4ht|.@[iZƟG\KsI3H#Ua36=XWkoB1,lt! Ro V o EC[vX}tKEsn1D,`yhzP[Ͳ!{$D> O8g]ʄt$yQ..0xN uwJ %,,U5c6gQk< IȦ@[Ϊ)mPƠ5yd`8P̩)!d~S:=(V%z;!L̪auyV\1&&k*ƿ iL}#IP%KPaVUvZ7&99 e̊U b*uYE+o|2lD }[k籝R_FJin5f{e ]vaksiv"u!8OX|Kx/eЄ?tJyѩrjLb%J;[H&Sbcѐu~f]J./)ّ)䘄cv3>+9N=G}/ؿ\sPggޅVTr1z.\Lږuͦ?vIơgN{?qi|9{}%Xm=r?ĉJPK3S> WY endstream endobj 149 0 obj <> stream xXK6WV]|KlKm.CӃʒ I^  ]X"9|󠒏 K($O.y}Er{(Ar,vW߁(;BCe ˈ36Ț@Rv=2~M>KualX,y?;:M2yD$fq1!?%\IB38#T'|W&M\N8:!dg]jV׽.W EӨgY V$y\l6ӳijY Q+Mg dN(xh%0³q$> stream xڝYYo~ϯPVFW^ v'< 6^ @m>U]dӢey1oVћܤ G/&y77,t?ߚv[l$k͕Q%ܷ=-'q>a;ۖNe۶lO+aߛm~q$F"\sjvDh>HD&pr9i%ឧ8Zo;;؊g{<}_7k;NJfU ,mT:[11r=Ga/zƯWOO##GDe 7ݏM9mCT$;ۋ],f_3}ЇL[WPZbm,)NG6Hє:Ymځ_#~>ܜ._itcڧDS]" fm{f%8:%}u2$Z'E2~ߎw$ ne|8@351 'ncQwuo`- d<=0nQm|Ӝ& 5Qt!IV*@4H`:`!T;$Yy܁Q;naF4Lݎtw&X3H/l{h_t!Aw42xkMh~|k2YW w%oYNcN'7EvX}{75mPx@`P(Y؀k hTOBȻخC`UWc4U*4BTع#C8uI[2ģχ\oix|<Զۚ 9YiVA]J啥q8~,` ‰X3sP6-uhBN[N `QA^D Rd"C6 oJ̐&'cnqtz5xmY!a٤{6,R%DTt @ϏE7j4;RvdBiPsK@w1D`7O~4Pgኼ=X*?qb8;V_4oaSgij&C ;z~nϱ-mi'K?Qѳ  EFk$2W*7WT`<2tY{*hR^7 J, lg+*jٰަ{byZHz~eL#V2X$![Ѿ`T׎"J:8P8Aue"I c˺D$bل,P8bR^b! m=I)c>+rY;!ڇ8~m ׀t|ȕL\3ȟM`iAڧW55r<Z810|4&>^ 'r߽{mR!h,P28I"Id"cWkr^ҡgM )-&޻THbg~Ld g,-Z ^h\۔^9pspc/CbSx)DC*8v@;aɭm#.y`{,D"AtrT"Z޲Z59*pMw/0`CK4J p3> P_dPQQf85s`^m+Z7N]NPJ 80O2-BZ endstream endobj 170 0 obj <> stream xڕUn6}W"W]mX41!1 [r%*F3dMae9gC$ڑ_WHL т ARYg)x녱QK%}xqN69մ |Ь[WǙUoڮ]kb 7rj::&! NR!pДԇ3|Ǎzwz ..6.v0ܹakǒ:0!Y08S.X)Cdj;XBhPڠ}$dLá,%E̘bf PU Heo?m!Eg >::p%܋g9l gt+ldP$`?Lϩ~0(3%z )fnm.+X&dUI%T79u4 PZk_\ riT;B7غJI,E"Wd Qf /H_:PS@;Hյo.7e,q)6SոX n??Bk`Æ$xY P렕 JŻ~w76T v"&cI#AR%3Q ~ JpJ "ꮸ߷~cdLfA`m?F,8b3vn&IT"Y:ȃ U+S78 UNUgC8{<JVr.,G>`fIp8zQJ<]L1~?NGlQ+a v-q cMxcӼo RW'5<)@h.WnlI4=y8P/?Z{$I\K3H0-KSp$ȝ %goNeNd s endstream endobj 230 0 obj <> stream xڝX XTe-ۭ`xIW[jf*5D߯" }f00]PR)J]'J씙<?e| ~o0bpmOsO79::}_lnMwN.t«enpRzCoE YD@ITI!|$*4Ule[I)UY Cl* 5Y59^m6ꊀ+K?9TS?Ґ|N~$USE de myWؾdG?? XW! 4h(7暭>VIgT Lq kLc}酿|otw49.W G(1J&W5 6Vb bp7&mVsf&`mCAMKk>; T覔7 pwåi{x p&[򜘝k)8 .f((,2~IÆmqHF&%G ʥؐ~{gβR8lU7u.K cEή8~-XݫM)>4%t=JZwZBD>&@˶hʂEvZZ $gsvh!*&aym&ym6v,fq&4US l/,;q8G&gaξ?)bgq8׫ {~D@\Tmڣohb*9xixi~5iM9 +_,іww^$1޴| ͓F8Y'/FĬ.k<}\-9A[ӜKP *VJ3`)x&C1 fR-}LS _LY/jvݮ3ak~,IwȨq~,@4(Vd TA*lPKin,I sI:V G$#e舓-P 2lh_cN rG^t3p8HuAokϾM@O "ѳ fjdr\&yx] ,q"0RZƩs6YF';ƙK~r%ڀ#/]ܢ\Q :V ӄCa%mѐ.k8 Ϡ/GΕhVoZh0^~\Xf0@7XtOW(SCW!` ?=򴠁jC!(6{*K QJqx寥p~Kx\'6 ?dn20mpA2aWzQB7_Yɩ~5pÑ~g04=K/ q{5V9 OB}q~ސVx4^WY*;9k8!YV0^|>xfO[?V ?|c{>*޷Cӯi+FdK'DtvN6FvXC,O0^gɯm .~b`+vSXmLZX.G@ESh H`* wTB nhq {!PVi5cBYgdHxWC{9׍|G=*ͰGD_OleFN-B{YK{{?cϊnqiz43n,.{Ò^$M3tnJZCQ_µZ?)T|Nǁ]UqӮ(ƭw *:r;;-+5إ6j6CBGF:6I#e;Չ`+hyEd=(<~Z0Pn]{G6mg^:|4i~#}?ΘyrNE3nJLpμ.2[07W\|ZU 5wdm 6@$4A[L!&v|x ޮ. v KdjT?#Z'⬈'٨UHl%OT h}rHh4Iˊ@9Os99u%Mhntc\L<#v4Cq$;U@߳k@j\d M_79³u9{6K$@zBVfr8[3jJL$F3̹|cmң{n̼m08"KC ,eTiIdEb1vl*$BZ6o.lީ->rNpV+A1 WtNja;60HXZlm٦wtUTo)t(rsQ9:yS56 `8:u&`,4G7gW0FH endstream endobj 232 0 obj <> stream xmS{LSW>"sJ-kw1/ ̂s.D#.D-R(PGAyP^G{ZtlUTteqQ--n>%.rfv2=|wpL*pפgmڒz6&9)!TQ$|58I%֨9ͼi,pbφ0nAVIq\^!%1)iMi_RC-rBkF2-Mc*&3%Mfq\a" K =iғ9%d6#[Knޑ]tXVW`%m6amr<,21)փKqKDLD~B>` WqaR A)B< ߕfmmn`;9p˰cpd *{򦌉7L]2lFcSq' AҜݠQש SY{} 4pNI{g(=j57i 2<]&/0 ߭t[h`fYW hmr <  j8Q_ K$\@C D>0RﮁF! Z|Ġн4ǼC蠝j+fA} C4!R(F _e}X;KS=PDŽPl,RoGIVq )#V䘜@Xt(:NU@EwVjӴy!@6)Qpr D|!{Ӡ>}mBA:0~6ˤ?4G@M;D ~;/^^ Mv=g-,6. .2HR,Os6kvoz8ǷTigQ9& Ů|_G/*{/՞l; ]yk=q5Cp@w9'`YXHUnJaMef8fhmV { gDt9zBW[wGgsMvz p>`hE؛{Bb3G(FXb^qVI1v>`;۠v[GKN5`9r9dQn.1(B Y 1UU+B؀\Yұ@ࢿq}Kt,V|7h9Ҡ#M*oߟ%n%XU[ 7y/\FgTsEs;1U-yWqnwu~T6PHi)j> Q Bh)IQd(oTrE1M3L3qU:i_ endstream endobj 234 0 obj <> stream xڭVyPga`Ut Pcݣkj, <`P ÛFagAQ#*auMvf+۠&$^ի~E%&fPl/o2:YP7}qDl s*eUj60-$xq*IM6H$VvvtrZꦌIؗ wvrZpWʃ宎rϠJuHy*tvoVH'JF/(*\ o%L#Pl37|YجMw,a.;ycAXEa %bؘ@x*f-&>`+aہaa&,1/H&*8'gX-X͵ªK!;P4S!rZuU[(cLl*HQaA^0hh%p6'WZBS)(|pjϢ%@5u A1ݗI`[lw&|+x \[މ ڣR֦mN-JB0o+/? 7LxٛjRsl$h+]@cv:Fb I]BE1aZL`wћȊ"8/Z(9=PV%k7 r@.xI !O'vƊfsQIerr|w]hX^UxN 2 _FjDlV6L2-p#ES>}9R@`\@!89 ;r2Py @ ZAZb: d+ هhOTG;V L02 k&FF<|@W[D9X\rlFH^~mqsQ+DIoU+ɫ*^vp$sxP;~[gAHh+:q@q_⇾>`GX}&݈ȗ-WQkZr//=39nA%S!G޴^ > m vb$PWBsgC9' x RYh#1sttn~X 19A[})l-m4UA+Q!2!6%ܽϯ" [#N_]*t=x-rJOќ?WzfEDL♱[23E$c߁bFC7T,L4wVMO,%`((@i2؀]$u`=а_PjwQp`-Y1K{}pE|fJtWP +ض9<^1#ug.VaߓF݂Ƌi?q4%_pE 9j'hhڦie^'q i&?FuGM96Hj"yzh"4 +86\f;=ׯL{RPq*?>j~D@HiSy/EID.KHwB _ ix߱8#4Z?{Hs3!`ҎUW(m.zx"}̢K4p~ί<Vj.tSw;L8xCD؎P pe뎝ba[X^oR 2qPii8;a 0 cSH F1_ #bԐ]E3'bX(~^(֔Ec9f "* h8>,&+ҙStt\jskd z#GO[ ȻtHގ{#5* :Sem fms@ 5rzRdNb |ںTn. |메+6<͊ endstream endobj 236 0 obj <> stream xڥW \Sg!{ED o*ZZlGQXRhd=I@ (kH"V(n1T8u{ڙvߥ}l}3Λywݳ9#)H$۴ws׭O IX%<%v8 wހr:Ӱ%Lh<~_En,nH$١HZp #R=.\4>{{fyZwo$d/=f'{G("<"<=6oZfM.[s}~#!9.DA+G͢:.VSkV*uDTh^閣h]}> UHf,} mVKT)B}FK4C4;Ρs&t0•Q&a$%8xjQFpWkԐ vE??0RKRךbmj -]6{pE!?Txc&~n7CFr/{{DFGX0x\To95' *p(cfC1#gIfz4|<\fDzEzB2pP[QctZtErXƛAj5jSf(G‘0wQjHpe!9r8|CCZ:q0>L瑷i P/G$2C!ZWYK IO88߮./2bݜ[א31GŴ AV1:K/`zzT* JHS,qq)q)QtSnh@.v )悪".ʕPJ\=TTЃVF]T0?A?&H;RZd3hPfDj0q%b<<൧~ԉ|O$~XيזSMrOcW*UiQJY,Γ`w ^_A8-^(-=]!/BT>^6 A&g%*E.3 }SW)n#˸d Q1?7䅵?Mʧ+Ԥݺk\ KҡJp%N7jt+Z$tpo$8?Ӟ`֜L8n V4*+L^ #ȴtZKUH/ p.QȦu =o:]9\8$Hok!BL`"OO, ~}4}9Vٓw|| [<`Bwg,z;ѧ?Ga9S1B 2y<| x)gfIyH8'Q".J>4"Mq:EbAXJi_X| w:5p gBrLD8c tWwD@.Acw[8C?aUKMgl0: ;,ޕ4[m(6ms%uF դa 픎]gj|͋d.3>NͭgjLAfرєgUlnzr>o[p^:feY_24>VQ?~>W.,,F|`8C)f:M\hU}O]p07K &qO< J3P_hINsIwc; }B2]I!nBrPF;H~]6Zg}@4aocG9&]NiC U,Y/S5-q eru ^8)K'CS" +tjA)~hPd}TJzx /rä@?4Bm8 UU`KAV ٺ:p7Amn}V;Zf+-PyY.- J?K-2sh5 t$bm,^)T rkqc;01fʲbWGm0UzO}vUeH{d/0rꔔY#$ 'o}n鶠;U B&p>#*%Ai<[餠KˇlO&w49|_JaG閤P֔&BimM}Ji iۥXhy %/D]wPoxI *N .N}uSR 0{B›kጺ;9<#♤ƌ5)s=d,Yh1G7[YǍ誾 Q9B|Nȅivcht%2d&zaeBO X43=NV5o')oW?r4{nT}6(HYFicˇ|[L7NLo 3հ=$aK;a2ؾ5}'J_ZJ?.['ϟݷ{?J/GX< `׫y 澄mm6aH;HM&iӿ1A:g[4{(%٘sgFlKiB+ar Gbbo< gkmn;ѱ7ؙ C4p]Ao-+=‹Xh)Gf][ʁFBȬKYߝm1%-b!, y*-t-G59+'1'{]YVX&iw䒎qMkVE5f 2\Ʒju0-w..6eOT4L^hZh0fM endstream endobj 238 0 obj <> stream xڅUkTgf ֙QaYi7\XH$&fBH oMD MХA9gK5kk=ڞ珝-{9L<= #$/S.0=5-ssWn3TЫY+D8J`w?rO$I/H<LJo0('%2Kk,bPdl3)g&&&NjqrKrQ2E,ZRߪLIHKPsF9+tj%V]\t+%ǰtθCg@,&TD a#&%2b9H!R4"XE!:bA"$DK4Hdx>0}XOx\KȢeL[+ztڠӪ}gJyx `rZ(Do)`S ;_Ǿf/7sI(p6@@G X_.Q^e)=TV{l2P[|r淕hidq (2ׯ z{ϝd~"giEz:WM]$D͆@Q Z)Wg6# hÙ9Ll*Nwdf yyHf %sdpV6dQޓA-9`^C7ȡFx2iĢpQ/ ҉xЦ:8deKA&J(Yґx2Z"PXq8qQ7gphx#:9̼$fSõĺŢȡ-8YV|-8o8ĂGCF?3 nˇ,?w-/у!G1cS708OO_RY?68Qt($})-zsXVծh{UyB&O+lw|Ulb9֝,8T~l;ѽJ]JϛΤCNBE(5,l?W"Ds;:1dgAB&R(C)c}־1Ǭ*,*DIKC@7{bI޵{}5x:H^$ݚcP;&`ɚ[zG+45,|3/%'*3$,rﭱD<-{TSi]WM+n8ƚpSUYܪ}^2{M8% Llzshk7,+*4ϳW!pV/88]uNE6[*Ɍ [E`ćMqy 2YUSlr9VhoTQ endstream endobj 240 0 obj <> stream xڥYw\׷eagʸffKk$FwTĊ" EzGD`).{v&.,=vC$D,Xs\;ߋc3w=ܑPD"Yn1˖s8ajw0_`4a % oYTPZY%ޢ(If?J!W~_:\mMYI$2']O0en@`dvOPI&L7\gغFo' "g6%㗏]Annoaa} W;t5Ox_XK+Sc-&ZdrŦNEP/AYP ՍNzS}~5R)95zRP6ԛ@j5J ަS#Qhj 5OCM&R)T]j5zCͥQBjZB-Q˩=ZE5#ZG6PM]j1%t(%-¥VC,-ZXXldnt1SmLou?z\蹦^v>Sj]zYoL}#G1B/7#loR :[eΤV^hg`jz 9FH>3&)Z#8wVxrV뒓u $(:˗F&=)%I5}n_@maE9(b\u"p/TXCnfZ`Ћ5JIlyl,$ILդf|o/($A#4(CnjׄѽaxG& 26Iۘ>:JpvO;330IJ|7-*XHjDRO/R!%<^y&3#_kÇvYuJy1t#8#A^b&22Vɣ1j}pE6!4ގG>>V8~qz'zdMg]x)tlu o%c۟uqšbcJK_d7X%ܿRv}8=UK_Ya*0 ̳} =5\WP3KKz3+oa{ßsP8Ȍ]j?v/>1I:iۡWM;$B6{pa[Z ݈ KCmےC ² <@ԗ~zdbLW(߄rB +{gq5\xY<;Yxc:!S4i\&<cy,/9nWbb#H MVvLr8Ƃr/.At{3,kB"CsDU^G Vt,AJhn/5vŹkF'4f,YަU|c9u4 UA~ku DH% QCwPv~($'HK Yzx)nlb:eJ5>׾IӦ&*n,Iڈa-(7؜T`Itv8HRct$$r`0eןNysFdal/q S;3FQLMl;+q-QL:Y"Җ v⻭rNK>bSF4Է&i; 2ptr3{%{IPNalg9)u^ [Opp6q ݬ&nGb(M! 9"&X]2c9i XM-x7V6qa8ɝL{/hbv%Lm}̒%ǑȡNuȽiGIT; Ig'xn<-MNY杫:Hfϋ~EV?C?aM^׊{4{qWJrfL{sWA}f**wP)lhy膒)0$n lܺuK fpoG!K"46c|h3t*n'823 (mNEr B>wʧo=cPEfYAd#%_l/° 퓲 6k(AP|/jZ'_0|Q\>ːNzK(q}w[< 9B2!:]U}۬I9QZuR4vkAmpFD~C\ F}p@ ;Ǒ&H ?$ex vq-:+v0لxklAyjSr? ,~Tt |6[nT2c>B|A26@V*G ,a1"p PGw\ ;1Sa&A*E{ex :RMivXȩU"=yᦢ4;ZO f|1q~G{;U>*m:?PFnreKk7o݁(O|NB~|!/dA.OW`^x $.E-mץ EJ]G !vaPXytaϤmmc, 8<b4O@ 4MF[ar\ߚ(e៶GCKW.ֵrxf)G~?_{Pk +3ep]U.]PQzdo ZGɨ<3dO$/fh*vr7\zj~FTtVÇ;OF"Yzfr/U Qm&YS ǔ/4ؗZ(EF^H$S^C{b ɪ iKV Z jԞ%`Bա~_i]dCc4$lA]*'D ^&BfcPd'H¼3\u4QCHZ\l8NF1Y&M<śpxʝ Y6?/Emm}5k翻v0>H}9C{8G']KsJKƖZ_2KHa zjW*c0,P &wsx̮How>~Vpc>YCAmk+W I 硚\iAqC@^Һu$ pNIRA,3'6?(0%@Ϯ7`R%T {E]Z0"T,8oè NFʗG6]," D:VA4xߺU0 <I>z5 ^ʈ,8^Fr+Ws85V]c&$yn}Y3"{&Ae+)@X ܻ^ˣ&6[?[ؾ]eolIʞsDLHjgeKFdBлԨ&/IU^qn^HBN:̍3(~WB^+Wgh 1'#3d4=5h<`bW@Y̜>MS֊@#}k+PB?yoVnx. ё9υa<͎|x` V AL@.S9Z)u&C0Ⱥ(r=z4jNޖᐵ(L8d ,`csH"3&Vh+]a)ͽ9(+?":RY40inG\fLnxqWK{1a(:'T닊 vaAC>'u}"eֿ\ 67.+?sXpGCyYQ3Us:&($}/hMg/C: HmHrwJ&zR.-8,N 4FDJ<^Rf'msFxQh 䤘 xb$d^̡Z?Yə1,Sqb,*NS^%Ƌqf k"5ژ b3?x&"nEq>oG8d:[гZ!]*&SٷXY9_טwD,0A~d򃵆%1Qq1I334]g҆>PuG`VoZ29\WnVA3&TUs/G(2T9Qhu (FР#p1[aۼ.'8]rRpS`)2i2S5x8zEʰ5 {S; 3;C#b4y.Cٲh0KQ~>:[0 ݤ^,Ý1#62 =wgu84'"9|CSɰsQ=I}dW1&:D-?S7fs5> stream xڭY xSe>i V5z*xNEŢpuYҲRҝiڤ%i&ކPZDvupqexG{: wδOCLBp8jݮ6/^/D[%H>$H_mlxߔdȃq1>)1> ڵq.M;w-Zc=%=)"6g>2Eb5XLl%{%A$91ēSBibG,%&V/kuzbxDl&^%-xmv"$v;]n"C{bԃS9c6W1eb^ȴi71Vκ3osy8o=x~%{ig==(Q˼m.ͯz,BmsX6^;v=A~}y~y=HAFF#d41` Sӻ JkԙH hVʫ?K/9T .(i7Uu_gg@:͒6:}FF.uFI[i6 jPPfn^5@oƶpQ%]X"HHJHkhw#O˼f#94ˎx봡_)EFo )mǚKA Ej9В۟_'ѴէW/aYUErBNR6@!BuK]E+}R#Eo\)ހ(N:[rD̤([O)uz!zXXB{' 3*8Ȼ-DӜ N-W"Z̯Ʌ|z'ʆ(c*aN=lozUP]Vo5 pfsιfs]R`P .kBR&F?_WƩM.v_-ЬoZ4B5V)Hrm`+|yR(ge_}0\\z\nNU;FT1yh;`*~UõA溆\bh,3¯dl&[ȮۓF[$`*^v5;w* W.?0p-y|HS@5Y+ŕ.|p/ݟfv4B*P+E^J/nR8پ<@PNVX;>vE\z|k!h1n,.QszWQu7b2 KIPKU02?]q(uP[!(<+gCN@Ȼ465h.IHUi"B+e-UF)}I 5gAZ*GG~!^ hOQ -c={ ;|́.纂i|ޣk" G0wTցb_BJI>RRes20TXk9Ш鸞V.#dfO|CUFH!n]tvck#n;dsh2͗-*L'cB*E>HyOn;Fy$Iź }>U&ܛ6MTOFޫl?~los\Q$.{;aw!^'N J& BUA$Dϛ"[ó"3VdqgK2Yi(!A%T%fXm5W pV+4Z\ܻ`: RwTmDc ^SR&Ȩ21-Ю8 *8dBfNcq;m1ke Wq^*.7kIM[9 U-G#FQgP2^qcY=~,wY v}f20X|9|m {;}{n@0= וlt½ HK_'4YȇEE`(䍠ƒNwm(m۶gڱuhEϼ*]_Mjc5¼]I$Uc+ :}(™#{ЂΑWٹ4[y'7kOWPޑazD".qR9W K7[59c W. JJ/L-fImҵB5q9Я#\g*C/5Q)ǐXŗB?uiZ42eFDe(@nܠ~i M5r 5oVUZEL%V?l͚oh8lh[FZrj2?=HaTaFZb5ZJ.c]䉸QɌ;8,nrMⷠMbDŽ IsMnI%[<h:Vki8oE| yXL.R1SEqELq)m3C0 UCc|Pc@>BN)zE1~ęD:;(wRZ*hXH1[ -^?8Y`®,IX1$,,c?3`7_9b&xvaJ*L쬛 pnD;=~ %wpUuNh PZeLtB+QR۪\Er:T1lGSzƤiVC*HU,֗-WTSݥ6X>Da!nY,^OCĜ5JѬ$p.U{Bm*uR }3cň呡 j,Z`J`, ƾ)|JJT ʍŅY|rL|R `}Z8.;3+LR .gH-5@0tfr|B۝?ᾴpsfq谆3Hh5~A>.\1: Gk{OXueuf6vS_*n CZ8@ӫ_<LAZNqbpд7P\[ g IW"KiH4**Ԉ58'u2J%wѨ$p{M)h:z=Mc/L߸ -hF,b=WݔCR\^M)ǂrOgu@1u^C,҂Sk50wYOujե$dU؝ u9c+q帡2dQsjS8vWt~͖* gGFA>@"Zk̶Z$KrEQ]7»Q.)D۶9 Rkex ^{KVJhꏋ֙ 01JSS2KS˭`9ՇY}fKLz 5j[\_MBiڏR`/ZkZ/]Ƞ b`v( ^UY䁦_Ezfq3xjs!02EH5䶇{ <">jBa\_LM!tR\΂ JBec]QYe@NJlwp碶I BNm̉H;M GPXZ_Qvyl{ǩ׍ퟛ;jBr jpud'[!3I9Q'pqj k'*YÞr  ؜_C۠dEOS jO g U.gKs?ۆn?X7~ rnG "7ۮqF!m7Ѓ`񰱇O w dtmL z '2 {`1c3 <69l-ܖPF(iT!x&MxFoK&^0Z0+:BʔWc8h&E{-3G-| {G0AFekɭh0 Z#z:1_l09cbt545fBo YSVзS:eB;NaB_C5~bCS Ba4rcc<5A(N_`|r˫hI%g\ch)n,I1ƚn.5Zb*l=# WDM ԗ3aA84dҬgcBt9gct~Z3WLHߡԭ+K.;>+t<(Ԋ_|pkj9Wpg6tFCUZP YuuQANJrm \h~\Ǫjd#|I+͎?* $U1m-ϟM>M~y A~Πr,~G#9#=#(K 0 6ζn|Pa,o{Wd@(w9 {;hi'_z̭ h\׶ "3";@z e[zx< KUKw9,E"RjF]) ~TmCxs|á֔Ue<7#}-o%pݏ#6x9n#0!@fچcTp@P!waKwC>IN}t bp[͸u^ 3Յb2(KCWb.r!X}$c>mݕ-5;Oz=~Lʭ׸3k*֨j=Fɒ=u:҃ k_mVg: F k\P-: |u9vz+:Sv̚n5fʹ83Ӯ{*fuWM2Y;k6lC w endstream endobj 244 0 obj <> stream x}V XזR jb%*7]mdEYEYiYEEP:JiL$1K2fQ'Se>6b^׷{9? ʊQ(K&/_=4=7Wmᖙی24a c80$ƍy0 'Y,+2˘ f%ݳYx3>t0Ze`@fbAփvZ/ERu¸གྷס3>fcxxbne2?MQ^El8GMLXꟐC^^xx(.%@5}e)1[.7un3NjE`8ոDQ=dl[; q!C,RtcJ.?L˶Cz'+`I1hyb8,Mi5*`bƾ#B@޺'jj/0g"C#b{2)1Ю!"p4V.  Bu:dzK q?PWCs"(Ôcv UB)6w\(nZC/r|9( )+?pFI~ )p!Pul)t)9Ut磫E md,&q$dD6 q=}Bx؝W2o@\5׭_\WztGM>Ŝ͋Vkh83% HY{_@TUuW p>炞}wxV1Qs6Yz@W1u'G5թ*4B)Ts>XMl7&olDR>x~pC[؅zRd,ǧMF· Vqm?YpT$AjM xZdpZ'& ,`՝Q @O0^ήXD'rLRb=TQ~PUyg?(GЁolxTXPZ$xBue6@y/Gfȶ-M8 r֔t!r ٩%GD:Flr}}w {SW |%Y<#ΏTrnd%=84 K姎,ʿ|NJ5[iFvq NGk,?V0߹t oD8 jڤz-UQm5~Y-$4+OhRΠ=.0dgNXrA"vY|x>} NF< i)8BC2չƶʓ--C.yA+78$1+(h?_h630D%M-ұhoj`;K;E5~$HBjO4TZz% 2 ?5Ǜ͛[L)hw5 )Gp|mL4c|xv$aGtDVH-zXvCzv_@KNq^0bi65a(;_1 5fc:i")N-1 f;$&<@"hwqkgazU}4#^qK(pUbHPMʣ[2qd\s#'`F{Ȣܗ* Քj@$nߣAfGi8dz<XbࠡjB®IY%ܔpk=Yl,ޗ|ӧ(y^z;% vo7GiCdnvB޲k,Šs&LLLtX LM8S.?RK읊J=A%Lб'rN2pQdI#G;h _0okɋ@^^%>՗?^%Y` %>|״:GZ{箜JW)k. ޱkoRPI3߫r3FipĻ ęp^Cf:sO3pTm vWETn_X7 <_B3H)Rdˀ=-U  Gd0Hє3&eg!VGZ{tz9I=EąՅ=T_H+#Tp.iPa4~_ܵW{W_&۵(A)?%i(q8qcKt)hHtأdЧ$lX^]茣]|5.}:Z6ц 4a"3Ho-zM I*[,ufVCY~'@{bc9 ]ymhkL1fWwW߲P,'ŖFon; -69%)sTX{v .{ko{vY6.vD&Aar k;"?#\SYt}*/%IjC$lvtpZ:'qcsƯ0Am|Ov޸s-a@?NPZa@, JOk.3{Xw;Ka_/>s endstream endobj 246 0 obj <> stream xڅX \T?rv<6X\J%4%1EABpAPvaߟa}w ̔\2uHk_ٽ%]Ex_gγ(J&)VmڹB^v/*[[(>KĿ( m%'aԘd'm)=%C1͆2v NisOJ{;WAa1Avޡ\6H kkM7٭q߸M/_kG”!]tyj<GjlDI9S le䭠(cʔ@QIeIYQKMPSg)fS󩗨Bj1Z.I[A䭢VK2Pk)ʕZ'IwoޢަܩMBmQ)O5#JFPЮM3n26j0.5cRo: =ΠQ+yIeNȘpn 3gL,0734:*&˩YYYX_Y`ȾL67)Kē2Q/s'&B,&{U/F-PXa"',.\F#-.a6{>PWoKILD&&RO^ -!LˉQ7 "K.iT.*(=ML{DML_.]XL2+OeW'ThQwkƐD|s/S2|޹.P'w6tƢf ʉv- <:NGuYqT13C %<ܻ[i-?(X;:c/rw}+G̈I@U^,WKJ܃ ,-0cq'rPTQ֬ Dꢢ7Z5@m B`*pB}P@:01q*b2*QPo%'f@m곜\M葜$U[ͻ-!S_Q[%XZp'2Ƈ-v!A;=|kTn;R2!QUt?>F+d^Hu:{\ty/>mmp}!A݇[x lVUGA*#|/gEzB.@W_7BqUQ~YQ\8X$kŃZ:<"e5/pa&>Cl2 3h&p*eJ̖-sE3to޸[VڜD.TfǸVy4ClORio?ʰ?>eAY1[xo{rMCSM;U;w"jϷ]{yI]Z\a`t~Z#1+'AwLW@gW$LO H<"Z((+(hAk30x_u#p}o~Iyl jV5-H*,ji?Y|GJM!3#eAҢC|NVttGu_a⡍\{TcXXTTXXcT{{cc?ki16=n43:9,M*ȏIb"Z ɤ"!#'9##EVӵbOhM3Lb 8jtM+t dn㈂?5ڣ:(!+r=̧JsB_x=7 ]%t  ̔,IT pC R9 ImݐweCtR2!u ߊ ? >o,9A-Ca4Y/f'Z}K[j1bCչ`YXZzT秺{ү9.ZҁiiJSϢzt6QI3%5\p*\>mu3NyU+tu7=ZS^]#۴Yn[Ibz'9IiXiHK aS4 Ȭ8˴Iҫ4lmќD[RcE OSf,-E@pL92b J)1[ $yca#.B:ypQ!]hC^F' IE޿fy.#0]ģyS!=ۖ?y*[{Z%NN3Zo8}xoH)Lg0X~}{fʐf K4}pPϫT{_lb2lǮ>(>Ű+OQJ(YߜD /~7s_F2ojT+AE`-q ´r+CEt DK\> ~Yyծ=ߐ_pVn0bP?ʸP %'=v0 ߼S(g@SK#csCMǍH{?6(f$W`+;ўMKUwE 't`OdsV`շ/ʟw;v7r nڽhvMzw_/d {@7NeFmۜ+` _3xn>HsgK#?B'mWtѕ7.~x yFOZ^7-.-%wo-vH_?Ycc9 b\MX?:}MR#q/B.t\չp.B % )k3 ֌J(U+ &*aeTrBj$aӭvHfA"&2wu%{7 j: !-OTɰӓ+s 4?pd?0n|6(+3#BSt)%Ɯ'{̹wwn"Zg㡶n3`]ޔxiZ?@klЕK_p5g+6$f!OmSO |dVXcVZq4%dҸz@.6PBbaHgSw2Nֈ0D M7cpij`[IRn;Ob?sd!gt!827Qx~Iq' ҰCf#-bd9"hs7eH-űzKxHyb$)x̔I ۧ:VAb/s:c  C GPp$F5rcME6X籞9$goi4/_9tњfp{k"jbZinMgZ^hτk*\S-XSG2$?\5r)IjX%x{/a Me3kE?:֔iթaJ((z3p纹Dδ]ICt ձnwdsݸA7X`>f_~=)h)t{f'%\] +kg؝fdʹf~#2a,S4[DLǵ ctI/n5`[MP#M=#WQkxH)/(27m+(iJO'+B endstream endobj 248 0 obj <> stream xڕyTSW_X^J ´jĊӺw+b $&,b H [HԢhk;S1K3=ul֎sp9ӿ&9s_>/LJ3&.ذqZސ("lZU9^o~wWO,M+qs{{ֽ.TM #I9GDD22yaax'ī$ u^VB,Qׅo oR fh`y8 xzM]e1*PTUZEAJ 4,W:37~DF̬ڂVGS@(E/glMsP=v+ZM/Ej[Zg vPX v{/^ U\QU^'Y7b/^?NgiQj)!e셪$r z! gjf&7&_z/o·mgd|!i/;$eIʐ&;j0vD*iR"KW7Ɯ[EoT;5B6wY .}o^i _^<7co 8/{tP$'o; ~`/fPΫY4.sEhx쩯)Y) tr8\3oO.VzE̮EBB&44? gm;h&:6:i[Q"ylM'S]P֎6e°( 츢qrA Q[^F ;:Sz.@-X<ɇ3_P@Y]WD?k_+ phnXj M%&C3x{/7 lkLu{ZLjhe6DSI }x7OI*J-dXhq4u6Ye@n=Xӕ䒹w#|}_m2r"0lYZ .SB6Y](򇡾S)v =ua|b'9 ɐt_a8CL4A&bp +CNTctbzQWf[FFf2-<aov2D >|<+u,zjp`l&1|7B,!YOT"=P"}/Mnet| V e@ 'b!ӺͧOZ>=ԇ4 N-1WsnXCvPnpz0lGa`饂\m8T z"yF3n/mH3KkEK:|_^Pr9\b|bH#~3@rQz&Ehd*|3KwOep!i@zέ;Hwu z `ߴhO?4fJ.խsȟ"Od)>y]}; $BtMf9I{ $=8HTQttUnm OA"ʹm s@cV[ZcWACVU1*V+;4W˗/G-Y"cA[Ee\1xR5Q}C@!+ j*;SÞA噊lӒ$.\=;):Z%)<$PPDWHMg3o)@oP>9o8ull6_&B3  T(ϼR1l l>Q99oduNj=1mxbsyX4l$N'Wd( }O SJB/tT+#xzkœ;arst_+sCh VAJAiէb IE] -{*T$srE.\h:$lv3؁4@ PT~wTzM$3g߯ MqMVݟÕFc-޵-3 pE endstream endobj 250 0 obj <> stream xcd`aa`ddpuq74 JM/I,əa!C<?/*UD~@5`tb`cdͮn,,/I-KI+)64300q/,L(Q0200 I Nz ^ٙ y) ^zz ~@L<ԌĜ44`נ` Ѐ`M=t~E9@g1v&FFk,\{c ߏ)ڹr^I7GWUUg n)^}=i9[ ?]X@ߡ z&L0qpN>u{= 0a~Du z'zpT EB endstream endobj 252 0 obj <> stream xcd`aa`dd  M3 JM/I,əf!Cy?wȰI*BD>@`\+SReg``_PYQ`d`` $-*ˋ3R|ˁ y I9i i ! A Az΅rs`4dd`bf`fdd} &,Y߽~yIww[ʮ=c}:{} 9O/>}h9ǟIYurlY=> ^luͭvݵ}ؾX.~Wo&ڦ.^1gܖur._~[+n_߭$;9~<{)XBy8'pupŽwR+xxO?r$^9 Ie r# endstream endobj 254 0 obj <> stream xcd`aa`ddwuw M3 JM/I,əf!Cy?aU |" H' ]-&H9(3=DXWHZ*$U*8)x%&ggg*$(x)34R3sBR#B]܃C5Н &h xLR7t9mJ`W鞱F.ݹrު)Kv_=NE.!P@xP;԰ߟgxwS?0X_4ѳժ?wk=9jʕ[u_.oߢfZ-Y1wºrxa[+n'Wߍs%s8~-<{ XBy8'pupoǽwRO<<z7O;W1+ dv endstream endobj 256 0 obj <> stream xڝW XSg>!sdQX@"VoNUpZDzB ;ʾ$|!! ZqZqtb;8'I?anNek}6Zn"*XMadkpoq?}oY׈!Ж_ <ɴ~=ٴzma!e)y+c#v;zzx̝=_8$;z; T$EF8w8q_^袐;v:*v:ng&Ǖ>޲p_~_ lL#wCMl |`Jfk"aCL"l ;‰p%fs/›XF,'V+Ujb XO|Ll 6o&FL7#9a, řVx֜4ϵXdqE/:G~LP\&tY[nza]j3f`"=Q>ޤs;Nr7H8b {($ode䁪`@{H z׮L4 $;$ҐXQz~IWx L)٭9 5P[IGCh y$=X^H5oƐ'P/*t m|@/~s)fgN%'r IoPFf;u1Hɐ|W.VWSSݨG+dͱmvf^|PBY$>tiBp3 U p%ޣ|3,Ԙsϒ(gI7ey;ILHoPR¹ySƄKE5wޅãb;Ȏ8\Ipˠ4e^Af6vǓ5o:Q#ZuoUDΩJ$(4& Ҷ.uսGxǯh!"d/Y ,dQ;+@3E׿d۳ *M-U4" i#y0c(嗮>%w2>d"AJ]ǺWZN3QB#)Ԕf7%v=ۚ00e"&favtQmh!ax-~LoNNsQS":CJKLHhO5$n?=0^`7T/(,9<ŀӝr64M0x =$Z:14l~L2ș(` o Al3'5q'1hm ƺ{j]ͧgw,ȍUa)AJ󾏹 ACTf/EHbe悘P C;>(3Ћzڙe0;dbdOd)ĮR}9$Ͽi'\¾ࣤC4EC<,+?M @hsLwzB~$AaUmkױ>wj+4o.3{uv(|>q됵< A+Pf$*98rANOIJt&+!K22aCCmŕmd6b+L?vAvhZ', :&!^` գ 2_{_ "=b3On B֚Ms'$nQC㺢+=ԩ36C5aTp4"%ߜQ9O u4(|m:&j,mO4]qÅNȐW(jدNw(}!xQcN | uΖP; / [An9e:z?q(ޣɪdN[u8<ڴZ EeE:d96B39+h_IY0VUe*r rqPUNESFRr;JUMr|9P ))lA+eLJHO< gEeh-7ϡӮ2yW!EپciYa=XVFժ }jmmThK4Zm 3oNtS Jm endstream endobj 8 0 obj <> stream x][oG~ϯHA` `dZ'2%Tϯ﫪lʲҶ UVΥ+QIYIҕԕrUVJ0spePX5RΚJwh\PUFҨJ*ZJ2"&IV ** ύ(mA!TI:J h U}a@h]eQSYVVN*;X[94UrM< vMA hq?)z U QAAW>DUV^_T5qAM|Uq" =$VР/*XtF(J!PB!nD+~R᪨440ZȖ$Eg"8  eKSXOx8tx@:bJO=1X>%H]"A2QX*CH謤 <AD^@؞TBJEb *jktPCkL)x2-qXr-i'6, ͖ڟ%WFQ)#cqYbIȫKx)FJDJ"#G'$IEI mA)rfSQ#OPN%7R#!Q#5;,,nJčq9p%JP|J+Yc"=vR i K'_ISX&^xJ لQRiRW5F8Z,ᰊeQɲbf1$4F2)+.`dbL0aĐˡ@(sPD=Ρl ]ȶIYZAu!R`6hbH9BT9MI \&R(˚Rr 2Xe(C;+*4mI诌Ҥ0E^=[e"~ 9_}5M3z}կoT(b:Wvl{ 5n#kmtM[1]+g^>yT6ly^y5_7ͺY5ͪY/y~R]nͺ\]}M}}skf=x0Wj?05l b](R.uW}v97 d_-_wfN"痉])Cx[SqXՂ'##WV{JB-`O;zV^Q'lX/o(oW}3Zl.R!0/HDX3 ^o߾v}Ry D@HZZ1\f+u KWhz"|ğ` tDߚKrN($5f{:wo ҧM|"E?_ϟ^BQ3$9ua\%!m8q5u|M/ଭLK!R@kD&҇"Aӕ9 (V!*:<.{ )XB[6z'qy`b}%?[8_VE=pB%Ā/6x pC\Pq-* 6ȋCT"cߪS!.%Pr;9[gE0!1"F7S&9k=gzTn\!. 4t1 B#]g,˜ y>fVrRkm/Kc;~8yRݯ57IH%0xbQ9$|G1tN :Z|r?ÂzXNcM@:U qAKTfgj2k0FLZۨXNpIe 2t`nKQ Jbc?䤎l C.^ָǣ:q KT4rz<~=0Ҝ 0 \H=kJI"vJQ#,Ey|WdP䷜Db=!9h i5C4pK#G_"1ֈHu"=hplKw\|y[\]=g}2R;}F}}};Xw5B*V8\TU0QCi T adːև8M$C.ΐv\}ϲEvѨHDx`4S!AnWݑ Z;صii\;KO;;C \3TiUlǴJ3*͆| O0;/Iqc 7Dɴ63 8CA8n%A]Kq^r3%WzK[I!hzE%[̉&Y }uK1m(Kib *qgK@M@8AL>br1q#cӕAdSC30܆T̶${]K^Վۺq^Pͭ^"~\VoԫzxV)Nd!\xRf˔⬵#"cE*-W[[Ӎ bfD@06dmlZ 5apF; TprS}1/sL3LؒsEȩ^Hx4P)h7F=)Q̛-`|"HWDaXAI\SPWU"XA`XzBǞǜ&EH> S:w:L@6iŧIJ˳_\A-j"҆! Ӓvf-mOV4^vV_ :k}tu>rkRkL^ CNH<Q;${*F=.d}T9VVt>އY[U`71]E;Z5FdzW EQx 6FÎ]<39>\"-h&ƿY7ey6 /@whHFOMvsz"fL^XiO!FJirw#~b{0؄G`xKvCɢf2G'~&s8vf<8a,-f< ګEՌEٌEt>^x`K3SctNE|z9\^]1 m-+Wf,M#s䨪`s31xCv\r놯X;*hC0*sΚ!`LO=A{Nar(fݤPOkjh..pϙ⃲7OFuڦ3G؍tB5:ܷ99HTź-2ҚA*_]ħ)H ! hxrٶ[C#9Iq z? 8/Zz4ہ>X;6 p#QG8Z>Yǥ$1Qx;"7_1?q\v!?kbrC\k,NQxdcWa:T0/i@Ύp_"xWAJw54*7*FEoT cTK ؊v&5-_ܠ{YǷ|[䡟 6ۮo*ie^T;h ݪ7O0}]ܷGxl(wxe]WYOV<'Jյ? _S*<2]r'1V1GcBߓ)MD4%epB,oxRx c-}(n+ձPL>hs_ @֜PGa(9<21>mn+ԭY.Vl,FOs'Kl)C3%YCIɫ2/g9;n#cŽ! zxY\n[R>h8y_nw:w!(H Y|,NCQRI2/R鏪rrsF'}DgղE{suj3εߞ5ߖ7o~~9Ak,#^^-ɳxR!"_fƿ4X/2fMrϲy۬_tt6 Znn]%FA&;ABIJ$uJ=bW?-lmܱg@kFg:~5爂lɨg!Es|;~Nas McNn\yH%ؘB5NۑtsH>i> stream xڅ TSg_Hx<cKmօVKI "6!7AE (I(.-*j kժAتssf^ m'9;{˽w 0/Zzʹ bJYTJ|zk;c 6,%diѳN0=e]ob]=EL9a>N}"1M,uws{Օ_ߓI}fHCjU\4T!8CPco)0Yth|T) ^p*eӌ vpP&]mzϮ^=a], \9"W:qw~!^Ɓ;&z nc:,Bf} PY89çg\hܡɟ ^% 3hA0婣6m69sB>8?dM@#t8bAr {XWz#AJ*fU]vVdrȤsT;,^ؑ|>Z\1KƩӂR6teQ2-l'n7aA1D٠tJ} @pd0->Trh2B 8n,#HCE@4leH #detZv+}JIdcBA 0Yo|/A#Ή BO4eϴx(R{ G$9/`w玝xo&eu؇B6a=e| 79mSxNs~}D?yPSC[i!W/Oodl-Y)h ƚPD"GFp:6z:hMOguuFhG~?rnTpGoh‘;M|4Y3p\g F,cI<2 uB G T.<+W)iwx.x-]Pr~МgxOg!$Z ?nܖ^4:zҲo1^uUas^tRNYZ \=hO"IXDPܠ0dP<Bw =t& %5sA ^r}[f*x;U9F F J> ݴ"˱u:uÛпpH:"LA8#Nw޷v:c̉Ld 祑 C)uGߣp@*lj?YUrSڑE ^BF\ y SU7iwF _Kt&u1!AJHhTL&J̩A4 a<]f"$I8aF?CȒb hCVM )1CtWh-^y.ȟT d#G$5sa M&ڢ xjp~7^Ư}lɶI<ŬDwH`nѬן\8ACt. SQU |p] T@a@Sx`MyJ? tvOϥK=K=x f%| ~t !%LݸC?Vr̐+|֐<-4u̧ކkJdG^(`H,\0M'uEkRj k[Y"l `L69I1o5s"uG](ӥ㚠bpvLd(.* d~.HMÌ'wNe%K!>m92Pvd~&(T9ԖN~hחQj'4bM)bCX'rȥ GU$[(N7(y i 2#j!Uۂs%"E2= .k/Y:,d0L0#q> stream xcd`aa`ddpq M34,ILa!C<^Iy&Ú',AD@1PQ@44$83/='UH9(3=DXWHZ*$U*8)x%&ggg*$(x)34R3sBR#B]܃C5н&Ũ$Y|hj_e?X +|Rs^Ԫn@ؕ^{D;{Kod3OYz{DI3gr.<{qlr\,!> stream xڵWixTU"c7FiBBzA!{RdO%R=꫺/I* =D ,8QgV~\NœiTôcI{~QӦQ"ݚ6e˚_ONN孕g.[9; ,7ЃġiG?̊ه&5Rh+;קrϲ{8YUsev%K>DQ'KxH֧2 $KW,/ߗ^nI rc~⢄􂜄✄Snya57nMؒ>n(.+L/ PzFjyjxJ1ET U$}nj. j>ZDZL%RVQuKzj#{/H箞 !f<^rQ"/qkg`j)R2.8x:zM&[:W-kR6Ȕ.0v.}NqqDIm?mjb&?< _ toE|'%% mp F;z\!hooΒf L÷OJz;ˋ:{{;:z\% BhT^V\V:zguLS+6J*u?zR&$K@zLf J07\ac- tRɕrdvģ% {7L26VC 4jo& @l7!hZ=Yʝ ^Dgl. W5!U,I6oE Vġ4t=8;At!Fͨ@mzco.Msx? ~[.(x 2jnkdz@3rG'v1GBfאE8H=d ?WGכf Nbk)_[Wf ^q3HUr4 /TAU-&hAZ9a"$b#!O-1E9I9@؟ a⾼ 3xUB [*- V8мPWpB`0x!Vr{8ѨO 3^M 4^8tN/O[L&*|B Z 6eM=y ]\;;xQg⻦0l]sB0uk]Q&䮫ߣ\9`qxӆ3T"V0+]C跌Ih-M'Zgn\ N -wh (!\u*:0.( SI߇NKks^K"zG)~wͣbTpBZƠh𖄕/kҐ8 Ф Oݢ>r2 AW=8:\th$l V0pxa 2 euz´tϯ$ _KhCSK+lnrX}K"Rjв/Bw.}fv|fRGtBa̷2P u*VU̓hGl=hwKO{I ӡ q`W* \,L&!~Ƕ]529T> (?}w8=Ͼ kmujW4&=`e=B'IFVޫ w!_(XÁϘa X#mj+ HQ+$]QHW6x:3}Hz6s4jP:~5@՝~4HGƀ>#7wETsPtt@Cm E)&" ViKZ 8EfA7T>2i 5G/>|49$!f$;7 |ieZlssxS݅/q"w)èFNYU4X@eqhwޡgq&-"z} ^mgN:MJ뀲b]]Mttyc8-{#k4oCˏ8~p(XSM@'>xFJ+](*6qhBF&c%muͲҒ>5UyDjRI*tl57h^n0[l7vaLU*$Yv 1}8=8$D)?GEgš:PV(oify AϤ-|.0oyƖtd:qj:S[@p9fUkIYQenX1|qtX&n47/:_C;%Jom6[`%7{{-N9t{XL^#Nz L#dA=] ^aavگCx /k3Ƚe\(9s 4jcTtX X́Þ3؍IMJ ~8ΟN\FeǏ:\֓Q ;];^^PVץ@}[=I6RJUp e`&2$8mf$=`}1M +<[W#3k((#ΰgP1B 4$³l̕fOݟQ(thmQG'mwCodH  L挺4p8DO::}E=yf~Ara4vr=RGwwOKPz#/XLR[P<؝R&Jo௘çSCU@oWpzծ޾uqΝ@MZUEڜɰ^|{OP??=~ {wC ђS͊ 6G_+\:}V驲,~?HW{oHD؃inKcꓚ#( D8 Fp5s(xr p,wi=R En#UwGDփB]xTWh rj6ȶ޴.9跍Es<}F)ґ/FC-S!^vإ.8|kbJ?h2+]%mlKlMmNԒƹƟU+@.o?ޏSJIlj7 Ff68nxjLJ qi/Ϛ5f ̘0[ein47g41Nnu3 endstream endobj 265 0 obj <> stream xuT{LSgzƜIMYNm36| )ZXZm Jiop> TӸda1itq&31KL-%n5T1ι4A4-ߴ:NcX`޶zRJPA 3iaHxG,>(抴;ƏeEQ7R-J,GӨ\2]1wڒC7njhV(j֠Qj5>FMWjkTzUFMU?O5*͢ 'cEě7 YILjVl,PlҙHP*TUJMBWQ[smm޹e܂+M:}RCK(QZJ*L:M(uv.*LvA<5UDTBϠ/h Ox3Øq3#=OF1:^<ҾFYF0z!"gEzO^UϞ5 hV;`Fl*B^T&CĄB<]!{ƶi2\&r-qvZ΀M{9G-h]*K w!?_BfLK#hLOr: 'L&01<*yAyU|\7 ^Fd&Opaspd@^inUE Ap/\!9ޓwolBc 1hnCLlX] G,KH[99p~_po #B-Nl#&VYz3]ҳ p؞!m;[HokLnCl8[ A`B zhXlkw:؂dq6P$ru/Lm{-D^RjA%CHu38N^TڬYQQRz0%)4pP)k `wHc ހ7b!K`sbbI+%%;ha'VJ,HF7¦[?*$#& Ivnhb'H!5z%BZd`6t?ϝҙ 9XN?tyP/6Ht endstream endobj 267 0 obj <> stream xڍV TT>39*rski,AE4 1 2=3Oq`rQ155%7*jv}EvGͪGVZws{o @I$EA+-LL150*:EOQxIE^&}qrc,OQv4Xߣև8gJ!(Fk2scç{xL_-C5u*sT3T>eicU[#UU+XդĭQ1--ਵŁA%!+^q^H$ ?g)5IFyHKST0 ¨p*R$J'nE0j8H)RRc(H͠fQQS>e? `J) @%Q}W%Zc"{d:LshOȻWyt*36jnDNʆKyZ$g빻(mnh&^x:ތÑ;@hrC^h#7y ./͞: JPoy^\MJ0! S< y W{:-$umU1\=w.i?s`\Z2id hML!8LGsb(\=.jdNG\Z)#dNcK["H'҄ܮuµR-B3#e?(0%1Lq)3:Akhi/hovu$gQ@rקJ+А'Cr /tpe`K+w N5f@u 7;3y+ڣQʉsfRJC 4A1GP쯉pW6A+*GVـVn8EnUԠ @#T@ ^:_""D/C^KC=)43kj i א:0&16Z0$V.j! !OW/o TPHeX]誡L%%5hxå}_'FPg:O鐠2TeSvdtA,<>/S&GDZ sewg5A*M6X 6ФGG^;9&hlSCUakq_[w%siznny‚Xxasj>zxa}#waW<5NYq涓Sj_YwR֊Et D/4BYg*$M/L" '2Ea~]<3N vɈFNǐ/EJ(E\ b$?$"M}I9yn/kE0F/-*9̋2=UZ9)yl7jSH*Au (%7&Ħ%j@}{= !H˶r맅c vOupewHZZ3(8Cր DMfկ"[;oyWn$aS?f/=Ő A' ZNUr8/܇ Ȗm[OzZRd= MV%Rhey~ٛ$,B.й>;N}AgHILbCe^Td_2;@O$`gAN'UY^TW 癶ΛBIm̭PbuP Qs('HմC})-IMg3EKѿ2a!6&b>7 ϥUEe'g؅\ fc~71|HkN'j*Ҳ'EZW#DGߓKO]3:494{tx<h wmj;I Hv*j]f<y?t{o`gmIK&jN#J}NȄ̜]?ȭ (uڷ [m1K m&WBX!CWԧ/%&׹ RjKs뇟_R΄f`X%ÆzWm,(h2L&CGGkPj*9k( endstream endobj 269 0 obj <> stream xڍ{pSUo6\(qo-"*/uX\`_( Jhii&iڴIqnI#mӴE`+變eU\tWfwgu}θ'f=iѝdɝs# @0o}0/?_,VJ+JV.ټwBZ6q'HOKN__Ly龉!X<'gނ# hWyJ&._z[_ee=4+GZR.S+˲{r/zV˲*J+eemٻ=kkӛ 6l.غ'UV`k'g ,cYĭ=Ľăbb$CyDFtb-,|By:."nZִ!^8c}]P}4"Gb򮪗#@C\ZB|&O+h&=zZ/k jA5Q&ni X]rVRf؀%CMtAw(1Jׅ;іV`LE&ݏ$MY;mZsS6)LKn-:5gt ]p@[ q,c;J#n~5k0o^ *i1}07Td 1|9b|@+q%q,s.,hF}Wɏ־A' QT t0f9l`Kqf\ F5 Ahc_#1 }"^|'ͯ]@V<Xg2[ċg Q%jz)T*b8A{oˆ\X-F iZɚ!ʴ u/N?6MP5*CEY<(>^-@~ 'NPS$+B_0ZS8bD@6ة *~@Fڡ68vjxHTV>tE;kHNxXj%.+c0/$bȞ tbڙ]YD$ÂH84Fk6L XjQ LguZA&Z~ZEܼBS*9m+ZTUa΅| HvϷLP89 IٯOL2{\:0uPi񏡫99Z%ߍ#N{$\gr~UVȔtZA{ȆeP隒+Q=$xYc`*I4P N)S3̺.aD%;=识N/02Ct yu`D)֥GxwpJ맟P[&{,>}eM. ̨=`[ {cĆJ7tBw&QM{5XI֮n^fF\C cl75< pZ܄B/'wXJ 1Tpn7dY  &.0Gҁւ5;k7J2E{_zbQpѺ@=nJYV3Xۉ6_߭ʼi/3`zm#7pW5]|IRj%Mƒ" 폍3>u+gZ?k3٭ %=3IO }y1s9Oݕ.s8À[*jXeZ;Ԑ0OHPbV*Q|E:K6hyhq1=Iہ!;&#8𴃣=za UnH )2>>pI.Od[QRұdMl{T[BtyCWW΀3h-^itapaՓ#@r<~,zА}ۓn'+R(s8_"͇D.Ɗ:r>h)C #0g-=cgȯ)gѦ[LNio۴1% ݵ!uE|y@Ӈl C[x2y賏%LQY,"e{ROs" 3Rg_`̩ӣ !Pdj*RiTl7swIKTkIf5 oS:`&JBuNu %< lm$׍-m|Xypbv6 kp'γ;rppjmd ,1]-"PV蔪}r#Noo˛2U63UQ\VgDfh۠5CNlUOuw5sŏŧ"^X/mMdoFRM6\y0t^./O(Oߤlf^Ed6#޾N 53u fg{X z 1W ltܴd)K`rjC7H<CDH"˜͜33.͌sN!'9p rqY> stream xm_HSqmӫ͑Aa$]mi濬mٝmWe|-SC(*ҋ { §z0 ̠{.m9Z#]X_ E8rg_Hd:5h"ԐI+byc>Xb5Tbbf-b CrVC;dUu;+(HKEn#s8 Bq&[v>'jVb*a䗻P)tR>|u^j𶴵ʜjB9 Vwx^Y& Pgݼufyҟ .Qq\HrqGxBVi<-歳ܡY-gJ#* P@rSg*Rl"S*bDS-.-~#l]U[)8N>[Yz?ϻb:WlwחaOo߉C)k6Sd4P/kߠucO@o#$)!Cmhu"'H%HH\̭~}[ jSDwmlaQLk"y budnݖw}e|N*;M$.'FFx&_sڤE endstream endobj 273 0 obj <> stream xڅTklSG $i)rBXR-"#D Mĉmtؾǯ8NhI Uh-]jt!j*F}/eqxjљsf}{1jxgחt-&NKVpvK>ɞڴ.A.MOh2CU/؟=7JsyЭxKCeh1*BkfڍP+a4x9^Yo[ꬖ*z:Tﹻi<عrk'w-[-͎}{\1ϝw5znmU((,|Y[}y_-[clnțLkmMTW)o*R o5b[l1Yd[\f)X}qK j5Y? ZM)ډ؇X8C8#GRhQ- oyҾl4hhgiOcYSByBx&O\!CoDV:~2zl&X op-bX 0ì< v^A0hO+rVHT[@{ ¾^)䫅P݋X\znrxO1_HER!fM}t{QG$J۟:Qr VD֑qqr/<}: h7\.-;TDه86 BxtZtBC_Xhr*(U)a? n4 ߪTr!RĆ>eꁍU}'j}169-9{Z,f%FI7=M4zο^YfVT6J`nRC !n &^#N|8.%% FbgV7@.OJ w ?&~3A?LMO{ckj^L$x/3 J' *!%_fvad53J@qtڒp2 z j$s=w^/FeGo MLOP:zTq$L3u=;9zEi0t u@H uJk wיdIl}m֧68b5AƂ!@X ۳,HkUM<,ɫ)bUaD}VAC⪐5~᜔}#iyK鵿_x/t{42xCp"=jUbn6}W #7^> stream x]kHSa߳{:BJ'+ql% t`$д9sL;:efktJSfNz/+lVgV%2εن:!JH A(b%B2T&$mߔTjzHI/:'Jlqhя[p `@dMT ?-,{73r<隁[F0@pј3Av 6e"HW7=fbT*]9`&5N W5)> stream xcd`aa`ddsr v54TC Yr?Yd { TX9j *23J45 --u ,sS2|K2RsKԒJ +}rbt;M̒ ԢT kiNj1A@g30110]gZ}?˺D׳}]3zqʴI}݋9˱͝=W{r뤦޴QgWL<ss|7slݕU')=%rv\FͩnjG{Ţ7޽@-Mrպr#T*8~~]н䃭~T6-˻peڽy.w6?ٻ &Wr|ZtROY {g*_~'Mg̵{ b1 z endstream endobj 279 0 obj <> stream xڅT{LSW 而Y 4 D])˘ZP(/ -@WZ(Ipa$ Pt..&Ne˹)KdYNG}c>>+w"+m+K%ޥ7gP>x~bﱊ[;;p _@/Fs{7qAfAbsdTԫ{dŊRѼr樨-ne)Dёxqv@"G&DeܤD^V$KsE\QZQzjLJ(6%)=9uCT/ eb)'Wec9+sX,biX:0q>^ i|Jȳb,gQhMr<R`:-Է,JtJR42"֏]ήgCg_##As;\Cʨ ϋۑ DfC;LJ@|nyjVXž(C\GwvCtZM]zi@do+h$5 DEgh2s0}]͵-컏n޷K$(F8oi?xV]4b $@IJJ–:JC l$Y ⴧ2ô&SAZ/n3GA!{/BaŅ1e Tl. b?jkIʫS8VmnDYkmѱ2Oȇ0vѲ9tYxGA]vz y uUB eDQ_儻o3u1M-e tMMUwqG2_ QՅj*}4$46 h:"q\:N`?5h#td3;hX"dvEBYl ʊ<:  Ns]NACI͎ ]~[k켄B @뺫 j%PO #w uM~uKyvlEQ!Ö|ӡ yNZy("8C0=Q sH(\JwE;v7Md#lY0*B:umF~^Gv.Q>B*h"Aܨ>.B Di,E? V Q-кUC@}Vs($P, > stream xcd`aa`ddt rrt M34 JM/I,If!Cy?Nɰv*RD>@`b`edd-1300q/,L(Q0200 I Nz ^ٙ y) ^zz ~@L<ԌĜ44`נ` Ѐ`M= rsa:E̜e}+W:ccecs֭3gn.<{޴ XBy8'pupgRoo %[&N0We|p endstream endobj 283 0 obj <> stream x]Pn0[^@TR E w`X:-c >ؠ+Rj_3YtL1hH€!VVG3|H)-GRwvmW8.:G`gcX݀Oi4CnM'-Ru+w5iT$MDQ ڶfH߮è~e`<~ɸ"1$j aӐ_%!-|Bo= endstream endobj 295 0 obj <> stream x]Pn0 ӡ]@|,ю$yR HGc L'85bِ„!V7JGVҳKZ Imoq~T/Gh|8[Lh}XLaӇvd;h 8] -Rڶ(1z0HZ [}2$v0Yd`y;9C=Ҟ¾/rC|w>(l endstream endobj 303 0 obj <> stream x]P=o0 +<^*[" E8Q$n`K~~3o'00lhQ嬬7IZo&^/]yvؖɁ 68k7K¾`hӵ22"E(X]q"4.^* fd(j]W3$w(Id`3=g&Ur4QZn_'U?nz endstream endobj 306 0 obj <> stream x]Pn0+*^ M$ qC%c/R1ؠz5;&sj֢^(nqԓeB$.ۦp3I OjDPv8:ts;ez EAotv݉_> stream x]j0>YNEE/ԙ L.kEw[j΍ogxzL#t8H BrE$>d !EJ1& _.E[ܜy( Yp8 Sо@'kFB!K2#F-0)]omYFra„:9KȧodSプۤIpa[m.I筱*?P~ endstream endobj 260 0 obj <> stream x͚[OHW*i4000` B[NBQnꪶ&0Τ㨙YIǙޡ =f\, :LztR3գg`qC!=CW%QN1O@xA#2L -SJ`(4 㘲 sN0-L+AdZPKŴZ09YXO) 3:T1rN=XYfn058Z D^ine"0]P?ptNRE`35/e^4o0&n*0p# ,H*, B`uq,x4GcB Ԫ Ah +z*R\q7H}8Az}x7ONv=HL7vG?uC pX`{ѺdHv@a ۀRe(n%{GW"QD;BΘj7ej}*xNz^UV.Ѧ%[zV˭mHs2zHBvNAֶJj͌j*bK(U*&5Z5zD @G VdH)*P[ڠEK.Pv;[_!h8Zӷ) N_pJ)kq:]۽|UJ^0$}p'g0wu8U(ɠx: LMj#$ͳ{nD4z+;NfةUXTFA\Un].$MhOծ@ZDgJT*(,*f/<9ݝ?K>ȏZD~Z,bqQHjP%Υ!Ip~9?2d?o4N4 QGS&$4TCKqHRaSY &󣃝i#4Q]-M?K>[$E-ZTblgLیZO'TXP-qwL.6{ff}:[v7 u<@ԠEK CíӈSqa{FJrG^01ms{j"O~Dފ(YצV+cesvUF.Owh~~"EsK:6zJ<^eljԒ`ʀ`= 9.zSqN>\eW/OH _ +K/O(rQRmZҵ͗h8=>}mXy- 归\.*Ҙҿ,|;Ygb‰EQDvxo}+Wr㓋vY,ԀyY!뢐TheU[`"ol~#φhْ Z.ܥkԴzJ;~"QzM`xK#G;G_?#=v.J Z#_G(|koσVxgZ{W=_G# |E"tK 绱]퉇iX֧>̶΢RL;>+?sʮ\Iw"wʄ|TԘKΠpf6{ʆɿϣ< P lxmrOn(iZt-}_q>aYǢūgȰͰ{xf'ft\Rl]5&l?|OK3 6CYBxI6hN]#*MD]: G9eD4Y>zdi[:GgSzS I:ey2?4_L3|SIgLJk|GA=,ҵy|ސ(5d?v@q37 IoHwIHQ;G endstream endobj 319 0 obj <]/Size 320/W[1 3 2]/Filter/FlateDecode/Length 726>> stream x5WUQE  [EE [Q,΁Crȑ3oL|w}νwamO 0EZyJ&eT*,XXkb-!בYk7Qx,Xc"beðx0 {u.|9ؐ6dF(>Ob&3߄!7f[`KC [ӴV#~Taq,e0^!gc,{7Cl\78G318V6-We99ތ"ٚo&]ٖL8 'sTqlw0,sp.@W-/-Wjuz܀p#n͸ b\+qnN<1wQ܃{qxa<0cg =?ݨ7RtW<}Vo;)R~K?Jmgx${,Es^F1#1KI=GJ{z.-z.#O%|-]x﹬t9s9L>"bc5 Xb73_'N$L]c9 a/H gfUzQ~Q endstream endobj startxref 113543 %%EOF fastcluster/inst/doc/fastcluster.Rtex0000644000176200001440000012016112470717051017543 0ustar liggesusers\def\fastclusterversion{1.1.16} \documentclass[fontsize=10pt,paper=letter,BCOR=-6mm]{scrartcl} \usepackage[utf8]{inputenc} \usepackage{lmodern} \normalfont \usepackage[T1]{fontenc} \usepackage{textcomp} \newcommand*\q{\textquotesingle} \usepackage{amsmath} \usepackage{amsfonts} \usepackage{xcolor} \usepackage{ifpdf} \ifpdf \newcommand*\driver{} \else \newcommand*\driver{dvipdfmx} \fi \usepackage[% pdftitle={fastcluster manual}, pdfauthor={Daniel Müllner}, % pdfsubject={}, pdfdisplaydoctitle=true, % pdfduplex=DuplexFlipLongEdge, pdfstartview=FitH, colorlinks=True, pdfhighlight=/I, % pdfborder={0 0 1}, % linkbordercolor={1 .8 .8}, % citebordercolor={.5 .9 .5}, % urlbordercolor={.5 .7 1}, % linkcolor={blue}, % citecolor={blue}, urlcolor={blue!80!black}, linkcolor={red!80!black}, % runcolor={blue}, % filecolor={blue}, pdfpagemode=UseOutlines, bookmarksopen=true, bookmarksopenlevel=1, bookmarksdepth=2, breaklinks=true, unicode=true, \driver ]{hyperref} % Optimize the PDF targets and make the PDF file smaller \ifpdf\RequirePackage{hypdestopt}\fi \renewcommand*\sectionautorefname{Section} \usepackage{typearea} \DeclareMathOperator\size{size} \DeclareMathOperator\Var{Var} \newcommand*\linkage{\href{http://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.linkage.html}{\texttt{linkage}}} \newcommand*\hierarchy{\href{http://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.linkage.html}{\texttt{scipy.\hskip0pt cluster.\hskip0pt hierarchy}}} \newcommand*\hclust{\href{http://stat.ethz.ch/R-manual/R-patched/library/stats/html/hclust.html}{\texttt{hclust}}} \newcommand*\stats{\href{http://stat.ethz.ch/R-manual/R-devel/library/stats/html/00Index.html}{\texttt{stats}}} \newcommand*\flashClustPack{\href{http://cran.r-project.org/web/packages/flashClust/index.html}{\texttt{flashClust}}} \newcommand*\dist{\href{http://stat.ethz.ch/R-manual/R-devel/library/stats/html/dist.html}{\texttt{dist}}} \newcommand*\print{\href{http://stat.ethz.ch/R-manual/R-patched/library/base/html/print.html}{\texttt{print}}} \newcommand*\plot{\href{http://stat.ethz.ch/R-manual/R-patched/library/graphics/html/plot.html}{\texttt{plot}}} \newcommand*\identify{\href{http://stat.ethz.ch/R-manual/R-patched/library/stats/html/identify.hclust.html}{\texttt{identify}}} \newcommand*\rect{\href{http://stat.ethz.ch/R-manual/R-patched/library/stats/html/rect.hclust.html}{\texttt{rect.hclust}}} \newcommand*\NA{\href{http://stat.ethz.ch/R-manual/R-devel/library/base/html/NA.html}{\texttt{NA}}} %\usepackage{showframe} \makeatletter \newenvironment{methods}{% \list{}{\labelwidth\z@ \itemindent-\leftmargin \let\makelabel\methodslabel}% }{% \endlist } \newcommand*{\methodslabel}[1]{% %\hspace{\labelsep}% \hbox to \textwidth{\hspace{\labelsep}% \normalfont\bfseries\ttfamily #1\hskip-\labelsep\hfill}% } \makeatother \setkomafont{descriptionlabel}{\normalfont\ttfamily\bfseries} \begin{document} %\VignetteIndexEntry{User's manual} \title{The \textit{fastcluster} package: User's manual} \author{\href{http://danifold.net}{Daniel Müllner}} \date{January 7, 2015} \subtitle{Version \fastclusterversion} \maketitle \makeatletter \renewenvironment{quotation}{% \list{}{\listparindent 1em% \itemindent \listparindent \leftmargin2.5em \rightmargin \leftmargin \parsep \z@ \@plus\p@ }% \item\relax }{% \endlist } \makeatother \begin{abstract}\noindent\small The fastcluster package is a C++ library for hierarchical, agglomerative clustering. It efficiently implements the seven most widely used clustering schemes: single, complete, average, weighted/mcquitty, Ward, centroid and median linkage. The library currently has interfaces to two languages: R and Python/SciPy. Part of the functionality is designed as drop-in replacement for existing routines: \linkage{} in the SciPy package \hierarchy{}, \hclust{} in R's \stats{} package, and the \flashClustPack{} package. Once the fastcluster library is loaded at the beginning of the code, every program that uses hierarchical clustering can benefit immediately and effortlessly from the performance gain. Moreover, there are memory-saving routines for clustering of vector data, which go beyond what the existing packages provide. \end{abstract} \noindent This document describes the usage for the two interfaces for R and Python and is meant as the reference document for the end user. Installation instructions are given in the file INSTALL in the source distribution and are not repeated here. The sections about the two interfaces are independent and in consequence somewhat redundant, so that users who need a reference for one interface need to consult only one section. If you use the fastcluster package for scientific work, please cite it as: \begin{quote} Daniel Müllner, \textit{fastcluster: Fast Hierarchical, Agglomerative Clustering Routines for R and Python}, Journal of Statistical Software, \textbf{53} (2013), no.~9, 1--18, \url{http://www.jstatsoft.org/v53/i09/}. \end{quote} \textbf{The fastcluster package is considered stable and will undergo few changes from now on. If some years from now there have not been any updates, this does not necessarily mean that the package is unmaintained but maybe it just was not necessary to correct anything. Of course, please still report potential bugs and incompatibilities to \texttt{daniel@danifold.net}.} \tableofcontents \section{The R interface} Load the package with the following command: \begin{quote} \texttt{library(\q fastcluster\q)} \end{quote} The package overwrites the function \hclust{} from the \stats{} package (in the same way as the \flashClustPack{} package does). Please remove any references to the \flashClustPack{} package in your R files to not accidentally overwrite the \hclust{} function with the \flashClustPack{} version. The \hyperref[hclust]{new \texttt{hclust} function} has exactly the same calling conventions as the old one. You may just load the package and immediately and effortlessly enjoy the performance improvements. The function is also an improvement to the \texttt{flashClust} function from the \flashClustPack{} package. Just replace every call to \texttt{flashClust} by \hyperref[hclust]{\texttt{hclust}} and expect your code to work as before, only faster.\footnote{If you are using flashClust prior to version 1.01, update it! See the change log for \flashClustPack{} at \url{http://cran.r-project.org/web/packages/flashClust/ChangeLog}.} In case the data includes infinite or NaN values, see \autoref{sec:infnan}. If you need to access the old function or make sure that the right function is called, specify the package as follows: \begin{quote} \texttt{\hyperref[hclust]{fastcluster::hclust}(…)}\\ \texttt{flashClust::hclust(…)}\\ \texttt{stats::hclust(…)} \end{quote} Vector data can be clustered with a memory-saving algorithm with the command: \begin{quote} \texttt{\hyperref[hclust.vector]{hclust.vector}(…)} \end{quote} The following sections contain comprehensive descriptions of these methods. \begin{methods} \item [\normalfont\texttt{\textbf{hclust}}\,(\textit{d, method=\q complete\q, members=NULL})] \phantomsection\label{hclust} \addcontentsline{toc}{subsection}{\texttt{hclust}} Hierarchical, agglomerative clustering on a condensed dissimilarity matrix. This method has the same specifications as the method \hclust{} in the package \stats{} and \texttt{hclust} alias \texttt{flashClust} in the package \flashClustPack{}. In particular, the \print{}, \plot{}, \rect{} and \identify{} methods work as expected. The argument $d$ is a condensed distance matrix, as it is produced by \dist. The argument \textit{method} is one of the strings \textit{\q single\q}, \textit{\q complete\q}, \textit{\q average\q}, \textit{\q mcquitty\q}, \textit{\q centroid\q}, \textit{\q median\q}, \textit{\q ward.D\q}, \textit{\q ward.D2\q} or an unambiguous abbreviation thereof. The argument \textit{members} specifies the sizes of the initial nodes, ie.\ the number of observations in the initial clusters. The default value \texttt{NULL} says that all initial nodes are singletons, ie.\ have size 1. Otherwise, \textit{members} must be a vector whose size is the number of input points. The vector is processed as a \href{http://stat.ethz.ch/R-manual/R-patched/library/base/html/double.html}{\texttt{double}} array so that not only integer cardinalities of nodes can be accounted for but also weighted nodes with real weights. The general scheme of the agglomerative clustering procedure is as follows: \begin{enumerate} \item Start with $N$ singleton clusters (nodes) labeled $-1,\ldots, -N$, which represent the input points. \item Find a pair of nodes with minimal distance among all pairwise distances. \item Join the two nodes into a new node and remove the two old nodes. The new nodes are labeled consecutively $1,2,\ldots$ \item The distances from the new node to all other nodes is determined by the \textit{method} parameter (see below). \item Repeat $N-1$ times from step 2, until there is one big node, which contains all original input points. \end{enumerate} The output of \texttt{hclust} is an object of class \texttt{\q hclust\q} and represents a \emph{stepwise dendrogram}. It contains the following fields: \begin{description} \item[\normalfont\textit{merge}] This is an $(N-1)\times 2$ array. Row $i$ specifies the labels of the nodes which are joined step $i$ of the clustering. \item[\normalfont\textit{height}] This is a vector of length $N-1$. It contains the sequence of dissimilarities at which every pair of nearest nodes is joined. \item[\normalfont\textit{order}] This is a vector of length $N$. It contains a permutation of the numbers $1,\ldots N$ for the \plot{} method. When the dendrogram is plotted, this is the order in which the singleton nodes are plotted as the leaves of a rooted tree. The order is computed so that the dendrogram is plotted without intersections (except the case when there are inversions for the \textit{\q centroid\q} and \textit{\q median\q} methods). The choice of the \textit{\q order\q} sequence follows the same scheme as the \texttt{stats} package does, only with a faster algorithm. Note that there are many valid choices to order the nodes in a dendrogram without intersections. Also, subsequent points in the \textit{\q order\q} field are not always close in the ultrametric given by the dendrogram. \item[\normalfont\textit{labels}] This copies the attribute \textit{\q Labels\q} from the first input parameter $d$. It contains the labels for the objects being clustered. \item[\normalfont\textit{method}] The (unabbreviated) string for the \textit{\q method\q} parameter. See below for a specification of all available methods. \item[\normalfont\textit{call}] The full command that produced the result. See \href{http://stat.ethz.ch/R-manual/R-patched/library/base/html/match.call.html}{\texttt{match.call}}. \item[\normalfont\textit{dist.method}] This \textit{\q method\q} attribute of the first input parameter $d$. This specifies which metric was used in the \texttt{dist} method which generated the first argument. \end{description} The parameter \textit{method} specifies which clustering scheme to use. The clustering scheme determines the distance from a new node to the other nodes. Denote the dissimilarities by $d$, the nodes to be joined by $I,J$, the new node by $K$ and any other node by $L$. The symbol $|I|$ denotes the size of the cluster $I$. \begin{description} \item [\normalfont\textit{method=\q single\q}:] $\displaystyle d(K,L) = \min(d(I,L), d(J,L))$ The distance between two clusters $A,B$ is the closest distance between any two points in each cluster: \[ d(A,B)=\min_{a\in A, b\in B}d(a,b) \] \item [\normalfont\textit{method=\q complete\q}:] $\displaystyle d(K,L) = \max(d(I,L), d(J,L))$ The distance between two clusters $A,B$ is the maximal distance between any two points in each cluster: \[ d(A,B)=\max_{a\in A, b\in B}d(a,b) \] \item [\normalfont\textit{method=\q average\q}:] $\displaystyle d(K,L) = \frac{|I|\cdot d(I,L)+|J|\cdot d(J,L)}{|I|+|J|}$ The distance between two clusters $A,B$ is the average distance between the points in the two clusters: \[ d(A,B)=\frac1{|A||B|}\sum_{a\in A, b\in B}d(a,b) \] \item [\normalfont\textit{method=\q mcquitty\q}:] $\displaystyle d(K,L) = \tfrac12(d(I,L)+d(J,L))$ There is no global description for the distance between clusters since the distance depends on the order of the merging steps. \end{description} The following three methods are intended for Euclidean data only, ie.\ when $X$ contains the pairwise \textbf{squared} distances between vectors in Euclidean space. The algorithm will work on any input, however, and it is up to the user to make sure that applying the methods makes sense. \begin{description} \item [\normalfont\textit{method=\q centroid\q}:] $\displaystyle d(K,L) = \frac{|I|\cdot d(I,L)+|J|\cdot d(J,L)}{|I|+|J|}-\frac{|I|\cdot|J|\cdot d(I,J)}{(|I|+|J|)^2}$ There is a geometric interpretation: $d(A,B)$ is the distance between the centroids (ie.\ barycenters) of the clusters in Euclidean space: \[ d(A,B) = \|\vec c_A-\vec c_B\|^2, \] where $\vec c_A$ denotes the centroid of the points in cluster $A$. \item [\normalfont\textit{method=\q median\q}:] $\displaystyle d(K,L) = \tfrac12 d(I,L)+\tfrac12 d(J,L)-\tfrac14 d(I,J)$ Define the midpoint $\vec w_K$ of a cluster $K$ iteratively as $\vec w_K=k$ if $K=\{k\}$ is a singleton and as the midpoint $\frac12(\vec w_I+\vec w_J)$ if $K$ is formed by joining $I$ and $J$. Then we have \[ d(A,B)=\|\vec w_A-\vec w_B\|^2 \] in Euclidean space for all nodes $A,B$. Notice however that this distance depends on the order of the merging steps. \item [\normalfont\textit{method=\q ward.D\q}:] $\displaystyle d(K,L) = \frac{(|I|+|L|)\cdot d(I,L)+(|J|+|L|)\cdot d(J,L)-|L|\cdot d(I,J)}{|I|+|J|+|L|}$ The global cluster dissimilarity can be expressed as \[ d(A,B)=\frac{2|A||B|}{|A|+|B|}\cdot\|\vec c_A-\vec c_B\|^2, \] where $\vec c_A$ again denotes the centroid of the points in cluster $A$. \item [\normalfont\textit{method=\q ward.D2\q}:] This is the equivalent of \textit{\q ward.D\q}, but for input consisting of untransformed (in particular: \textbf{non-squared}) Euclidean distances. Internally, all distances are squared first, then method \textit{ward.D} is applied, and finally the square root of all heights in the dendrogram is taken. Thus, global cluster dissimilarity can be expressed as the square root of that for \textit{ward.D}, namely \[ d(A,B)=\sqrt{\frac{2|A||B|}{|A|+|B|}}\cdot\|\vec c_A-\vec c_B\|. \] \end{description} \item [\normalfont\texttt{\textbf{hclust.vector}}\,(\textit{X, method=\q single\q, members=NULL, metric=\q euclidean\q, p=NULL})] \phantomsection\label{hclust.vector} \addcontentsline{toc}{subsection}{\texttt{hclust.vector}} This performs hierarchical, agglomerative clustering on vector data with memory-saving algorithms. While the \hyperref[hclust]{\texttt{hclust}} method requires $\Theta(N^2)$ memory for clustering of $N$ points, this method needs $\Theta(ND)$ for $N$ points in $\mathbb R^D$, which is usually much smaller. The argument $X$ must be a two-dimensional matrix with \href{http://stat.ethz.ch/R-manual/R-patched/library/base/html/double.html}{\texttt{double}} precision values. It describes $N$ data points in $\mathbb R^D$ as an $(N\times D)$ matrix. The parameter \textit{\q members\q} is the same as for \hyperref[hclust]{\texttt{hclust}}. The parameter \textit{\q method\q} is one of the strings \textit{\q single\q}, \textit{\q centroid\q}, \textit{\q median\q}, \textit{\q ward\q}, or an unambiguous abbreviation thereof. If \textit{method} is \textit{\q single\q}, single linkage clustering is performed on the data points with the metric which is specified by the \textit{metric} parameter. The choices are the same as in the \href{http://stat.ethz.ch/R-manual/R-patched/library/stats/html/dist.html}{\texttt{dist}} method: \textit{\q euclidean\q}, \textit{\q maximum\q}, \textit{\q manhattan\q}, \textit{\q canberra\q}, \textit{\q binary\q} and \textit{\q minkowski\q}. Any unambiguous substring can be given. The parameter \textit{p} is used for the \textit{\q minkowski\q} metric only. The call \begin{quote} \texttt{hclust.vector(X, method=\q single\q, metric=[...])} \end{quote} is equivalent to \begin{quote} \texttt{hclust(dist(X, metric=[...]), method=\q single\q)} \end{quote} but uses less memory and is equally fast. Ties may be resolved differently, ie.\ if two pairs of nodes have equal, minimal dissimilarity values at some point, in the specific computer's representation for floating point numbers, either pair may be chosen for the next merging step in the dendrogram. If \textit{method} is one of \textit{\q centroid\q}, \textit{\q median\q}, \textit{\q ward\q}, clustering is performed with respect to Euclidean distances. In this case, the parameter \textit{metric} must be \textit{\q euclidean\q}. Notice that \texttt{hclust.vector} operates on Euclidean distances for compatibility reasons with the \dist{} method, while \hyperref[hclust]{\texttt{hclust}} assumes \textbf{squared} Euclidean distances for compatibility with the \href{http://stat.ethz.ch/R-manual/R-patched/library/stats/html/hclust.html}{\texttt{stats::hclust}} method! Hence, the call \phantomsection\label{squared} \begin{quote} \texttt{hc = hclust.vector(X, method=\q centroid\q)} \end{quote} is, aside from the lesser memory requirements, equivalent to \begin{quote} \texttt{d = dist(X)}\\ \texttt{hc = hclust(d\textasciicircum 2, method=\q centroid\q)}\\ \texttt{hc\$height = sqrt(hc\$height)} \end{quote} The same applies to the \textit{\q median\q} method. The \textit{\q ward\q} method in \hyperref[hclust.vector]{\texttt{hclust.vector}} is equivalent to \hyperref[hclust]{\texttt{hclust}} with method \textit{\q ward.D2\q}, but to method \textit{\q ward.D\q} only after squaring as above. Differences in these algebraically equivalent methods may arise only from floating-point inaccuracies and the resolution of ties (which may, however, in extreme cases affect the entire clustering result due to the inherently unstable nature of the clustering schemes). \end{methods} \section{The Python interface} The fastcluster package is imported as usual by: \begin{quote} \texttt{import fastcluster} \end{quote} It provides the following functions: \begin{quote} \hyperref[linkage]{\texttt{linkage}}\,(\textit{X, method=\q single\q, metric=\q euclidean\q, preserve\_input=True})\\ \hyperref[single]{\texttt{single}}\,($X$)\\ \hyperref[complete]{\texttt{complete}}\,($X$)\\ \hyperref[average]{\texttt{average}}\,($X$)\\ \hyperref[weighted]{\texttt{weighted}}\,($X$)\\ \hyperref[ward]{\texttt{ward}}\,($X$)\\ \hyperref[centroid]{\texttt{centroid}}\,($X$)\\ \hyperref[median]{\texttt{median}}\,($X$)\\ \hyperref[linkage_vector]{\texttt{linkage\_vector}}\,(\textit{X, method=\q single\q, metric=\q euclidean\q, extraarg=None}) \end{quote} The following sections contain comprehensive descriptions of these methods. \begin{methods} \item [\normalfont\texttt{fastcluster.\textbf{linkage}}\,(\textit{X, method=\q single\q, metric=\q euclidean\q, preserve\_input=\q True\q})] \phantomsection\label{linkage} \addcontentsline{toc}{subsection}{\texttt{linkage}} Hierarchical, agglomerative clustering on a condensed dissimilarity matrix or on vector data. Apart from the argument \textit{preserve\_input}, the method has the same input parameters and output format as the function of the same name in the module \hierarchy. The argument $X$ is preferably a \href{http://docs.scipy.org/doc/numpy/reference/generated/numpy.ndarray.html}{NumPy array} with floating point entries (\texttt{X.dtype\hskip0pt==\hskip0pt numpy.double}). Any other data format will be converted before it is processed. NumPy's \href{http://docs.scipy.org/doc/numpy/reference/maskedarray.html}{masked arrays} are not treated as special, and the mask is simply ignored. If $X$ is a one-dimensional array, it is considered a condensed matrix of pairwise dissimilarities in the format which is returned by \href{http://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.pdist.html}{\texttt{scipy.spatial.distance.pdist}}. It contains the flattened, upper-triangular part of a pairwise dissimilarity matrix. That is, if there are $N$ data points and the matrix $d$ contains the dissimilarity between the $i$-th and $j$-th observation at position $d_{i,j}$, the vector $X$ has length $\binom N2$ and is ordered as follows: \[ d = \begin{pmatrix} 0&d_{0,1}&d_{0,2}&\ldots&d_{0,n-1}\\ & 0&d_{1,2} & \ldots\\ &&0&\ldots\\ &&&\ddots\\ &&&&0 \end{pmatrix} = \begin{pmatrix} 0&X[0] &X[1]&\ldots&X[n-2]\\ & 0&X[n-1] & \ldots\\ &&0&\ldots\\ &&&\ddots\\ &&&&0 \end{pmatrix} \] The \textit{metric} argument is ignored in case of dissimilarity input. The optional argument \textit{preserve\_input} specifies whether the method makes a working copy of the dissimilarity vector or writes temporary data into the existing array. If the dissimilarities are generated for the clustering step only and are not needed afterward, approximately half the memory can be saved by specifying \textit{preserve\_input=False}. Note that the input array $X$ contains unspecified values after this procedure. It is therefore safer to write \begin{verbatim} linkage(X, method="...", preserve_input=False) del X \end{verbatim} to make sure that the matrix $X$ is not accessed accidentally after it has been used as scratch memory. (The single linkage algorithm does not write to the distance matrix or its copy anyway, so the \textit{preserve\_input} flag has no effect in this case.) If $X$ contains vector data, it must be a two-dimensional array with $N$ observations in $D$ dimensions as an $(N\times D)$ array. The \textit{preserve\_input} argument is ignored in this case. The specified \textit{metric} is used to generate pairwise distances from the input. The following two function calls yield equivalent output: \begin{verbatim} linkage(pdist(X, metric), method="...", preserve_input=False) linkage(X, metric=metric, method="...") \end{verbatim} The two results are identical in most cases, but differences occur if ties are resolved differently: if the minimum in step 2 below is attained for more than one pair of nodes, either pair may be chosen. It is not guaranteed that both \texttt{linkage} variants choose the same pair in this case. The general scheme of the agglomerative clustering procedure is as follows: \begin{enumerate} \item Start with $N$ singleton clusters (nodes) labeled $0,\ldots, N-1$, which represent the input points. \item Find a pair of nodes with minimal distance among all pairwise distances. \item Join the two nodes into a new node and remove the two old nodes. The new nodes are labeled consecutively $N,N+1,\ldots$ \item The distances from the new node to all other nodes is determined by the \textit{method} parameter (see below). \item Repeat $N-1$ times from step 2, until there is one big node, which contains all original input points. \end{enumerate} The output of \texttt{linkage} is \emph{stepwise dendrogram}, which is represented as an $(N-1)\times 4$ \href{http://docs.scipy.org/doc/numpy/reference/generated/numpy.ndarray.html}{NumPy array} with floating point entries (\texttt{dtype=numpy.double}). The first two columns contain the node indices which are joined in each step. The input nodes are labeled $0,\ldots,N-1$, and the newly generated nodes have the labels $N,\ldots, 2N-2$. The third column contains the distance between the two nodes at each step, ie.\ the current minimal distance at the time of the merge. The fourth column counts the number of points which comprise each new node. The parameter \textit{method} specifies which clustering scheme to use. The clustering scheme determines the distance from a new node to the other nodes. Denote the dissimilarities by $d$, the nodes to be joined by $I,J$, the new node by $K$ and any other node by $L$. The symbol $|I|$ denotes the size of the cluster $I$. \begin{description} \item [\normalfont\textit{method=\q single\q}:] $\displaystyle d(K,L) = \min(d(I,L), d(J,L))$ The distance between two clusters $A,B$ is the closest distance between any two points in each cluster: \[ d(A,B)=\min_{a\in A, b\in B}d(a,b) \] \item [\normalfont\textit{method=\q complete\q}:] $\displaystyle d(K,L) = \max(d(I,L), d(J,L))$ The distance between two clusters $A,B$ is the maximal distance between any two points in each cluster: \[ d(A,B)=\max_{a\in A, b\in B}d(a,b) \] \item [\normalfont\textit{method=\q average\q}:] $\displaystyle d(K,L) = \frac{|I|\cdot d(I,L)+|J|\cdot d(J,L)}{|I|+|J|}$ The distance between two clusters $A,B$ is the average distance between the points in the two clusters: \[ d(A,B)=\frac1{|A||B|}\sum_{a\in A, b\in B}d(a,b) \] \item [\normalfont\textit{method=\q weighted\q}:] $\displaystyle d(K,L) = \tfrac12(d(I,L)+d(J,L))$ There is no global description for the distance between clusters since the distance depends on the order of the merging steps. \end{description} The following three methods are intended for Euclidean data only, ie.\ when $X$ contains the pairwise (non-squared!)\ distances between vectors in Euclidean space. The algorithm will work on any input, however, and it is up to the user to make sure that applying the methods makes sense. \begin{description} \item [\normalfont\textit{method=\q centroid\q}:] $\displaystyle d(K,L) = \sqrt{\frac{|I|\cdot d(I,L)+|J|\cdot d(J,L)}{|I|+|J|}-\frac{|I|\cdot|J|\cdot d(I,J)}{(|I|+|J|)^2}}$ There is a geometric interpretation: $d(A,B)$ is the distance between the centroids (ie.\ barycenters) of the clusters in Euclidean space: \[ d(A,B) = \|\vec c_A-\vec c_B\|, \] where $\vec c_A$ denotes the centroid of the points in cluster $A$.\pagebreak[2] \item [\normalfont\textit{method=\q median\q}:] $\displaystyle d(K,L) = \sqrt{\tfrac12 d(I,L)+\tfrac12 d(J,L)-\tfrac14 d(I,J)}$ Define the midpoint $\vec w_K$ of a cluster $K$ iteratively as $\vec w_K=k$ if $K=\{k\}$ is a singleton and as the midpoint $\frac12(\vec w_I+\vec w_J)$ if $K$ is formed by joining $I$ and $J$. Then we have \[ d(A,B)=\|\vec w_A-\vec w_B\| \] in Euclidean space for all nodes $A,B$. Notice however that this distance depends on the order of the merging steps. \item [\normalfont\textit{method=\q ward\q}:] $\displaystyle d(K,L) = \sqrt{\frac{(|I|+|L|)\cdot d(I,L)+(|J|+|L|)\cdot d(J,L)-|L|\cdot d(I,J)}{|I|+|J|+|L|}}$ The global cluster dissimilarity can be expressed as \[ d(A,B)=\sqrt{\frac{2|A||B|}{|A|+|B|}}\cdot\|\vec c_A-\vec c_B\|, \] where $\vec c_A$ again denotes the centroid of the points in cluster $A$. \end{description} \item [\normalfont\texttt{fastcluster.\textbf{single}}\,(\textit{X})] \phantomsection\addcontentsline{toc}{subsection}{\texttt{single}}\label{single} Alias for \texttt{fastcluster.\textbf{linkage}}\,(\textit{X, method=\q single\q}). \item [\normalfont\texttt{fastcluster.\textbf{complete}}\,(\textit{X})] \phantomsection\addcontentsline{toc}{subsection}{\texttt{complete}}\label{complete} Alias for \texttt{fastcluster.\textbf{linkage}}\,(\textit{X, method=\q complete\q}). \item [\normalfont\texttt{fastcluster.\textbf{average}}\,(\textit{X})] \phantomsection\addcontentsline{toc}{subsection}{\texttt{average}}\label{average} Alias for \texttt{fastcluster.\textbf{linkage}}\,(\textit{X, method=\q average\q}). \item [\normalfont\texttt{fastcluster.\textbf{weighted}}\,(\textit{X})] \phantomsection\addcontentsline{toc}{subsection}{\texttt{weighted}}\label{weighted} Alias for \texttt{fastcluster.\textbf{linkage}}\,(\textit{X, method=\q weighted\q}). \item [\normalfont\texttt{fastcluster.\textbf{centroid}}\,(\textit{X})] \phantomsection\addcontentsline{toc}{subsection}{\texttt{centroid}}\label{centroid} Alias for \texttt{fastcluster.\textbf{linkage}}\,(\textit{X, method=\q centroid\q}). \item [\normalfont\texttt{fastcluster.\textbf{median}}\,(\textit{X})] \phantomsection\addcontentsline{toc}{subsection}{\texttt{median}}\label{median} Alias for \texttt{fastcluster.\textbf{linkage}}\,(\textit{X, method=\q median\q}). \item [\normalfont\texttt{fastcluster.\textbf{ward}}\,(\textit{X})] \phantomsection\addcontentsline{toc}{subsection}{\texttt{ward}}\label{ward} Alias for \texttt{fastcluster.\textbf{linkage}}\,(\textit{X, method=\q ward\q}). \item [\normalfont\texttt{fastcluster.\textbf{linkage\_vector}}\,(\textit{X, method=\q single\q, metric=\q euclidean\q, extraarg=\q None\q})] \phantomsection\addcontentsline{toc}{subsection}{\texttt{linkage\_vector}}\label{linkage_vector} This performs hierarchical, agglomerative clustering on vector data with memory-saving algorithms. While the \hyperref[linkage]{\texttt{linkage}} method requires $\Theta(N^2)$ memory for clustering of $N$ points, this method needs $\Theta(ND)$ for $N$ points in $\mathbb R^D$, which is usually much smaller. The argument $X$ has the same format as before, when $X$ describes vector data, ie.\ it is an $(N\times D)$ array. Also the output array has the same format. The parameter \textit{method} must be one of \textit{\q single\q}, \textit{\q centroid\q}, \textit{\q median\q}, \textit{\q ward\q}, ie.\ only for these methods there exist memory-saving algorithms currently. If \textit{method}, is one of \textit{\q centroid\q}, \textit{\q median\q}, \textit{\q ward\q}, the \textit{metric} must be \textit{\q euclidean\q}. Like the \texttt{linkage} method, \texttt{linkage\_vector} does not treat NumPy's \href{http://docs.scipy.org/doc/numpy/reference/maskedarray.html}{masked arrays} as special and simply ignores the mask. For single linkage clustering, any dissimilarity function may be chosen. Basically, every metric which is implemented in the method \href{http://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.pdist.html}{\texttt{scipy.spatial.distance.pdist}} is reimplemented here. However, the metrics differ in some instances since a number of mistakes and typos (both in the code and in the documentation) were corrected in the \textit{fastcluster} package.\footnote{Hopefully, the SciPy metric will be corrected in future versions and some day coincide with the \textit{fastcluster} definitions. See the bug reports at \url{http://projects.scipy.org/scipy/ticket/1484}, \url{http://projects.scipy.org/scipy/ticket/1486}.} Therefore, the available metrics with their definitions are listed below as a reference. The symbols $u$ and $v$ mostly denote vectors in $\mathbb R^D$ with coordinates $u_j$ and $v_j$ respectively. See below for additional metrics for Boolean vectors. Unless otherwise stated, the input array $X$ is converted to a floating point array (\texttt{X.dtype==numpy.double}) if it does has have already the required data type. Some metrics accept Boolean input; in this case this is stated explicitly below. \begin{description} \item[\normalfont\textit{\q euclidean\q}:] Euclidean metric, $L_2$ norm \[ d(u,v) = \| u-v\|_2 = \sqrt{\sum_j (u_j-v_j)^2} \] \item[\normalfont\textit{\q sqeuclidean\q}:] squared Euclidean metric \[ d(u,v) = \| u-v\|^2_2 = \sum_j (u_j-v_j)^2 \] \item[\normalfont\textit{\q seuclidean\q}:] standardized Euclidean metric \[ d(u,v) = \sqrt{\sum_j (u_j-v_j)^2 /V_j} \] The vector $V=(V_0,\ldots,V_{D-1})$ is given as the \textit{extraarg} argument. If no \textit{extraarg} is given, $V_j$ is by default the unbiased sample variance of all observations in the $j$-th coordinate, $V_j = \Var_i(X_{i,j})=\frac1{N-1}\sum_i(X_{i,j}^2-\mu(X_j)^2)$. (Here, $\mu(X_j)$ denotes as usual the mean of $X_{i,j}$ over all rows $i$.) \item[\normalfont\textit{\q mahalanobis\q}:] Mahalanobis distance \[ d(u,v) = \sqrt{(u-v)^{\mkern-3mu\top}V (u-v)} \] Here, $V=\textit{extraarg}$, a $(D\times D)$-matrix. If $V$ is not specified, the inverse of the covariance matrix \texttt{numpy.linalg.inv(numpy.cov(X, rowvar=False))} is used: \[ (V^{-1})_{j,k} = \frac1{N-1} \sum_i (X_{i,j}-\mu(X_j))(X_{i,k}-\mu(X_k)) \] \item[\normalfont\textit{\q cityblock\q}:] the Manhattan distance, $L_1$ norm \[ d(u,v) = \sum_j |u_j-v_j| \] \item[\normalfont\textit{\q chebychev\q}:] the supremum norm, $L_\infty$ norm \[ d(u,v) = \max_j |u_j-v_j| \] \item[\normalfont\textit{\q minkowski\q}:] the $L_p$ norm \[ d(u,v) = \left(\sum_j |u_j-v_j|^p\right)^{1/p} \] This metric coincides with the \textit{cityblock}, \textit{euclidean} and \textit{chebychev} metrics for $p=1$, $p=2$ and $p=\infty$ (\texttt{numpy.inf}), respectively. The parameter $p$ is given as the \textit{\q extraarg\q} argument. \item[\normalfont\textit{\q cosine\q}] \[ d(u,v) = 1 - \frac{\langle u,v\rangle}{\|u\|\cdot\|v\|} = 1 - \frac{\sum_j u_jv_j}{\sqrt{\sum_j u_j^2\cdot \sum_j v_j^2}} \] \item[\normalfont\textit{\q correlation\q}:] This method first mean-centers the rows of $X$ and then applies the \textit{cosine} distance. Equivalently, the \textit{correlation} distance measures $1-{}$\textrm{(Pearson's correlation coefficient)}. \[ d(u,v) = 1 - \frac{\langle u-\mu(u),v-\mu(v)\rangle}{\|u-\mu(u)\|\cdot\|v-\mu(v)\|}, \] \item[\normalfont\textit{\q canberra\q}] \[ d(u,v) = \sum_j\frac{|u_j-v_j|}{|u_j|+|v_j|} \] Summands with $u_j=v_j=0$ contribute 0 to the sum. \item[\normalfont\textit{\q braycurtis\q}] \[ d(u,v) = \frac{\sum_j |u_j-v_j|}{\sum_j |u_j+v_j|} \] \item[\textnormal{(user function):}] The parameter \textit{metric} may also be a function which accepts two NumPy floating point vectors and returns a number. Eg.\ the Euclidean distance could be emulated with \begin{quote} \texttt{fn = lambda u, v: numpy.sqrt(((u-v)*(u-v)).sum())}\\ \texttt{linkage\_vector(X, method=\q single\q, metric=fn)} \end{quote} This method, however, is much slower than the built-in function. \item[\normalfont\textit{\q hamming\q}:] The Hamming distance accepts a Boolean array (\texttt{X.dtype==bool}) for efficient storage. Any other data type is converted to \texttt{numpy.double}. \[ d(u,v) = |\{j\mid u_j\neq v_j\}| \] \item[\normalfont\textit{\q jaccard\q}:] The Jaccard distance accepts a Boolean array (\texttt{X.dtype\hskip0pt ==\hskip0pt bool}) for efficient storage. Any other data type is converted to \texttt{numpy.double}. \[ d(u,v) = \frac{|\{j\mid u_j\neq v_j\}|}{|\{j\mid u_j\neq 0\text{ or } v_j\neq 0\}|} \] \[ d(0,0) = 0 \] Python represents \texttt{True} by 1 and \texttt{False} by 0. In the Boolean case, the Jaccard distance is therefore: \[ d(u,v) = \frac{|\{j\mid u_j\neq v_j\}|}{|\{j\mid u_j\lor v_j\}|} \] \end{description} The following metrics are designed for Boolean vectors. The input array is converted to the \texttt{bool} data type if it is not Boolean already. Use the following abbreviations for the entries of a contingency table: \begin{align*} a &= |\{j\mid u_j\land v_j \}| & b &= |\{j\mid u_j\land(\lnot v_j)\}|\\ c &= |\{j\mid (\lnot u_j)\land v_j \}| & d &= |\{j\mid (\lnot u_j)\land(\lnot v_j)\}| \end{align*} Recall that $D$ denotes the number of dimensions, hence $D=a+b+c+d$. \begin{description} \item[\normalfont\textit{\q yule\q}] \[ d(u,v) = \frac{2bc}{ad+bc} \] \item[\normalfont\textit{\q dice\q}] \begin{gather*} d(u,v) = \frac{b+c}{2a+b+c}\\ d(0,0) = 0 \end{gather*} \item[\normalfont\textit{\q rogerstanimoto\q}] \[ d(u,v) = \frac{2(b+c)}{b+c+D} \] \item[\normalfont\textit{\q russellrao\q}] \[ d(u,v) = \frac{b+c+d}{D} \] \item[\normalfont\textit{\q sokalsneath\q}] \begin{gather*} d(u,v) = \frac{2(b+c)}{a+2(b+c)}\\ d(0,0) = 0 \end{gather*} \item[\normalfont\textit{\q kulsinski\q}] \[ d(u,v) = \frac 12\cdot\left(\frac b{a+b} + \frac c{a+c}\right) \] \item[\normalfont\textit{\q matching\q}] \[ d(u,v) = \frac{b+c}{D} \] Notice that when given a Boolean array, the \textit{matching} and \textit{hamming} distance are the same. The \textit{matching} distance formula, however, converts every input to Boolean first. Hence, the vectors $(0,1)$ and $(0,2)$ have zero \textit{matching} distance since they are both converted to $(\mathrm{False}, \mathrm{True})$ but the \textit{hamming} distance is $0.5$. \item[\normalfont\textit{\q sokalmichener\q}] is an alias for \textit{\q matching\q}. \end{description} \end{methods} \section{Behavior for NaN and infinite values}\label{sec:infnan} Whenever the fastcluster package encounters a NaN value as the distance between nodes, either as the initial distance or as an updated distance after some merging steps, it raises an error. This was designed intentionally, even if there might be ways to propagate NaNs through the algorithms in a more or less sensible way. Indeed, since the clustering result depends on every single distance value, the presence of NaN values usually indicates a dubious clustering result, and therefore NaN values should be eliminated in preprocessing.\pagebreak[1] In the R interface for vector input, coordinates with {\NA} value are interpreted as missing data and treated in the same way as R's {\dist} function does. This results in valid output whenever the resulting distances are not NaN. The Python interface does not provide any way of handling missing coordinates, and data should be processed accordingly and given as pairwise distances to the clustering algorithms in this case. The fastcluster package handles node distances and coordinates with infinite values correctly, as long as the formulas for the distance updates and the metric (in case of vector input) make sense. In concordance with the statement above, an error is produced if a NaN value results from performing arithmetic with infinity. Also, the usual proviso applies: internal formulas in the code are mathematically equivalent to the formulas as stated in the documentation only for finite, real numbers but might produce different results for $\pm\infty$. Apart from obvious cases like single or complete linkage, it is therefore recommended that users think about how they want infinite values to be treated by the distance update and metric formulas and then check whether the fastcluster code does exactly what they want in these special cases. \section{Differences between the two interfaces} \begin{itemize} \item The \textit{\q mcquitty\q} method in R is called \textit{\q weighted\q} in Python. \item R and SciPy use different conventions for the ``Euclidean'' methods \textit{\q centroid\q}, \textit{\q median\q}! R assumes that the dissimilarity matrix consists of squared Euclidean distances, while SciPy expects non-squared Euclidean distances. The fastcluster package respects these conventions and uses different formulas in the two interfaces. The \textit{\q ward\q} method in the Python interface is identical to \textit{\q ward.D2\q} in the R interface. If the same results in both interfaces ought to be obtained, then the \hyperref[hclust]{\texttt{hclust}} function in R must be input the entry-wise square of the distance matrix, \verb!d^2!, for the \textit{\q ward.D\q}, \textit{\q centroid\q} and \textit{\q median\q} methods, and later the square root of the height field in the dendrogram must be taken. The \hyperref[hclust.vector]{\texttt{hclust.vector}} method calculates non-squared Euclidean distances, like R's \dist{} method and identically to the Python interface. See the \hyperref[squared]{example} in the \hyperref[hclust.vector]{\texttt{hclust.vector}} documentation above. For the \textit{\q average\q} and \textit{\q weighted\q} alias \textit{\q mcquitty\q} methods, the same, non-squared distance matrix \texttt{d} as in the Python interface must be used for the same results. The \textit{\q single\q} and \textit{\q complete\q} methods only depend on the relative order of the distances, hence it does not make a difference whether the method operates on the distances or the squared distances. The code example in the R documentation (enter \texttt{?hclust} or \texttt{example(hclust)} in R) contains another instance where the squared distance matrix is generated from Euclidean data. \item The Python interface is not designed to deal with missing values, and NaN values in the vector data raise an error message. The \hyperref[hclust.vector]{\texttt{hclust.vector}} method in the R interface, in contrast, deals with NaN and the (R specific) {\NA} values in the same way as the \dist{} method does. Confer the documentation for \dist{} for details. \end{itemize} \section{References} \begin{trivlist} \item \textit{NumPy: Scientific computing tools for Python}, \url{http://numpy.scipy.org/}. \item Eric Jones, Travis Oliphant, Pearu Peterson et al., \textit{SciPy: Open Source Scientific Tools for Python}, 2001, \url{http://www.scipy.org}. \item \textit{R: A Language and Environment for Statistical Computing}, R Foundation for Statistical Computing, Vienna, 2011, \url{http://www.r-project.org}. \end{trivlist} \end{document} %%% Local variables: %%% mode: latex %%% TeX-master: "fastcluster.Rtex" %%% TeX-PDF-mode: t %%% End: fastcluster/tests/0000755000176200001440000000000012452574006013760 5ustar liggesusersfastcluster/tests/test_fastcluster.R0000644000176200001440000001625112452574006017506 0ustar liggesusers# fastcluster: Fast hierarchical clustering routines for R and Python # # Copyright © 2011 Daniel Müllner # # # Test script for the R interface seed = as.integer(runif(1, 0, 1e9)) set.seed(seed) cat(sprintf("Random seed: %d\n",seed)) print_seed <- function() { return(sprintf(' Please send a report to the author of the \'fastcluster\' package, Daniel Müllner. For contact details, see . To make the error reproducible, you must include the following number (the random seed value) in your error report: %d.\n\n', seed)) } hasWardD2 = getRversion() >= '3.1.0' # Compare two dendrograms and check whether they are equal, except that # ties may be resolved differently. compare <- function(dg1, dg2) { h1 <- dg1$height h2 <- dg2$height # "height" vectors may have small numerical errors. rdiffs <- abs(h1-h2)/pmax(abs(h1),abs(h2)) rdiffs = rdiffs[complete.cases(rdiffs)] rel_error <- max(rdiffs) # We allow a relative error of 1e-13. if (rel_error>1e-13) { print(h1) print(h2) cat(sprintf('Height vectors differ! The maximum relative error is %e.\n', rel_error)) return(FALSE) } # Filter the indices where consecutive merging distances are distinct. d = diff(dg1$height) b = (c(d,1)!=0 & c(1,d)!=0) #cat(sprintf("Percentage of indices where we can test: %g.\n",100.0*length(b[b])/length(b))) if (any(b)) { m1 = dg1$merge[b,] m2 = dg2$merge[b,] r = function(i) { if (i<0) { return(1) } else { return(b[i]) } } f = sapply(m1,r) fm1 = m1*f fm2 = m2*f # The "merge" matrices must be identical whereever indices are not ambiguous # due to ties. if (!identical(fm1,fm2)) { cat('Merge matrices differ!\n') return(FALSE) } # Compare the "order" vectors only if all merging distances were distinct. if (all(b) && !identical(dg1$order,dg2$order)) { cat('Order vectors differ!\n') return(FALSE) } } return(TRUE) } # Generate uniformly distributed random data generate.uniform <- function() { n = sample(10:1000,1) range_exp = runif(1,min=-10, max=10) cat(sprintf("Number of sample points: %d\n",n)) cat(sprintf("Dissimilarity range: [0,%g]\n",10^range_exp)) d = runif(n*(n-1)/2, min=0, max=10^range_exp) # Fake a compressed distance matrix attributes(d) <- NULL attr(d,"Size") <- n attr(d, "call") <- 'N/A' class(d) <- "dist" return(d) } # Generate normally distributed random data generate.normal <- function() { n = sample(10:1000,1) dim = sample(2:20,1) cat (sprintf("Number of sample points: %d\n",n)) cat (sprintf("Dimension: %d\n",dim)) pcd = matrix(rnorm(n*dim), c(n,dim)) d = dist(pcd) return(d) } # Test the clustering functions when a distance matrix is given. test.dm <- function(d) { d2 = d if (hasWardD2) { methods = c('single','complete','average','mcquitty','ward.D','ward.D2','centroid','median') } else { methods = c('single','complete','average','mcquitty','ward','centroid','median') } for (method in methods) { cat(paste('Method :', method, '\n')) dg_stats = stats::hclust(d, method=method) if (method == 'ward') { method = 'ward.D' } dg_fastcluster = fastcluster::hclust(d, method=method) if (!identical(d,d2)) { cat('Input array was corrupted!\n') stop(print_seed()) } if (!compare(dg_stats, dg_fastcluster)) { stop(print_seed()) } } cat('Passed.\n') } # Test the clustering functions for vector input in Euclidean space. test.vector <- function() { # generate test data n = sample(10:1000,1) dim = sample(2:20,1) cat (sprintf("Number of sample points: %d\n",n)) cat (sprintf("Dimension: %d\n",dim)) range_exp = runif(1,min=-10, max=10) pcd = matrix(rnorm(n*dim, sd=10^range_exp), c(n,dim)) pcd2 = pcd # test method='single' cat(paste('Method:', method, '\n')) for (metric in c('euclidean', 'maximum', 'manhattan', 'canberra', 'minkowski')) { cat(paste(' Metric:', metric, '\n')) if (metric=='minkowski') { p = runif(1, min=1.0, max=10.0) cat (sprintf(" p: %g\n",p)); dg_fastcluster = fastcluster::hclust.vector(pcd, method=method, metric=metric, p=p) d = dist(pcd, method=metric, p=p) } else { dg_fastcluster = fastcluster::hclust.vector(pcd, method=method, metric=metric) d = dist(pcd, method=metric) } d2 = d dg_fastcluster_dist = fastcluster::hclust(d, method=method) if (!identical(d,d2) || !identical(pcd,pcd2)) { cat('Input array was corrupted!\n') stop(print_seed()) } if (!compare(dg_fastcluster_dist, dg_fastcluster)) { stop(print_seed()) } } for (method in c('ward','centroid','median') ) { cat(paste('Method:', method, '\n')) dg_fastcluster = fastcluster::hclust.vector(pcd, method=method) if (!identical(pcd,pcd2)) { cat('Input array was corrupted!\n') stop(print_seed()) } d = dist(pcd) if(method == "ward" && hasWardD2) { method = "ward.D2" } else { # Workaround: fastcluster::hclust expects _squared_ euclidean distances. d = d^2 } d2 = d dg_fastcluster_dist = fastcluster::hclust(d, method=method) if (!identical(d,d2)) { cat('Input array was corrupted!\n') stop(print_seed()) } if(method != "ward.D2") { dg_fastcluster_dist$height = sqrt(dg_fastcluster_dist$height) } # The Euclidean methods may have small numerical errors due to squaring/ # taking the root in the Euclidean distances. if (!compare(dg_fastcluster_dist, dg_fastcluster)) { stop(print_seed()) } } cat('Passed.\n') } # Test the single linkage function with the "binary" metric test.vector.binary <- function() { # generate test data cat (sprintf("Uniform sampling for the 'binary' metric:\n")) n = sample(10:400,1) dim = sample(n:(2*n),1) cat (sprintf("Number of sample points: %d\n",n)) cat (sprintf("Dimension: %d\n",dim)) pcd = matrix(sample(-1:2, n*dim, replace=T), c(n,dim)) pcd2 = pcd # test method='single' metric='binary' cat(paste('Method:', method, '\n')) cat(paste(' Metric:', metric, '\n')) dg_fastcluster = fastcluster::hclust.vector(pcd, method=method, metric=metric) d = dist(pcd, method=metric) d2 = d dg_fastcluster_dist = fastcluster::hclust(d, method=method) if (!identical(d,d2) || !identical(d,d2)) { cat('Input array was corrupted!\n') stop(print_seed()) } if (!compare(dg_fastcluster_dist, dg_fastcluster)) { stop(print_seed()) } cat('Passed.\n') } N = 15 for (i in (1:N)) { if (i%%2==1) { cat(sprintf('Random test %d of %d (uniform distribution of distances):\n',i,2*N)) d = generate.uniform() } else { cat(sprintf('Random test %d of %d (Gaussian density):\n',i,2*N)) d = generate.normal() } test.dm(d) } for (i in (N+1:N)) { cat(sprintf('Random test %d of %d (Gaussian density):\n',i,2*N)) test.vector() test.vector.binary() } cat('Done.\n') fastcluster/src/0000755000176200001440000000000012470717051013404 5ustar liggesusersfastcluster/src/fastcluster.cpp0000644000176200001440000014547212470717051016464 0ustar liggesusers/* fastcluster: Fast hierarchical clustering routines for R and Python Copyright © 2011 Daniel Müllner This library implements various fast algorithms for hierarchical, agglomerative clustering methods: (1) Algorithms for the "stored matrix approach": the input is the array of pairwise dissimilarities. MST_linkage_core: single linkage clustering with the "minimum spanning tree algorithm (Rohlfs) NN_chain_core: nearest-neighbor-chain algorithm, suitable for single, complete, average, weighted and Ward linkage (Murtagh) generic_linkage: generic algorithm, suitable for all distance update formulas (Müllner) (2) Algorithms for the "stored data approach": the input are points in a vector space. MST_linkage_core_vector: single linkage clustering for vector data generic_linkage_vector: generic algorithm for vector data, suitable for the Ward, centroid and median methods. generic_linkage_vector_alternative: alternative scheme for updating the nearest neighbors. This method seems faster than "generic_linkage_vector" for the centroid and median methods but slower for the Ward method. All these implementation treat infinity values correctly. They throw an exception if a NaN distance value occurs. */ #include // for std::ptrdiff_t #include // for std::numeric_limits<...>::infinity() #include // for std::fill_n #include // for std::runtime_error #include // for std::string // Microsoft Visual Studio does not have fenv.h #ifdef _MSC_VER #if (_MSC_VER == 1500 || _MSC_VER == 1600) #define NO_INCLUDE_FENV #endif #endif #ifndef NO_INCLUDE_FENV #include #endif #include // also for DBL_MAX, DBL_MIN #ifndef DBL_MANT_DIG #error The constant DBL_MANT_DIG could not be defined. #endif #define T_FLOAT_MANT_DIG DBL_MANT_DIG #ifndef LONG_MAX #include #endif #ifndef LONG_MAX #error The constant LONG_MAX could not be defined. #endif #ifndef INT_MAX #error The constant INT_MAX could not be defined. #endif #ifndef INT32_MAX #define __STDC_LIMIT_MACROS #include #endif #ifndef HAVE_DIAGNOSTIC #if __GNUC__ > 4 || (__GNUC__ == 4 && (__GNUC_MINOR__ >= 6)) #define HAVE_DIAGNOSTIC 1 #endif #endif #ifndef HAVE_VISIBILITY #if __GNUC__ >= 4 #define HAVE_VISIBILITY 1 #endif #endif /* Since the public interface is given by the Python respectively R interface, * we do not want other symbols than the interface initalization routines to be * visible in the shared object file. The "visibility" switch is a GCC concept. * Hiding symbols keeps the relocation table small and decreases startup time. * See http://gcc.gnu.org/wiki/Visibility */ #if HAVE_VISIBILITY #pragma GCC visibility push(hidden) #endif typedef int_fast32_t t_index; #ifndef INT32_MAX #define MAX_INDEX 0x7fffffffL #else #define MAX_INDEX INT32_MAX #endif #if (LONG_MAX < MAX_INDEX) #error The integer format "t_index" must not have a greater range than "long int". #endif #if (INT_MAX > MAX_INDEX) #error The integer format "int" must not have a greater range than "t_index". #endif typedef double t_float; /* Method codes. These codes must agree with the METHODS array in fastcluster.R and the dictionary mthidx in fastcluster.py. */ enum method_codes { // non-Euclidean methods METHOD_METR_SINGLE = 0, METHOD_METR_COMPLETE = 1, METHOD_METR_AVERAGE = 2, METHOD_METR_WEIGHTED = 3, METHOD_METR_WARD = 4, METHOD_METR_WARD_D = METHOD_METR_WARD, METHOD_METR_CENTROID = 5, METHOD_METR_MEDIAN = 6, METHOD_METR_WARD_D2 = 7, MIN_METHOD_CODE = 0, MAX_METHOD_CODE = 7 }; enum method_codes_vector { // Euclidean methods METHOD_VECTOR_SINGLE = 0, METHOD_VECTOR_WARD = 1, METHOD_VECTOR_CENTROID = 2, METHOD_VECTOR_MEDIAN = 3, MIN_METHOD_VECTOR_CODE = 0, MAX_METHOD_VECTOR_CODE = 3 }; enum { // Return values RET_SUCCESS = 0, RET_MEMORY_ERROR = 1, RET_STL_ERROR = 2, RET_UNKNOWN_ERROR = 3 }; // self-destructing array pointer template class auto_array_ptr{ private: type * ptr; auto_array_ptr(auto_array_ptr const &); // non construction-copyable auto_array_ptr& operator=(auto_array_ptr const &); // non copyable public: auto_array_ptr() : ptr(NULL) { } template auto_array_ptr(index const size) : ptr(new type[size]) { } template auto_array_ptr(index const size, value const val) : ptr(new type[size]) { std::fill_n(ptr, size, val); } ~auto_array_ptr() { delete [] ptr; } void free() { delete [] ptr; ptr = NULL; } template void init(index const size) { ptr = new type [size]; } template void init(index const size, value const val) { init(size); std::fill_n(ptr, size, val); } inline operator type *() const { return ptr; } }; struct node { t_index node1, node2; t_float dist; /* inline bool operator< (const node a) const { return this->dist < a.dist; } */ inline friend bool operator< (const node a, const node b) { return (a.dist < b.dist); } }; class cluster_result { private: auto_array_ptr Z; t_index pos; public: cluster_result(const t_index size) : Z(size) , pos(0) {} void append(const t_index node1, const t_index node2, const t_float dist) { Z[pos].node1 = node1; Z[pos].node2 = node2; Z[pos].dist = dist; ++pos; } node * operator[] (const t_index idx) const { return Z + idx; } /* Define several methods to postprocess the distances. All these functions are monotone, so they do not change the sorted order of distances. */ void sqrt() const { for (node * ZZ=Z; ZZ!=Z+pos; ++ZZ) { ZZ->dist = ::sqrt(ZZ->dist); } } void sqrt(const t_float) const { // ignore the argument sqrt(); } void sqrtdouble(const t_float) const { // ignore the argument for (node * ZZ=Z; ZZ!=Z+pos; ++ZZ) { ZZ->dist = ::sqrt(2*ZZ->dist); } } #ifdef R_pow #define my_pow R_pow #else #define my_pow pow #endif void power(const t_float p) const { t_float const q = 1/p; for (node * ZZ=Z; ZZ!=Z+pos; ++ZZ) { ZZ->dist = my_pow(ZZ->dist,q); } } void plusone(const t_float) const { // ignore the argument for (node * ZZ=Z; ZZ!=Z+pos; ++ZZ) { ZZ->dist += 1; } } void divide(const t_float denom) const { for (node * ZZ=Z; ZZ!=Z+pos; ++ZZ) { ZZ->dist /= denom; } } }; class doubly_linked_list { /* Class for a doubly linked list. Initially, the list is the integer range [0, size]. We provide a forward iterator and a method to delete an index from the list. Typical use: for (i=L.start; L succ; private: auto_array_ptr pred; // Not necessarily private, we just do not need it in this instance. public: doubly_linked_list(const t_index size) // Initialize to the given size. : start(0) , succ(size+1) , pred(size+1) { for (t_index i=0; i(2*N-3-(r_))*(r_)>>1)+(c_)-1] ) // Z is an ((N-1)x4)-array #define Z_(_r, _c) (Z[(_r)*4 + (_c)]) /* Lookup function for a union-find data structure. The function finds the root of idx by going iteratively through all parent elements until a root is found. An element i is a root if nodes[i] is zero. To make subsequent searches faster, the entry for idx and all its parents is updated with the root element. */ class union_find { private: auto_array_ptr parent; t_index nextparent; public: union_find(const t_index size) : parent(size>0 ? 2*size-1 : 0, 0) , nextparent(size) { } t_index Find (t_index idx) const { if (parent[idx] != 0 ) { // a → b t_index p = idx; idx = parent[idx]; if (parent[idx] != 0 ) { // a → b → c do { idx = parent[idx]; } while (parent[idx] != 0); do { t_index tmp = parent[p]; parent[p] = idx; p = tmp; } while (parent[p] != idx); } } return idx; } void Union (const t_index node1, const t_index node2) { parent[node1] = parent[node2] = nextparent++; } }; class nan_error{}; #ifdef FE_INVALID class fenv_error{}; #endif static void MST_linkage_core(const t_index N, const t_float * const D, cluster_result & Z2) { /* N: integer, number of data points D: condensed distance matrix N*(N-1)/2 Z2: output data structure The basis of this algorithm is an algorithm by Rohlf: F. James Rohlf, Hierarchical clustering using the minimum spanning tree, The Computer Journal, vol. 16, 1973, p. 93–95. */ t_index i; t_index idx2; doubly_linked_list active_nodes(N); auto_array_ptr d(N); t_index prev_node; t_float min; // first iteration idx2 = 1; min = std::numeric_limits::infinity(); for (i=1; i tmp) d[i] = tmp; else if (fc_isnan(tmp)) throw (nan_error()); #if HAVE_DIAGNOSTIC #pragma GCC diagnostic pop #endif if (d[i] < min) { min = d[i]; idx2 = i; } } Z2.append(prev_node, idx2, min); } } /* Functions for the update of the dissimilarity array */ inline static void f_single( t_float * const b, const t_float a ) { if (*b > a) *b = a; } inline static void f_complete( t_float * const b, const t_float a ) { if (*b < a) *b = a; } inline static void f_average( t_float * const b, const t_float a, const t_float s, const t_float t) { *b = s*a + t*(*b); #ifndef FE_INVALID #if HAVE_DIAGNOSTIC #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wfloat-equal" #endif if (fc_isnan(*b)) { throw(nan_error()); } #if HAVE_DIAGNOSTIC #pragma GCC diagnostic pop #endif #endif } inline static void f_weighted( t_float * const b, const t_float a) { *b = (a+*b)*.5; #ifndef FE_INVALID #if HAVE_DIAGNOSTIC #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wfloat-equal" #endif if (fc_isnan(*b)) { throw(nan_error()); } #if HAVE_DIAGNOSTIC #pragma GCC diagnostic pop #endif #endif } inline static void f_ward( t_float * const b, const t_float a, const t_float c, const t_float s, const t_float t, const t_float v) { *b = ( (v+s)*a - v*c + (v+t)*(*b) ) / (s+t+v); //*b = a+(*b)-(t*a+s*(*b)+v*c)/(s+t+v); #ifndef FE_INVALID #if HAVE_DIAGNOSTIC #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wfloat-equal" #endif if (fc_isnan(*b)) { throw(nan_error()); } #if HAVE_DIAGNOSTIC #pragma GCC diagnostic pop #endif #endif } inline static void f_centroid( t_float * const b, const t_float a, const t_float stc, const t_float s, const t_float t) { *b = s*a - stc + t*(*b); #ifndef FE_INVALID if (fc_isnan(*b)) { throw(nan_error()); } #if HAVE_DIAGNOSTIC #pragma GCC diagnostic pop #endif #endif } inline static void f_median( t_float * const b, const t_float a, const t_float c_4) { *b = (a+(*b))*.5 - c_4; #ifndef FE_INVALID #if HAVE_DIAGNOSTIC #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wfloat-equal" #endif if (fc_isnan(*b)) { throw(nan_error()); } #if HAVE_DIAGNOSTIC #pragma GCC diagnostic pop #endif #endif } template static void NN_chain_core(const t_index N, t_float * const D, t_members * const members, cluster_result & Z2) { /* N: integer D: condensed distance matrix N*(N-1)/2 Z2: output data structure This is the NN-chain algorithm, described on page 86 in the following book: Fionn Murtagh, Multidimensional Clustering Algorithms, Vienna, Würzburg: Physica-Verlag, 1985. */ t_index i; auto_array_ptr NN_chain(N); t_index NN_chain_tip = 0; t_index idx1, idx2; t_float size1, size2; doubly_linked_list active_nodes(N); t_float min; for (t_float const * DD=D; DD!=D+(static_cast(N)*(N-1)>>1); ++DD) { #if HAVE_DIAGNOSTIC #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wfloat-equal" #endif if (fc_isnan(*DD)) { throw(nan_error()); } #if HAVE_DIAGNOSTIC #pragma GCC diagnostic pop #endif } #ifdef FE_INVALID if (feclearexcept(FE_INVALID)) throw fenv_error(); #endif for (t_index j=0; jidx2) { t_index tmp = idx1; idx1 = idx2; idx2 = tmp; } if (method==METHOD_METR_AVERAGE || method==METHOD_METR_WARD) { size1 = static_cast(members[idx1]); size2 = static_cast(members[idx2]); members[idx2] += members[idx1]; } // Remove the smaller index from the valid indices (active_nodes). active_nodes.remove(idx1); switch (method) { case METHOD_METR_SINGLE: /* Single linkage. Characteristic: new distances are never longer than the old distances. */ // Update the distance matrix in the range [start, idx1). for (i=active_nodes.start; i(members[i]); for (i=active_nodes.start; i(members[i]) ); // Update the distance matrix in the range (idx1, idx2). for (; i(members[i]) ); // Update the distance matrix in the range (idx2, N). for (i=active_nodes.succ[idx2]; i(members[i]) ); break; default: throw std::runtime_error(std::string("Invalid method.")); } } #ifdef FE_INVALID if (fetestexcept(FE_INVALID)) throw fenv_error(); #endif } class binary_min_heap { /* Class for a binary min-heap. The data resides in an array A. The elements of A are not changed but two lists I and R of indices are generated which point to elements of A and backwards. The heap tree structure is H[2*i+1] H[2*i+2] \ / \ / ≤ ≤ \ / \ / H[i] where the children must be less or equal than their parent. Thus, H[0] contains the minimum. The lists I and R are made such that H[i] = A[I[i]] and R[I[i]] = i. This implementation is not designed to handle NaN values. */ private: t_float * const A; t_index size; auto_array_ptr I; auto_array_ptr R; // no default constructor binary_min_heap(); // noncopyable binary_min_heap(binary_min_heap const &); binary_min_heap & operator=(binary_min_heap const &); public: binary_min_heap(t_float * const A_, const t_index size_) : A(A_), size(size_), I(size), R(size) { // Allocate memory and initialize the lists I and R to the identity. This // does not make it a heap. Call heapify afterwards! for (t_index i=0; i>1); idx>0; ) { --idx; update_geq_(idx); } } inline t_index argmin() const { // Return the minimal element. return I[0]; } void heap_pop() { // Remove the minimal element from the heap. --size; I[0] = I[size]; R[I[0]] = 0; update_geq_(0); } void remove(t_index idx) { // Remove an element from the heap. --size; R[I[size]] = R[idx]; I[R[idx]] = I[size]; if ( H(size)<=A[idx] ) { update_leq_(R[idx]); } else { update_geq_(R[idx]); } } void replace ( const t_index idxold, const t_index idxnew, const t_float val) { R[idxnew] = R[idxold]; I[R[idxnew]] = idxnew; if (val<=A[idxold]) update_leq(idxnew, val); else update_geq(idxnew, val); } void update ( const t_index idx, const t_float val ) const { // Update the element A[i] with val and re-arrange the indices to preserve // the heap condition. if (val<=A[idx]) update_leq(idx, val); else update_geq(idx, val); } void update_leq ( const t_index idx, const t_float val ) const { // Use this when the new value is not more than the old value. A[idx] = val; update_leq_(R[idx]); } void update_geq ( const t_index idx, const t_float val ) const { // Use this when the new value is not less than the old value. A[idx] = val; update_geq_(R[idx]); } private: void update_leq_ (t_index i) const { t_index j; for ( ; (i>0) && ( H(i)>1) ); i=j) heap_swap(i,j); } void update_geq_ (t_index i) const { t_index j; for ( ; (j=2*i+1)=H(i) ) { ++j; if ( j>=size || H(j)>=H(i) ) break; } else if ( j+1 static void generic_linkage(const t_index N, t_float * const D, t_members * const members, cluster_result & Z2) { /* N: integer, number of data points D: condensed distance matrix N*(N-1)/2 Z2: output data structure */ const t_index N_1 = N-1; t_index i, j; // loop variables t_index idx1, idx2; // row and column indices auto_array_ptr n_nghbr(N_1); // array of nearest neighbors auto_array_ptr mindist(N_1); // distances to the nearest neighbors auto_array_ptr row_repr(N); // row_repr[i]: node number that the // i-th row represents doubly_linked_list active_nodes(N); binary_min_heap nn_distances(&*mindist, N_1); // minimum heap structure for // the distance to the nearest neighbor of each point t_index node1, node2; // node numbers in the output t_float size1, size2; // and their cardinalities t_float min; // minimum and row index for nearest-neighbor search t_index idx; for (i=0; ii} D(i,j) for i in range(N-1) t_float const * DD = D; for (i=0; i::infinity(); for (idx=j=i+1; ji} D(i,j) Normally, we have equality. However, this minimum may become invalid due to the updates in the distance matrix. The rules are: 1) If mindist[i] is equal to D(i, n_nghbr[i]), this is the correct minimum and n_nghbr[i] is a nearest neighbor. 2) If mindist[i] is smaller than D(i, n_nghbr[i]), this might not be the correct minimum. The minimum needs to be recomputed. 3) mindist[i] is never bigger than the true minimum. Hence, we never miss the true minimum if we take the smallest mindist entry, re-compute the value if necessary (thus maybe increasing it) and looking for the now smallest mindist entry until a valid minimal entry is found. This step is done in the lines below. The update process for D below takes care that these rules are fulfilled. This makes sure that the minima in the rows D(i,i+1:)of D are re-calculated when necessary but re-calculation is avoided whenever possible. The re-calculation of the minima makes the worst-case runtime of this algorithm cubic in N. We avoid this whenever possible, and in most cases the runtime appears to be quadratic. */ idx1 = nn_distances.argmin(); if (method != METHOD_METR_SINGLE) { while ( mindist[idx1] < D_(idx1, n_nghbr[idx1]) ) { // Recompute the minimum mindist[idx1] and n_nghbr[idx1]. n_nghbr[idx1] = j = active_nodes.succ[idx1]; // exists, maximally N-1 min = D_(idx1,j); for (j=active_nodes.succ[j]; j(members[idx1]); size2 = static_cast(members[idx2]); members[idx2] += members[idx1]; } Z2.append(node1, node2, mindist[idx1]); // Remove idx1 from the list of active indices (active_nodes). active_nodes.remove(idx1); // Index idx2 now represents the new (merged) node with label N+i. row_repr[idx2] = N+i; // Update the distance matrix switch (method) { case METHOD_METR_SINGLE: /* Single linkage. Characteristic: new distances are never longer than the old distances. */ // Update the distance matrix in the range [start, idx1). for (j=active_nodes.start; j(members[j]) ); if (n_nghbr[j] == idx1) n_nghbr[j] = idx2; } // Update the distance matrix in the range (idx1, idx2). for (; j(members[j]) ); if (D_(j, idx2) < mindist[j]) { nn_distances.update_leq(j, D_(j, idx2)); n_nghbr[j] = idx2; } } // Update the distance matrix in the range (idx2, N). if (idx2(members[j]) ); min = D_(idx2,j); for (j=active_nodes.succ[j]; j(members[j]) ); if (D_(idx2,j) < min) { min = D_(idx2,j); n_nghbr[idx2] = j; } } nn_distances.update(idx2, min); } break; case METHOD_METR_CENTROID: { /* Centroid linkage. Shorter and longer distances can occur, not bigger than max(d1,d2) but maybe smaller than min(d1,d2). */ // Update the distance matrix in the range [start, idx1). t_float s = size1/(size1+size2); t_float t = size2/(size1+size2); t_float stc = s*t*mindist[idx1]; for (j=active_nodes.start; j static void MST_linkage_core_vector(const t_index N, t_dissimilarity & dist, cluster_result & Z2) { /* N: integer, number of data points dist: function pointer to the metric Z2: output data structure The basis of this algorithm is an algorithm by Rohlf: F. James Rohlf, Hierarchical clustering using the minimum spanning tree, The Computer Journal, vol. 16, 1973, p. 93–95. */ t_index i; t_index idx2; doubly_linked_list active_nodes(N); auto_array_ptr d(N); t_index prev_node; t_float min; // first iteration idx2 = 1; min = std::numeric_limits::infinity(); for (i=1; i tmp) d[i] = tmp; else if (fc_isnan(tmp)) throw (nan_error()); #if HAVE_DIAGNOSTIC #pragma GCC diagnostic pop #endif if (d[i] < min) { min = d[i]; idx2 = i; } } Z2.append(prev_node, idx2, min); } } template static void generic_linkage_vector(const t_index N, t_dissimilarity & dist, cluster_result & Z2) { /* N: integer, number of data points dist: function pointer to the metric Z2: output data structure This algorithm is valid for the distance update methods "Ward", "centroid" and "median" only! */ const t_index N_1 = N-1; t_index i, j; // loop variables t_index idx1, idx2; // row and column indices auto_array_ptr n_nghbr(N_1); // array of nearest neighbors auto_array_ptr mindist(N_1); // distances to the nearest neighbors auto_array_ptr row_repr(N); // row_repr[i]: node number that the // i-th row represents doubly_linked_list active_nodes(N); binary_min_heap nn_distances(&*mindist, N_1); // minimum heap structure for // the distance to the nearest neighbor of each point t_index node1, node2; // node numbers in the output t_float min; // minimum and row index for nearest-neighbor search for (i=0; ii} D(i,j) for i in range(N-1) for (i=0; i::infinity(); t_index idx; for (idx=j=i+1; j(i,j); } if (tmp(idx1,j); for (j=active_nodes.succ[j]; j(idx1,j); if (tmp(j, idx2); if (tmp < mindist[j]) { nn_distances.update_leq(j, tmp); n_nghbr[j] = idx2; } else if (n_nghbr[j] == idx2) n_nghbr[j] = idx1; // invalidate } // Find the nearest neighbor for idx2. if (idx2(idx2,j); for (j=active_nodes.succ[j]; j(idx2, j); if (tmp < min) { min = tmp; n_nghbr[idx2] = j; } } nn_distances.update(idx2, min); } } } } template static void generic_linkage_vector_alternative(const t_index N, t_dissimilarity & dist, cluster_result & Z2) { /* N: integer, number of data points dist: function pointer to the metric Z2: output data structure This algorithm is valid for the distance update methods "Ward", "centroid" and "median" only! */ const t_index N_1 = N-1; t_index i, j=0; // loop variables t_index idx1, idx2; // row and column indices auto_array_ptr n_nghbr(2*N-2); // array of nearest neighbors auto_array_ptr mindist(2*N-2); // distances to the nearest neighbors doubly_linked_list active_nodes(N+N_1); binary_min_heap nn_distances(&*mindist, N_1, 2*N-2, 1); // minimum heap // structure for the distance to the nearest neighbor of each point t_float min; // minimum for nearest-neighbor searches // Initialize the minimal distances: // Find the nearest neighbor of each point. // n_nghbr[i] = argmin_{j>i} D(i,j) for i in range(N-1) for (i=1; i::infinity(); t_index idx; for (idx=j=0; j(i,j); } if (tmp This module provides fast hierarchical clustering routines. The "linkage" method is designed to provide a replacement for the “linkage” function and its siblings in the scipy.cluster.hierarchy module. You may use the methods in this module with the same syntax as the corresponding SciPy functions but with the benefit of much faster performance. The method "linkage_vector" performs clustering of vector data with memory- saving algorithms. Refer to the User's manual "fastcluster.pdf" for comprehensive details. It is located in the directory inst/doc/ in the source distribution and may also be obtained at . """ __all__ = ['single', 'complete', 'average', 'weighted', 'ward', 'centroid', 'median', 'linkage', 'linkage_vector'] __version_info__ = ('1', '1', '16') __version__ = '.'.join(__version_info__) from numpy import double, empty, array, ndarray, var, cov, dot, bool, \ expand_dims, ceil, sqrt from numpy.linalg import inv try: from scipy.spatial.distance import pdist except ImportError: def pdist(*args, **kwargs): raise ImportError('The fastcluster.linkage function cannot process ' 'vector data since the function ' 'scipy.partial.distance.pdist could not be ' 'imported.') from _fastcluster import linkage_wrap, linkage_vector_wrap def single(D): '''Single linkage clustering (alias). See the help on the “linkage” function for further information.''' return linkage(D, method='single') def complete(D): '''Complete linkage clustering (alias). See the help on the “linkage” function for further information.''' return linkage(D, method='complete') def average(D): '''Hierarchical clustering with the “average” distance update formula (alias). See the help on the “linkage” function for further information.''' return linkage(D, method='average') def weighted(D): '''Hierarchical clustering with the “weighted” distance update formula (alias). See the help on the “linkage” function for further information.''' return linkage(D, method='weighted') def ward(D): '''Hierarchical clustering with the “Ward” distance update formula (alias). See the help on the “linkage” function for further information.''' return linkage(D, method='ward') def centroid(D): '''Hierarchical clustering with the “centroid” distance update formula (alias). See the help on the “linkage” function for further information.''' return linkage(D, method='centroid') def median(D): '''Hierarchical clustering with the “median” distance update formula (alias). See the help on the “linkage” function for further information.''' return linkage(D, method='median') # This dictionary must agree with the enum method_codes in fastcluster.cpp. mthidx = {'single' : 0, 'complete' : 1, 'average' : 2, 'weighted' : 3, 'ward' : 4, 'centroid' : 5, 'median' : 6 } def linkage(X, method='single', metric='euclidean', preserve_input=True): '''Hierarchical, agglomerative clustering on a dissimilarity matrix or on Euclidean data. Apart from the argument 'preserve_input', the method has the same input parameters and output format as the functions of the same name in the module scipy.cluster.hierarchy. The argument X is preferably a NumPy array with floating point entries (X.dtype==numpy.double). Any other data format will be converted before it is processed. If X is a one-dimensional array, it is considered a condensed matrix of pairwise dissimilarities in the format which is returned by scipy.spatial.distance.pdist. It contains the flattened, upper- triangular part of a pairwise dissimilarity matrix. That is, if there are N data points and the matrix d contains the dissimilarity between the i-th and j-th observation at position d(i,j), the vector X has length N(N-1)/2 and is ordered as follows: [ d(0,1), d(0,2), ..., d(0,n-1), d(1,2), ..., d(1,n-1), ..., d(n-2,n-1) ] The 'metric' argument is ignored in case of dissimilarity input. The optional argument 'preserve_input' specifies whether the method makes a working copy of the dissimilarity vector or writes temporary data into the existing array. If the dissimilarities are generated for the clustering step only and are not needed afterward, approximately half the memory can be saved by specifying 'preserve_input=False'. Note that the input array X contains unspecified values after this procedure. It is therefore safer to write linkage(X, method="...", preserve_input=False) del X to make sure that the matrix X is not accessed accidentally after it has been used as scratch memory. (The single linkage algorithm does not write to the distance matrix or its copy anyway, so the 'preserve_input' flag has no effect in this case.) If X contains vector data, it must be a two-dimensional array with N observations in D dimensions as an (N×D) array. The preserve_input argument is ignored in this case. The specified metric is used to generate pairwise distances from the input. The following two function calls yield the same output: linkage(pdist(X, metric), method="...", preserve_input=False) linkage(X, metric=metric, method="...") The general scheme of the agglomerative clustering procedure is as follows: 1. Start with N singleton clusters (nodes) labeled 0,...,N−1, which represent the input points. 2. Find a pair of nodes with minimal distance among all pairwise distances. 3. Join the two nodes into a new node and remove the two old nodes. The new nodes are labeled consecutively N, N+1, ... 4. The distances from the new node to all other nodes is determined by the method parameter (see below). 5. Repeat N−1 times from step 2, until there is one big node, which contains all original input points. The output of linkage is stepwise dendrogram, which is represented as an (N−1)×4 NumPy array with floating point entries (dtype=numpy.double). The first two columns contain the node indices which are joined in each step. The input nodes are labeled 0,...,N−1, and the newly generated nodes have the labels N,...,2N−2. The third column contains the distance between the two nodes at each step, ie. the current minimal distance at the time of the merge. The fourth column counts the number of points which comprise each new node. The parameter method specifies which clustering scheme to use. The clustering scheme determines the distance from a new node to the other nodes. Denote the dissimilarities by d, the nodes to be joined by I, J, the new node by K and any other node by L. The symbol |I| denotes the size of the cluster I. method='single': d(K,L) = min(d(I,L), d(J,L)) The distance between two clusters A, B is the closest distance between any two points in each cluster: d(A,B) = min{ d(a,b) | a∈A, b∈B } method='complete': d(K,L) = max(d(I,L), d(J,L)) The distance between two clusters A, B is the maximal distance between any two points in each cluster: d(A,B) = max{ d(a,b) | a∈A, b∈B } method='average': d(K,L) = ( |I|·d(I,L) + |J|·d(J,L) ) / (|I|+|J|) The distance between two clusters A, B is the average distance between the points in the two clusters: d(A,B) = (|A|·|B|)^(-1) · \sum { d(a,b) | a∈A, b∈B } method='weighted': d(K,L) = (d(I,L)+d(J,L))/2 There is no global description for the distance between clusters since the distance depends on the order of the merging steps. The following three methods are intended for Euclidean data only, ie. when X contains the pairwise (non-squared!) distances between vectors in Euclidean space. The algorithm will work on any input, however, and it is up to the user to make sure that applying the methods makes sense. method='centroid': d(K,L) = ( (|I|·d(I,L) + |J|·d(J,L)) / (|I|+|J|) − |I|·|J|·d(I,J)/(|I|+|J|)^2 )^(1/2) There is a geometric interpretation: d(A,B) is the distance between the centroids (ie. barycenters) of the clusters in Euclidean space: d(A,B) = ‖c_A−c_B∥, where c_A denotes the centroid of the points in cluster A. method='median': d(K,L) = ( d(I,L)/2 + d(J,L)/2 − d(I,J)/4 )^(1/2) Define the midpoint w_K of a cluster K iteratively as w_K=k if K={k} is a singleton and as the midpoint (w_I+w_J)/2 if K is formed by joining I and J. Then we have d(A,B) = ∥w_A−w_B∥ in Euclidean space for all nodes A,B. Notice however that this distance depends on the order of the merging steps. method='ward': d(K,L) = ( ((|I|+|L)d(I,L) + (|J|+|L|)d(J,L) − |L|d(I,J)) / (|I|+|J|+|L|) )^(1/2) The global cluster dissimilarity can be expressed as d(A,B) = ( 2|A|·|B|/(|A|+|B|) )^(1/2) · ‖c_A−c_B∥, where c_A again denotes the centroid of the points in cluster A. The clustering algorithm handles infinite values correctly, as long as the chosen distance update formula makes sense. If a NaN value occurs, either in the original dissimilarities or as an updated dissimilarity, an error is raised. The linkage method does not treat NumPy's masked arrays as special and simply ignores the mask.''' X = array(X, copy=False, subok=True) if X.ndim==1: if method=='single': preserve_input = False X = array(X, dtype=double, copy=preserve_input, order='C', subok=True) NN = len(X) N = int(ceil(sqrt(NN*2))) if (N*(N-1)//2) != NN: raise ValueError('The length of the condensed distance matrix ' 'must be (k \choose 2) for k data points!') else: assert X.ndim==2 N = len(X) X = pdist(X, metric) X = array(X, dtype=double, copy=False, order='C', subok=True) Z = empty((N-1,4)) if N > 1: linkage_wrap(N, X, Z, mthidx[method]) return Z # This dictionary must agree with the enum metric_codes in fastcluster_python.cpp. mtridx = {'euclidean' : 0, 'minkowski' : 1, 'cityblock' : 2, 'seuclidean' : 3, 'sqeuclidean' : 4, 'cosine' : 5, 'hamming' : 6, 'jaccard' : 7, 'chebychev' : 8, 'canberra' : 9, 'braycurtis' : 10, 'mahalanobis' : 11, 'yule' : 12, 'matching' : 13, 'sokalmichener' : 13, # an alias for 'matching' 'dice' : 14, 'rogerstanimoto' : 15, 'russellrao' : 16, 'sokalsneath' : 17, 'kulsinski' : 18, 'USER' : 19, } booleanmetrics = ('yule', 'matching', 'dice', 'kulsinski', 'rogerstanimoto', 'sokalmichener', 'russellrao', 'sokalsneath', 'kulsinski') def linkage_vector(X, method='single', metric='euclidean', extraarg=None): '''Hierarchical (agglomerative) clustering on Euclidean data. Compared to the 'linkage' method, 'linkage_vector' uses a memory-saving algorithm. While the linkage method requires Θ(N^2) memory for clustering of N points, this method needs Θ(ND) for N points in R^D, which is usually much smaller. The argument X has the same format as before, when X describes vector data, ie. it is an (N×D) array. Also the output array has the same format. The parameter method must be one of 'single', 'centroid', 'median', 'ward', ie. only for these methods there exist memory-saving algorithms currently. If 'method', is one of 'centroid', 'median', 'ward', the 'metric' must be 'euclidean'. For single linkage clustering, any dissimilarity function may be chosen. Basically, every metric which is implemented in the method scipy.spatial.distance.pdist is reimplemented here. However, the metrics differ in some instances since a number of mistakes and typos (both in the code and in the documentation) were corrected in the fastcluster package. Therefore, the available metrics with their definitions are listed below as a reference. The symbols u and v mostly denote vectors in R^D with coordinates u_j and v_j respectively. See below for additional metrics for Boolean vectors. Unless otherwise stated, the input array X is converted to a floating point array (X.dtype==numpy.double) if it does not have already the required data type. Some metrics accept Boolean input; in this case this is stated explicitly below. If a NaN value occurs, either in the original dissimilarities or as an updated dissimilarity, an error is raised. In principle, the clustering algorithm handles infinite values correctly, but the user is advised to carefully check the behavior of the metric and distance update formulas under these circumstances. The distance formulas combined with the clustering in the 'linkage_vector' method do not have specified behavior if the data X contains infinite or NaN values. Also, the masks in NumPy’s masked arrays are simply ignored. metric='euclidean': Euclidean metric, L_2 norm d(u,v) = ∥u−v∥ = ( \sum_j { (u_j−v_j)^2 } )^(1/2) metric='sqeuclidean': squared Euclidean metric d(u,v) = ∥u−v∥^2 = \sum_j { (u_j−v_j)^2 } metric='seuclidean': standardized Euclidean metric d(u,v) = ( \sum_j { (u_j−v_j)^2 / V_j } )^(1/2) The vector V=(V_0,...,V_{D−1}) is given as the 'extraarg' argument. If no 'extraarg' is given, V_j is by default the unbiased sample variance of all observations in the j-th coordinate: V_j = Var_i (X(i,j) ) = 1/(N−1) · \sum_i ( X(i,j)^2 − μ(X_j)^2 ) (Here, μ(X_j) denotes as usual the mean of X(i,j) over all rows i.) metric='mahalanobis': Mahalanobis distance d(u,v) = ( transpose(u−v) V (u−v) )^(1/2) Here, V=extraarg, a (D×D)-matrix. If V is not specified, the inverse of the covariance matrix numpy.linalg.inv(numpy.cov(X, rowvar=False)) is used. metric='cityblock': the Manhattan distance, L_1 norm d(u,v) = \sum_j |u_j−v_j| metric='chebychev': the supremum norm, L_∞ norm d(u,v) = max_j { |u_j−v_j| } metric='minkowski': the L_p norm d(u,v) = ( \sum_j |u_j−v_j|^p ) ^(1/p) This metric coincides with the cityblock, euclidean and chebychev metrics for p=1, p=2 and p=∞ (numpy.inf), respectively. The parameter p is given as the 'extraarg' argument. metric='cosine' d(u,v) = 1 − ⟨u,v⟩ / (∥u∥·∥v∥) = 1 − (\sum_j u_j·v_j) / ( (\sum u_j^2)(\sum v_j^2) )^(1/2) metric='correlation': This method first mean-centers the rows of X and then applies the 'cosine' distance. Equivalently, the correlation distance measures 1 − (Pearson’s correlation coefficient). d(u,v) = 1 − ⟨u−μ(u),v−μ(v)⟩ / (∥u−μ(u)∥·∥v−μ(v)∥) metric='canberra' d(u,v) = \sum_j ( |u_j−v_j| / (|u_j|+|v_j|) ) Summands with u_j=v_j=0 contribute 0 to the sum. metric='braycurtis' d(u,v) = (\sum_j |u_j-v_j|) / (\sum_j |u_j+v_j|) metric=(user function): The parameter metric may also be a function which accepts two NumPy floating point vectors and returns a number. Eg. the Euclidean distance could be emulated with fn = lambda u, v: numpy.sqrt(((u-v)*(u-v)).sum()) linkage_vector(X, method='single', metric=fn) This method, however, is much slower than the build-in function. metric='hamming': The Hamming distance accepts a Boolean array (X.dtype==bool) for efficient storage. Any other data type is converted to numpy.double. d(u,v) = |{j | u_j≠v_j }| metric='jaccard': The Jaccard distance accepts a Boolean array (X.dtype==bool) for efficient storage. Any other data type is converted to numpy.double. d(u,v) = |{j | u_j≠v_j }| / |{j | u_j≠0 or v_j≠0 }| d(0,0) = 0 Python represents True by 1 and False by 0. In the Boolean case, the Jaccard distance is therefore: d(u,v) = |{j | u_j≠v_j }| / |{j | u_j ∨ v_j }| The following metrics are designed for Boolean vectors. The input array is converted to the 'bool' data type if it is not Boolean already. Use the following abbreviations to count the number of True/False combinations: a = |{j | u_j ∧ v_j }| b = |{j | u_j ∧ (¬v_j) }| c = |{j | (¬u_j) ∧ v_j }| d = |{j | (¬u_j) ∧ (¬v_j) }| Recall that D denotes the number of dimensions, hence D=a+b+c+d. metric='yule' d(u,v) = 2bc / (ad+bc) metric='dice': d(u,v) = (b+c) / (2a+b+c) d(0,0) = 0 metric='rogerstanimoto': d(u,v) = 2(b+c) / (b+c+D) metric='russellrao': d(u,v) = (b+c+d) / D metric='sokalsneath': d(u,v) = 2(b+c)/ ( a+2(b+c)) d(0,0) = 0 metric='kulsinski' d(u,v) = (b/(a+b) + c/(a+c)) / 2 metric='matching': d(u,v) = (b+c)/D Notice that when given a Boolean array, the 'matching' and 'hamming' distance are the same. The 'matching' distance formula, however, converts every input to Boolean first. Hence, the vectors (0,1) and (0,2) have zero 'matching' distance since they are both converted to (False, True) but the Hamming distance is 0.5. metric='sokalmichener' is an alias for 'matching'.''' if method=='single': assert metric!='USER' if metric in ('hamming', 'jaccard'): X = array(X, copy=False, subok=True) dtype = bool if X.dtype==bool else double else: dtype = bool if metric in booleanmetrics else double X = array(X, dtype=dtype, copy=False, order='C', subok=True) else: assert metric=='euclidean' X = array(X, dtype=double, copy=(method=='ward'), order='C', subok=True) assert X.ndim==2 N = len(X) Z = empty((N-1,4)) if metric=='seuclidean': if extraarg is None: extraarg = var(X, axis=0, ddof=1) elif metric=='mahalanobis': if extraarg is None: extraarg = inv(cov(X, rowvar=False)) # instead of the inverse covariance matrix, pass the matrix product # with the data matrix! extraarg = array(dot(X,extraarg),dtype=double, copy=False, order='C', subok=True) elif metric=='correlation': X = X-expand_dims(X.mean(axis=1),1) metric='cosine' elif not isinstance(metric, str): assert extraarg is None metric, extraarg = 'USER', metric elif metric!='minkowski': assert extraarg is None if N > 1: linkage_vector_wrap(X, Z, mthidx[method], mtridx[metric], extraarg) return Z fastcluster/src/python/setup.py0000644000176200001440000001326212254003115016430 0ustar liggesusers#!/usr/bin/env python # -*- coding: utf-8 -*- import sys if sys.hexversion < 0x03000000: # uniform unicode handling for both Python 2.x and 3.x def u(x): return x.decode('utf-8') def textfileopen(filename): return open(filename, mode='r') else: def u(x): return x def textfileopen(filename): return open(filename, mode='r', encoding='utf_8') u(''' fastcluster: Fast hierarchical clustering routines for R and Python Copyright © 2011 Daniel Müllner ''') #import distutils.debug #distutils.debug.DEBUG = 'yes' from numpy.distutils.core import setup, Extension with textfileopen('fastcluster.py') as f: for line in f: if line.find('__version_info__ =')==0: version = '.'.join(line.split("'")[1:-1:2]) break print('Version: ' + version) setup(name='fastcluster', \ version=version, \ py_modules=['fastcluster'], \ description='Fast hierarchical clustering routines for R and Python.', \ long_description=u(""" This library provides Python functions for hierarchical clustering. It generates hierarchical clusters from distance matrices or from vector data. Part of this module is intended to replace the functions linkage, single, complete, average, weighted, centroid, median, ward in the module scipy.cluster.hierarchy with the same functionality but much faster algorithms. Moreover, the function 'linkage_vector' provides memory-efficient clustering for vector data. The interface is very similar to MATLAB's Statistics Toolbox API to make code easier to port from MATLAB to Python/Numpy. The core implementation of this library is in C++ for efficiency. Installation files for Windows are provided by Christoph Gohlke on his `web page `_. **The fastcluster package is considered stable and will undergo few changes from now on. If some years from now there have not been any updates, this does not necessarily mean that the package is unmaintained but maybe it just was not necessary to correct anything. Of course, please still report potential bugs and incompatibilities to daniel@danifold.net.** Reference: Daniel Müllner, *fastcluster: Fast Hierarchical, Agglomerative Clustering Routines for R and Python*, Journal of Statistical Software, **53** (2013), no. 9, 1–18, http://www.jstatsoft.org/v53/i09/. """), requires=['numpy'], provides=['fastcluster'], ext_modules=[Extension('_fastcluster', ['../fastcluster_python.cpp'], # Feel free to uncomment the line below if you use the GCC. # This switches to more aggressive optimization and turns # more warning switches on. No warning should appear in # the compilation process. # # Also, the author's Python distribution generates debug # symbols by default. This can be turned off, resulting a in # much smaller compiled library. # # Optimization #extra_compile_args=['-O2', '-g0', '-march=native', '-mtune=native', '-fno-math-errno'], # # List of all warning switches, somewhere from stackoverflow.com #extra_compile_args=['-Wall', '-Weffc++', '-Wextra', '-Wall', '-Wcast-align', '-Wchar-subscripts', '-Wcomment', '-Wconversion', '-Wsign-conversion', '-Wdisabled-optimization', '-Wfloat-equal', '-Wformat', '-Wformat=2', '-Wformat-nonliteral', '-Wformat-security', '-Wformat-y2k', '-Wimport', '-Winit-self', '-Winline', '-Winvalid-pch', '-Wunsafe-loop-optimizations', '-Wmissing-braces', '-Wmissing-field-initializers', '-Wmissing-format-attribute', '-Wmissing-include-dirs', '-Wmissing-noreturn', '-Wpacked', '-Wparentheses', '-Wpointer-arith', '-Wredundant-decls', '-Wreturn-type', '-Wsequence-point', '-Wshadow', '-Wsign-compare', '-Wstack-protector', '-Wstrict-aliasing', '-Wstrict-aliasing=2', '-Wswitch', '-Wswitch-enum', '-Wtrigraphs', '-Wuninitialized', '-Wunknown-pragmas', '-Wunreachable-code', '-Wunused', '-Wunused-function', '-Wunused-label', '-Wunused-parameter', '-Wunused-value', '-Wunused-variable', '-Wvariadic-macros', '-Wvolatile-register-var', '-Wwrite-strings', '-Wlong-long', '-Wpadded', '-Wcast-qual', '-Wswitch-default', '-Wnon-virtual-dtor', '-Wold-style-cast', '-Woverloaded-virtual', '-Waggregate-return', '-Werror'], # # Linker optimization #extra_link_args=['-Wl,--strip-all'], )], keywords=['dendrogram', 'linkage', 'cluster', 'agglomerative', 'hierarchical', 'hierarchy', 'ward'], author=u("Daniel Müllner"), author_email="daniel@danifold.net", license="BSD ", classifiers = ["Topic :: Scientific/Engineering :: Information Analysis", "Topic :: Scientific/Engineering :: Artificial Intelligence", "Topic :: Scientific/Engineering :: Bio-Informatics", "Topic :: Scientific/Engineering :: Mathematics", "Programming Language :: Python", "Programming Language :: Python :: 2", "Programming Language :: Python :: 3", "Programming Language :: C++", "Operating System :: OS Independent", "License :: OSI Approved :: BSD License", "License :: OSI Approved :: GNU General Public License v2 (GPLv2)", "Intended Audience :: Science/Research", "Development Status :: 5 - Production/Stable"], url = 'http://danifold.net', ) fastcluster/src/python/test/0000755000176200001440000000000012453206217015702 5ustar liggesusersfastcluster/src/python/test/test.py0000644000176200001440000001335112453206217017236 0ustar liggesusers#!/usr/bin/env python # -*- coding: utf-8 -*- import sys if sys.hexversion < 0x03000000: # uniform unicode handling for both Python 2.x and 3.x def u(x): return x.decode('utf-8') else: def u(x): return x print(u('''Test program for the 'fastcluster' package. Copyright (c) 2011 Daniel Müllner, If everything is OK, the test program will run forever, without an error message. ''')) import fastcluster as fc import numpy as np from scipy.spatial.distance import pdist, squareform import math import sys version = '1.1.16' if fc.__version__ != version: raise ValueError('Wrong module version: {} instead of {}.'.format(fc.__version__, version)) import atexit def print_seed(): print("Seed: {0}".format(seed)) atexit.register(print_seed) seed = np.random.randint(0,1e9) np.random.seed(seed) #abstol = 1e-14 # absolute tolerance rtol = 1e-14 # relative tolerance # NaN values are used in computations. Do not warn about them. np.seterr(invalid='ignore') def test_all(): D2 = D.copy() for method in ['single', 'complete', 'average', 'weighted', 'ward', 'centroid', 'median']: Z2 = fc.linkage(D, method) if np.any(D2!=D): raise AssertionError('Input array was corrupted.') test(Z2, method) def test(Z2, method): sys.stdout.write("Method: " + method + "...") I = np.array(Z2[:,:2], dtype=int) Ds = squareform(D) n = len(Ds) row_repr = np.arange(2*n-1) row_repr[n:] = -1 size = np.ones(n, dtype=np.int) np.fill_diagonal(Ds, np.nan) mins = np.empty(n-1) for i in range(n-1): for j in range(n-1): # Suppress warning is all distances are NaN. if np.all(np.isnan(Ds[j,j+1:])): mins[j] = np.nan else: mins[j] = np.nanmin(Ds[j,j+1:]) gmin = np.nanmin(mins) if (Z2[i,2]-gmin) > max(abs(Z2[i,2]),abs(gmin))*rtol: raise AssertionError('Not the global minimum in step {2}: {0}, {1}'.\ format(Z2[i,2], gmin, i)) i1, i2 = row_repr[I[i,:]] if (i1<0): raise AssertionError('Negative index i1.') if (i2<0): raise AssertionError('Negative index i2.') if I[i,0]>=I[i,1]: raise AssertionError('Convention violated.') if i1>i2: i1, i2 = i2, i1 if (Ds[i1,i2]-gmin) > max(abs(Ds[i1,i2]),abs(gmin))*rtol: raise AssertionError('The global minimum is not at the right place: ' '({0}, {1}): {2} != {3}. Difference: {4}'.\ format(i1, i2, Ds[i1, i2], gmin, Ds[i1, i2]-gmin)) s1 = size[i1] s2 = size[i2] S = float(s1+s2) if method=='single': if i1>0: # mostly unnecessary; workaround for a bug/feature in NumPy 1.7.0.dev # see http://projects.scipy.org/numpy/ticket/2078 Ds[:i1,i2] = np.min( Ds[:i1,(i1,i2)],axis=1) Ds[i1:i2,i2] = np.minimum(Ds[i1,i1:i2],Ds[i1:i2,i2]) Ds[i2,i2:] = np.min( Ds[(i1,i2),i2:],axis=0) elif method=='complete': if i1>0: Ds[:i1,i2] = np.max( Ds[:i1,(i1,i2)],axis=1) Ds[i1:i2,i2] = np.maximum(Ds[i1,i1:i2],Ds[i1:i2,i2]) Ds[i2,i2:] = np.max( Ds[(i1,i2),i2:],axis=0) elif method=='average': Ds[:i1,i2] = ( Ds[:i1,i1]*s1 + Ds[:i1,i2]*s2 ) / S Ds[i1:i2,i2] = ( Ds[i1,i1:i2]*s1 + Ds[i1:i2,i2]*s2 ) / S Ds[i2,i2:] = ( Ds[i1,i2:]*s1 + Ds[i2,i2:]*s2 ) / S elif method=='weighted': if i1>0: Ds[:i1,i2] = np.mean( Ds[:i1,(i1,i2)],axis=1) Ds[i1:i2,i2] = ( Ds[i1,i1:i2] + Ds[i1:i2,i2] ) *.5 Ds[i2,i2:] = np.mean( Ds[(i1,i2),i2:],axis=0) elif method=='ward': Ds[:i1,i2] = np.sqrt((np.square(Ds[:i1,i1])*(s1+size[:i1]) -gmin*gmin*size[:i1]+np.square(Ds[:i1,i2]) *(s2+size[:i1]))/(S+size[:i1])) Ds[i1:i2,i2] = np.sqrt((np.square(Ds[i1,i1:i2])*(s1+size[i1:i2]) -gmin*gmin*size[i1:i2]+np.square(Ds[i1:i2,i2]) *(s2+size[i1:i2]))/(S+size[i1:i2])) Ds[i2,i2:] = np.sqrt((np.square(Ds[i1,i2:])*(s1+size[i2:]) -gmin*gmin*size[i2:]+np.square(Ds[i2,i2:]) *(s2+size[i2:]))/(S+size[i2:])) elif method=='centroid': Ds[:i1,i2] = np.sqrt((np.square(Ds[:i1,i1])*s1 +np.square(Ds[:i1,i2])*s2)*S-gmin*gmin*s1*s2) / S Ds[i1:i2,i2] = np.sqrt((np.square(Ds[i1,i1:i2])*s1 +np.square(Ds[i1:i2,i2])*s2)*S-gmin*gmin*s1*s2) / S Ds[i2,i2:] = np.sqrt((np.square(Ds[i1,i2:])*s1 +np.square(Ds[i2,i2:])*s2)*S-gmin*gmin*s1*s2) / S elif method=='median': Ds[:i1,i2] = np.sqrt((np.square(Ds[:i1,i1])+\ np.square(Ds[:i1,i2]))*2-gmin*gmin)*.5 Ds[i1:i2,i2] = np.sqrt((np.square(Ds[i1,i1:i2])+\ np.square(Ds[i1:i2,i2]))*2-gmin*gmin)*.5 Ds[i2,i2:] = np.sqrt((np.square(Ds[i1,i2:])+\ np.square(Ds[i2,i2:]))*2-gmin*gmin)*.5 else: raise ValueError('Unknown method.') Ds[i1, i1:n] = np.nan Ds[:i1, i1] = np.nan row_repr[n+i] = i2 size[i2] = S print('OK.') while True: dim = np.random.random_integers(2,20) n = np.random.random_integers(2,100) print('Dimension: {0}'.format(dim)) print('Number of points: {0}'.format(n)) D = pdist(np.random.randn(n,dim)) try: print('Real distance values:') test_all() D = np.round(D*n/4) print('Integer distance values:') test_all() except AssertionError as E: print(E) print(squareform(D)) sys.exit() fastcluster/src/python/test/nantest.py0000644000176200001440000000303612453206174017734 0ustar liggesusers'''Test whether the fastcluster package correctly recognizes NaN values and raises a FloatingPointError.''' import numpy as np import fastcluster version = '1.1.16' if fastcluster.__version__ != version: raise ValueError('Wrong module version: {} instead of {}.'.format(fastcluster.__version__, version)) n = np.random.random_integers(2,100) # Part 1: distance matrix input N = n*(n-1)//2 D = np.random.rand(N) # Insert a single NaN value pos = np.random.randint(N) D[pos] = np.nan for method in ['single', 'complete', 'average', 'weighted', 'ward', 'centroid', 'median']: try: fastcluster.linkage(D, method=method) raise AssertionError('fastcluster did not detect a NaN value!') except FloatingPointError: pass # Next: the original array does not contain a NaN, but a NaN occurs # as an updated distance. for method in ['average', 'weighted', 'ward', 'centroid', 'median']: try: fastcluster.linkage([np.inf,-np.inf,-np.inf], method=method) raise AssertionError('fastcluster did not detect a NaN value!') except FloatingPointError: pass # Part 2: vector input dim = np.random.random_integers(2,12) X = np.random.rand(n,dim) pos = (np.random.randint(n), np.random.randint(dim)) # Insert a single NaN coordinate X[pos] = np.nan for method in ['single', 'ward', 'centroid', 'median']: try: fastcluster.linkage_vector(X, method=method) raise AssertionError('fastcluster did not detect a NaN value!') except FloatingPointError: pass print('OK.') fastcluster/src/python/test/vectortest.py0000644000176200001440000002110012453206202020442 0ustar liggesusers#!/usr/bin/env python # -*- coding: utf-8 -*- # TBD test single on integer matrices for hamming/jaccard import sys if sys.hexversion < 0x03000000: # uniform unicode handling for both Python 2.x and 3.x def u(x): return x.decode('utf-8') else: def u(x): return x print(u('''Test program for the 'fastcluster' package. Copyright (c) 2011 Daniel Müllner, If everything is OK, the test program will run forever, without an error message. ''')) import fastcluster as fc import numpy as np from scipy.spatial.distance import pdist, squareform import math import sys version = '1.1.16' if fc.__version__ != version: raise ValueError('Wrong module version: {} instead of {}.'.format(fc.__version__, version)) import atexit def print_seed(): print("Seed: {0}".format(seed)) atexit.register(print_seed) seed = np.random.randint(0,1e9) print_seed() np.random.seed(seed) abstol = 1e-14 # absolute tolerance rtol = 1e-13 # relative tolerance # NaN values are used in computations. Do not warn about them. np.seterr(invalid='ignore') def correct_for_zero_vectors(D, pcd, metric): # Correct some metrics: we want the distance from the zero vector # to itself to be 0, not NaN. if metric in ('jaccard', 'dice', 'sokalsneath'): z = np.flatnonzero(np.all(pcd==0, axis=1)) if len(z): DD = squareform(D) DD[np.ix_(z, z)] = 0 D = squareform(DD) return D def test_all(n,dim): method = 'single' # metrics for boolean vectors pcd = np.array(np.random.random_integers(0,1,(n,dim)), dtype=np.bool) pcd2 = pcd.copy() for metric in ('hamming', 'jaccard', 'yule', 'matching', 'dice', 'rogerstanimoto', #'sokalmichener', # exclude, bug in Scipy # http://projects.scipy.org/scipy/ticket/1486 'russellrao', 'sokalsneath', #'kulsinski' # exclude, bug in Scipy # http://projects.scipy.org/scipy/ticket/1484 ): sys.stdout.write("Metric: " + metric + "...") D = pdist(pcd, metric) D = correct_for_zero_vectors(D, pcd, metric) try: Z2 = fc.linkage_vector(pcd, method, metric) except FloatingPointError: # If linkage_vector reported a NaN dissimilarity value, # check whether the distance matrix really contains NaN. if np.any(np.isnan(D)): print("Skip this test: NaN dissimilarity value.") continue else: raise AssertionError('"linkage_vector" erroneously reported NaN.') if np.any(pcd2!=pcd): raise AssertionError('Input array was corrupted.', pcd) test(Z2, method, D) # metrics for real vectors bound = math.sqrt(n) pcd = np.random.random_integers(-bound,bound,(n,dim)) for metric in ['euclidean', 'sqeuclidean', 'cityblock', 'chebychev', 'minkowski', 'cosine', 'correlation', 'hamming', 'jaccard', 'canberra', # canberra: see bug in older Scipy versions # http://projects.scipy.org/scipy/ticket/1430 'braycurtis', 'seuclidean', 'mahalanobis', 'user']: sys.stdout.write("Metric: " + metric + "...") if metric=='minkowski': p = np.random.uniform(1.,10.) sys.stdout.write("p: " + str(p) + "...") D = pdist(pcd, metric, p) Z2 = fc.linkage_vector(pcd, method, metric, p) elif metric=='user': # Euclidean metric as a user function fn = (lambda u, v: np.sqrt(((u-v)*(u-v).T).sum())) D = pdist(pcd, fn) Z2 = fc.linkage_vector(pcd, method, fn) else: D = pdist(pcd, metric) D = correct_for_zero_vectors(D, pcd, metric) try: Z2 = fc.linkage_vector(pcd, method, metric) except FloatingPointError: if np.any(np.isnan(D)): print("Skip this test: NaN dissimilarity value.") continue else: raise AssertionError( '"linkage_vector" erroneously reported NaN.') test(Z2, method, D) D = pdist(pcd) for method in ['ward', 'centroid', 'median']: Z2 = fc.linkage_vector(pcd, method) test(Z2, method, D) def test(Z2, method, D): sys.stdout.write("Method: " + method + "...") I = np.array(Z2[:,:2], dtype=int) Ds = squareform(D) n = len(Ds) row_repr = np.arange(2*n-1) row_repr[n:] = -1 size = np.ones(n, dtype=np.int) np.fill_diagonal(Ds, np.nan) mins = np.empty(n-1) for i in range(n-1): for j in range(n-1): mins[j] = np.nanmin(Ds[j,j+1:]) gmin = np.nanmin(mins) if abs(Z2[i,2]-gmin) > max(abs(Z2[i,2]),abs(gmin))*rtol and \ abs(Z2[i,2]-gmin)>abstol: raise AssertionError( 'Not the global minimum in step {2}: {0}, {1}'. format(Z2[i,2], gmin,i), squareform(D)) i1, i2 = row_repr[I[i,:]] if (i1<0): raise AssertionError('Negative index i1.', squareform(D)) if (i2<0): raise AssertionError('Negative index i2.', squareform(D)) if I[i,0]>=I[i,1]: raise AssertionError('Convention violated.', squareform(D)) if i1>i2: i1, i2 = i2, i1 if abs(Ds[i1,i2]-gmin) > max(abs(Ds[i1,i2]),abs(gmin))*rtol and \ abs(Ds[i1,i2]-gmin)>abstol: raise AssertionError( 'The global minimum is not at the right place in step {5}: ' '({0}, {1}): {2} != {3}. Difference: {4}' .format(i1, i2, Ds[i1, i2], gmin, Ds[i1, i2]-gmin, i), squareform(D)) s1 = size[i1] s2 = size[i2] S = float(s1+s2) if method=='single': if i1>0: # mostly unnecessary; workaround for a bug/feature in NumPy # 1.7.0.dev, see http://projects.scipy.org/numpy/ticket/2078 Ds[:i1,i2] = np.min( Ds[:i1,(i1,i2)],axis=1) Ds[i1:i2,i2] = np.minimum(Ds[i1,i1:i2],Ds[i1:i2,i2]) Ds[i2,i2:] = np.min( Ds[(i1,i2),i2:],axis=0) elif method=='complete': if i1>0: Ds[:i1,i2] = np.max( Ds[:i1,(i1,i2)],axis=1) Ds[i1:i2,i2] = np.maximum(Ds[i1,i1:i2],Ds[i1:i2,i2]) Ds[i2,i2:] = np.max( Ds[(i1,i2),i2:],axis=0) elif method=='average': Ds[:i1,i2] = ( Ds[:i1,i1]*s1 + Ds[:i1,i2]*s2 ) / S Ds[i1:i2,i2] = ( Ds[i1,i1:i2]*s1 + Ds[i1:i2,i2]*s2 ) / S Ds[i2,i2:] = ( Ds[i1,i2:]*s1 + Ds[i2,i2:]*s2 ) / S elif method=='weighted': if i1>0: Ds[:i1,i2] = np.mean( Ds[:i1,(i1,i2)],axis=1) Ds[i1:i2,i2] = ( Ds[i1,i1:i2] + Ds[i1:i2,i2] )*.5 Ds[i2,i2:] = np.mean( Ds[(i1,i2),i2:],axis=0) elif method=='ward': Ds[:i1,i2] = np.sqrt((np.square(Ds[:i1,i1])*(s1+size[:i1]) -gmin*gmin*size[:i1] +np.square(Ds[:i1,i2])*(s2+size[:i1]))/(S+size[:i1])) Ds[i1:i2,i2] = np.sqrt((np.square(Ds[i1,i1:i2])*(s1+size[i1:i2]) -gmin*gmin*size[i1:i2] +np.square(Ds[i1:i2,i2])*(s2+size[i1:i2])) /(S+size[i1:i2])) Ds[i2,i2:] = np.sqrt((np.square(Ds[i1,i2:])*(s1+size[i2:]) -gmin*gmin*size[i2:] +np.square(Ds[i2,i2:])*(s2+size[i2:]))/(S+size[i2:])) elif method=='centroid': Ds[:i1,i2] = np.sqrt((np.square(Ds[:i1,i1])*s1 +np.square(Ds[:i1,i2])*s2)*S-gmin*gmin*s1*s2) / S Ds[i1:i2,i2] = np.sqrt((np.square(Ds[i1,i1:i2])*s1 +np.square(Ds[i1:i2,i2])*s2)*S-gmin*gmin*s1*s2) / S Ds[i2,i2:] = np.sqrt((np.square(Ds[i1,i2:])*s1 +np.square(Ds[i2,i2:])*s2)*S-gmin*gmin*s1*s2) / S elif method=='median': Ds[:i1,i2] = np.sqrt((np.square(Ds[:i1,i1]) +np.square(Ds[:i1,i2]))*2-gmin*gmin)*.5 Ds[i1:i2,i2] = np.sqrt((np.square(Ds[i1,i1:i2]) +np.square(Ds[i1:i2,i2]))*2-gmin*gmin)*.5 Ds[i2,i2:] = np.sqrt((np.square(Ds[i1,i2:]) +np.square(Ds[i2,i2:]))*2-gmin*gmin)*.5 else: raise ValueError('Unknown method.') Ds[i1, i1:n] = np.inf Ds[:i1, i1] = np.inf row_repr[n+i] = i2 size[i2] = S print('OK.') while True: dim = np.random.random_integers(2,12) n = np.random.random_integers(max(2*dim,5),200) print('Dimension: {0}'.format(dim)) print('Number of points: {0}'.format(n)) try: test_all(n,dim) except AssertionError as E: print(E.args[0]) print(E.args[1]) sys.exit() fastcluster/src/fastcluster_R.cpp0000644000176200001440000006412212470717051016735 0ustar liggesusers/* fastcluster: Fast hierarchical clustering routines for R and Python Copyright © 2011 Daniel Müllner */ #if __GNUC__ > 4 || (__GNUC__ == 4 && (__GNUC_MINOR__ >= 6)) #define HAVE_DIAGNOSTIC 1 #endif #if HAVE_DIAGNOSTIC #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wredundant-decls" #pragma GCC diagnostic ignored "-Wpadded" #endif #include #include #include // for R_pow #if HAVE_DIAGNOSTIC #pragma GCC diagnostic pop #endif #define fc_isnan(X) ((X)!=(X)) // There is ISNAN but it is so much slower on my x86_64 system with GCC! #include // for std::ptrdiff_t #include // for std::numeric_limits<...>::infinity() #include // for std::stable_sort #include // for std::runtime_error #include // for std::string #include // for std::bad_alloc #include // for std::exception #include "fastcluster.cpp" /* Since the public interface is given by the Python respectively R interface, * we do not want other symbols than the interface initalization routines to be * visible in the shared object file. The "visibility" switch is a GCC concept. * Hiding symbols keeps the relocation table small and decreases startup time. * See http://gcc.gnu.org/wiki/Visibility */ #if HAVE_VISIBILITY #pragma GCC visibility push(hidden) #endif /* Helper function: order the nodes so that they can be displayed nicely in a dendrogram. This is used for the 'order' field in the R output. */ #if HAVE_DIAGNOSTIC #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wpadded" #endif struct pos_node { t_index pos; int node; }; #if HAVE_DIAGNOSTIC #pragma GCC diagnostic pop #endif void order_nodes(const int N, const int * const merge, const t_index * const node_size, int * const order) { /* Parameters: N : number of data points merge : (N-1)×2 array which specifies the node indices which are merged in each step of the clustering procedure. Negative entries -1...-N point to singleton nodes, while positive entries 1...(N-1) point to nodes which are themselves parents of other nodes. node_size : array of node sizes - makes it easier order : output array of size N Runtime: Θ(N) */ auto_array_ptr queue(N/2); int parent; int child; t_index pos = 0; queue[0].pos = 0; queue[0].node = N-2; t_index idx = 1; do { --idx; pos = queue[idx].pos; parent = queue[idx].node; // First child child = merge[parent]; if (child<0) { // singleton node, write this into the 'order' array. order[pos] = -child; ++pos; } else { /* compound node: put it on top of the queue and decompose it in a later iteration. */ queue[idx].pos = pos; queue[idx].node = child-1; // convert index-1 based to index-0 based ++idx; pos += node_size[child-1]; } // Second child child = merge[parent+N-1]; if (child<0) { order[pos] = -child; } else { queue[idx].pos = pos; queue[idx].node = child-1; ++idx; } } while (idx>0); } #define size_(r_) ( ((r_ void generate_R_dendrogram(int * const merge, double * const height, int * const order, cluster_result & Z2, const int N) { // The array "nodes" is a union-find data structure for the cluster // identites (only needed for unsorted cluster_result input). union_find nodes(sorted ? 0 : N); if (!sorted) { std::stable_sort(Z2[0], Z2[N-1]); } t_index node1, node2; auto_array_ptr node_size(N-1); for (t_index i=0; inode1; node2 = Z2[i]->node2; } else { node1 = nodes.Find(Z2[i]->node1); node2 = nodes.Find(Z2[i]->node2); // Merge the nodes in the union-find data structure by making them // children of a new node. nodes.Union(node1, node2); } // Sort the nodes in the output array. if (node1>node2) { t_index tmp = node1; node1 = node2; node2 = tmp; } /* Conversion between labeling conventions. Input: singleton nodes 0,...,N-1 compound nodes N,...,2N-2 Output: singleton nodes -1,...,-N compound nodes 1,...,N */ merge[i] = (node1(node1)-1 : static_cast(node1)-N+1; merge[i+N-1] = (node2(node2)-1 : static_cast(node2)-N+1; height[i] = Z2[i]->dist; node_size[i] = size_(node1) + size_(node2); } order_nodes(N, merge, node_size, order); } /* R interface code */ enum { METRIC_R_EUCLIDEAN = 0, METRIC_R_MAXIMUM = 1, METRIC_R_MANHATTAN = 2, METRIC_R_CANBERRA = 3, METRIC_R_BINARY = 4, METRIC_R_MINKOWSKI = 5 }; #if HAVE_DIAGNOSTIC #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wpadded" #endif class R_dissimilarity { private: t_float * Xa; std::ptrdiff_t dim; // std::ptrdiff_t saves many statis_cast<> in products t_float * members; void (cluster_result::*postprocessfn) (const t_float) const; t_float postprocessarg; t_float (R_dissimilarity::*distfn) (const t_index, const t_index) const; auto_array_ptr row_repr; int N; // no default constructor R_dissimilarity(); // noncopyable R_dissimilarity(R_dissimilarity const &); R_dissimilarity & operator=(R_dissimilarity const &); public: // Ignore warning about uninitialized member variables. I know what I am // doing here, and some member variables are only used for certain metrics. #if HAVE_DIAGNOSTIC #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Weffc++" #endif R_dissimilarity (t_float * const X_, const int N_, const int dim_, t_float * const members_, const unsigned char method, const unsigned char metric, const t_float p, bool make_row_repr) : Xa(X_), dim(dim_), members(members_), postprocessfn(NULL), postprocessarg(p), N(N_) { switch (method) { case METHOD_VECTOR_SINGLE: switch (metric) { case METRIC_R_EUCLIDEAN: distfn = &R_dissimilarity::sqeuclidean; postprocessfn = &cluster_result::sqrt; break; case METRIC_R_MAXIMUM: distfn = &R_dissimilarity::maximum; break; case METRIC_R_MANHATTAN: distfn = &R_dissimilarity::manhattan; break; case METRIC_R_CANBERRA: distfn = &R_dissimilarity::canberra; break; case METRIC_R_BINARY: distfn = &R_dissimilarity::dist_binary; break; case METRIC_R_MINKOWSKI: distfn = &R_dissimilarity::minkowski; postprocessfn = &cluster_result::power; break; default: throw std::runtime_error(std::string("Invalid method.")); } break; case METHOD_VECTOR_WARD: postprocessfn = &cluster_result::sqrtdouble; break; default: postprocessfn = &cluster_result::sqrt; } if (make_row_repr) { row_repr.init(2*N-1); for (t_index i=0; i*distfn)(i,j); } inline t_float X (const t_index i, const t_index j) const { // "C-style" array alignment return Xa[i*dim+j]; } inline t_float * Xptr(const t_index i, const t_index j) const { // "C-style" array alignment return Xa+i*dim+j; } void merge(const t_index i, const t_index j, const t_index newnode) const { merge_inplace(row_repr[i], row_repr[j]); row_repr[newnode] = row_repr[j]; } void merge_inplace(const t_index i, const t_index j) const { for(t_index k=0; k(i1,i2)*members[i1]*members[i2]/ \ (members[i1]+members[i2]); } inline double ward_initial(t_index const i1, t_index const i2) const { /* In the R interface, ward_initial is the same as ward. Only the Python interface has two different functions here. */ return ward(i1,i2); } // This method must not produce NaN if the input is non-NaN. inline static t_float ward_initial_conversion(const t_float min) { // identity return min; } double ward_extended(t_index i1, t_index i2) const { return ward(row_repr[i1], row_repr[i2]); } /* The following definitions and methods have been taken directly from the R source file /src/library/stats/src/distance.c in the R release 2.13.0. The code has only been adapted very slightly. (Unfortunately, the methods cannot be called directly in the R libraries since the functions are declared "static" in the above file.) Note to maintainers: If the code in distance.c changes in future R releases compared to 2.13.0, please update the definitions here, if necessary. */ // translation of variable names #define nc dim #define nr N #define x Xa #define p postprocessarg // The code from distance.c starts here #define both_FINITE(a,b) (R_FINITE(a) && R_FINITE(b)) #ifdef R_160_and_older #define both_non_NA both_FINITE #else #define both_non_NA(a,b) (!ISNAN(a) && !ISNAN(b)) #endif /* We need two variants of the Euclidean metric: one that does not check for a NaN result, which is used for the initial distances, and one which does, for the updated distances during the clustering procedure. */ // still public template double sqeuclidean(t_index const i1, t_index const i2) const { double dev, dist; int count, j; count = 0; dist = 0; double * p1 = x+i1*nc; double * p2 = x+i2*nc; for(j = 0 ; j < nc ; ++j) { if(both_non_NA(*p1, *p2)) { dev = (*p1 - *p2); if(!ISNAN(dev)) { dist += dev * dev; ++count; } } ++p1; ++p2; } if(count == 0) return NA_REAL; if(count != nc) dist /= (static_cast(count)/static_cast(nc)); //return sqrt(dist); // we take the square root later if (check_NaN) { #if HAVE_DIAGNOSTIC #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wfloat-equal" #endif if (fc_isnan(dist)) throw(nan_error()); #if HAVE_DIAGNOSTIC #pragma GCC diagnostic pop #endif } return dist; } inline double sqeuclidean_extended(t_index const i1, t_index const i2) const { return sqeuclidean(row_repr[i1], row_repr[i2]); } private: double maximum(t_index i1, t_index i2) const { double dev, dist; int count, j; count = 0; dist = -DBL_MAX; double * p1 = x+i1*nc; double * p2 = x+i2*nc; for(j = 0 ; j < nc ; ++j) { if(both_non_NA(*p1, *p2)) { dev = fabs(*p1 - *p2); if(!ISNAN(dev)) { if(dev > dist) dist = dev; ++count; } } ++p1; ++p2; } if(count == 0) return NA_REAL; return dist; } double manhattan(t_index i1, t_index i2) const { double dev, dist; int count, j; count = 0; dist = 0; double * p1 = x+i1*nc; double * p2 = x+i2*nc; for(j = 0 ; j < nc ; ++j) { if(both_non_NA(*p1, *p2)) { dev = fabs(*p1 - *p2); if(!ISNAN(dev)) { dist += dev; ++count; } } ++p1; ++p2; } if(count == 0) return NA_REAL; if(count != nc) dist /= (static_cast(count)/static_cast(nc)); return dist; } double canberra(t_index i1, t_index i2) const { double dev, dist, sum, diff; int count, j; count = 0; dist = 0; double * p1 = x+i1*nc; double * p2 = x+i2*nc; for(j = 0 ; j < nc ; ++j) { if(both_non_NA(*p1, *p2)) { sum = fabs(*p1 + *p2); diff = fabs(*p1 - *p2); if (sum > DBL_MIN || diff > DBL_MIN) { dev = diff/sum; #if HAVE_DIAGNOSTIC #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wfloat-equal" #endif if(!ISNAN(dev) || (!R_FINITE(diff) && diff == sum && /* use Inf = lim x -> oo */ (dev = 1.))) { dist += dev; ++count; } #if HAVE_DIAGNOSTIC #pragma GCC diagnostic pop #endif } } ++p1; ++p2; } if(count == 0) return NA_REAL; if(count != nc) dist /= (static_cast(count)/static_cast(nc)); return dist; } double dist_binary(t_index i1, t_index i2) const { int total, count, dist; int j; total = 0; count = 0; dist = 0; double * p1 = x+i1*nc; double * p2 = x+i2*nc; for(j = 0 ; j < nc ; ++j) { if(both_non_NA(*p1, *p2)) { if(!both_FINITE(*p1, *p2)) { // warning(_("treating non-finite values as NA")); } else { #if HAVE_DIAGNOSTIC #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wfloat-equal" #endif if(*p1 || *p2) { ++count; if( ! (*p1 && *p2) ) { ++dist; } } #if HAVE_DIAGNOSTIC #pragma GCC diagnostic pop #endif ++total; } } ++p1; ++p2; } if(total == 0) return NA_REAL; if(count == 0) return 0; return static_cast(dist) / static_cast(count); } double minkowski(t_index i1, t_index i2) const { double dev, dist; int count, j; count= 0; dist = 0; double * p1 = x+i1*nc; double * p2 = x+i2*nc; for(j = 0 ; j < nc ; ++j) { if(both_non_NA(*p1, *p2)) { dev = (*p1 - *p2); if(!ISNAN(dev)) { dist += R_pow(fabs(dev), p); ++count; } } ++p1; ++p2; } if(count == 0) return NA_REAL; if(count != nc) dist /= (static_cast(count)/static_cast(nc)); //return R_pow(dist, 1.0/p); // raise to the (1/p)-th power later return dist; } }; #if HAVE_DIAGNOSTIC #pragma GCC diagnostic pop #endif extern "C" { SEXP fastcluster(SEXP const N_, SEXP const method_, SEXP D_, SEXP members_) { SEXP r = NULL; // return value try{ /* Input checks */ // Parameter N: number of data points if (!IS_INTEGER(N_) || LENGTH(N_)!=1) Rf_error("'N' must be a single integer."); const int N = INTEGER_VALUE(N_); if (N<2) Rf_error("N must be at least 2."); const R_xlen_t NN = static_cast(N)*(N-1)/2; // Parameter method: dissimilarity index update method if (!IS_INTEGER(method_) || LENGTH(method_)!=1) Rf_error("'method' must be a single integer."); const int method = INTEGER_VALUE(method_) - 1; // index-0 based; if (methodMAX_METHOD_CODE) { Rf_error("Invalid method index."); } // Parameter members: number of members in each node auto_array_ptr members; if (method==METHOD_METR_AVERAGE || method==METHOD_METR_WARD_D || method==METHOD_METR_WARD_D2 || method==METHOD_METR_CENTROID) { members.init(N); if (Rf_isNull(members_)) { for (t_index i=0; i D__; if (method!=METHOD_METR_SINGLE) { D__.init(NN); for (R_xlen_t i=0; i(N)*(N-1)/2; ++DD) *DD *= *DD; } /* Clustering step */ cluster_result Z2(N-1); switch (method) { case METHOD_METR_SINGLE: MST_linkage_core(N, D, Z2); break; case METHOD_METR_COMPLETE: NN_chain_core(N, D__, NULL, Z2); break; case METHOD_METR_AVERAGE: NN_chain_core(N, D__, members, Z2); break; case METHOD_METR_WEIGHTED: NN_chain_core(N, D__, NULL, Z2); break; case METHOD_METR_WARD_D: case METHOD_METR_WARD_D2: NN_chain_core(N, D__, members, Z2); break; case METHOD_METR_CENTROID: generic_linkage(N, D__, members, Z2); break; case METHOD_METR_MEDIAN: generic_linkage(N, D__, NULL, Z2); break; default: throw std::runtime_error(std::string("Invalid method.")); } D__.free(); // Free the memory now members.free(); // (not strictly necessary). SEXP m; // return field "merge" PROTECT(m = NEW_INTEGER(2*(N-1))); int * const merge = INTEGER_POINTER(m); SEXP dim_m; // Specify that m is an (N-1)×2 matrix PROTECT(dim_m = NEW_INTEGER(2)); INTEGER(dim_m)[0] = N-1; INTEGER(dim_m)[1] = 2; SET_DIM(m, dim_m); SEXP h; // return field "height" PROTECT(h = NEW_NUMERIC(N-1)); double * const height = NUMERIC_POINTER(h); SEXP o; // return fiels "order' PROTECT(o = NEW_INTEGER(N)); int * const order = INTEGER_POINTER(o); if (method==METHOD_METR_WARD_D2) { Z2.sqrt(); } if (method==METHOD_METR_CENTROID || method==METHOD_METR_MEDIAN) generate_R_dendrogram(merge, height, order, Z2, N); else generate_R_dendrogram(merge, height, order, Z2, N); SEXP n; // names PROTECT(n = NEW_CHARACTER(3)); SET_STRING_ELT(n, 0, COPY_TO_USER_STRING("merge")); SET_STRING_ELT(n, 1, COPY_TO_USER_STRING("height")); SET_STRING_ELT(n, 2, COPY_TO_USER_STRING("order")); PROTECT(r = NEW_LIST(3)); // field names in the output list SET_ELEMENT(r, 0, m); SET_ELEMENT(r, 1, h); SET_ELEMENT(r, 2, o); SET_NAMES(r, n); UNPROTECT(6); // m, dim_m, h, o, r, n } // try catch (const std::bad_alloc&) { Rf_error( "Memory overflow."); } catch(const std::exception& e){ Rf_error( e.what() ); } catch(const nan_error&){ Rf_error("NaN dissimilarity value."); } #ifdef FE_INVALID catch(const fenv_error&){ Rf_error( "NaN dissimilarity value in intermediate results."); } #endif catch(...){ Rf_error( "C++ exception (unknown reason)." ); } return r; } SEXP fastcluster_vector(SEXP const method_, SEXP const metric_, SEXP X_, SEXP members_, SEXP p_) { SEXP r = NULL; // return value try{ /* Input checks */ // Parameter method: dissimilarity index update method if (!IS_INTEGER(method_) || LENGTH(method_)!=1) Rf_error("'method' must be a single integer."); int method = INTEGER_VALUE(method_) - 1; // index-0 based; if (methodMAX_METHOD_VECTOR_CODE) { Rf_error("Invalid method index."); } // Parameter metric if (!IS_INTEGER(metric_) || LENGTH(metric_)!=1) Rf_error("'metric' must be a single integer."); int metric = INTEGER_VALUE(metric_) - 1; // index-0 based; if (metric<0 || metric>5 || (method!=METHOD_VECTOR_SINGLE && metric!=0) ) { Rf_error("Invalid metric index."); } // data array PROTECT(X_ = AS_NUMERIC(X_)); SEXP dims_ = PROTECT( Rf_getAttrib( X_, R_DimSymbol ) ) ; if( dims_ == R_NilValue || LENGTH(dims_) != 2 ) { Rf_error( "Argument is not a matrix."); } const int * const dims = INTEGER(dims_); const int N = dims[0]; const int dim = dims[1]; if (N<2) Rf_error("There must be at least two data points."); // Make a working copy of the dissimilarity array // for all methods except "single". double * X__ = NUMERIC_POINTER(X_); // Copy the input array and change it from Fortran-contiguous style // to C-contiguous style. auto_array_ptr X(LENGTH(X_)); for (std::ptrdiff_t i=0; i members; if (method==METHOD_VECTOR_WARD || method==METHOD_VECTOR_CENTROID) { members.init(N); if (Rf_isNull(members_)) { for (t_index i=0; i(method), static_cast(metric), p, make_row_repr); cluster_result Z2(N-1); /* Clustering step */ switch (method) { case METHOD_VECTOR_SINGLE: MST_linkage_core_vector(N, dist, Z2); break; case METHOD_VECTOR_WARD: generic_linkage_vector(N, dist, Z2); break; case METHOD_VECTOR_CENTROID: generic_linkage_vector_alternative(N, dist, Z2); break; case METHOD_VECTOR_MEDIAN: generic_linkage_vector_alternative(N, dist, Z2); break; default: throw std::runtime_error(std::string("Invalid method.")); } X.free(); // Free the memory now members.free(); // (not strictly necessary). dist.postprocess(Z2); SEXP m; // return field "merge" PROTECT(m = NEW_INTEGER(2*(N-1))); int * const merge = INTEGER_POINTER(m); SEXP dim_m; // Specify that m is an (N-1)×2 matrix PROTECT(dim_m = NEW_INTEGER(2)); INTEGER(dim_m)[0] = N-1; INTEGER(dim_m)[1] = 2; SET_DIM(m, dim_m); SEXP h; // return field "height" PROTECT(h = NEW_NUMERIC(N-1)); double * const height = NUMERIC_POINTER(h); SEXP o; // return fiels "order' PROTECT(o = NEW_INTEGER(N)); int * const order = INTEGER_POINTER(o); if (method==METHOD_VECTOR_SINGLE) generate_R_dendrogram(merge, height, order, Z2, N); else generate_R_dendrogram(merge, height, order, Z2, N); SEXP n; // names PROTECT(n = NEW_CHARACTER(3)); SET_STRING_ELT(n, 0, COPY_TO_USER_STRING("merge")); SET_STRING_ELT(n, 1, COPY_TO_USER_STRING("height")); SET_STRING_ELT(n, 2, COPY_TO_USER_STRING("order")); PROTECT(r = NEW_LIST(3)); // field names in the output list SET_ELEMENT(r, 0, m); SET_ELEMENT(r, 1, h); SET_ELEMENT(r, 2, o); SET_NAMES(r, n); UNPROTECT(6); // m, dim_m, h, o, r, n } // try catch (const std::bad_alloc&) { Rf_error( "Memory overflow."); } catch(const std::exception& e){ Rf_error( e.what() ); } catch(const nan_error&){ Rf_error("NaN dissimilarity value."); } catch(...){ Rf_error( "C++ exception (unknown reason)." ); } return r; } #if HAVE_VISIBILITY #pragma GCC visibility push(default) #endif void R_init_fastcluster(DllInfo * const info) { R_CallMethodDef callMethods[] = { {"fastcluster", (DL_FUNC) &fastcluster, 4}, {"fastcluster_vector", (DL_FUNC) &fastcluster_vector, 5}, {NULL, NULL, 0} }; R_registerRoutines(info, NULL, callMethods, NULL, NULL); } #if HAVE_VISIBILITY #pragma GCC visibility pop #endif } // extern "C" #if HAVE_VISIBILITY #pragma GCC visibility pop #endif fastcluster/src/fastcluster_python.cpp0000644000176200001440000010634712470717051020063 0ustar liggesusers/* fastcluster: Fast hierarchical clustering routines for R and Python Copyright © 2011 Daniel Müllner */ // for INT32_MAX in fastcluster.cpp // This must be defined here since Python.h loads the header file pyport.h, // and from this stdint.h. INT32_MAX is defined in stdint.h, but only if // __STDC_LIMIT_MACROS is defined. #define __STDC_LIMIT_MACROS #define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION #if __GNUC__ > 4 || (__GNUC__ == 4 && (__GNUC_MINOR__ >= 6)) #define HAVE_DIAGNOSTIC 1 #endif #if HAVE_DIAGNOSTIC #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wswitch-default" #pragma GCC diagnostic ignored "-Wpadded" #pragma GCC diagnostic ignored "-Wlong-long" #pragma GCC diagnostic ignored "-Wformat" #endif #include #if HAVE_DIAGNOSTIC #pragma GCC diagnostic pop #endif #if HAVE_DIAGNOSTIC #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wlong-long" #pragma GCC diagnostic ignored "-Wpedantic" #pragma GCC diagnostic ignored "-Wpadded" #pragma GCC diagnostic ignored "-Wcast-qual" #endif #include #if HAVE_DIAGNOSTIC #pragma GCC diagnostic pop #endif /* It's complicated, but if I do not include the C++ math headers, GCC will complain about conversions from 'double' to 'float', whenever 'isnan' is called in a templated function (but not outside templates). The '#include ' seems to cure the problem. */ //#include #define fc_isnan(X) ((X)!=(X)) // There is Py_IS_NAN but it is so much slower on my x86_64 system with GCC! #include // for std::ptrdiff_t #include // for std::numeric_limits<...>::infinity() #include // for std::stable_sort #include // for std::bad_alloc #include // for std::exception #include "fastcluster.cpp" // backwards compatibility #ifndef NPY_ARRAY_CARRAY_RO #define NPY_ARRAY_CARRAY_RO NPY_CARRAY_RO #endif /* Since the public interface is given by the Python respectively R interface, * we do not want other symbols than the interface initalization routines to be * visible in the shared object file. The "visibility" switch is a GCC concept. * Hiding symbols keeps the relocation table small and decreases startup time. * See http://gcc.gnu.org/wiki/Visibility */ #if HAVE_VISIBILITY #pragma GCC visibility push(hidden) #endif /* Convenience class for the output array: automatic counter. */ class linkage_output { private: t_float * Z; public: linkage_output(t_float * const Z_) : Z(Z_) {} void append(const t_index node1, const t_index node2, const t_float dist, const t_float size) { if (node1(node1); *(Z++) = static_cast(node2); } else { *(Z++) = static_cast(node2); *(Z++) = static_cast(node1); } *(Z++) = dist; *(Z++) = size; } }; /* Generate the SciPy-specific output format for a dendrogram from the clustering output. The list of merging steps can be sorted or unsorted. */ // The size of a node is either 1 (a single point) or is looked up from // one of the clusters. #define size_(r_) ( ((r_ static void generate_SciPy_dendrogram(t_float * const Z, cluster_result & Z2, const t_index N) { // The array "nodes" is a union-find data structure for the cluster // identities (only needed for unsorted cluster_result input). union_find nodes(sorted ? 0 : N); if (!sorted) { std::stable_sort(Z2[0], Z2[N-1]); } linkage_output output(Z); t_index node1, node2; for (node const * NN=Z2[0]; NN!=Z2[N-1]; ++NN) { // Get two data points whose clusters are merged in step i. if (sorted) { node1 = NN->node1; node2 = NN->node2; } else { // Find the cluster identifiers for these points. node1 = nodes.Find(NN->node1); node2 = nodes.Find(NN->node2); // Merge the nodes in the union-find data structure by making them // children of a new node. nodes.Union(node1, node2); } output.append(node1, node2, NN->dist, size_(node1)+size_(node2)); } } /* Python interface code */ static PyObject * linkage_wrap(PyObject * const self, PyObject * const args); static PyObject * linkage_vector_wrap(PyObject * const self, PyObject * const args); // List the C++ methods that this extension provides. static PyMethodDef _fastclusterWrapMethods[] = { {"linkage_wrap", linkage_wrap, METH_VARARGS, NULL}, {"linkage_vector_wrap", linkage_vector_wrap, METH_VARARGS, NULL}, {NULL, NULL, 0, NULL} /* Sentinel - marks the end of this structure */ }; /* Tell Python about these methods. Python 2.x and 3.x differ in their C APIs for this part. */ #if PY_VERSION_HEX >= 0x03000000 static struct PyModuleDef fastclustermodule = { PyModuleDef_HEAD_INIT, "_fastcluster", NULL, // no module documentation -1, /* size of per-interpreter state of the module, or -1 if the module keeps state in global variables. */ _fastclusterWrapMethods, NULL, NULL, NULL, NULL }; /* Make the interface initalization routines visible in the shared object * file. */ #if HAVE_VISIBILITY #pragma GCC visibility push(default) #endif PyMODINIT_FUNC PyInit__fastcluster(void) { PyObject * m; m = PyModule_Create(&fastclustermodule); if (!m) { return NULL; } import_array(); // Must be present for NumPy. Called first after above line. return m; } #if HAVE_VISIBILITY #pragma GCC visibility pop #endif # else // Python 2.x #if HAVE_VISIBILITY #pragma GCC visibility push(default) #endif PyMODINIT_FUNC init_fastcluster(void) { (void) Py_InitModule("_fastcluster", _fastclusterWrapMethods); import_array(); // Must be present for NumPy. Called first after above line. } #if HAVE_VISIBILITY #pragma GCC visibility pop #endif #endif // PY_VERSION class GIL_release { private: // noncopyable GIL_release(GIL_release const &); GIL_release & operator=(GIL_release const &); public: inline GIL_release(bool really = true) : _save(really ? PyEval_SaveThread() : NULL) { } inline ~GIL_release() { if (_save) PyEval_RestoreThread(_save); } private: PyThreadState * _save; }; /* Interface to Python, part 1: The input is a dissimilarity matrix. */ static PyObject *linkage_wrap(PyObject * const, PyObject * const args) { PyArrayObject * D, * Z; long int N_ = 0; unsigned char method; try{ #if HAVE_DIAGNOSTIC #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wold-style-cast" #endif // Parse the input arguments if (!PyArg_ParseTuple(args, "lO!O!b", &N_, // signed long integer &PyArray_Type, &D, // NumPy array &PyArray_Type, &Z, // NumPy array &method)) { // unsigned char return NULL; // Error if the arguments have the wrong type. } #if HAVE_DIAGNOSTIC #pragma GCC diagnostic pop #endif if (N_ < 1 ) { // N must be at least 1. PyErr_SetString(PyExc_ValueError, "At least one element is needed for clustering."); return NULL; } /* (1) The biggest index used below is 4*(N-2)+3, as an index to Z. This must fit into the data type used for indices. (2) The largest representable integer, without loss of precision, by a floating point number of type t_float is 2^T_FLOAT_MANT_DIG. Here, we make sure that all cluster labels from 0 to 2N-2 in the output can be accurately represented by a floating point number. Conversion of N to 64 bits below is not really necessary but it prevents a warning ("shift count >= width of type") on systems where "long int" is 32 bits wide. */ if (N_ > MAX_INDEX/4 || static_cast(N_-1)>>(T_FLOAT_MANT_DIG-1) > 0) { PyErr_SetString(PyExc_ValueError, "Data is too big, index overflow."); return NULL; } t_index N = static_cast(N_); // Allow threads! GIL_release G; t_float * const D_ = reinterpret_cast(PyArray_DATA(D)); cluster_result Z2(N-1); auto_array_ptr members; // For these methods, the distance update formula needs the number of // data points in a cluster. if (method==METHOD_METR_AVERAGE || method==METHOD_METR_WARD || method==METHOD_METR_CENTROID) { members.init(N, 1); } // Operate on squared distances for these methods. if (method==METHOD_METR_WARD || method==METHOD_METR_CENTROID || method==METHOD_METR_MEDIAN) { for (t_float * DD = D_; DD!=D_+static_cast(N)*(N-1)/2; ++DD) *DD *= *DD; } switch (method) { case METHOD_METR_SINGLE: MST_linkage_core(N, D_, Z2); break; case METHOD_METR_COMPLETE: NN_chain_core(N, D_, NULL, Z2); break; case METHOD_METR_AVERAGE: NN_chain_core(N, D_, members, Z2); break; case METHOD_METR_WEIGHTED: NN_chain_core(N, D_, NULL, Z2); break; case METHOD_METR_WARD: NN_chain_core(N, D_, members, Z2); break; case METHOD_METR_CENTROID: generic_linkage(N, D_, members, Z2); break; case METHOD_METR_MEDIAN: generic_linkage(N, D_, NULL, Z2); break; default: throw std::runtime_error(std::string("Invalid method index.")); } if (method==METHOD_METR_WARD || method==METHOD_METR_CENTROID || method==METHOD_METR_MEDIAN) { Z2.sqrt(); } t_float * const Z_ = reinterpret_cast(PyArray_DATA(Z)); if (method==METHOD_METR_CENTROID || method==METHOD_METR_MEDIAN) { generate_SciPy_dendrogram(Z_, Z2, N); } else { generate_SciPy_dendrogram(Z_, Z2, N); } } // try catch (const std::bad_alloc&) { return PyErr_NoMemory(); } catch(const std::exception& e){ PyErr_SetString(PyExc_EnvironmentError, e.what()); return NULL; } catch(const nan_error&){ PyErr_SetString(PyExc_FloatingPointError, "NaN dissimilarity value."); return NULL; } #ifdef FE_INVALID catch(const fenv_error&){ PyErr_SetString(PyExc_FloatingPointError, "NaN dissimilarity value in intermediate results."); return NULL; } #endif catch(...){ PyErr_SetString(PyExc_EnvironmentError, "C++ exception (unknown reason). Please send a bug report."); return NULL; } #if HAVE_DIAGNOSTIC #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wold-style-cast" #endif Py_RETURN_NONE; #if HAVE_DIAGNOSTIC #pragma GCC diagnostic pop #endif } /* Part 2: Clustering on vector data */ /* Metric codes. These codes must agree with the dictionary mtridx in fastcluster.py. */ enum metric_codes { // metrics METRIC_EUCLIDEAN = 0, METRIC_MINKOWSKI = 1, METRIC_CITYBLOCK = 2, METRIC_SEUCLIDEAN = 3, METRIC_SQEUCLIDEAN = 4, METRIC_COSINE = 5, METRIC_HAMMING = 6, METRIC_JACCARD = 7, METRIC_CHEBYCHEV = 8, METRIC_CANBERRA = 9, METRIC_BRAYCURTIS = 10, METRIC_MAHALANOBIS = 11, METRIC_YULE = 12, METRIC_MATCHING = 13, METRIC_DICE = 14, METRIC_ROGERSTANIMOTO = 15, METRIC_RUSSELLRAO = 16, METRIC_SOKALSNEATH = 17, METRIC_KULSINSKI = 18, METRIC_USER = 19, METRIC_INVALID = 20, // sentinel METRIC_JACCARD_BOOL = 21, // separate function for Jaccard metric on }; // Boolean input data /* Helper class: Throw this if calling the Python interpreter from within C returned an error. */ class pythonerror {}; /* This class handles all the information about the dissimilarity computation. */ class python_dissimilarity { private: t_float * Xa; std::ptrdiff_t dim; // size_t saves many statis_cast<> in products t_index N; auto_array_ptr Xnew; t_index * members; void (cluster_result::*postprocessfn) (const t_float) const; t_float postprocessarg; t_float (python_dissimilarity::*distfn) (const t_index, const t_index) const; // for user-defined metrics PyObject * X_Python; PyObject * userfn; auto_array_ptr precomputed; t_float * precomputed2; PyArrayObject * V; const t_float * V_data; // noncopyable python_dissimilarity(); python_dissimilarity(python_dissimilarity const &); python_dissimilarity & operator=(python_dissimilarity const &); public: // Ignore warning about uninitialized member variables. I know what I am // doing here, and some member variables are only used for certain metrics. #if HAVE_DIAGNOSTIC #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Weffc++" #endif python_dissimilarity (PyArrayObject * const Xarg, t_index * const members_, const method_codes method, const metric_codes metric, PyObject * const extraarg, bool temp_point_array) : Xa(reinterpret_cast(PyArray_DATA(Xarg))), dim(PyArray_DIM(Xarg, 1)), N(static_cast(PyArray_DIM(Xarg, 0))), Xnew(temp_point_array ? (N-1)*dim : 0), members(members_), postprocessfn(NULL), V(NULL) { switch (method) { case METHOD_METR_SINGLE: postprocessfn = NULL; // default switch (metric) { case METRIC_EUCLIDEAN: set_euclidean(); break; case METRIC_SEUCLIDEAN: if (extraarg==NULL) { PyErr_SetString(PyExc_TypeError, "The 'seuclidean' metric needs a variance parameter."); throw pythonerror(); } #if HAVE_DIAGNOSTIC #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wold-style-cast" #endif V = reinterpret_cast(PyArray_FromAny(extraarg, PyArray_DescrFromType(NPY_DOUBLE), 1, 1, NPY_ARRAY_CARRAY_RO, NULL)); #if HAVE_DIAGNOSTIC #pragma GCC diagnostic pop #endif if (PyErr_Occurred()) { throw pythonerror(); } if (PyArray_DIM(V, 0)!=dim) { PyErr_SetString(PyExc_ValueError, "The variance vector must have the same dimensionality as the data."); throw pythonerror(); } V_data = reinterpret_cast(PyArray_DATA(V)); distfn = &python_dissimilarity::seuclidean; postprocessfn = &cluster_result::sqrt; break; case METRIC_SQEUCLIDEAN: distfn = &python_dissimilarity::sqeuclidean; break; case METRIC_CITYBLOCK: set_cityblock(); break; case METRIC_CHEBYCHEV: set_chebychev(); break; case METRIC_MINKOWSKI: set_minkowski(extraarg); break; case METRIC_COSINE: distfn = &python_dissimilarity::cosine; postprocessfn = &cluster_result::plusone; // precompute norms precomputed.init(N); for (t_index i=0; i(dim); break; case METRIC_JACCARD: distfn = &python_dissimilarity::jaccard; break; case METRIC_CANBERRA: distfn = &python_dissimilarity::canberra; break; case METRIC_BRAYCURTIS: distfn = &python_dissimilarity::braycurtis; break; case METRIC_MAHALANOBIS: if (extraarg==NULL) { PyErr_SetString(PyExc_TypeError, "The 'mahalanobis' metric needs a parameter for the inverse covariance."); throw pythonerror(); } #if HAVE_DIAGNOSTIC #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wold-style-cast" #endif V = reinterpret_cast(PyArray_FromAny(extraarg, PyArray_DescrFromType(NPY_DOUBLE), 2, 2, NPY_ARRAY_CARRAY_RO, NULL)); #if HAVE_DIAGNOSTIC #pragma GCC diagnostic pop #endif if (PyErr_Occurred()) { throw pythonerror(); } if (PyArray_DIM(V, 0)!=N || PyArray_DIM(V, 1)!=dim) { PyErr_SetString(PyExc_ValueError, "The inverse covariance matrix has the wrong size."); throw pythonerror(); } V_data = reinterpret_cast(PyArray_DATA(V)); distfn = &python_dissimilarity::mahalanobis; postprocessfn = &cluster_result::sqrt; break; case METRIC_YULE: distfn = &python_dissimilarity::yule; break; case METRIC_MATCHING: distfn = &python_dissimilarity::matching; postprocessfn = &cluster_result::divide; postprocessarg = static_cast(dim); break; case METRIC_DICE: distfn = &python_dissimilarity::dice; break; case METRIC_ROGERSTANIMOTO: distfn = &python_dissimilarity::rogerstanimoto; break; case METRIC_RUSSELLRAO: distfn = &python_dissimilarity::russellrao; postprocessfn = &cluster_result::divide; postprocessarg = static_cast(dim); break; case METRIC_SOKALSNEATH: distfn = &python_dissimilarity::sokalsneath; break; case METRIC_KULSINSKI: distfn = &python_dissimilarity::kulsinski; postprocessfn = &cluster_result::plusone; precomputed.init(N); for (t_index i=0; i(sum); } break; case METRIC_USER: X_Python = reinterpret_cast(Xarg); this->userfn = extraarg; distfn = &python_dissimilarity::user; break; default: // case METRIC_JACCARD_BOOL: distfn = &python_dissimilarity::jaccard_bool; } break; case METHOD_METR_WARD: postprocessfn = &cluster_result::sqrtdouble; break; default: postprocessfn = &cluster_result::sqrt; } } #if HAVE_DIAGNOSTIC #pragma GCC diagnostic pop #endif ~python_dissimilarity() { #if HAVE_DIAGNOSTIC #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wold-style-cast" #endif Py_XDECREF(V); #if HAVE_DIAGNOSTIC #pragma GCC diagnostic pop #endif } inline t_float operator () (const t_index i, const t_index j) const { return (this->*distfn)(i,j); } inline t_float X (const t_index i, const t_index j) const { return Xa[i*dim+j]; } inline bool Xb (const t_index i, const t_index j) const { return reinterpret_cast(Xa)[i*dim+j]; } inline t_float * Xptr(const t_index i, const t_index j) const { return Xa+i*dim+j; } void merge(const t_index i, const t_index j, const t_index newnode) const { t_float const * const Pi = i(members[i]) + Pj[k]*static_cast(members[j])) / static_cast(members[i]+members[j]); } members[newnode] = members[i]+members[j]; } void merge_weighted(const t_index i, const t_index j, const t_index newnode) const { t_float const * const Pi = i(members[i]) + Pj[k]*static_cast(members[j])) / static_cast(members[i]+members[j]); } members[j] += members[i]; } void merge_inplace_weighted(const t_index i, const t_index j) const { t_float const * const Pi = Xa+i*dim; t_float * const Pj = Xa+j*dim; for(t_index k=0; k(members[i]); t_float mj = static_cast(members[j]); return sqeuclidean(i,j)*mi*mj/(mi+mj); } inline t_float ward_initial(const t_index i, const t_index j) const { // alias for sqeuclidean // Factor 2!!! return sqeuclidean(i,j); } // This method must not produce NaN if the input is non-NaN. inline static t_float ward_initial_conversion(const t_float min) { return min*.5; } inline t_float ward_extended(const t_index i, const t_index j) const { t_float mi = static_cast(members[i]); t_float mj = static_cast(members[j]); return sqeuclidean_extended(i,j)*mi*mj/(mi+mj); } /* We need two variants of the Euclidean metric: one that does not check for a NaN result, which is used for the initial distances, and one which does, for the updated distances during the clustering procedure. */ template t_float sqeuclidean(const t_index i, const t_index j) const { t_float sum = 0; /* for (t_index k=0; k::infinity()) { set_chebychev(); } else if (postprocessarg==1.0){ set_cityblock(); } else if (postprocessarg==2.0){ set_euclidean(); } else { distfn = &python_dissimilarity::minkowski; postprocessfn = &cluster_result::power; } #if HAVE_DIAGNOSTIC #pragma GCC diagnostic pop #endif } void set_euclidean() { distfn = &python_dissimilarity::sqeuclidean; postprocessfn = &cluster_result::sqrt; } void set_cityblock() { distfn = &python_dissimilarity::cityblock; } void set_chebychev() { distfn = &python_dissimilarity::chebychev; } t_float seuclidean(const t_index i, const t_index j) const { t_float sum = 0; for (t_index k=0; kmax) { max = diff; } } return max; } t_float cosine(const t_index i, const t_index j) const { t_float sum = 0; for (t_index k=0; k(sum1) / static_cast(sum2); } t_float canberra(const t_index i, const t_index j) const { t_float sum = 0; for (t_index k=0; k(dim)-NTT-NXO); // NFFTT } void nbool_correspond_xo(const t_index i, const t_index j) const { NXO = 0; for (t_index k=0; k(2*NTFFT) / static_cast(NTFFT + NFFTT); } // Prevent a zero denominator for equal vectors. t_float dice(const t_index i, const t_index j) const { nbool_correspond(i, j); return (NXO==0) ? 0 : static_cast(NXO) / static_cast(NXO+2*NTT); } t_float rogerstanimoto(const t_index i, const t_index j) const { nbool_correspond_xo(i, j); return static_cast(2*NXO) / static_cast(NXO+dim); } t_float russellrao(const t_index i, const t_index j) const { nbool_correspond_tt(i, j); return static_cast(dim-NTT); } // Prevent a zero denominator for equal vectors. t_float sokalsneath(const t_index i, const t_index j) const { nbool_correspond(i, j); return (NXO==0) ? 0 : static_cast(2*NXO) / static_cast(NTT+2*NXO); } t_float kulsinski(const t_index i, const t_index j) const { nbool_correspond_tt(i, j); return static_cast(NTT) * (precomputed[i] + precomputed[j]); } // 'matching' distance = Hamming distance t_float matching(const t_index i, const t_index j) const { nbool_correspond_xo(i, j); return static_cast(NXO); } // Prevent a zero denominator for equal vectors. t_float jaccard_bool(const t_index i, const t_index j) const { nbool_correspond(i, j); return (NXO==0) ? 0 : static_cast(NXO) / static_cast(NXO+NTT); } }; static PyObject *linkage_vector_wrap(PyObject * const, PyObject * const args) { PyArrayObject * X, * Z; unsigned char method, metric; PyObject * extraarg; try{ // Parse the input arguments #if HAVE_DIAGNOSTIC #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wold-style-cast" #endif if (!PyArg_ParseTuple(args, "O!O!bbO", &PyArray_Type, &X, // NumPy array &PyArray_Type, &Z, // NumPy array &method, // unsigned char &metric, // unsigned char &extraarg )) { // Python object return NULL; } #if HAVE_DIAGNOSTIC #pragma GCC diagnostic pop #endif if (PyArray_NDIM(X) != 2) { PyErr_SetString(PyExc_ValueError, "The input array must be two-dimensional."); } npy_intp const N_ = PyArray_DIM(X, 0); if (N_ < 1 ) { // N must be at least 1. PyErr_SetString(PyExc_ValueError, "At least one element is needed for clustering."); return NULL; } npy_intp const dim = PyArray_DIM(X, 1); if (dim < 1 ) { PyErr_SetString(PyExc_ValueError, "Invalid dimension of the data set."); return NULL; } /* (1) The biggest index used below is 4*(N-2)+3, as an index to Z. This must fit into the data type used for indices. (2) The largest representable integer, without loss of precision, by a floating point number of type t_float is 2^T_FLOAT_MANT_DIG. Here, we make sure that all cluster labels from 0 to 2N-2 in the output can be accurately represented by a floating point number. Conversion of N to 64 bits below is not really necessary but it prevents a warning ("shift count >= width of type") on systems where "int" is 32 bits wide. */ if (N_ > MAX_INDEX/4 || dim > MAX_INDEX || static_cast(N_-1)>>(T_FLOAT_MANT_DIG-1) > 0) { PyErr_SetString(PyExc_ValueError, "Data is too big, index overflow."); return NULL; } t_index N = static_cast(N_); cluster_result Z2(N-1); auto_array_ptr members; if (method==METHOD_METR_WARD || method==METHOD_METR_CENTROID) { members.init(2*N-1, 1); } if ((method!=METHOD_METR_SINGLE && metric!=METRIC_EUCLIDEAN) || metric>=METRIC_INVALID) { PyErr_SetString(PyExc_IndexError, "Invalid metric index."); return NULL; } if (PyArray_ISBOOL(X)) { if (metric==METRIC_HAMMING) { metric = METRIC_MATCHING; // Alias } if (metric==METRIC_JACCARD) { metric = METRIC_JACCARD_BOOL; } } if (extraarg!=Py_None && metric!=METRIC_MINKOWSKI && metric!=METRIC_SEUCLIDEAN && metric!=METRIC_MAHALANOBIS && metric!=METRIC_USER) { PyErr_SetString(PyExc_TypeError, "No extra parameter is allowed for this metric."); return NULL; } /* temp_point_array must be true if the alternative algorithm is used below (currently for the centroid and median methods). */ bool temp_point_array = (method==METHOD_METR_CENTROID || method==METHOD_METR_MEDIAN); python_dissimilarity dist(X, members, static_cast(method), static_cast(metric), extraarg, temp_point_array); if (method!=METHOD_METR_SINGLE && method!=METHOD_METR_WARD && method!=METHOD_METR_CENTROID && method!=METHOD_METR_MEDIAN) { PyErr_SetString(PyExc_IndexError, "Invalid method index."); return NULL; } // Allow threads if the metric is not "user"! GIL_release G(metric!=METRIC_USER); switch (method) { case METHOD_METR_SINGLE: MST_linkage_core_vector(N, dist, Z2); break; case METHOD_METR_WARD: generic_linkage_vector(N, dist, Z2); break; case METHOD_METR_CENTROID: generic_linkage_vector_alternative(N, dist, Z2); break; default: // case METHOD_METR_MEDIAN: generic_linkage_vector_alternative(N, dist, Z2); } if (method==METHOD_METR_WARD || method==METHOD_METR_CENTROID) { members.free(); } dist.postprocess(Z2); t_float * const Z_ = reinterpret_cast(PyArray_DATA(Z)); if (method!=METHOD_METR_SINGLE) { generate_SciPy_dendrogram(Z_, Z2, N); } else { generate_SciPy_dendrogram(Z_, Z2, N); } } // try catch (const std::bad_alloc&) { return PyErr_NoMemory(); } catch(const std::exception& e){ PyErr_SetString(PyExc_EnvironmentError, e.what()); return NULL; } catch(const nan_error&){ PyErr_SetString(PyExc_FloatingPointError, "NaN dissimilarity value."); return NULL; } catch(const pythonerror){ return NULL; } catch(...){ PyErr_SetString(PyExc_EnvironmentError, "C++ exception (unknown reason). Please send a bug report."); return NULL; } #if HAVE_DIAGNOSTIC #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wold-style-cast" #endif Py_RETURN_NONE; #if HAVE_DIAGNOSTIC #pragma GCC diagnostic pop #endif } #if HAVE_VISIBILITY #pragma GCC visibility pop #endif fastcluster/src/Makevars.win0000644000176200001440000000003211727523223015667 0ustar liggesusersOBJECTS = fastcluster_R.o fastcluster/NAMESPACE0000644000176200001440000000011711727523223014033 0ustar liggesusersuseDynLib(fastcluster, .registration=TRUE) export('hclust', 'hclust.vector') fastcluster/INSTALL0000644000176200001440000000751412254004022013640 0ustar liggesusersfastcluster: Fast hierarchical clustering routines for R and Python Copyright © 2011 Daniel Müllner Installation ‾‾‾‾‾‾‾‾‾‾‾‾ Installation procedures were tested under 64-bit Ubuntu. CRAN also hosts precompiled binaries (of the R library, not the Python module) for Windows and OS X. In principle, it should be possible to install the fastcluster package on any system that has a C++ compiler and R respectively Python with NumPy. There are no unusual libraries needed to compile the package, only the STL library, which every C++ compiler should have by default. Please send me feedback if you accomplish to install the fastcluster package on a certain platform but needed to tweak the configuration! I will update the installation instructions and modify the package if needed (eg. include the right compiler flags for various operating systems). Installation for R ‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾ Enter the command install.packages("fastcluster") in R, and R will download the package automatically, then install it. That's it! If this does not work, please consult R's help function by typing ?INSTALL from within R or read the “R installation and administration” manual: http://cran.r-project.org/doc/manuals/R-admin.html#Installing-packages For manual download, you can get the fastcluster package from the download page at CRAN: http://cran.r-project.org/web/packages/fastcluster/ You may need to start R with administrator rights to be able to install packages. There are ways to install R packages without administrator privileges in your user directories. See this help page for example: http://csg.sph.umich.edu/docs/R/localpackages.html Installation for Python ‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾ Make sure that you have both Python and NumPy installed. 1. Microsoft Windows ‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾ Christoph Gohlke provides installation files for Windows on his web page: http://www.lfd.uci.edu/~gohlke/pythonlibs/#fastcluster 2. Other operating systems, with setuptools ‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾ If setuptools are installed, type easy_install --upgrade --user fastcluster in a terminal, which automatically downloads the latest version from PyPI, compiles the C++ library and installs the package for a single user without administrator rights. If you cannot make this work, use method 3. Also, this method gives you access to all accompanying information, in particular the documentation in docs/fastcluster.pdf. Moreover, it installs (insignificantly) fewer files. 3. Without setuptools ‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾ If you have not done so already, download the fastcluster package from PyPI here: http://pypi.python.org/pypi/fastcluster/ Open a terminal, go to the directory with the downloaded file and extract the contents of the archive with: tar -xvf fastcluster-(version).tar.gz Alternatively, use your favorite archive manager for unpacking, eg. on Windows. This will generate a new directory “fastcluster-(version)”. Switch to this subdirectory: cd fastcluster-(...) The source distribution on CRAN also contains the complete source files. See the directory src/python there. Now compile and install the Python module by: python setup.py install You may need to precede this command with sudo or install the package in your home directory, like this: python setup.py install --user See the chapter “Installing Python modules” in the Python documentation for further help: http://docs.python.org/install/index.html fastcluster/NEWS0000644000176200001440000001242412470453460013320 0ustar liggesusersfastcluster: Fast hierarchical clustering routines for R and Python Copyright © 2011 Daniel Müllner Version history ‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾ Version 1.0.0, 03/14/2011: • Initial release, dependent on Rcpp. Not available on CRAN. Version 1.0.1, 03/15/2011: • Removed the dependence on Rcpp; only R's original C interface is used. Version 1.0.2, 03/17/2011: • File DESCRIPTION: Fixed a typo Version 1.0.3, 03/20/2011: • File README: Removed the warning about false results from the flashClust package since the new flashClust version 1.01 has this error corrected. • Cleaned the test file fastcluster_test.R up. (No dependence on the MASS package any more) Version 1.0.4, 03/21/2011: • Changed the name of the external function from the outdated "Rcpp_linkage" to "fastcluster". • Registered the external function "fastcluster" in R. • Configured the C header inclusions to work on Fedora (thanks to Peter Langfelder). Version 1.1.0, 08/21/2011 • Routines for clustering vector data. • Added a User's manual • Revision of all files Version 1.1.1, 10/08/2011 • Fixed test scripts, which indicated an error on some architectures, even if results were correct. (The assumption was that ties in single linkage clustering are resolved in the same way, both for dissimilarity input and for vector input. This is not necessarily true if the floating point unit uses "excess precision". Now the test scripts are content with arbitrary resolution of ties and do not assume a specific scheme.) • Bug fix: uninitialized function pointer in Version 1.1.0 Version 1.1.2, 10/11/2011 • Fix for Solaris: replaced ssize_t by ptrdiff_t in the C++ code. • Removed the NN-chain algorithm for vector input: it was not clear that it would work under all circumstances with the intricacies of floating- point arithmetic. Especially the effects of the excess precision on the x87 are impossible to control in a portable way. Now, the memory-saving routines for the “Ward” linkage use the generic algorithm, as “centroid” and “median” linkage do. Version 1.1.3, 12/10/2011 • Replaced ptrdiff_t by std::ptrdiff_t, as GCC 4.6.1 complains about this. Version 1.1.4, 02/01/2012 • Release the GIL in the Python package, so that it can be used efficiently in multithreaded applications. • Improved performance for the "Ward" method with vector input. • The "members" parameter in the R interface is now treated as a double array, not an integer array as before. This was a slight incompatibility with the stats::hclust function. Thanks to Matthias Studer, University of Geneva, for pointing this out. Version 1.1.5, 02/14/2012 • Updated the "members" specification in the User's manual to reflect the recent change. Version 1.1.6, 03/12/2012 • Bug fix related to GIL release in the Python wrapper. Thanks to Massimo Di Stefano for the bug report. • Small compatibility changes in the Python test scripts (again thanks to Massimo Di Stefano for the report). Version 1.1.7, 09/17/2012 • Scipy import is now optional (suggested by Forest Gregg) • Compatibility fix for NumPy 1.7. Thanks to Semihcan Doken for the bug report. Version 1.1.8, 08/28/2012 • Test for NaN dissimilarity values: Now the algorithms produce an error message instead of silently giving false results. The documentation was updated accordingly. This is the final design as intended: the fastcluster package handles infinity values correctly but complains about NaNs. • The Python interface now works with both Python 2 and Python 3. • Changed the license to BSD. Version 1.1.9, 03/15/2013 • Compatibility fix for the MSVC compilers on Windows. • Simplified GIL release in the Python interface. Version 1.1.10, 05/22/2013 • Updated citation information (JSS paper). • Suppress warnings where applicable. Compilation with GCC should not produce any warning at all, even if all compiler warnings are enabled. (The switch -pedantic still does not work, but this is due to the Python headers.) • Optimization: Hidden symbols. Only the interface functions are exported to the symbol table with GCC. Version 1.1.11, 05/23/2013 • Compatibility fix for Solaris. Version 1.1.12, 12/10/2013 • Tiny maintenance updates: new author web page and e-mail address, new location for R vignette. Version 1.1.13, 12/17/2013 • Moved the "python" directory due to CRAN requirements. Version 1.1.14, 01/02/2015 • Updated the DESCRIPTION file according to CRAN rules. • Renamed the “ward” method for dissimilarity input to “ward.D” in the R interface and created a new method “ward.D2”, following changes in R's hclust package. Version 1.1.15, 01/05/2015 • Fixed the unit test to work with old and new R versions (see the changes in stats::hclust in R 3.1.0). Version 1.1.16, 01/07/2015 • Support for large distance matrices (more than 2^31 entries, R's long vector support since version 3.0.0). fastcluster/R/0000755000176200001440000000000012452035565013021 5ustar liggesusersfastcluster/R/fastcluster.R0000644000176200001440000000414012452035565015502 0ustar liggesusers# fastcluster: Fast hierarchical clustering routines for R and Python # # Copyright © 2011 Daniel Müllner # hclust <- function(d, method="complete", members=NULL) { # Hierarchical clustering, on raw input data. if(method == "ward") { message("The \"ward\" method has been renamed to \"ward.D\"; note new \"ward.D2\"") method <- "ward.D" } # This array must agree with the enum method_codes in fastcluster.cpp. METHODS <- c("single", "complete", "average", "mcquitty", "ward.D", "centroid", "median", "ward.D2") method <- pmatch(method, METHODS) if (is.na(method)) stop("Invalid clustering method.") if (method == -1) stop("Ambiguous clustering method.") dendrogram <- c( .Call(fastcluster, attr(d, "Size"), method, d, members), list( labels = attr(d, "Labels") ,method = METHODS[method] ,call = match.call() ,dist.method = attr(d, "method") ) ) class(dendrogram) <- "hclust" return (dendrogram) } hclust.vector <- function(X, method='single', members=NULL, metric='euclidean', p=NULL) { # Hierarchical clustering, on vector data. METHODS <- c("single", "ward", "centroid", "median") methodidx <- pmatch(method, METHODS) if (is.na(methodidx)) stop(paste("Invalid clustering method '", method, "' for vector data.", sep='')) if (methodidx == -1) stop("Ambiguous clustering method.") METRICS <- c("euclidean", "maximum", "manhattan", "canberra", "binary", "minkowski") metric = pmatch(metric, METRICS) if (is.na(metric)) stop("Invalid metric.") if (metric == -1) stop("Ambiguous metric.") if (methodidx!=1 && metric!=1) stop("The Euclidean methods 'ward', 'centroid' and 'median' require the 'euclidean' metric.") X <- as.matrix(X) dendrogram <- c( .Call(fastcluster_vector, methodidx, metric, X, members, p), list( labels = dimnames(X)[[1L]] ,method = METHODS[methodidx] ,call = match.call() ,dist.method = METRICS[metric] ) ) class(dendrogram) <- "hclust" return (dendrogram) } fastcluster/vignettes/0000755000176200001440000000000012470717051014625 5ustar liggesusersfastcluster/vignettes/Makefile0000644000176200001440000000032112452071270016255 0ustar liggesusersall: latex fastcluster.Rtex latex fastcluster.Rtex latex fastcluster.Rtex dvipdfmx fastcluster.dvi mkdir keep mv fastcluster.pdf keep mv fastcluster.Rtex keep rm fastcluster.* mv keep/* . rmdir keep fastcluster/vignettes/fastcluster.Rtex0000644000176200001440000012016112453206244020027 0ustar liggesusers\def\fastclusterversion{1.1.16} \documentclass[fontsize=10pt,paper=letter,BCOR=-6mm]{scrartcl} \usepackage[utf8]{inputenc} \usepackage{lmodern} \normalfont \usepackage[T1]{fontenc} \usepackage{textcomp} \newcommand*\q{\textquotesingle} \usepackage{amsmath} \usepackage{amsfonts} \usepackage{xcolor} \usepackage{ifpdf} \ifpdf \newcommand*\driver{} \else \newcommand*\driver{dvipdfmx} \fi \usepackage[% pdftitle={fastcluster manual}, pdfauthor={Daniel Müllner}, % pdfsubject={}, pdfdisplaydoctitle=true, % pdfduplex=DuplexFlipLongEdge, pdfstartview=FitH, colorlinks=True, pdfhighlight=/I, % pdfborder={0 0 1}, % linkbordercolor={1 .8 .8}, % citebordercolor={.5 .9 .5}, % urlbordercolor={.5 .7 1}, % linkcolor={blue}, % citecolor={blue}, urlcolor={blue!80!black}, linkcolor={red!80!black}, % runcolor={blue}, % filecolor={blue}, pdfpagemode=UseOutlines, bookmarksopen=true, bookmarksopenlevel=1, bookmarksdepth=2, breaklinks=true, unicode=true, \driver ]{hyperref} % Optimize the PDF targets and make the PDF file smaller \ifpdf\RequirePackage{hypdestopt}\fi \renewcommand*\sectionautorefname{Section} \usepackage{typearea} \DeclareMathOperator\size{size} \DeclareMathOperator\Var{Var} \newcommand*\linkage{\href{http://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.linkage.html}{\texttt{linkage}}} \newcommand*\hierarchy{\href{http://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.linkage.html}{\texttt{scipy.\hskip0pt cluster.\hskip0pt hierarchy}}} \newcommand*\hclust{\href{http://stat.ethz.ch/R-manual/R-patched/library/stats/html/hclust.html}{\texttt{hclust}}} \newcommand*\stats{\href{http://stat.ethz.ch/R-manual/R-devel/library/stats/html/00Index.html}{\texttt{stats}}} \newcommand*\flashClustPack{\href{http://cran.r-project.org/web/packages/flashClust/index.html}{\texttt{flashClust}}} \newcommand*\dist{\href{http://stat.ethz.ch/R-manual/R-devel/library/stats/html/dist.html}{\texttt{dist}}} \newcommand*\print{\href{http://stat.ethz.ch/R-manual/R-patched/library/base/html/print.html}{\texttt{print}}} \newcommand*\plot{\href{http://stat.ethz.ch/R-manual/R-patched/library/graphics/html/plot.html}{\texttt{plot}}} \newcommand*\identify{\href{http://stat.ethz.ch/R-manual/R-patched/library/stats/html/identify.hclust.html}{\texttt{identify}}} \newcommand*\rect{\href{http://stat.ethz.ch/R-manual/R-patched/library/stats/html/rect.hclust.html}{\texttt{rect.hclust}}} \newcommand*\NA{\href{http://stat.ethz.ch/R-manual/R-devel/library/base/html/NA.html}{\texttt{NA}}} %\usepackage{showframe} \makeatletter \newenvironment{methods}{% \list{}{\labelwidth\z@ \itemindent-\leftmargin \let\makelabel\methodslabel}% }{% \endlist } \newcommand*{\methodslabel}[1]{% %\hspace{\labelsep}% \hbox to \textwidth{\hspace{\labelsep}% \normalfont\bfseries\ttfamily #1\hskip-\labelsep\hfill}% } \makeatother \setkomafont{descriptionlabel}{\normalfont\ttfamily\bfseries} \begin{document} %\VignetteIndexEntry{User's manual} \title{The \textit{fastcluster} package: User's manual} \author{\href{http://danifold.net}{Daniel Müllner}} \date{January 7, 2015} \subtitle{Version \fastclusterversion} \maketitle \makeatletter \renewenvironment{quotation}{% \list{}{\listparindent 1em% \itemindent \listparindent \leftmargin2.5em \rightmargin \leftmargin \parsep \z@ \@plus\p@ }% \item\relax }{% \endlist } \makeatother \begin{abstract}\noindent\small The fastcluster package is a C++ library for hierarchical, agglomerative clustering. It efficiently implements the seven most widely used clustering schemes: single, complete, average, weighted/mcquitty, Ward, centroid and median linkage. The library currently has interfaces to two languages: R and Python/SciPy. Part of the functionality is designed as drop-in replacement for existing routines: \linkage{} in the SciPy package \hierarchy{}, \hclust{} in R's \stats{} package, and the \flashClustPack{} package. Once the fastcluster library is loaded at the beginning of the code, every program that uses hierarchical clustering can benefit immediately and effortlessly from the performance gain. Moreover, there are memory-saving routines for clustering of vector data, which go beyond what the existing packages provide. \end{abstract} \noindent This document describes the usage for the two interfaces for R and Python and is meant as the reference document for the end user. Installation instructions are given in the file INSTALL in the source distribution and are not repeated here. The sections about the two interfaces are independent and in consequence somewhat redundant, so that users who need a reference for one interface need to consult only one section. If you use the fastcluster package for scientific work, please cite it as: \begin{quote} Daniel Müllner, \textit{fastcluster: Fast Hierarchical, Agglomerative Clustering Routines for R and Python}, Journal of Statistical Software, \textbf{53} (2013), no.~9, 1--18, \url{http://www.jstatsoft.org/v53/i09/}. \end{quote} \textbf{The fastcluster package is considered stable and will undergo few changes from now on. If some years from now there have not been any updates, this does not necessarily mean that the package is unmaintained but maybe it just was not necessary to correct anything. Of course, please still report potential bugs and incompatibilities to \texttt{daniel@danifold.net}.} \tableofcontents \section{The R interface} Load the package with the following command: \begin{quote} \texttt{library(\q fastcluster\q)} \end{quote} The package overwrites the function \hclust{} from the \stats{} package (in the same way as the \flashClustPack{} package does). Please remove any references to the \flashClustPack{} package in your R files to not accidentally overwrite the \hclust{} function with the \flashClustPack{} version. The \hyperref[hclust]{new \texttt{hclust} function} has exactly the same calling conventions as the old one. You may just load the package and immediately and effortlessly enjoy the performance improvements. The function is also an improvement to the \texttt{flashClust} function from the \flashClustPack{} package. Just replace every call to \texttt{flashClust} by \hyperref[hclust]{\texttt{hclust}} and expect your code to work as before, only faster.\footnote{If you are using flashClust prior to version 1.01, update it! See the change log for \flashClustPack{} at \url{http://cran.r-project.org/web/packages/flashClust/ChangeLog}.} In case the data includes infinite or NaN values, see \autoref{sec:infnan}. If you need to access the old function or make sure that the right function is called, specify the package as follows: \begin{quote} \texttt{\hyperref[hclust]{fastcluster::hclust}(…)}\\ \texttt{flashClust::hclust(…)}\\ \texttt{stats::hclust(…)} \end{quote} Vector data can be clustered with a memory-saving algorithm with the command: \begin{quote} \texttt{\hyperref[hclust.vector]{hclust.vector}(…)} \end{quote} The following sections contain comprehensive descriptions of these methods. \begin{methods} \item [\normalfont\texttt{\textbf{hclust}}\,(\textit{d, method=\q complete\q, members=NULL})] \phantomsection\label{hclust} \addcontentsline{toc}{subsection}{\texttt{hclust}} Hierarchical, agglomerative clustering on a condensed dissimilarity matrix. This method has the same specifications as the method \hclust{} in the package \stats{} and \texttt{hclust} alias \texttt{flashClust} in the package \flashClustPack{}. In particular, the \print{}, \plot{}, \rect{} and \identify{} methods work as expected. The argument $d$ is a condensed distance matrix, as it is produced by \dist. The argument \textit{method} is one of the strings \textit{\q single\q}, \textit{\q complete\q}, \textit{\q average\q}, \textit{\q mcquitty\q}, \textit{\q centroid\q}, \textit{\q median\q}, \textit{\q ward.D\q}, \textit{\q ward.D2\q} or an unambiguous abbreviation thereof. The argument \textit{members} specifies the sizes of the initial nodes, ie.\ the number of observations in the initial clusters. The default value \texttt{NULL} says that all initial nodes are singletons, ie.\ have size 1. Otherwise, \textit{members} must be a vector whose size is the number of input points. The vector is processed as a \href{http://stat.ethz.ch/R-manual/R-patched/library/base/html/double.html}{\texttt{double}} array so that not only integer cardinalities of nodes can be accounted for but also weighted nodes with real weights. The general scheme of the agglomerative clustering procedure is as follows: \begin{enumerate} \item Start with $N$ singleton clusters (nodes) labeled $-1,\ldots, -N$, which represent the input points. \item Find a pair of nodes with minimal distance among all pairwise distances. \item Join the two nodes into a new node and remove the two old nodes. The new nodes are labeled consecutively $1,2,\ldots$ \item The distances from the new node to all other nodes is determined by the \textit{method} parameter (see below). \item Repeat $N-1$ times from step 2, until there is one big node, which contains all original input points. \end{enumerate} The output of \texttt{hclust} is an object of class \texttt{\q hclust\q} and represents a \emph{stepwise dendrogram}. It contains the following fields: \begin{description} \item[\normalfont\textit{merge}] This is an $(N-1)\times 2$ array. Row $i$ specifies the labels of the nodes which are joined step $i$ of the clustering. \item[\normalfont\textit{height}] This is a vector of length $N-1$. It contains the sequence of dissimilarities at which every pair of nearest nodes is joined. \item[\normalfont\textit{order}] This is a vector of length $N$. It contains a permutation of the numbers $1,\ldots N$ for the \plot{} method. When the dendrogram is plotted, this is the order in which the singleton nodes are plotted as the leaves of a rooted tree. The order is computed so that the dendrogram is plotted without intersections (except the case when there are inversions for the \textit{\q centroid\q} and \textit{\q median\q} methods). The choice of the \textit{\q order\q} sequence follows the same scheme as the \texttt{stats} package does, only with a faster algorithm. Note that there are many valid choices to order the nodes in a dendrogram without intersections. Also, subsequent points in the \textit{\q order\q} field are not always close in the ultrametric given by the dendrogram. \item[\normalfont\textit{labels}] This copies the attribute \textit{\q Labels\q} from the first input parameter $d$. It contains the labels for the objects being clustered. \item[\normalfont\textit{method}] The (unabbreviated) string for the \textit{\q method\q} parameter. See below for a specification of all available methods. \item[\normalfont\textit{call}] The full command that produced the result. See \href{http://stat.ethz.ch/R-manual/R-patched/library/base/html/match.call.html}{\texttt{match.call}}. \item[\normalfont\textit{dist.method}] This \textit{\q method\q} attribute of the first input parameter $d$. This specifies which metric was used in the \texttt{dist} method which generated the first argument. \end{description} The parameter \textit{method} specifies which clustering scheme to use. The clustering scheme determines the distance from a new node to the other nodes. Denote the dissimilarities by $d$, the nodes to be joined by $I,J$, the new node by $K$ and any other node by $L$. The symbol $|I|$ denotes the size of the cluster $I$. \begin{description} \item [\normalfont\textit{method=\q single\q}:] $\displaystyle d(K,L) = \min(d(I,L), d(J,L))$ The distance between two clusters $A,B$ is the closest distance between any two points in each cluster: \[ d(A,B)=\min_{a\in A, b\in B}d(a,b) \] \item [\normalfont\textit{method=\q complete\q}:] $\displaystyle d(K,L) = \max(d(I,L), d(J,L))$ The distance between two clusters $A,B$ is the maximal distance between any two points in each cluster: \[ d(A,B)=\max_{a\in A, b\in B}d(a,b) \] \item [\normalfont\textit{method=\q average\q}:] $\displaystyle d(K,L) = \frac{|I|\cdot d(I,L)+|J|\cdot d(J,L)}{|I|+|J|}$ The distance between two clusters $A,B$ is the average distance between the points in the two clusters: \[ d(A,B)=\frac1{|A||B|}\sum_{a\in A, b\in B}d(a,b) \] \item [\normalfont\textit{method=\q mcquitty\q}:] $\displaystyle d(K,L) = \tfrac12(d(I,L)+d(J,L))$ There is no global description for the distance between clusters since the distance depends on the order of the merging steps. \end{description} The following three methods are intended for Euclidean data only, ie.\ when $X$ contains the pairwise \textbf{squared} distances between vectors in Euclidean space. The algorithm will work on any input, however, and it is up to the user to make sure that applying the methods makes sense. \begin{description} \item [\normalfont\textit{method=\q centroid\q}:] $\displaystyle d(K,L) = \frac{|I|\cdot d(I,L)+|J|\cdot d(J,L)}{|I|+|J|}-\frac{|I|\cdot|J|\cdot d(I,J)}{(|I|+|J|)^2}$ There is a geometric interpretation: $d(A,B)$ is the distance between the centroids (ie.\ barycenters) of the clusters in Euclidean space: \[ d(A,B) = \|\vec c_A-\vec c_B\|^2, \] where $\vec c_A$ denotes the centroid of the points in cluster $A$. \item [\normalfont\textit{method=\q median\q}:] $\displaystyle d(K,L) = \tfrac12 d(I,L)+\tfrac12 d(J,L)-\tfrac14 d(I,J)$ Define the midpoint $\vec w_K$ of a cluster $K$ iteratively as $\vec w_K=k$ if $K=\{k\}$ is a singleton and as the midpoint $\frac12(\vec w_I+\vec w_J)$ if $K$ is formed by joining $I$ and $J$. Then we have \[ d(A,B)=\|\vec w_A-\vec w_B\|^2 \] in Euclidean space for all nodes $A,B$. Notice however that this distance depends on the order of the merging steps. \item [\normalfont\textit{method=\q ward.D\q}:] $\displaystyle d(K,L) = \frac{(|I|+|L|)\cdot d(I,L)+(|J|+|L|)\cdot d(J,L)-|L|\cdot d(I,J)}{|I|+|J|+|L|}$ The global cluster dissimilarity can be expressed as \[ d(A,B)=\frac{2|A||B|}{|A|+|B|}\cdot\|\vec c_A-\vec c_B\|^2, \] where $\vec c_A$ again denotes the centroid of the points in cluster $A$. \item [\normalfont\textit{method=\q ward.D2\q}:] This is the equivalent of \textit{\q ward.D\q}, but for input consisting of untransformed (in particular: \textbf{non-squared}) Euclidean distances. Internally, all distances are squared first, then method \textit{ward.D} is applied, and finally the square root of all heights in the dendrogram is taken. Thus, global cluster dissimilarity can be expressed as the square root of that for \textit{ward.D}, namely \[ d(A,B)=\sqrt{\frac{2|A||B|}{|A|+|B|}}\cdot\|\vec c_A-\vec c_B\|. \] \end{description} \item [\normalfont\texttt{\textbf{hclust.vector}}\,(\textit{X, method=\q single\q, members=NULL, metric=\q euclidean\q, p=NULL})] \phantomsection\label{hclust.vector} \addcontentsline{toc}{subsection}{\texttt{hclust.vector}} This performs hierarchical, agglomerative clustering on vector data with memory-saving algorithms. While the \hyperref[hclust]{\texttt{hclust}} method requires $\Theta(N^2)$ memory for clustering of $N$ points, this method needs $\Theta(ND)$ for $N$ points in $\mathbb R^D$, which is usually much smaller. The argument $X$ must be a two-dimensional matrix with \href{http://stat.ethz.ch/R-manual/R-patched/library/base/html/double.html}{\texttt{double}} precision values. It describes $N$ data points in $\mathbb R^D$ as an $(N\times D)$ matrix. The parameter \textit{\q members\q} is the same as for \hyperref[hclust]{\texttt{hclust}}. The parameter \textit{\q method\q} is one of the strings \textit{\q single\q}, \textit{\q centroid\q}, \textit{\q median\q}, \textit{\q ward\q}, or an unambiguous abbreviation thereof. If \textit{method} is \textit{\q single\q}, single linkage clustering is performed on the data points with the metric which is specified by the \textit{metric} parameter. The choices are the same as in the \href{http://stat.ethz.ch/R-manual/R-patched/library/stats/html/dist.html}{\texttt{dist}} method: \textit{\q euclidean\q}, \textit{\q maximum\q}, \textit{\q manhattan\q}, \textit{\q canberra\q}, \textit{\q binary\q} and \textit{\q minkowski\q}. Any unambiguous substring can be given. The parameter \textit{p} is used for the \textit{\q minkowski\q} metric only. The call \begin{quote} \texttt{hclust.vector(X, method=\q single\q, metric=[...])} \end{quote} is equivalent to \begin{quote} \texttt{hclust(dist(X, metric=[...]), method=\q single\q)} \end{quote} but uses less memory and is equally fast. Ties may be resolved differently, ie.\ if two pairs of nodes have equal, minimal dissimilarity values at some point, in the specific computer's representation for floating point numbers, either pair may be chosen for the next merging step in the dendrogram. If \textit{method} is one of \textit{\q centroid\q}, \textit{\q median\q}, \textit{\q ward\q}, clustering is performed with respect to Euclidean distances. In this case, the parameter \textit{metric} must be \textit{\q euclidean\q}. Notice that \texttt{hclust.vector} operates on Euclidean distances for compatibility reasons with the \dist{} method, while \hyperref[hclust]{\texttt{hclust}} assumes \textbf{squared} Euclidean distances for compatibility with the \href{http://stat.ethz.ch/R-manual/R-patched/library/stats/html/hclust.html}{\texttt{stats::hclust}} method! Hence, the call \phantomsection\label{squared} \begin{quote} \texttt{hc = hclust.vector(X, method=\q centroid\q)} \end{quote} is, aside from the lesser memory requirements, equivalent to \begin{quote} \texttt{d = dist(X)}\\ \texttt{hc = hclust(d\textasciicircum 2, method=\q centroid\q)}\\ \texttt{hc\$height = sqrt(hc\$height)} \end{quote} The same applies to the \textit{\q median\q} method. The \textit{\q ward\q} method in \hyperref[hclust.vector]{\texttt{hclust.vector}} is equivalent to \hyperref[hclust]{\texttt{hclust}} with method \textit{\q ward.D2\q}, but to method \textit{\q ward.D\q} only after squaring as above. Differences in these algebraically equivalent methods may arise only from floating-point inaccuracies and the resolution of ties (which may, however, in extreme cases affect the entire clustering result due to the inherently unstable nature of the clustering schemes). \end{methods} \section{The Python interface} The fastcluster package is imported as usual by: \begin{quote} \texttt{import fastcluster} \end{quote} It provides the following functions: \begin{quote} \hyperref[linkage]{\texttt{linkage}}\,(\textit{X, method=\q single\q, metric=\q euclidean\q, preserve\_input=True})\\ \hyperref[single]{\texttt{single}}\,($X$)\\ \hyperref[complete]{\texttt{complete}}\,($X$)\\ \hyperref[average]{\texttt{average}}\,($X$)\\ \hyperref[weighted]{\texttt{weighted}}\,($X$)\\ \hyperref[ward]{\texttt{ward}}\,($X$)\\ \hyperref[centroid]{\texttt{centroid}}\,($X$)\\ \hyperref[median]{\texttt{median}}\,($X$)\\ \hyperref[linkage_vector]{\texttt{linkage\_vector}}\,(\textit{X, method=\q single\q, metric=\q euclidean\q, extraarg=None}) \end{quote} The following sections contain comprehensive descriptions of these methods. \begin{methods} \item [\normalfont\texttt{fastcluster.\textbf{linkage}}\,(\textit{X, method=\q single\q, metric=\q euclidean\q, preserve\_input=\q True\q})] \phantomsection\label{linkage} \addcontentsline{toc}{subsection}{\texttt{linkage}} Hierarchical, agglomerative clustering on a condensed dissimilarity matrix or on vector data. Apart from the argument \textit{preserve\_input}, the method has the same input parameters and output format as the function of the same name in the module \hierarchy. The argument $X$ is preferably a \href{http://docs.scipy.org/doc/numpy/reference/generated/numpy.ndarray.html}{NumPy array} with floating point entries (\texttt{X.dtype\hskip0pt==\hskip0pt numpy.double}). Any other data format will be converted before it is processed. NumPy's \href{http://docs.scipy.org/doc/numpy/reference/maskedarray.html}{masked arrays} are not treated as special, and the mask is simply ignored. If $X$ is a one-dimensional array, it is considered a condensed matrix of pairwise dissimilarities in the format which is returned by \href{http://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.pdist.html}{\texttt{scipy.spatial.distance.pdist}}. It contains the flattened, upper-triangular part of a pairwise dissimilarity matrix. That is, if there are $N$ data points and the matrix $d$ contains the dissimilarity between the $i$-th and $j$-th observation at position $d_{i,j}$, the vector $X$ has length $\binom N2$ and is ordered as follows: \[ d = \begin{pmatrix} 0&d_{0,1}&d_{0,2}&\ldots&d_{0,n-1}\\ & 0&d_{1,2} & \ldots\\ &&0&\ldots\\ &&&\ddots\\ &&&&0 \end{pmatrix} = \begin{pmatrix} 0&X[0] &X[1]&\ldots&X[n-2]\\ & 0&X[n-1] & \ldots\\ &&0&\ldots\\ &&&\ddots\\ &&&&0 \end{pmatrix} \] The \textit{metric} argument is ignored in case of dissimilarity input. The optional argument \textit{preserve\_input} specifies whether the method makes a working copy of the dissimilarity vector or writes temporary data into the existing array. If the dissimilarities are generated for the clustering step only and are not needed afterward, approximately half the memory can be saved by specifying \textit{preserve\_input=False}. Note that the input array $X$ contains unspecified values after this procedure. It is therefore safer to write \begin{verbatim} linkage(X, method="...", preserve_input=False) del X \end{verbatim} to make sure that the matrix $X$ is not accessed accidentally after it has been used as scratch memory. (The single linkage algorithm does not write to the distance matrix or its copy anyway, so the \textit{preserve\_input} flag has no effect in this case.) If $X$ contains vector data, it must be a two-dimensional array with $N$ observations in $D$ dimensions as an $(N\times D)$ array. The \textit{preserve\_input} argument is ignored in this case. The specified \textit{metric} is used to generate pairwise distances from the input. The following two function calls yield equivalent output: \begin{verbatim} linkage(pdist(X, metric), method="...", preserve_input=False) linkage(X, metric=metric, method="...") \end{verbatim} The two results are identical in most cases, but differences occur if ties are resolved differently: if the minimum in step 2 below is attained for more than one pair of nodes, either pair may be chosen. It is not guaranteed that both \texttt{linkage} variants choose the same pair in this case. The general scheme of the agglomerative clustering procedure is as follows: \begin{enumerate} \item Start with $N$ singleton clusters (nodes) labeled $0,\ldots, N-1$, which represent the input points. \item Find a pair of nodes with minimal distance among all pairwise distances. \item Join the two nodes into a new node and remove the two old nodes. The new nodes are labeled consecutively $N,N+1,\ldots$ \item The distances from the new node to all other nodes is determined by the \textit{method} parameter (see below). \item Repeat $N-1$ times from step 2, until there is one big node, which contains all original input points. \end{enumerate} The output of \texttt{linkage} is \emph{stepwise dendrogram}, which is represented as an $(N-1)\times 4$ \href{http://docs.scipy.org/doc/numpy/reference/generated/numpy.ndarray.html}{NumPy array} with floating point entries (\texttt{dtype=numpy.double}). The first two columns contain the node indices which are joined in each step. The input nodes are labeled $0,\ldots,N-1$, and the newly generated nodes have the labels $N,\ldots, 2N-2$. The third column contains the distance between the two nodes at each step, ie.\ the current minimal distance at the time of the merge. The fourth column counts the number of points which comprise each new node. The parameter \textit{method} specifies which clustering scheme to use. The clustering scheme determines the distance from a new node to the other nodes. Denote the dissimilarities by $d$, the nodes to be joined by $I,J$, the new node by $K$ and any other node by $L$. The symbol $|I|$ denotes the size of the cluster $I$. \begin{description} \item [\normalfont\textit{method=\q single\q}:] $\displaystyle d(K,L) = \min(d(I,L), d(J,L))$ The distance between two clusters $A,B$ is the closest distance between any two points in each cluster: \[ d(A,B)=\min_{a\in A, b\in B}d(a,b) \] \item [\normalfont\textit{method=\q complete\q}:] $\displaystyle d(K,L) = \max(d(I,L), d(J,L))$ The distance between two clusters $A,B$ is the maximal distance between any two points in each cluster: \[ d(A,B)=\max_{a\in A, b\in B}d(a,b) \] \item [\normalfont\textit{method=\q average\q}:] $\displaystyle d(K,L) = \frac{|I|\cdot d(I,L)+|J|\cdot d(J,L)}{|I|+|J|}$ The distance between two clusters $A,B$ is the average distance between the points in the two clusters: \[ d(A,B)=\frac1{|A||B|}\sum_{a\in A, b\in B}d(a,b) \] \item [\normalfont\textit{method=\q weighted\q}:] $\displaystyle d(K,L) = \tfrac12(d(I,L)+d(J,L))$ There is no global description for the distance between clusters since the distance depends on the order of the merging steps. \end{description} The following three methods are intended for Euclidean data only, ie.\ when $X$ contains the pairwise (non-squared!)\ distances between vectors in Euclidean space. The algorithm will work on any input, however, and it is up to the user to make sure that applying the methods makes sense. \begin{description} \item [\normalfont\textit{method=\q centroid\q}:] $\displaystyle d(K,L) = \sqrt{\frac{|I|\cdot d(I,L)+|J|\cdot d(J,L)}{|I|+|J|}-\frac{|I|\cdot|J|\cdot d(I,J)}{(|I|+|J|)^2}}$ There is a geometric interpretation: $d(A,B)$ is the distance between the centroids (ie.\ barycenters) of the clusters in Euclidean space: \[ d(A,B) = \|\vec c_A-\vec c_B\|, \] where $\vec c_A$ denotes the centroid of the points in cluster $A$.\pagebreak[2] \item [\normalfont\textit{method=\q median\q}:] $\displaystyle d(K,L) = \sqrt{\tfrac12 d(I,L)+\tfrac12 d(J,L)-\tfrac14 d(I,J)}$ Define the midpoint $\vec w_K$ of a cluster $K$ iteratively as $\vec w_K=k$ if $K=\{k\}$ is a singleton and as the midpoint $\frac12(\vec w_I+\vec w_J)$ if $K$ is formed by joining $I$ and $J$. Then we have \[ d(A,B)=\|\vec w_A-\vec w_B\| \] in Euclidean space for all nodes $A,B$. Notice however that this distance depends on the order of the merging steps. \item [\normalfont\textit{method=\q ward\q}:] $\displaystyle d(K,L) = \sqrt{\frac{(|I|+|L|)\cdot d(I,L)+(|J|+|L|)\cdot d(J,L)-|L|\cdot d(I,J)}{|I|+|J|+|L|}}$ The global cluster dissimilarity can be expressed as \[ d(A,B)=\sqrt{\frac{2|A||B|}{|A|+|B|}}\cdot\|\vec c_A-\vec c_B\|, \] where $\vec c_A$ again denotes the centroid of the points in cluster $A$. \end{description} \item [\normalfont\texttt{fastcluster.\textbf{single}}\,(\textit{X})] \phantomsection\addcontentsline{toc}{subsection}{\texttt{single}}\label{single} Alias for \texttt{fastcluster.\textbf{linkage}}\,(\textit{X, method=\q single\q}). \item [\normalfont\texttt{fastcluster.\textbf{complete}}\,(\textit{X})] \phantomsection\addcontentsline{toc}{subsection}{\texttt{complete}}\label{complete} Alias for \texttt{fastcluster.\textbf{linkage}}\,(\textit{X, method=\q complete\q}). \item [\normalfont\texttt{fastcluster.\textbf{average}}\,(\textit{X})] \phantomsection\addcontentsline{toc}{subsection}{\texttt{average}}\label{average} Alias for \texttt{fastcluster.\textbf{linkage}}\,(\textit{X, method=\q average\q}). \item [\normalfont\texttt{fastcluster.\textbf{weighted}}\,(\textit{X})] \phantomsection\addcontentsline{toc}{subsection}{\texttt{weighted}}\label{weighted} Alias for \texttt{fastcluster.\textbf{linkage}}\,(\textit{X, method=\q weighted\q}). \item [\normalfont\texttt{fastcluster.\textbf{centroid}}\,(\textit{X})] \phantomsection\addcontentsline{toc}{subsection}{\texttt{centroid}}\label{centroid} Alias for \texttt{fastcluster.\textbf{linkage}}\,(\textit{X, method=\q centroid\q}). \item [\normalfont\texttt{fastcluster.\textbf{median}}\,(\textit{X})] \phantomsection\addcontentsline{toc}{subsection}{\texttt{median}}\label{median} Alias for \texttt{fastcluster.\textbf{linkage}}\,(\textit{X, method=\q median\q}). \item [\normalfont\texttt{fastcluster.\textbf{ward}}\,(\textit{X})] \phantomsection\addcontentsline{toc}{subsection}{\texttt{ward}}\label{ward} Alias for \texttt{fastcluster.\textbf{linkage}}\,(\textit{X, method=\q ward\q}). \item [\normalfont\texttt{fastcluster.\textbf{linkage\_vector}}\,(\textit{X, method=\q single\q, metric=\q euclidean\q, extraarg=\q None\q})] \phantomsection\addcontentsline{toc}{subsection}{\texttt{linkage\_vector}}\label{linkage_vector} This performs hierarchical, agglomerative clustering on vector data with memory-saving algorithms. While the \hyperref[linkage]{\texttt{linkage}} method requires $\Theta(N^2)$ memory for clustering of $N$ points, this method needs $\Theta(ND)$ for $N$ points in $\mathbb R^D$, which is usually much smaller. The argument $X$ has the same format as before, when $X$ describes vector data, ie.\ it is an $(N\times D)$ array. Also the output array has the same format. The parameter \textit{method} must be one of \textit{\q single\q}, \textit{\q centroid\q}, \textit{\q median\q}, \textit{\q ward\q}, ie.\ only for these methods there exist memory-saving algorithms currently. If \textit{method}, is one of \textit{\q centroid\q}, \textit{\q median\q}, \textit{\q ward\q}, the \textit{metric} must be \textit{\q euclidean\q}. Like the \texttt{linkage} method, \texttt{linkage\_vector} does not treat NumPy's \href{http://docs.scipy.org/doc/numpy/reference/maskedarray.html}{masked arrays} as special and simply ignores the mask. For single linkage clustering, any dissimilarity function may be chosen. Basically, every metric which is implemented in the method \href{http://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.pdist.html}{\texttt{scipy.spatial.distance.pdist}} is reimplemented here. However, the metrics differ in some instances since a number of mistakes and typos (both in the code and in the documentation) were corrected in the \textit{fastcluster} package.\footnote{Hopefully, the SciPy metric will be corrected in future versions and some day coincide with the \textit{fastcluster} definitions. See the bug reports at \url{http://projects.scipy.org/scipy/ticket/1484}, \url{http://projects.scipy.org/scipy/ticket/1486}.} Therefore, the available metrics with their definitions are listed below as a reference. The symbols $u$ and $v$ mostly denote vectors in $\mathbb R^D$ with coordinates $u_j$ and $v_j$ respectively. See below for additional metrics for Boolean vectors. Unless otherwise stated, the input array $X$ is converted to a floating point array (\texttt{X.dtype==numpy.double}) if it does has have already the required data type. Some metrics accept Boolean input; in this case this is stated explicitly below. \begin{description} \item[\normalfont\textit{\q euclidean\q}:] Euclidean metric, $L_2$ norm \[ d(u,v) = \| u-v\|_2 = \sqrt{\sum_j (u_j-v_j)^2} \] \item[\normalfont\textit{\q sqeuclidean\q}:] squared Euclidean metric \[ d(u,v) = \| u-v\|^2_2 = \sum_j (u_j-v_j)^2 \] \item[\normalfont\textit{\q seuclidean\q}:] standardized Euclidean metric \[ d(u,v) = \sqrt{\sum_j (u_j-v_j)^2 /V_j} \] The vector $V=(V_0,\ldots,V_{D-1})$ is given as the \textit{extraarg} argument. If no \textit{extraarg} is given, $V_j$ is by default the unbiased sample variance of all observations in the $j$-th coordinate, $V_j = \Var_i(X_{i,j})=\frac1{N-1}\sum_i(X_{i,j}^2-\mu(X_j)^2)$. (Here, $\mu(X_j)$ denotes as usual the mean of $X_{i,j}$ over all rows $i$.) \item[\normalfont\textit{\q mahalanobis\q}:] Mahalanobis distance \[ d(u,v) = \sqrt{(u-v)^{\mkern-3mu\top}V (u-v)} \] Here, $V=\textit{extraarg}$, a $(D\times D)$-matrix. If $V$ is not specified, the inverse of the covariance matrix \texttt{numpy.linalg.inv(numpy.cov(X, rowvar=False))} is used: \[ (V^{-1})_{j,k} = \frac1{N-1} \sum_i (X_{i,j}-\mu(X_j))(X_{i,k}-\mu(X_k)) \] \item[\normalfont\textit{\q cityblock\q}:] the Manhattan distance, $L_1$ norm \[ d(u,v) = \sum_j |u_j-v_j| \] \item[\normalfont\textit{\q chebychev\q}:] the supremum norm, $L_\infty$ norm \[ d(u,v) = \max_j |u_j-v_j| \] \item[\normalfont\textit{\q minkowski\q}:] the $L_p$ norm \[ d(u,v) = \left(\sum_j |u_j-v_j|^p\right)^{1/p} \] This metric coincides with the \textit{cityblock}, \textit{euclidean} and \textit{chebychev} metrics for $p=1$, $p=2$ and $p=\infty$ (\texttt{numpy.inf}), respectively. The parameter $p$ is given as the \textit{\q extraarg\q} argument. \item[\normalfont\textit{\q cosine\q}] \[ d(u,v) = 1 - \frac{\langle u,v\rangle}{\|u\|\cdot\|v\|} = 1 - \frac{\sum_j u_jv_j}{\sqrt{\sum_j u_j^2\cdot \sum_j v_j^2}} \] \item[\normalfont\textit{\q correlation\q}:] This method first mean-centers the rows of $X$ and then applies the \textit{cosine} distance. Equivalently, the \textit{correlation} distance measures $1-{}$\textrm{(Pearson's correlation coefficient)}. \[ d(u,v) = 1 - \frac{\langle u-\mu(u),v-\mu(v)\rangle}{\|u-\mu(u)\|\cdot\|v-\mu(v)\|}, \] \item[\normalfont\textit{\q canberra\q}] \[ d(u,v) = \sum_j\frac{|u_j-v_j|}{|u_j|+|v_j|} \] Summands with $u_j=v_j=0$ contribute 0 to the sum. \item[\normalfont\textit{\q braycurtis\q}] \[ d(u,v) = \frac{\sum_j |u_j-v_j|}{\sum_j |u_j+v_j|} \] \item[\textnormal{(user function):}] The parameter \textit{metric} may also be a function which accepts two NumPy floating point vectors and returns a number. Eg.\ the Euclidean distance could be emulated with \begin{quote} \texttt{fn = lambda u, v: numpy.sqrt(((u-v)*(u-v)).sum())}\\ \texttt{linkage\_vector(X, method=\q single\q, metric=fn)} \end{quote} This method, however, is much slower than the built-in function. \item[\normalfont\textit{\q hamming\q}:] The Hamming distance accepts a Boolean array (\texttt{X.dtype==bool}) for efficient storage. Any other data type is converted to \texttt{numpy.double}. \[ d(u,v) = |\{j\mid u_j\neq v_j\}| \] \item[\normalfont\textit{\q jaccard\q}:] The Jaccard distance accepts a Boolean array (\texttt{X.dtype\hskip0pt ==\hskip0pt bool}) for efficient storage. Any other data type is converted to \texttt{numpy.double}. \[ d(u,v) = \frac{|\{j\mid u_j\neq v_j\}|}{|\{j\mid u_j\neq 0\text{ or } v_j\neq 0\}|} \] \[ d(0,0) = 0 \] Python represents \texttt{True} by 1 and \texttt{False} by 0. In the Boolean case, the Jaccard distance is therefore: \[ d(u,v) = \frac{|\{j\mid u_j\neq v_j\}|}{|\{j\mid u_j\lor v_j\}|} \] \end{description} The following metrics are designed for Boolean vectors. The input array is converted to the \texttt{bool} data type if it is not Boolean already. Use the following abbreviations for the entries of a contingency table: \begin{align*} a &= |\{j\mid u_j\land v_j \}| & b &= |\{j\mid u_j\land(\lnot v_j)\}|\\ c &= |\{j\mid (\lnot u_j)\land v_j \}| & d &= |\{j\mid (\lnot u_j)\land(\lnot v_j)\}| \end{align*} Recall that $D$ denotes the number of dimensions, hence $D=a+b+c+d$. \begin{description} \item[\normalfont\textit{\q yule\q}] \[ d(u,v) = \frac{2bc}{ad+bc} \] \item[\normalfont\textit{\q dice\q}] \begin{gather*} d(u,v) = \frac{b+c}{2a+b+c}\\ d(0,0) = 0 \end{gather*} \item[\normalfont\textit{\q rogerstanimoto\q}] \[ d(u,v) = \frac{2(b+c)}{b+c+D} \] \item[\normalfont\textit{\q russellrao\q}] \[ d(u,v) = \frac{b+c+d}{D} \] \item[\normalfont\textit{\q sokalsneath\q}] \begin{gather*} d(u,v) = \frac{2(b+c)}{a+2(b+c)}\\ d(0,0) = 0 \end{gather*} \item[\normalfont\textit{\q kulsinski\q}] \[ d(u,v) = \frac 12\cdot\left(\frac b{a+b} + \frac c{a+c}\right) \] \item[\normalfont\textit{\q matching\q}] \[ d(u,v) = \frac{b+c}{D} \] Notice that when given a Boolean array, the \textit{matching} and \textit{hamming} distance are the same. The \textit{matching} distance formula, however, converts every input to Boolean first. Hence, the vectors $(0,1)$ and $(0,2)$ have zero \textit{matching} distance since they are both converted to $(\mathrm{False}, \mathrm{True})$ but the \textit{hamming} distance is $0.5$. \item[\normalfont\textit{\q sokalmichener\q}] is an alias for \textit{\q matching\q}. \end{description} \end{methods} \section{Behavior for NaN and infinite values}\label{sec:infnan} Whenever the fastcluster package encounters a NaN value as the distance between nodes, either as the initial distance or as an updated distance after some merging steps, it raises an error. This was designed intentionally, even if there might be ways to propagate NaNs through the algorithms in a more or less sensible way. Indeed, since the clustering result depends on every single distance value, the presence of NaN values usually indicates a dubious clustering result, and therefore NaN values should be eliminated in preprocessing.\pagebreak[1] In the R interface for vector input, coordinates with {\NA} value are interpreted as missing data and treated in the same way as R's {\dist} function does. This results in valid output whenever the resulting distances are not NaN. The Python interface does not provide any way of handling missing coordinates, and data should be processed accordingly and given as pairwise distances to the clustering algorithms in this case. The fastcluster package handles node distances and coordinates with infinite values correctly, as long as the formulas for the distance updates and the metric (in case of vector input) make sense. In concordance with the statement above, an error is produced if a NaN value results from performing arithmetic with infinity. Also, the usual proviso applies: internal formulas in the code are mathematically equivalent to the formulas as stated in the documentation only for finite, real numbers but might produce different results for $\pm\infty$. Apart from obvious cases like single or complete linkage, it is therefore recommended that users think about how they want infinite values to be treated by the distance update and metric formulas and then check whether the fastcluster code does exactly what they want in these special cases. \section{Differences between the two interfaces} \begin{itemize} \item The \textit{\q mcquitty\q} method in R is called \textit{\q weighted\q} in Python. \item R and SciPy use different conventions for the ``Euclidean'' methods \textit{\q centroid\q}, \textit{\q median\q}! R assumes that the dissimilarity matrix consists of squared Euclidean distances, while SciPy expects non-squared Euclidean distances. The fastcluster package respects these conventions and uses different formulas in the two interfaces. The \textit{\q ward\q} method in the Python interface is identical to \textit{\q ward.D2\q} in the R interface. If the same results in both interfaces ought to be obtained, then the \hyperref[hclust]{\texttt{hclust}} function in R must be input the entry-wise square of the distance matrix, \verb!d^2!, for the \textit{\q ward.D\q}, \textit{\q centroid\q} and \textit{\q median\q} methods, and later the square root of the height field in the dendrogram must be taken. The \hyperref[hclust.vector]{\texttt{hclust.vector}} method calculates non-squared Euclidean distances, like R's \dist{} method and identically to the Python interface. See the \hyperref[squared]{example} in the \hyperref[hclust.vector]{\texttt{hclust.vector}} documentation above. For the \textit{\q average\q} and \textit{\q weighted\q} alias \textit{\q mcquitty\q} methods, the same, non-squared distance matrix \texttt{d} as in the Python interface must be used for the same results. The \textit{\q single\q} and \textit{\q complete\q} methods only depend on the relative order of the distances, hence it does not make a difference whether the method operates on the distances or the squared distances. The code example in the R documentation (enter \texttt{?hclust} or \texttt{example(hclust)} in R) contains another instance where the squared distance matrix is generated from Euclidean data. \item The Python interface is not designed to deal with missing values, and NaN values in the vector data raise an error message. The \hyperref[hclust.vector]{\texttt{hclust.vector}} method in the R interface, in contrast, deals with NaN and the (R specific) {\NA} values in the same way as the \dist{} method does. Confer the documentation for \dist{} for details. \end{itemize} \section{References} \begin{trivlist} \item \textit{NumPy: Scientific computing tools for Python}, \url{http://numpy.scipy.org/}. \item Eric Jones, Travis Oliphant, Pearu Peterson et al., \textit{SciPy: Open Source Scientific Tools for Python}, 2001, \url{http://www.scipy.org}. \item \textit{R: A Language and Environment for Statistical Computing}, R Foundation for Statistical Computing, Vienna, 2011, \url{http://www.r-project.org}. \end{trivlist} \end{document} %%% Local variables: %%% mode: latex %%% TeX-master: "fastcluster.Rtex" %%% TeX-PDF-mode: t %%% End: fastcluster/MD50000644000176200001440000000261312470743142013127 0ustar liggesusers7f74c6c3764af313e48a09cc141b6727 *DESCRIPTION f999854b6b3aab607fac6eadb564dd88 *INSTALL 59e6750f727695ec043303b3a9f37fa4 *LICENSE da8e9d68585993250a9c29c3e9bff50b *NAMESPACE 62cc71cc39349059f485d6c8bdf63126 *NEWS 5fd601c6a56b9625b79593f71463d50c *R/fastcluster.R 50c4d271555e475a4561b10f60f3e667 *README 82098d0efda6851e7f29b0a289217c95 *build/vignette.rds 459081fd7078ab4eadf2e3ce7e45bab1 *inst/CITATION 1892d8bf745674b836a1c59e68e78200 *inst/doc/fastcluster.Rtex 022319f8f025a0c2091435656d26c30a *inst/doc/fastcluster.pdf e6686a163ad4b71d2da8e903d8a96569 *man/fastcluster.Rd 6ad560d2393f210e52cb0bc4e9fabd68 *man/hclust.Rd 6d4bd49e86326aed71ec5bfba7c82e53 *man/hclust.vector.Rd 97bb0f9bf046e498c47423129fc3691a *src/Makevars 7b8a328733afe582986d5292e9c91278 *src/Makevars.win aef5054bc316771422d6973f2072a8a5 *src/fastcluster.cpp 5021c6f2e1270f7448dc93f2ef5de5e8 *src/fastcluster_R.cpp b9b2b3c57417b5ed19c880fd19b20ca5 *src/fastcluster_python.cpp f53dc20a5d1a35c9c07f552cfa705035 *src/python/fastcluster.py 62a29578aeb9744900ec9faaa038798f *src/python/setup.py 2e104e2472d2dca0ef1229b993e88d56 *src/python/test/nantest.py 3e6cfc5d931e71e7eaa660faee897bee *src/python/test/test.py 44f55e80338781f6d5db099431ad3404 *src/python/test/vectortest.py 7dba0c8af8d88099a7898cf95b049fe9 *tests/test_fastcluster.R 9cbb544a7574e9d55aed550e5f3608a4 *vignettes/Makefile 1892d8bf745674b836a1c59e68e78200 *vignettes/fastcluster.Rtex fastcluster/README0000644000176200001440000001413312252103644013472 0ustar liggesusersfastcluster: Fast hierarchical clustering routines for R and Python Copyright © 2011 Daniel Müllner The fastcluster package is a C++ library for hierarchical, agglomerative clustering. It efficiently implements the seven most widely used clustering schemes: single, complete, average, weighted/McQuitty, Ward, centroid and median linkage. The library currently has interfaces to two languages: R and Python/NumPy. Part of the functionality is designed as drop-in replacement for existing routines: “linkage” in the SciPy package “scipy.cluster.hierarchy”, “hclust” in R's “stats” package, and the “flashClust” package. Once the fastcluster library is loaded at the beginning of the code, every program that uses hierarchical clustering can benefit immediately and effortlessly from the performance gain. Moreover, there are memory-saving routines for clustering of vector data, which go beyond what the existing packages provide. See the author's home page for more information, in particular a performance comparison with other clustering packages. The User's manual is the file inst/doc/fastcluster.pdf in the source distribution. The fastcluster package is distributed under the BSD license. See the file LICENSE in the source distribution or . Installation ‾‾‾‾‾‾‾‾‾‾‾‾ See the file INSTALL in the source distribution. Usage ‾‾‾‾‾ 1. R ‾‾‾‾ In R, load the package with the following command: library('fastcluster') The package overwrites the function hclust from the “stats” package (in the same way as the flashClust package does). Please remove any references to the flashClust package in your R files to not accidentally overwrite the hclust function with the flashClust version. The new hclust function has exactly the same calling conventions as the old one. You may just load the package and immediately and effortlessly enjoy the performance improvements. The function is also an improvement to the flashClust function from the “flashClust” package. Just replace every call to flashClust by hclust and expect your code to work as before, only faster. (If you are using flashClust prior to version 1.01, update it! See the change log for flashClust: http://cran.r-project.org/web/packages/flashClust/ChangeLog ) If you need to access the old function or make sure that the right function is called, specify the package as follows: fastcluster::hclust(…) flashClust::hclust(…) stats::hclust(…) Vector data can be clustered with a memory-saving algorithm with the command hclust.vector(…) See the User's manual inst/doc/fastcluster.pdf for further details. WARNING ‾‾‾‾‾‾‾ R and Matlab/SciPy use different conventions for the “Ward”, “centroid” and “median” methods. R assumes that the dissimilarity matrix consists of squared Euclidean distances, while Matlab and SciPy expect non-squared Euclidean distances. The fastcluster package respects these conventions and uses different formulas in the two interfaces. If you want the same results in both interfaces, then feed the hclust function in R with the entry-wise square of the distance matrix, D^2, for the “Ward”, “centroid” and “median” methods and later take the square root of the height field in the dendrogram. For the “average” and “weighted” alias “mcquitty” methods, you must still take the same distance matrix D as in the Python interface for the same results. The “single” and “complete” methods only depend on the relative order of the distances, hence it does not make a difference whether the method operates on the distances or the squared distances. The code example in the R documentation (enter ?hclust or example(hclust) in R) contains an instance where the squared distance matrix is generated from Euclidean data. 2. Python ‾‾‾‾‾‾‾‾‾ The fastcluster package is imported as usual by import fastcluster It provides the following functions: linkage(X, method='single', metric='euclidean', preserve_input=True) single(X) complete(X) average(X) weighted(X) ward(X) centroid(X) median(X) linkage_vector(X, method='single', metric='euclidean', extraarg=None) The argument X is either a compressed distance matrix or a collection of n observation vectors in d dimensions as an (n×d) array. Apart from the argument preserve_input, the methods have the same input and output as the functions of the same name in the package scipy.cluster.hierarchy. The additional, optional argument preserve_input specifies whether the fastcluster package first copies the distance matrix or writes into the existing array. If the dissimilarities are generated for the clustering step only and are not needed afterward, approximately half the memory can be saved by specifying preserve_input=False. Note that the input array X contains unspecified values after this procedure. You may want to write linkage(X, method='…', preserve_input=False) del X to make sure that the matrix X is not accessed accidentally after it has been used as scratch memory. The method linkage_vector(X, method='single', metric='euclidean', extraarg=None) provides memory-saving clustering for vector data. It also accepts a collection of n observation vectors in d dimensions as an (n×d) array as the first parameter. The parameter 'method' is either 'single', 'ward', 'centroid' or 'median'. The 'ward', 'centroid' and 'median' methods require the Euclidean metric. In case of single linkage, the 'metric' parameter can be chosen from all metrics which are implemented in scipy.spatial.dist.pdist. There may be differences between linkage(scipy.spatial.dist.pdist(X, metric='…')) and linkage_vector(X, metric='…') since there have been made a few corrections compared to the pdist function. Please consult the the User's manual inst/doc/fastcluster.pdf for comprehensive details. fastcluster/build/0000755000176200001440000000000012470717051013714 5ustar liggesusersfastcluster/build/vignette.rds0000644000176200001440000000031612470717051016253 0ustar liggesusersb```b`fdb`b2 1# 'X\SZ\ZTZ&ZZ^W&ɏ % Md0&$yּb4M.y) 3GZY_Ӄ -3'foHf e2|s mMI,F(WJbI^ZP?fastcluster/DESCRIPTION0000644000176200001440000000247212470743142014330 0ustar liggesusersPackage: fastcluster Encoding: UTF-8 Type: Package Version: 1.1.16 Date: 2015-01-07 Title: Fast Hierarchical Clustering Routines for R and Python Authors@R: person("Daniel", "Müllner", email = "daniel@danifold.net", role = c("aut", "cph", "cre")) Enhances: stats, flashClust Depends: R (>= 3.0.0) Description: This is a two-in-one package which provides interfaces to both R and Python. It implements fast hierarchical, agglomerative clustering routines. Part of the functionality is designed as drop-in replacement for existing routines: "linkage" in the SciPy package "scipy.cluster.hierarchy", "hclust" in R's "stats" package, and the "flashClust" package. It provides the same functionality with the benefit of a much faster implementation. Moreover, there are memory-saving routines for clustering of vector data, which go beyond what the existing packages provide. For information on how to install the Python files, see the file INSTALL in the source distribution. License: FreeBSD | GPL-2 | file LICENSE URL: http://danifold.net/fastcluster.html Packaged: 2015-02-17 20:01:45 UTC; muellner Author: Daniel Müllner [aut, cph, cre] Maintainer: Daniel Müllner NeedsCompilation: yes Repository: CRAN Date/Publication: 2015-02-17 23:53:22 fastcluster/man/0000755000176200001440000000000012452210521013356 5ustar liggesusersfastcluster/man/hclust.Rd0000644000176200001440000000441012452210501015144 0ustar liggesusers\name{hclust} \alias{hclust} \title{Fast hierarchical, agglomerative clustering of dissimilarity data} \description{ This function implements hierarchical clustering with the same interface as \code{\link[stats:hclust]{hclust}} from the \pkg{\link{stats}} package but with much faster algorithms. } \usage{hclust(d, method="complete", members=NULL)} \arguments{ \item{d}{a dissimilarity structure as produced by \code{dist}.} \item{method}{the agglomeration method to be used. This must be (an unambiguous abbreviation of) one of \code{"single"}, \code{"complete"}, \code{"average"}, \code{"mcquitty"}, \code{"ward.D"}, \code{"ward.D2"}, \code{"centroid"} or \code{"median"}.} \item{members}{\code{NULL} or a vector with length the number of observations.} } \value{An object of class \code{'hclust'}. It encodes a stepwise dendrogram.} \details{See the documentation of the original function \code{\link[stats:hclust]{hclust}} in the \pkg{\link{stats}} package. A comprehensive User's manual \href{http://cran.r-project.org/web/packages/fastcluster/vignettes/fastcluster.pdf}{fastcluster.pdf} is available as a vignette. Get this from the R command line with \code{vignette('fastcluster')}. } \references{\url{http://danifold.net/fastcluster.html}} \author{Daniel Müllner} \seealso{\code{\link{fastcluster}}, \code{\link{hclust.vector}}, \code{\link[stats:hclust]{stats::hclust}}} \examples{# Taken and modified from stats::hclust # # hclust(...) # new method # stats::hclust(...) # old method require(fastcluster) require(graphics) hc <- hclust(dist(USArrests), "ave") plot(hc) plot(hc, hang = -1) ## Do the same with centroid clustering and squared Euclidean distance, ## cut the tree into ten clusters and reconstruct the upper part of the ## tree from the cluster centers. hc <- hclust(dist(USArrests)^2, "cen") memb <- cutree(hc, k = 10) cent <- NULL for(k in 1:10){ cent <- rbind(cent, colMeans(USArrests[memb == k, , drop = FALSE])) } hc1 <- hclust(dist(cent)^2, method = "cen", members = table(memb)) opar <- par(mfrow = c(1, 2)) plot(hc, labels = FALSE, hang = -1, main = "Original Tree") plot(hc1, labels = FALSE, hang = -1, main = "Re-start from 10 clusters") par(opar) } \keyword{multivariate} \keyword{cluster} fastcluster/man/fastcluster.Rd0000644000176200001440000000571612452210467016226 0ustar liggesusers\name{fastcluster} \alias{fastcluster} \alias{fastcluster-package} \docType{package} \title{Fast hierarchical, agglomerative clustering routines for R and Python} \description{The \pkg{fastcluster} package provides efficient algorithms for hierarchical, agglomerative clustering. In addition to the R interface, there is also a Python interface to the underlying C++ library, to be found in the source distribution. } \details{The function \code{\link{hclust}} provides clustering when the input is a dissimilarity matrix. A dissimilarity matrix can be computed from vector data by \code{\link{dist}}. The \code{\link{hclust}} function can be used as a drop-in replacement for existing routines: \code{\link[stats:hclust]{stats::hclust}} and \code{\link[flashClust:hclust]{flashClust::hclust}} alias \code{\link[flashClust:flashClust]{flashClust::flashClust}}. Once the fastcluster library is loaded at the beginning of the code, every program that uses hierarchical clustering can benefit immediately and effortlessly from the performance gain When the package is loaded, it overwrites the function \code{\link{hclust}} with the new code. The function \code{\link{hclust.vector}} provides memory-saving routines when the input is vector data. Further information: \itemize{ \item R documentation pages: \code{\link{hclust}}, \code{\link{hclust.vector}} \item A comprehensive User's manual: \href{http://cran.r-project.org/web/packages/fastcluster/vignettes/fastcluster.pdf}{fastcluster.pdf}. Get this from the R command line with \code{vignette('fastcluster')}. \item JSS paper: \url{http://www.jstatsoft.org/v53/i09/}. \item See the author's home page for a performance comparison: \url{http://danifold.net/fastcluster.html}. } } \references{\url{http://danifold.net/fastcluster.html}} \author{Daniel Müllner} \seealso{\code{\link{hclust}}, \code{\link{hclust.vector}}} \examples{# Taken and modified from stats::hclust # # hclust(...) # new method # hclust.vector(...) # new method # stats::hclust(...) # old method require(fastcluster) require(graphics) hc <- hclust(dist(USArrests), "ave") plot(hc) plot(hc, hang = -1) ## Do the same with centroid clustering and squared Euclidean distance, ## cut the tree into ten clusters and reconstruct the upper part of the ## tree from the cluster centers. hc <- hclust.vector(USArrests, "cen") # squared Euclidean distances hc$height <- hc$height^2 memb <- cutree(hc, k = 10) cent <- NULL for(k in 1:10){ cent <- rbind(cent, colMeans(USArrests[memb == k, , drop = FALSE])) } hc1 <- hclust.vector(cent, method = "cen", members = table(memb)) # squared Euclidean distances hc1$height <- hc1$height^2 opar <- par(mfrow = c(1, 2)) plot(hc, labels = FALSE, hang = -1, main = "Original Tree") plot(hc1, labels = FALSE, hang = -1, main = "Re-start from 10 clusters") par(opar) } \keyword{multivariate} \keyword{cluster} fastcluster/man/hclust.vector.Rd0000644000176200001440000000701412452210521016452 0ustar liggesusers\name{hclust.vector} \alias{hclust.vector} \title{Fast hierarchical, agglomerative clustering of vector data} \description{ This function implements hierarchical, agglomerative clustering with memory-saving algorithms.} \usage{hclust.vector(X, method="single", members=NULL, metric='euclidean', p=NULL)} \arguments{ \item{X}{an \eqn{(N\times D)}{(N×D)} matrix of '\link{double}' values: \eqn{N}{N} observations in \eqn{N}{N} variables.} \item{method}{the agglomeration method to be used. This must be (an unambiguous abbreviation of) one of \code{"single"}, \code{"ward"}, \code{"centroid"} or \code{"median"}.} \item{members}{\code{NULL} or a vector with length the number of observations.} \item{metric}{the distance measure to be used. This must be one of \code{"euclidean"}, \code{"maximum"}, \code{"manhattan"}, \code{"canberra"}, \code{"binary"} or \code{"minkowski"}. Any unambiguous substring can be given.} \item{p}{parameter for the Minkowski metric.} } \details{The function \code{\link{hclust.vector}} provides clustering when the input is vector data. It uses memory-saving algorithms which allow processing of larger data sets than \code{\link{hclust}} does. The \code{"ward"}, \code{"centroid"} and \code{"median"} methods require \code{metric="euclidean"} and cluster the data set with respect to Euclidean distances. For \code{"single"} linkage clustering, any dissimilarity measure may be chosen. Currently, the same metrics are implemented as the \code{\link[stats:dist]{dist}} function provides. The call\preformatted{ hclust.vector(X, method='single', metric=[...])} gives the same result as\preformatted{ hclust(dist(X, metric=[...]), method='single')} but uses less memory and is equally fast. For the Euclidean methods, care must be taken since \code{\link{hclust}} expects \bold{squared} Euclidean distances. Hence, the call\preformatted{ hclust.vector(X, method='centroid')} is, aside from the lesser memory requirements, equivalent to\preformatted{ d = dist(X) hc = hclust(d^2, method='centroid') hc$height = sqrt(hc$height)} The same applies to the \code{"median"} method. The \code{"ward"} method in \code{\link{hclust.vector}} is equivalent to \code{\link{hclust}} with method \code{"ward.D2"}, but to method \code{"ward.D"} only after squaring as above. More details are in the User's manual \href{http://cran.r-project.org/web/packages/fastcluster/vignettes/fastcluster.pdf}{fastcluster.pdf}, which is available as a vignette. Get this from the R command line with \code{vignette('fastcluster')}. } \references{\url{http://danifold.net/fastcluster.html}} \author{Daniel Müllner} \seealso{\code{\link{fastcluster}}, \code{\link{hclust}}} \examples{# Taken and modified from stats::hclust ## Perform centroid clustering with squared Euclidean distances, ## cut the tree into ten clusters and reconstruct the upper part of the ## tree from the cluster centers. hc <- hclust.vector(USArrests, "cen") # squared Euclidean distances hc$height <- hc$height^2 memb <- cutree(hc, k = 10) cent <- NULL for(k in 1:10){ cent <- rbind(cent, colMeans(USArrests[memb == k, , drop = FALSE])) } hc1 <- hclust.vector(cent, method = "cen", members = table(memb)) # squared Euclidean distances hc1$height <- hc1$height^2 opar <- par(mfrow = c(1, 2)) plot(hc, labels = FALSE, hang = -1, main = "Original Tree") plot(hc1, labels = FALSE, hang = -1, main = "Re-start from 10 clusters") par(opar) } \keyword{multivariate} \keyword{cluster} fastcluster/LICENSE0000644000176200001440000000242112252103564013615 0ustar liggesusersCopyright © 2011, Daniel Müllner All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.