tm/0000755000175100001440000000000014367745152010726 5ustar hornikuserstm/NAMESPACE0000644000175100001440000002017414270254224012135 0ustar hornikusersuseDynLib("tm", .registration = TRUE) importFrom("NLP", "content", "content<-", "meta", "meta<-", "words", "as.Token_Tokenizer", "is.Span_Tokenizer", "Token_Tokenizer", "TaggedTextDocument") importFrom("Rcpp", "evalCpp") importFrom("graphics", "abline", "plot") importFrom("parallel", "parLapply") importFrom("stats", "coef", "cor", "lm", "setNames") importFrom("utils", "download.file", "getS3method", "unzip") importFrom("slam", "as.simple_triplet_matrix", "col_sums", "crossapply_simple_triplet_matrix", "read_stm_MC", "rollup", "row_sums", "simple_triplet_matrix") importFrom("xml2", "read_xml", "xml_contents", "xml_find_all", "xml_missing", "xml_text") export("as.DocumentTermMatrix", "as.TermDocumentMatrix", "as.VCorpus", "Boost_tokenizer", "content_transformer", "Corpus", "DataframeSource", "DirSource", "Docs", "DocumentTermMatrix", "DublinCore", "DublinCore<-", "eoi", "FunctionGenerator", "getElem", "getMeta", "Heaps_plot", "findAssocs", "findFreqTerms", "findMostFreqTerms", "getReaders", "getSources", "getTokenizers", "getTransformations", "inspect", "MC_tokenizer", "nDocs", "nTerms", "PCorpus", "pGetElem", "PlainTextDocument", "read_dtm_Blei_et_al", "read_dtm_MC", "readDataframe", "readDOC", "reader", "readPlain", "readReut21578XML", "readReut21578XMLasPlain", "readRCV1", "readRCV1asPlain", "readPDF", "readTagged", "readXML", "removeNumbers", "removePunctuation", "removeSparseTerms", "removeWords", "scan_tokenizer", "SimpleCorpus", "SimpleSource", "stemCompletion", "stemDocument", "stepNext", "stopwords", "stripWhitespace", "TermDocumentMatrix", "termFreq", "Terms", "tm_filter", "tm_index", "tm_map", "tm_parLapply", "tm_parLapply_engine", "tm_reduce", "tm_term_score", "URISource", "VCorpus", "VectorSource", "WeightFunction", "weightTf", "weightTfIdf", "weightBin", "weightSMART", "writeCorpus", "XMLSource", "XMLTextDocument", "Zipf_plot", "ZipSource") S3method("removeNumbers", "character") S3method("removePunctuation", "character") S3method("removeWords", "character") S3method("stemDocument", "character") S3method("stripWhitespace", "character") S3method("words", "character") S3method("[", "DataframeSource") S3method("[[", "DataframeSource") S3method("getElem", "DataframeSource") S3method("getMeta", "DataframeSource") S3method("pGetElem", "DataframeSource") S3method("[", "DirSource") S3method("[[", "DirSource") S3method("getElem", "DirSource") S3method("pGetElem", "DirSource") S3method("[", "DocumentTermMatrix") S3method("c", "DocumentTermMatrix") S3method("dimnames<-", "DocumentTermMatrix") S3method("findAssocs", "DocumentTermMatrix") S3method("findMostFreqTerms", "DocumentTermMatrix") S3method("inspect", "DocumentTermMatrix") S3method("plot", "DocumentTermMatrix") S3method("print", "DocumentTermMatrix") S3method("t", "DocumentTermMatrix") S3method("tm_term_score", "DocumentTermMatrix") S3method("as.VCorpus", "list") S3method("tm_term_score", "term_frequency") S3method("[", "PCorpus") S3method("[[", "PCorpus") S3method("[[<-", "PCorpus") S3method("as.list", "PCorpus") S3method("content", "PCorpus") S3method("format", "PCorpus") S3method("inspect", "PCorpus") S3method("length", "PCorpus") S3method("meta", "PCorpus") S3method("meta<-", "PCorpus") S3method("names", "PCorpus") S3method("names<-", "PCorpus") S3method("print", "PCorpus", .print_via_format) S3method("TermDocumentMatrix", "PCorpus") S3method("tm_filter", "PCorpus") S3method("tm_index", "PCorpus") S3method("tm_map", "PCorpus") S3method("as.character", "PlainTextDocument") S3method("content", "PlainTextDocument") S3method("content<-", "PlainTextDocument") S3method("format", "PlainTextDocument") S3method("meta", "PlainTextDocument") S3method("meta<-", "PlainTextDocument") S3method("print", "PlainTextDocument", .print_via_format) S3method("removeNumbers", "PlainTextDocument") S3method("removePunctuation", "PlainTextDocument") S3method("removeWords", "PlainTextDocument") S3method("stemDocument", "PlainTextDocument") S3method("stripWhitespace", "PlainTextDocument") S3method("tm_term_score", "PlainTextDocument") S3method("words", "PlainTextDocument") S3method("[", "SimpleCorpus") S3method("[[", "SimpleCorpus") S3method("[[<-", "SimpleCorpus") S3method("as.list", "SimpleCorpus") S3method("content", "SimpleCorpus") S3method("format", "SimpleCorpus") S3method("inspect", "SimpleCorpus") S3method("length", "SimpleCorpus") S3method("meta", "SimpleCorpus") S3method("meta<-", "SimpleCorpus") S3method("names", "SimpleCorpus") S3method("print", "SimpleCorpus", .print_via_format) S3method("TermDocumentMatrix", "SimpleCorpus") S3method("tm_filter", "SimpleCorpus") S3method("tm_index", "SimpleCorpus") S3method("tm_map", "SimpleCorpus") S3method("close", "SimpleSource") S3method("eoi", "SimpleSource") S3method("length", "SimpleSource") S3method("open", "SimpleSource") S3method("reader", "SimpleSource") S3method("stepNext", "SimpleSource") S3method("c", "TermDocumentMatrix") S3method("[", "TermDocumentMatrix") S3method("dimnames<-", "TermDocumentMatrix") S3method("findAssocs", "TermDocumentMatrix") S3method("findMostFreqTerms", "TermDocumentMatrix") S3method("inspect", "TermDocumentMatrix") S3method("plot", "TermDocumentMatrix") S3method("print", "TermDocumentMatrix") S3method("t", "TermDocumentMatrix") S3method("tm_term_score", "TermDocumentMatrix") S3method("c", "term_frequency") S3method("findMostFreqTerms", "term_frequency") S3method("c", "TextDocument") S3method("inspect", "TextDocument") S3method("print", "TextDocumentMeta") S3method("[", "URISource") S3method("[[", "URISource") S3method("getElem", "URISource") S3method("pGetElem", "URISource") S3method("[", "VCorpus") S3method("[[", "VCorpus") S3method("[[<-", "VCorpus") S3method("as.list", "VCorpus") S3method("as.VCorpus", "VCorpus") S3method("c", "VCorpus") S3method("content", "VCorpus") S3method("format", "VCorpus") S3method("inspect", "VCorpus") S3method("length", "VCorpus") S3method("meta", "VCorpus") S3method("meta<-", "VCorpus") S3method("names", "VCorpus") S3method("names<-", "VCorpus") S3method("print", "VCorpus", .print_via_format) S3method("TermDocumentMatrix", "VCorpus") S3method("tm_filter", "VCorpus") S3method("tm_index", "VCorpus") S3method("tm_map", "VCorpus") S3method("[", "VectorSource") S3method("[[", "VectorSource") S3method("getElem", "VectorSource") S3method("pGetElem", "VectorSource") S3method("getElem", "XMLSource") S3method("as.character", "XMLTextDocument") S3method("content", "XMLTextDocument") S3method("content<-", "XMLTextDocument") S3method("format", "XMLTextDocument") S3method("meta", "XMLTextDocument") S3method("meta<-", "XMLTextDocument") S3method("print", "XMLTextDocument", .print_via_format) S3method("close", "ZipSource") S3method("getElem", "ZipSource") S3method("open", "ZipSource") S3method("pGetElem", "ZipSource") S3method("TermDocumentMatrix", "default") S3method("as.DocumentTermMatrix", "DocumentTermMatrix") S3method("as.DocumentTermMatrix", "TermDocumentMatrix") S3method("as.DocumentTermMatrix", "default") S3method("as.DocumentTermMatrix", "term_frequency") S3method("as.DocumentTermMatrix", "textcnt") S3method("as.TermDocumentMatrix", "TermDocumentMatrix") S3method("as.TermDocumentMatrix", "DocumentTermMatrix") S3method("as.TermDocumentMatrix", "default") S3method("as.TermDocumentMatrix", "term_frequency") S3method("as.TermDocumentMatrix", "textcnt") S3method("Docs", "DocumentTermMatrix") S3method("Docs", "TermDocumentMatrix") S3method("Terms", "DocumentTermMatrix") S3method("Terms", "TermDocumentMatrix") S3method("nDocs", "DocumentTermMatrix") S3method("nDocs", "TermDocumentMatrix") S3method("nTerms", "DocumentTermMatrix") S3method("nTerms", "TermDocumentMatrix") tm/data/0000755000175100001440000000000012315572766011637 5ustar hornikuserstm/data/crude.rda0000644000175100001440000002701314367743046013434 0ustar hornikusers}r#G%͌jggwX F!lB6HCje*A԰P諹¯+?~wNfUD8c{=KFH @Vɓ;'/ƓO4|D/?~BQƧ'~Zok;>Շ0$`iߨnT@{f<6~'^0W(e$Sj8I?_%aF RAfJNSZI64 y'cVOuf(*"rD1FV;ɛ?b^6KnL6O:M<䁴Z"=^cg_-TF M&1Ѵo*"ЭfܚԎNcDZ3sZ̼sF4ײ/-4_mihjS{W^,ӱNSGiۗO"u#IGG>^T? -O.c7Il_㣗gCVnI~ǝ~w>Wݫ@u/{_y뉉W{?Pߝ /;'+o<99 WE+.[{tE>IvYu%P#>lScrz0J{?!Yџ.He&D A"Eo$ <}9pbFbOuL_q&:E$Y#Q*Bɯ8(<]"()IL~ %@DlX&X2jt_eAj`ĆMbMM/84BbY-%L3w!#M)i`zJ26ӟޥHɲć^v{YY- k II,jro ~".Y1E yFէY9}=NrRjwۣBs{ѹ* pJsc sY'pE΢$Onxk=j5QB^HZ'/;6beOibIbLlE3"(c3keIX9_kodžGIr#+Լxw, ^eeɩH cR%f%dɔT,?7t.7ie c.Γͨ#ĉL&`#҇ڻ5Ȩ"5\CȤ5!" ;%F';I|@e/tǏ7f^lWHPGzDG$S631;~+|B֜)E[(&$K_Ĉqa-+Pd| Μ}/7+=g*ߎ4,'ƙ&=lm'aSk:553'DfFB$"!% *o8thdDyhԲc?cL%:"~OtI :oWb/!/ C y[,CK`93`&y 1jFj}K)[[%mZG $Nj|5IlR,1XNNqAKG,'bRl 'l2mglM.M 99ԏyAo8 hsyqg'CȊe\2No@s&6@Xbji@>Yҟz7O>K,ӷ:ݼ ULB0͈D^|I;rD! aZ42Fi񩝤|jq/thPa*% NxʱqX=YX3LsA"d/EG?J21Mas%$ HSb #I÷Vd,aS-HHKgtU+_œd^dfX>ٷto^Vޡ:|9::=n;N?4܏s 7ߜw6m?Vǝo{jWg\YOSmy`x|P΃UwY~?cBVҺy}{sΑ .Qml@"1%)Gf䵟;ר~(w~?$^?tK󰣾~J_.X^0$vW>o7`Y?˲<}?i3}MJIx"+W5f ~%H|25nnD('#8GrDV̰yZVd)_iӤ Y,Z v]ɫKJK&$5܏(<>\vsu^O{Wgwx#1twa}Z?8)2 U0EĦ Zq`fެ&&8<%@e#E^;S Xȋ[ɔd\.[059-ށ#DJ\a'E,>VGEXn}'vt~^'b6s,<}H"Q &K"sĽ@#gfCBDtNV4#Y6JEߔgjcd37),liN1t{g3$3]e p?-(͑o~KeL< *fD% dZ& 3r'g^ui\qQld'#՛t LM)%ȩ+̕^B 12rH? k+y{--Ԧ$ͥmlMYb w^Cda[PwDlfU'!H#{vcspX/X&0w e(җinHV͞i\DWLҳ𬵵B1aNruj$$R)Gab[5)  Ƈ~LHKxSA4gSq YuqMEgNO8.dK!,@\u]G` p4{`׽%4j*.c 1Kf^NB?%ORy|jq X-cbya2`-*c!4d,I8"ZpK<& /64rtBi\$E#M DQ2wIg )[ȆFd>)^^aXl^° ǖ92eZ+T +ۑ'7,Z{֫),E ltP7`I ,S:LEY$ɕ,^i Ѱb+(k,G1NbbPA9 (B(K] Bw, OMx&:@hWɳ-%G6n$0/,ebcۻ.б}p? 75;O8n~ss2T:頋@/Uww}-5~s'xζ~|âψh,o'$I֞!^ Iܜ&YgKc΄8㯐e!H,7k*:?/FɘC^$X5,rAAci qD:3P  ,hm4[l*DQq y^0|Ǒ2u+P&,w7Er @r5@DZY}35vv 6XntU%kq5ZHGPUB)kSI)y681lec +@J)''dk[gj݋TɎnDݐf8_imIotԠ;W^~pg']=>g^!٧ ʦ8,ƣ @AHBG",BJƜ. tܞf@H~_$w%6tiKgDOjヶ5LddyUrJZORN%Gm:z]e],k.W*N(Qs@HGDg.ʬO"!2poTKr})2^~}'a-Z7(b,%*?!VY("%#K=s8Y"p]̏BU$R, %&ral69y[y[ոYSn|'O1YF&8ٓ-,n`0/ ;XB%lsi><&Or 8ڳ#b}ȇXNk/XyRKbkgi} gc8ok?i<ƽ:W'uT0P]vNd[{vpgMdI˝YIl Jݡ(S` 61H-(*g=24;U @JcIAu8NYB?Ŵ' SP>Yu6$3<3P 2 n%֑zEvåĵ|B,NCb1JqVDL@[xU`ΒIt;baBueP/AK;J D<=%kRSwߩܐXf)MFkw8U=zһP6&Ǎ.Eq(zi˙Cd8E7B_f2[[j,ݒ﯒`eJVI[7&Zz4ÉiUܤG/@N$|1 #Plbn$5끵.HcH2n_TM''MԯfG6I if;Ð@,獯Φ]c]Ώ[AB&`ep&-kJD`!;Fǥn  ڴӓnA#ΝYܽ"]^L<ȗcLH7=v  jz39Q|cݙ@=np]hr=%UͳY]͍YmFFRBAY -)I?.M+q$%"]& "D:5ϺVGuC賋aPuOޡuoHCu/;wjhwku;X>\QԱw4֑NŀTA%FB|:k wqTP.k#$DHyKƘ ͋gfesBGD4Lɵ><鶠xm[^ *-uerҌY\$D(1;(O!_FNpm3T*Չ-q=/r@[26߾2i"NG҃!!fP#$jI~_VsU~^Dn@-pYm6,ZpX5˺izen[V#%0vzUͅf(7zUKTUz)fl CwVh4) ( :ĻfG_m%.dݰNIm=ǩ_^UQ6^.6{yٿPW yׁeB]gEFunչGs?4PZ4'w>h:BH.g2v!3ccmFݰu cյh՘wdbreO2kVu\Gw43hW 㳩9&]iVoYh?UpR{?Z%UqmyxwWMqÇ+*uwWWuv1vkwkEYk<* CSXZ@my2 D(µI7e" O!E_3A0jEp!.j0ZU?BێLk]MD*F~7h2pWtQ1MOB<;R]DCU ( jJw:|niy{8:)s%-z-nТW%A7ej!PBMc%)A$ ^ko@U]쯸9ᦈ @ 4iSQ2 Ѡ5< ɕ<ҟz6hdI4O4zhh=@M$B h COBi=M14 dAHT$TL"z@''F#FF=4z$ A24$)y2PFhL4 R} 4 nDlrxl㓥ru2Y;AQ,Q:(E@$I.@= P={GL(aT ;+dKPHS*B@A`Ae@`"!~Ɉ*ȨD~͟>Oc. ߡU@Ֆ_z{aĎ 'ʀ  v3r"߼?<>uj)Q*TM]:o7VG"7b*$rQVOf͞Opf:1Qx#V&H1ׯ~v{dev3-ʽ5dY׻Mz 3MC !:G Zyf?6?n9B1?%g6X6 :&iAmtydz9-u~F8|M(79(eb{MzN(0ylCرzb8瓶8=nr5Ęgm3sό~`IFDA[Y`A, ~ U-RPdk+IOD (5B Ô<9ǧ~دBЈV6gjh"֕{ڲ.V1@/'db}w8J&ǔHֳja8s-KdG<ўWmD13U{ߦBNEՌ<;+B(>Cg6]dHׯ?\0׫W/^[nW {t;;O^ڼ$}{aJ+ղt l ru$%'}o{:v'.CEUQ lmV }9f 얛6&bpXJ,x}6o]Uw8 h[BoᑘOyWKo8d`e]3bU.Iz*:X$8~Ң/5yוQ!ϛQc0es vt"yLHg?iӉrQ/}.vynm"&> @]NIJiVkgtx(%t=: |W^►]Dz,PU#Gfrѱ$ y2M8zRxu.ڠy4YvmNz9p(_"XpcۦpmgwK濻k`k_u4dN697o·Ybr<՗H-C>'VfV8D:f[H"nZ b<(=`sҵ\x}7멜^mRZ}w2(m?jV+zTec3_),R'b|$@.疪d{68y(lsmr2Y BŕÖr-ESEeN]~l;v?:yVX]?;G)P3ӵ4gyW_OՆU(ܮ*[mp]B%iB/Rqr*vuTx|t~we'u;f*o!ケ{؇d@ʃÖ uNLl AĿWcƳ\  jM":7-C] LUr():$BjCyS a}ao,s{$Dpb:h&θ2(LS )_\;$+%r]i|g,s =,3I{>?3zMV(/ޛ-X _dC}r]`{ɽŨx腥ntT7CRg!Ir"]iA.4N‘X/bNˤ)S/M\a&*dK.5`%!poϲL'd{K?;ZɉuھnƥǙNB;acEkZtBJ.8gQNkP[r>㡎պ3aG+KXfv}YLpeCde{3D<D#i·_cǞo)̚|/&[K.Sh=r[ǍlK E<q5U}gGu޶:Z0"EJQ$mr:=V;(uXi7 Yh4!ij$;d=9a d#?܉et[2`a}<\G:穈⨐CY::3,H%C}~,;$2'@fj8 X3!ͧqch?~GMPnu#,L:/߰5OO0jKB<bveO'H ;Qy]8X!UI'ͮa$q߻N'C|g$TcTpWM`&ʀpD 4W}s1spF GqR,/1EvɕzwBLYV/M3?3dxU6 >r] Ģ*E9X4)2jKT5{՜Iw/p&wg{'l?`G3e&%1\vF'gλ(]3zȫܩ[4$RrGf\-u] "Apf0ꗈ8ϟsx6K8MdN$VJᚆTc3iEz P,zS,]K8 DDrpwYgWq0>;`@Xv>r w E ਟeLпn[z.:mϷTx.|ܻv;"+?Mf/JNT>J ߈ W:7f(} =y*;~ 6=.Wf\~o4rY]h]Bj]Ww]c5Rtm"dβU[1.[d7 s]1Q!np.O0|vg D|xYZv_qXeKUWֶٝ[Fhb9Ks s6;\~`PF?$$=l >OlR;}mWeOʻ_(=n6^l!vs!Ƶl!҇{vq;o|Z 5 ';s~d$">ϯUq c/v{x#{qM,O;\kTOz^~Aq#!Kϵ-f]-\)Eiv ~^DXJOGs&0RR^:kK^q.姳W op/a~VK;sc̥9N`X֬|]o1B )p@to?D i^[>s99b3/_~7fo|wH\vb'-\2CR3$= ( { yx=Kb) 4$(xLlҫԺSTLO );!?&ր K8;P2obySUA-$Y鯇?;S=CL&SvI7~n;t!:Ec,{ܴXa__on<i?Gݤ ꀘ3gf#XQi8(|7v^g!3C0'{5M"~;'n˫zTڐO D %x "fq}V׶]h]D(<ۯaۊ4LYƭ#;r^dypQpszBvt=32 Fi ߹LRBG҂m] Ah>SI3Y0N@Ł73ڷ1^*I%⮘TQID@x+t~lO_NVӀ\ g/H{`Ε'_?ԠNxPEU'UToYOpH{SϿ8<1]OƔjdWkZ ;,쬡U|v'<54bOzA8԰NЃ$"d)"bI 12 킓 @_;y:~-g@;R}c֩߂@Hxxى+ `1~Hn s8T >5$;LDﭻ>S! 3W~s`ͱYEF؞PGz[[jO٢ &TH~#x9Rs@@zU9I`]2gf RJPU~[І>ΪW18v6<[ QTIl7oV}|LcnR }HDG? 8z]Z5Er~dAxo~W@ qyZ!$"H/,TE-Ш:-.IOWwxp(\&A"~ 'b:#uO?؟(Pl;X2PNV04Oru(e|fpʀ 5%%=PݬdT'=&D4rl <74{/b2&A큘CKE3kmfZ]=ǰd"!^M(L^z(k'q&' *"wBTBc 1[u;wwdSNz>iHvVyBG(5^ZzWwz JLO#:9"HTAͶ9"= ;^S||{8nu'KZҳ$ / vJNub,ޕrd-Go<Tsqdhx/ci﫸NN?Id)jN$_r.%Zƙ șc%5D%R]xñ߷[{Hk îA`QDD(ւlsPD_/Ϡu5420 T 󁈿U"f QqƳ?fe&NtM÷'Lboj L+|~֋j|xy"2I~EZbQQ!z)}* 5vϨitf MuӿP<ԁ7CA361 ,N<^"H r8ӱ̚k)r7ڈ^Jnh;L%a93 H/ۊf(@:ሼ:VuՕ ; +  ""wB#Y 5!y{h xrlAߢbP 1=E.ٍHo+ F pـ>>tSG1M" *tX5t;q mXWVTV(\J[mZ 7]4Mci]W3Z0!l.6,Wv7…0pD0Sh3.Uf)*J 3&5TG30@*ֳ2-7Lփk4ⒺE*Wmeɫ\\jVn 6JZ7TDLr3B6Pd:QV&Q[KB[Q)Pm8Cw\Cv7bAyK[ )8V'VI-fk2p)'xSpC'(:BJ3 zt/zANt {гY&듣PLqQf%NuB ""ɾk3>{Y1?L Ms* V q%n xl^6Qր,LY8 M*ωp5 jP ֛\;9К1V.FgU\B`Uļ3LRsd@b0z@֩&#Rb8c#;>fȽyoH0@Ye/9A$ %$"]s+EBidc_ٸk -1>0';0\Z8']#xO0Dv  K#`{q밚[QrRN;pBl dS 䱮ZC`iq^+gW;0!IjȒ - `- GNVAj4GsÅqQtp9` Y=<41U_%DWn~mq*nݤ0d 4- i.o ؅PKUӬbΆׄ| &`59]|d8 0$#ٻ`ʼD _h쉕J kBA`_T[g$N]<-UݩajwR~-`iAg& M =u9Qx!ܝmd3dC)=4t\x|ތd=d]Ez4g֬KML T˜S)kf79&@><[J$&eoד=X1idsKDT#[H2Wů?@#a9_dRﵛ%9z(1As pI&[Ēb$'wz]KƑ+4XH5A״ p^U*" vne\3cj|G>8LNن!F>-tOjGc^/8aXSbvtةA<q˙Ap`3CE3db?,;"D@=DMP`>$ j|9= hqϳ]ĕT 1E|wp+U{"3`֪R7Er? 1Cz#ltlIAA96J7~y N Ɍ +J`YCU%@MG F"p@N5, Mپ"]{"My]Gر&\ L'G:B*ŭی'ø8:ͲaG[6׺-qZYU*/UlX(;6!P;3ݎ|;'x|%JM<,699~c<^ u+X67{;K}H! ؘdI 4$X/ʝ$ 5P}=VM" sT @L@ AT|sTX LL ={'!&j&hə\*,'JR?Iä*r;&!g<BRH].i- cY~|& 9Td$Pӊ 5,ޔ/D"a_cqm&,5HѶy{QD`Dd8~4@rjo N^CE^s$0&"âd]A{x>RE`"x*&a1RZ4%RbW-R0FFQ[aܐlA}!R~H!"{$J4d'I( B-$Q@X)AF2AIB-XLCIXEfBB([dP' Ў39#eB"YP<'L2Y1X#4l/`C5λ~SF96Z<͇2MI$QHtg@+hzxBLxC>] ,JPRFqϞnLMagzrj>Vgax]l9Y\ ;Vt[JaS^Gawݪu7F4 A9 $U eF@FS/۝DҍS+˥ #$(>QTA!i%Vo;P硌aU,& DD:D'ϣSMՀ P5N}%6~\ |7sŸ'*:xLD}w)9`[!PE1 Jcg(/_8IGiJgZd0#Au8d&&[ReAޘh>t\ {xܨ)& f9™Lg? ]i{ݲ$VTT[- -BЪ"z2 |SԹ8=G&߄6 l=O҅ۑy'^)}!;Ʊ8ی[aȔ9C&LHͪQ@h $H {:'ES痘O67b Hc3^[Wx+3ˮ11;vBuk{aEPE9ΡV9Z)̔Aހ=ϊ}_y yPCu7  "$"%J%I@?-9tc]Or>DpzCss{Bk~6>HD:}573?m>ʁD`,>hC^_Eg y;昐P  'q-3Ysaɼ|^1`u&Zdۣ(3,-,h]m afE<j4)01`sp DP?p>.[P*c@f#Z 4ӥDCf"8Mg/?xxt_  ) M1y:5xɌ_s6M .3.&%i>?Oq{FNE/ WcD@Ndz8GcS=ݳYL'1;Oo*QP)%deUWN"95W$7#^$L׋ g8`b ^pa>"=>U Ly(%n/scH -x Bsn"LH ꩠ%SJRM,TqĐ59 q=|XJӋwTEmMR$N6^Z]6Re3/Q*.xpjᘈoĪveqa >/A;HP `t#+@vŶzX/_4ư,0ؙNm HPggnNO˰#s!J@].MJ yiXV!6fKB^$v5 % [c,㼖h `EwA[S+P =0R2杰"]lvZ"ܤ)r6uEA˓mnSyDA I! eſ.8tS C>){1 'xCV΢xRpi"e.NBI6Q8 :pz,qwovщuաN{6Jr;@)iNJS;qB5LU/JM2hf&r@sAZw3'\fKs^g  si 2C1 u  1d$BD=1ɺ~ogGpE݃;¢lE38Ztn0 VgR H" Qljfk09yMdǚ,n~ [f:fqp=N$eLlm7d׽,^gyp@j/8/y]7]/$\>g SI$ $ Dfbr2;$=xdzfW(CTcY.uHI6~2fv~e6)+ c^κU.=79#ƈ,qe.ئ+9(آvx)LN>W]t07˞aOS IT J#4Ȁ%JV%%;C_Q='(nސЁmgt냁5 xSzJ͈, A` "$U!"RH@RE"+gr(>Vw.#z 8 Mij leUMCڵɣ5sfK} sc%gYg*$ ,+EQwTE&e{4Qfff8f;365va)%D:lֱLS-,| 2vmZ;lY fXHRB)>7ؓ dRHB'|IڢTekZ:}"NTi7c2a,,\:ƥG<;RnSw>D>n.2őC&{i5Bb Ca- {7u kZЎ.{!<8"80ÿl.F3-קpO=ESYWa.U|8 . umt] K7~؁E mF냄6&>cè^ A${/e}zd~,Wyw8 `s7>d[J!$ S&EqJ{ʪHȇ5>4M Ox;vrAU >Q$MCSHC[hԠ30 YFK17hdi [$vT-r`ֱ?>%JI)A9SR5dS,LCn a>sAز ❘W\) ,Uua4ݐ#*N'j2l`BpBĂBP&!kO1DE ;qYQI<12E_ غ**Xλ1H_LbÐ)-}mм=@ igX \4KZûk0!hp AIs,q„ VNJ\a&;ZDAFV@4 c5dl>tGX:^p gJp3U 6wmsG4X}_!i O"Hx֮Qt 0} and 0 otherwise.} \item{"L"}{(log average) is defined as \eqn{\frac{1 + \log_2(\mathit{tf}_{i,j})}{1+\log_2(\mathrm{ave}_{i\in j}(\mathit{tf}_{i,j}))}}.} } The second letter of \code{spec} specifies a weighting schema of document frequencies for \code{m}: \describe{ \item{"n"}{(no) is defined as 1.} \item{"t"}{(idf) is defined as \eqn{\log_2 \frac{N}{\mathit{df}_t}} where \eqn{\mathit{df}_t} denotes how often term \eqn{t} occurs in all documents.} \item{"p"}{(prob idf) is defined as \eqn{\max(0, \log_2(\frac{N - \mathit{df}_t}{\mathit{df}_t}))}.} } The third letter of \code{spec} specifies a schema for normalization of \code{m}: \describe{ \item{"n"}{(none) is defined as 1.} \item{"c"}{(cosine) is defined as \eqn{\sqrt{\mathrm{col\_sums}(m ^ 2)}}.} \item{"u"}{(pivoted unique) is defined as \eqn{\mathit{slope} * \sqrt{\mathrm{col\_sums}(m ^ 2)} + (1 - \mathit{slope}) * \mathit{pivot}} where both \code{slope} and \code{pivot} must be set via named tags in the \code{control} list.} \item{"b"}{(byte size) is defined as \eqn{\frac{1}{\mathit{CharLength}^\alpha}}. The parameter \eqn{\alpha} must be set via the named tag \code{alpha} in the \code{control} list.} } The final result is defined by multiplication of the chosen term frequency component with the chosen document frequency component with the chosen normalization component. } \value{ The weighted matrix. } \references{ Christopher D. Manning and Prabhakar Raghavan and Hinrich Schütze (2008). \emph{Introduction to Information Retrieval}. Cambridge University Press, ISBN 0521865719. } \examples{ data("crude") TermDocumentMatrix(crude, control = list(removePunctuation = TRUE, stopwords = TRUE, weighting = function(x) weightSMART(x, spec = "ntc"))) } tm/man/weightTfIdf.Rd0000644000175100001440000000267613025174645014200 0ustar hornikusers\name{weightTfIdf} \alias{weightTfIdf} \title{Weight by Term Frequency - Inverse Document Frequency} \description{ Weight a term-document matrix by term frequency - inverse document frequency. } \usage{ weightTfIdf(m, normalize = TRUE) } \arguments{ \item{m}{A \code{\link{TermDocumentMatrix}} in term frequency format.} \item{normalize}{A Boolean value indicating whether the term frequencies should be normalized.} } \details{ Formally this function is of class \code{WeightingFunction} with the additional attributes \code{name} and \code{acronym}. \emph{Term frequency} \eqn{\mathit{tf}_{i,j}} counts the number of occurrences \eqn{n_{i,j}} of a term \eqn{t_i} in a document \eqn{d_j}. In the case of normalization, the term frequency \eqn{\mathit{tf}_{i,j}} is divided by \eqn{\sum_k n_{k,j}}. \emph{Inverse document frequency} for a term \eqn{t_i} is defined as \deqn{\mathit{idf}_i = \log_2 \frac{|D|}{|\{d \mid t_i \in d\}|}} where \eqn{|D|} denotes the total number of documents and where \eqn{|\{d \mid t_i \in d\}|} is the number of documents where the term \eqn{t_i} appears. \emph{Term frequency - inverse document frequency} is now defined as \eqn{\mathit{tf}_{i,j} \cdot \mathit{idf}_i}. } \value{ The weighted matrix. } \references{ Gerard Salton and Christopher Buckley (1988). Term-weighting approaches in automatic text retrieval. \emph{Information Processing and Management}, \bold{24}/5, 513--523. } tm/man/WeightFunction.Rd0000644000175100001440000000142612324523350014711 0ustar hornikusers\name{WeightFunction} \alias{WeightFunction} \title{Weighting Function} \description{ Construct a weighting function for term-document matrices. } \usage{ WeightFunction(x, name, acronym) } \arguments{ \item{x}{A function which takes a \code{\link{TermDocumentMatrix}} with term frequencies as input, weights the elements, and returns the weighted matrix.} \item{name}{A character naming the weighting function.} \item{acronym}{A character giving an acronym for the name of the weighting function.} } \value{ An object of class \code{WeightFunction} which extends the class \code{function} representing a weighting function. } \examples{ weightCutBin <- WeightFunction(function(m, cutoff) m > cutoff, "binary with cutoff", "bincut") } tm/DESCRIPTION0000644000175100001440000000256414367745152012443 0ustar hornikusersPackage: tm Title: Text Mining Package Version: 0.7-11 Date: 2023-02-05 Authors@R: c(person("Ingo", "Feinerer", role = c("aut", "cre"), email = "feinerer@logic.at", comment = c(ORCID = "0000-0001-7656-8338")), person("Kurt", "Hornik", role = "aut", email = "Kurt.Hornik@R-project.org", comment = c(ORCID = "0000-0003-4198-9911")), person("Artifex Software, Inc.", role = c("ctb", "cph"), comment = "pdf_info.ps taken from GPL Ghostscript")) Depends: R (>= 3.2.0), NLP (>= 0.2-0) Imports: Rcpp, parallel, slam (>= 0.1-37), stats, tools, utils, graphics, xml2 LinkingTo: BH, Rcpp Suggests: antiword, filehash, methods, pdftools, Rcampdf, Rgraphviz, Rpoppler, SnowballC, testthat, tm.lexicon.GeneralInquirer Description: A framework for text mining applications within R. License: GPL-3 URL: https://tm.r-forge.r-project.org/ Additional_repositories: https://datacube.wu.ac.at NeedsCompilation: yes Packaged: 2023-02-05 15:07:18 UTC; hornik Author: Ingo Feinerer [aut, cre] (), Kurt Hornik [aut] (), Artifex Software, Inc. [ctb, cph] (pdf_info.ps taken from GPL Ghostscript) Maintainer: Ingo Feinerer Repository: CRAN Date/Publication: 2023-02-05 15:25:30 UTC tm/build/0000755000175100001440000000000014367743045012024 5ustar hornikuserstm/build/vignette.rds0000644000175100001440000000034514367743045014365 0ustar hornikusers}PA0(D w x/xDi AojkDݙNw&19,F;ҙyQ Aef]_ffKL  !''~?^tRsiQ/[Z(wϡԗYgg0m`{&` ,wBV1d*M߉3/M"C)G zO~ tm/tests/0000755000175100001440000000000013065660374012064 5ustar hornikuserstm/tests/testthat/0000755000175100001440000000000014367745152013730 5ustar hornikuserstm/tests/testthat/test-TermDocumentMatrix.R0000644000175100001440000000406113206514642020611 0ustar hornikuserscontext("Term-document matrices") test_that("construction works", { vs <- VectorSource(c("one two two three three three", "This is a short text with a few words")) scorpus <- Corpus(vs) vcorpus <- VCorpus(vs) ms <- TermDocumentMatrix(scorpus) mv <- TermDocumentMatrix(vcorpus) terms <- c("few", "one", "short", "text", "this", "three", "two", "with", "words") docs <- c("1", "2") expect_equal(sort(Terms(ms)), terms) expect_equal(sort(Terms(mv)), terms) expect_equal(Docs(ms), docs) expect_equal(Docs(mv), docs) m <- matrix(c(0, 1, 0, 0, 0, 3, 2, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1), ncol = 2, dimnames = list("Terms" = terms, "Docs" = docs)) expect_equal(as.matrix(ms[order(Terms(ms)), ]), m) expect_equal(as.matrix(mv), m) }) test_that("construction with control arguments works", { vs <- VectorSource("one two two three three three") scorpus <- Corpus(vs) vcorpus <- VCorpus(vs) docs <- "1" ctrl <- list(dictionary = c("three", "two", "zero")) ms <- TermDocumentMatrix(scorpus, ctrl) mv <- TermDocumentMatrix(vcorpus, ctrl) m <- matrix(c(3, 2, 0), dimnames = list("Terms" = ctrl$dictionary, "Docs" = docs)) expect_equal(as.matrix(ms[order(Terms(ms)), ]), m) expect_equal(as.matrix(mv), m) }) test_that("zero matrix works", { vs <- VectorSource("one two three") scorpus <- Corpus(vs) vcorpus <- VCorpus(vs) ctrl <- list(dictionary = "four", wordLengths = c(1, Inf)) ms <- TermDocumentMatrix(scorpus, ctrl) mv <- TermDocumentMatrix(vcorpus, ctrl) m <- matrix(0, dimnames = list("Terms" = ctrl$dictionary, "Docs" = "1")) expect_equal(as.matrix(ms), m) expect_equal(as.matrix(mv), m) }) test_that("empty matrix works", { docs <- "1" ds <- DataframeSource(data.frame(doc_id = docs, text = NA)) scorpus <- Corpus(ds) vcorpus <- VCorpus(ds) ms <- TermDocumentMatrix(scorpus) mv <- TermDocumentMatrix(vcorpus) m <- matrix(numeric(), dimnames = list("Terms" = character(), "Docs" = docs)) expect_equal(as.matrix(ms), m) expect_equal(as.matrix(mv), m) }) tm/tests/testthat/test-Tokenizer.R0000644000175100001440000000061213206536446016774 0ustar hornikuserscontext("Tokenizers") test_that("scan_tokenizer works with character vectors", { tokens <- c("a", "character", "vector", "consisting", "of", "multiple", "elements") expect_equal(scan_tokenizer(c(paste0(tokens[1:3], collapse = " "), paste0(tokens[4:5], collapse = " "), paste0(tokens[6:7], collapse = " "))), tokens) }) tm/tests/testthat/test-Transformation.R0000644000175100001440000000130713207271046020023 0ustar hornikuserscontext("Transformations") test_that("removePunctuation works in latin1 locale", { if (nzchar(suppressWarnings(Sys.setlocale("LC_CTYPE", "en_US.iso88591")))) { id <- c(73L, 108L, 32L, 115L, 39L, 101L, 120L, 112L, 114L, 105L, 109L, 97L, 105L, 116L, 32L, 101L, 110L, 32L, 117L, 110L, 32L, 108L, 97L, 110L, 103L, 97L, 103L, 101L, 32L, 99L, 104L, 226L, 116L, 105L, 233L) iu <- intToUtf8(id) il <- iconv(iu, from = "UTF-8", to = "latin1") td <- id[-5L] tu <- intToUtf8(td) tl <- iconv(tu, from = "UTF-8", to = "latin1") expect_equal(removePunctuation(iu), tu) expect_equal(removePunctuation(il), tl) } else skip("latin1 locale not available") }) tm/tests/testthat/test-Source.R0000644000175100001440000000117213110235234016245 0ustar hornikuserscontext("Sources") test_that("DataframeSource works", { txt <- c("First document.", "Second document.") dm1 <- 1:2 dm2 <- letters[1:2] df <- data.frame(doc_id = c("doc_1", "doc_2"), text = txt, dmeta1 = dm1, dmeta2 = dm2, stringsAsFactors = FALSE) ds <- DataframeSource(df) scorpus <- Corpus(ds) vcorpus <- VCorpus(ds) expect_equal(as.character(scorpus[[2]]), as.character(vcorpus[[2]])) expect_equal(as.character(scorpus[[2]]), txt[2]) expect_equal(meta(scorpus), meta(vcorpus)) expect_equal(meta(scorpus), data.frame(dmeta1 = dm1, dmeta2 = dm2, stringsAsFactors = FALSE)) }) tm/tests/testthat.R0000644000175100001440000000006013065660374014043 0ustar hornikuserslibrary(testthat) library(tm) test_check("tm") tm/src/0000755000175100001440000000000014367743046011515 5ustar hornikuserstm/src/init.c0000644000175100001440000000161413572675110012617 0ustar hornikusers#include #include #include SEXP _tm_copyCorpus(SEXP x, SEXP y); SEXP _tm_remove_chars(SEXP x, SEXP which); SEXP _tm_scan(SEXP x, SEXP which); SEXP _tm_tdm(SEXP stringsSEXP, SEXP remove_punctsSEXP, SEXP remove_digitsSEXP, SEXP stopwordsSEXP, SEXP dictionarySEXP, SEXP min_term_freqSEXP, SEXP max_term_freqSEXP, SEXP min_word_lengthSEXP, SEXP max_word_lengthSEXP); SEXP _tm_Boost_Tokenizer(SEXP stringsSEXP); static const R_CallMethodDef CallEntries[] = { {"_tm_copyCorpus", (DL_FUNC) &_tm_copyCorpus, 2}, {"_tm_remove_chars", (DL_FUNC) &_tm_remove_chars, 2}, {"_tm_scan", (DL_FUNC) &_tm_scan, 2}, {"_tm_tdm", (DL_FUNC) &_tm_tdm, 9}, {"_tm_Boost_Tokenizer", (DL_FUNC) &_tm_Boost_Tokenizer, 1}, {NULL, NULL, 0} }; void R_init_tm(DllInfo *dll) { R_registerRoutines(dll, NULL, CallEntries, NULL, NULL); R_useDynamicSymbols(dll, FALSE); } tm/src/tokenizer.cpp0000644000175100001440000000164113324403054014215 0ustar hornikusers// [[Rcpp::depends(BH)]] #include #include using namespace Rcpp; // [[Rcpp::export]] StringVector Boost_Tokenizer(const StringVector strings) { std::vector tokens; std::vector places; for (unsigned int index = 0; index < strings.size(); index++) { if(StringVector::is_na(strings[index])) { places.push_back(tokens.size()); tokens.push_back(""); continue; } std::string str = std::string(strings(index)); typedef boost::tokenizer > tokenizer; boost::char_separator sep(" \f\n\r\t\v"); tokenizer tok(str, sep); for (tokenizer::iterator it = tok.begin(); it != tok.end(); ++it) { tokens.push_back(*it); } } StringVector y = wrap(tokens); for(unsigned int i = 0; i < places.size(); i++) { y[places[i]] = NA_STRING; } return y; } tm/src/copy.c0000644000175100001440000000015613572675071012634 0ustar hornikusers#include SEXP _tm_copyCorpus(SEXP x, SEXP y) { copyVector(x, y); return R_NilValue; } tm/src/scan.c0000644000175100001440000000501714323476315012602 0ustar hornikusers#include #include /* #include static int is_ascii_space(int c) { return (isspace(c) && isascii(c)); } static int is_space_or_ascii_punct(int c) { return(isspace(c) || (ispunct(c) && isascii(c))); } */ static int is_ascii_space(int c) { static const char *s = " \f\n\r\t\v"; return strchr(s, c) == NULL ? 0 : 1; } static int is_ascii_space_or_punct(int c) { static const char *s = " \f\n\r\t\v!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"; return strchr(s, c) == NULL ? 0 : 1; } static SEXP tm_scan_one(SEXP this, int (*test) (int)) { SEXP y; Rboolean skip; int size = 256, i, j, nb = 0, ne = 0, u, v, w; int *beg, *end; const char *s; char c, *t, *p; cetype_t e; if(this == NA_STRING) { return ScalarString(NA_STRING); } beg = Calloc(size, int); end = Calloc(size, int); e = getCharCE(this); s = CHAR(this); i = 0; skip = TRUE; while((c = *s++) != '\0') { if(skip && !test(c)) { skip = FALSE; if(nb >= size) { if(size > INT_MAX / 2) error("too many items"); size *= 2; beg = Realloc(beg, size, int); end = Realloc(end, size, int); } beg[nb] = i; nb++; } else if(!skip && test(c)) { skip = TRUE; end[ne] = i - 1; ne++; } i++; } if(ne < nb) end[ne] = i - 1; PROTECT(y = NEW_CHARACTER(nb)); s = CHAR(this); v = -1; for(i = 0; i < nb; i++) { u = beg[i]; s += (u - v - 1); v = end[i]; w = v - u + 1; p = t = (char *) R_alloc(w + 1, sizeof(char)); for(j = 0; j < w; j++) { *t++ = *s++; } *t = '\0'; SET_STRING_ELT(y, i, mkCharCE(p, e)); } Free(beg); Free(end); UNPROTECT(1); return y; } SEXP _tm_scan(SEXP x, SEXP which) { SEXP y, z, this; R_xlen_t i, j, k, nx, ny; int w; int (*test) (int) = is_ascii_space; if(LENGTH(which) > 0) { PROTECT(this = AS_INTEGER(which)); w = INTEGER(this)[0]; if(w == 1) test = is_ascii_space_or_punct; UNPROTECT(1); } nx = LENGTH(x); if(nx < 1) return NEW_CHARACTER(0); if(nx == 1) return tm_scan_one(STRING_ELT(x, 0), test); PROTECT(z = NEW_LIST(nx)); ny = 0; for(i = 0; i < nx; i++) { this = tm_scan_one(STRING_ELT(x, i), test); SET_VECTOR_ELT(z, i, this); ny += LENGTH(this); } // Now unlist. k = 0; PROTECT(y = NEW_STRING(ny)); for(i = 0; i < nx; i++) { this = VECTOR_ELT(z, i); for(j = 0; j < LENGTH(this); j++, k++) SET_STRING_ELT(y, k, STRING_ELT(this, j)); } UNPROTECT(2); return y; } tm/src/tdm.cpp0000644000175100001440000000602013410131457012764 0ustar hornikusers// [[Rcpp::depends(BH)]] // [[Rcpp::plugins(cpp11)]] #include #include using namespace Rcpp; static int is_ascii_digit(int c) { static const char *s = "0123456789"; return strchr(s, c) == NULL ? 0 : 1; } static int is_ascii_punct(int c) { static const char *s = "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"; return strchr(s, c) == NULL ? 0 : 1; } // [[Rcpp::export]] List tdm(const StringVector strings, const bool remove_puncts, const bool remove_digits, const std::vector stopwords, const std::vector dictionary, const unsigned int min_term_freq, const unsigned int max_term_freq, const unsigned int min_word_length, const unsigned int max_word_length) { unsigned int column = 1; std::map line, terms_pos; std::set dict(dictionary.begin(), dictionary.end()), sw(stopwords.begin(), stopwords.end()); std::vector i, j, v; std::vector terms; for (unsigned int index = 0; index < strings.size(); index++) { std::string s = std::string(strings(index)); typedef boost::tokenizer > tokenizer; boost::char_separator sep(" \f\n\r\t\v"); tokenizer tok(s, sep); line.clear(); for (tokenizer::iterator it = tok.begin(); it != tok.end(); ++it) { std::string token = *it; if(remove_puncts) token.erase(std::remove_if(token.begin(), token.end(), &is_ascii_punct), token.end()); if(remove_digits) token.erase(std::remove_if(token.begin(), token.end(), &is_ascii_digit), token.end()); if ((dict.empty() || dict.count(token)) && min_word_length <= token.length() && token.length() <= max_word_length && !sw.count(token)) line[token]++; } for (std::map::iterator it = line.begin(); it != line.end(); ++it) { std::string term = it->first; unsigned int freq = it->second; if (min_term_freq <= freq && freq <= max_term_freq) { unsigned int tpt; if (!terms_pos.count(term)) { tpt = column++; terms_pos[term] = tpt; terms.push_back(term); } else { tpt = terms_pos[term]; } i.push_back(tpt); j.push_back(index + 1); v.push_back(freq); } } } for (const std::string &term : dictionary) if (std::find(terms.begin(), terms.end(), term) == terms.end()) terms.push_back(term); return List::create(Named("i") = i, Named("j") = j, Named("v") = v, Named("terms") = terms); } tm/src/remove.c0000644000175100001440000000250114323476227013150 0ustar hornikusers#include #include /* #include static int is_ascii_digit(int c) { return(isdigit(c) && isascii(c)); } static int is_ascii_punct(int c) { return(ispunct(c) && isascii(c)); } */ static int is_ascii_digit(int c) { static const char *s = "0123456789"; return strchr(s, c) == NULL ? 0 : 1; } static int is_ascii_punct(int c) { static const char *s = "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"; return strchr(s, c) == NULL ? 0 : 1; } SEXP _tm_remove_chars(SEXP x, SEXP which) { SEXP y, this; int n, i, w; const char *s; char c, *t, *p; cetype_t e; int (*test) (int) = is_ascii_punct; if(LENGTH(which) > 0) { PROTECT(this = AS_INTEGER(which)); w = INTEGER(this)[0]; if(w == 1) test = is_ascii_digit; UNPROTECT(1); } PROTECT(x = AS_CHARACTER(x)); n = LENGTH(x); PROTECT(y = NEW_CHARACTER(n)); for(i = 0; i < n; i++) { this = STRING_ELT(x, i); if(this == NA_STRING) { SET_STRING_ELT(y, i, NA_STRING); continue; } e = getCharCE(this); s = CHAR(this); t = p = (char *) R_alloc(strlen(s) + 1, sizeof(char)); while((c = *s++) != '\0') { if(!test(c)) *t++ = c; } *t = '\0'; SET_STRING_ELT(y, i, mkCharCE(p, e)); } setAttrib(y, R_NamesSymbol, getAttrib(x, R_NamesSymbol)); UNPROTECT(2); return y; } tm/src/RcppExports.cpp0000644000175100001440000000433413404766411014506 0ustar hornikusers// Generated by using Rcpp::compileAttributes() -> do not edit by hand // Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 #include using namespace Rcpp; // tdm List tdm(const StringVector strings, const bool remove_puncts, const bool remove_digits, const std::vector stopwords, const std::vector dictionary, const unsigned int min_term_freq, const unsigned int max_term_freq, const unsigned int min_word_length, const unsigned int max_word_length); RcppExport SEXP _tm_tdm(SEXP stringsSEXP, SEXP remove_punctsSEXP, SEXP remove_digitsSEXP, SEXP stopwordsSEXP, SEXP dictionarySEXP, SEXP min_term_freqSEXP, SEXP max_term_freqSEXP, SEXP min_word_lengthSEXP, SEXP max_word_lengthSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< const StringVector >::type strings(stringsSEXP); Rcpp::traits::input_parameter< const bool >::type remove_puncts(remove_punctsSEXP); Rcpp::traits::input_parameter< const bool >::type remove_digits(remove_digitsSEXP); Rcpp::traits::input_parameter< const std::vector >::type stopwords(stopwordsSEXP); Rcpp::traits::input_parameter< const std::vector >::type dictionary(dictionarySEXP); Rcpp::traits::input_parameter< const unsigned int >::type min_term_freq(min_term_freqSEXP); Rcpp::traits::input_parameter< const unsigned int >::type max_term_freq(max_term_freqSEXP); Rcpp::traits::input_parameter< const unsigned int >::type min_word_length(min_word_lengthSEXP); Rcpp::traits::input_parameter< const unsigned int >::type max_word_length(max_word_lengthSEXP); rcpp_result_gen = Rcpp::wrap(tdm(strings, remove_puncts, remove_digits, stopwords, dictionary, min_term_freq, max_term_freq, min_word_length, max_word_length)); return rcpp_result_gen; END_RCPP } // Boost_Tokenizer StringVector Boost_Tokenizer(const StringVector strings); RcppExport SEXP _tm_Boost_Tokenizer(SEXP stringsSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< const StringVector >::type strings(stringsSEXP); rcpp_result_gen = Rcpp::wrap(Boost_Tokenizer(strings)); return rcpp_result_gen; END_RCPP } tm/vignettes/0000755000175100001440000000000014367743045012735 5ustar hornikuserstm/vignettes/references.bib0000644000175100001440000000131311704521032015510 0ustar hornikusers@Article{Feinerer_etal_2008, author = {Ingo Feinerer and Kurt Hornik and David Meyer}, title = {Text Mining Infrastructure in {R}}, journal = {Journal of Statistical Software}, volume = 25, number = 5, pages = {1--54}, month = {March}, year = 2008, issn = {1548-7660}, coden = {JSSOBK}, url = {http://www.jstatsoft.org/v25/i05} } @Article{Rnews:Feinerer:2008, author = {Ingo Feinerer}, title = {An Introduction to Text Mining in {R}}, journal = {R News}, year = 2008, volume = 8, number = 2, pages = {19--22}, month = oct, url = {http://CRAN.R-project.org/doc/Rnews/}, pdf = {http://CRAN.R-project.org/doc/Rnews/Rnews_2008-2.pdf} } tm/vignettes/tm.Rnw0000644000175100001440000003350513155253051014036 0ustar hornikusers\documentclass[a4paper]{article} \usepackage[margin=2cm]{geometry} \usepackage[utf8]{inputenc} \usepackage[round]{natbib} \usepackage{url} \newcommand{\acronym}[1]{\textsc{#1}} \newcommand{\class}[1]{\mbox{\textsf{#1}}} \newcommand{\code}[1]{\mbox{\texttt{#1}}} \newcommand{\pkg}[1]{{\normalfont\fontseries{b}\selectfont #1}} \newcommand{\proglang}[1]{\textsf{#1}} %% \VignetteIndexEntry{Introduction to the tm Package} \begin{document} <>= library("tm") data("crude") @ \title{Introduction to the \pkg{tm} Package\\Text Mining in \proglang{R}} \author{Ingo Feinerer} \maketitle \section*{Introduction} This vignette gives a short introduction to text mining in \proglang{R} utilizing the text mining framework provided by the \pkg{tm} package. We present methods for data import, corpus handling, preprocessing, metadata management, and creation of term-document matrices. Our focus is on the main aspects of getting started with text mining in \proglang{R}---an in-depth description of the text mining infrastructure offered by \pkg{tm} was published in the \emph{Journal of Statistical Software}~\citep{Feinerer_etal_2008}. An introductory article on text mining in \proglang{R} was published in \emph{R News}~\citep{Rnews:Feinerer:2008}. \section*{Data Import} The main structure for managing documents in \pkg{tm} is a so-called \class{Corpus}, representing a collection of text documents. A corpus is an abstract concept, and there can exist several implementations in parallel. The default implementation is the so-called \class{VCorpus} (short for \emph{Volatile Corpus}) which realizes a semantics as known from most \proglang{R} objects: corpora are \proglang{R} objects held fully in memory. We denote this as volatile since once the \proglang{R} object is destroyed, the whole corpus is gone. Such a volatile corpus can be created via the constructor \code{VCorpus(x, readerControl)}. Another implementation is the \class{PCorpus} which implements a \emph{Permanent Corpus} semantics, i.e., the documents are physically stored outside of \proglang{R} (e.g., in a database), corresponding \proglang{R} objects are basically only pointers to external structures, and changes to the underlying corpus are reflected to all \proglang{R} objects associated with it. Compared to the volatile corpus the corpus encapsulated by a permanent corpus object is not destroyed if the corresponding \proglang{R} object is released. Within the corpus constructor, \code{x} must be a \class{Source} object which abstracts the input location. \pkg{tm} provides a set of predefined sources, e.g., \class{DirSource}, \class{VectorSource}, or \class{DataframeSource}, which handle a directory, a vector interpreting each component as document, or data frame like structures (like \acronym{CSV} files), respectively. Except \class{DirSource}, which is designed solely for directories on a file system, and \class{VectorSource}, which only accepts (character) vectors, most other implemented sources can take connections as input (a character string is interpreted as file path). \code{getSources()} lists available sources, and users can create their own sources. The second argument \code{readerControl} of the corpus constructor has to be a list with the named components \code{reader} and \code{language}. The first component \code{reader} constructs a text document from elements delivered by a source. The \pkg{tm} package ships with several readers (e.g., \code{readPlain()}, \code{readPDF()}, \code{readDOC()}, \ldots). See \code{getReaders()} for an up-to-date list of available readers. Each source has a default reader which can be overridden. E.g., for \code{DirSource} the default just reads in the input files and interprets their content as text. Finally, the second component \code{language} sets the texts' language (preferably using \acronym{ISO} 639-2 codes). In case of a permanent corpus, a third argument \code{dbControl} has to be a list with the named components \code{dbName} giving the filename holding the sourced out objects (i.e., the database), and \code{dbType} holding a valid database type as supported by package \pkg{filehash}. Activated database support reduces the memory demand, however, access gets slower since each operation is limited by the hard disk's read and write capabilities. So e.g., plain text files in the directory \code{txt} containing Latin (\code{lat}) texts by the Roman poet \emph{Ovid} can be read in with following code: <>= txt <- system.file("texts", "txt", package = "tm") (ovid <- VCorpus(DirSource(txt, encoding = "UTF-8"), readerControl = list(language = "lat"))) @ For simple examples \code{VectorSource} is quite useful, as it can create a corpus from character vectors, e.g.: <>= docs <- c("This is a text.", "This another one.") VCorpus(VectorSource(docs)) @ Finally we create a corpus for some Reuters documents as example for later use: <>= reut21578 <- system.file("texts", "crude", package = "tm") reuters <- VCorpus(DirSource(reut21578, mode = "binary"), readerControl = list(reader = readReut21578XMLasPlain)) @ \section*{Data Export} For the case you have created a corpus via manipulating other objects in \proglang{R}, thus do not have the texts already stored on a hard disk, and want to save the text documents to disk, you can simply use \code{writeCorpus()} <>= writeCorpus(ovid) @ which writes a character representation of the documents in a corpus to multiple files on disk. \section*{Inspecting Corpora} Custom \code{print()} methods are available which hide the raw amount of information (consider a corpus could consist of several thousand documents, like a database). \code{print()} gives a concise overview whereas more details are displayed with \code{inspect()}. <<>>= inspect(ovid[1:2]) @ Individual documents can be accessed via \code{[[}, either via the position in the corpus, or via their identifier. <>= meta(ovid[[2]], "id") identical(ovid[[2]], ovid[["ovid_2.txt"]]) @ A character representation of a document is available via \code{as.character()} which is also used when inspecting a document: <>= inspect(ovid[[2]]) lapply(ovid[1:2], as.character) @ \section*{Transformations} Once we have a corpus we typically want to modify the documents in it, e.g., stemming, stopword removal, et cetera. In \pkg{tm}, all this functionality is subsumed into the concept of a \emph{transformation}. Transformations are done via the \code{tm\_map()} function which applies (maps) a function to all elements of the corpus. Basically, all transformations work on single text documents and \code{tm\_map()} just applies them to all documents in a corpus. \subsection*{Eliminating Extra Whitespace} Extra whitespace is eliminated by: <<>>= reuters <- tm_map(reuters, stripWhitespace) @ \subsection*{Convert to Lower Case} Conversion to lower case by: <<>>= reuters <- tm_map(reuters, content_transformer(tolower)) @ We can use arbitrary character processing functions as transformations as long as the function returns a text document. In this case we use \code{content\_transformer()} which provides a convenience wrapper to access and set the content of a document. Consequently most text manipulation functions from base \proglang{R} can directly be used with this wrapper. This works for \code{tolower()} as used here but also e.g.\ for \code{gsub()} which comes quite handy for a broad range of text manipulation tasks. \subsection*{Remove Stopwords} Removal of stopwords by: <>= reuters <- tm_map(reuters, removeWords, stopwords("english")) @ \subsection*{Stemming} Stemming is done by: <>= tm_map(reuters, stemDocument) @ \section*{Filters} Often it is of special interest to filter out documents satisfying given properties. For this purpose the function \code{tm\_filter} is designed. It is possible to write custom filter functions which get applied to each document in the corpus. Alternatively, we can create indices based on selections and subset the corpus with them. E.g., the following statement filters out those documents having an \code{ID} equal to \code{"237"} and the string \code{"INDONESIA SEEN AT CROSSROADS OVER ECONOMIC CHANGE"} as their heading. <<>>= idx <- meta(reuters, "id") == '237' & meta(reuters, "heading") == 'INDONESIA SEEN AT CROSSROADS OVER ECONOMIC CHANGE' reuters[idx] @ \section*{Metadata Management} Metadata is used to annotate text documents or whole corpora with additional information. The easiest way to accomplish this with \pkg{tm} is to use the \code{meta()} function. A text document has a few predefined attributes like \code{author} but can be extended with an arbitrary number of additional user-defined metadata tags. These additional metadata tags are individually attached to a single text document. From a corpus perspective these metadata attachments are locally stored together with each individual text document. Alternatively to \code{meta()} the function \code{DublinCore()} provides a full mapping between Simple Dublin Core metadata and \pkg{tm} metadata structures and can be similarly used to get and set metadata information for text documents, e.g.: <>= DublinCore(crude[[1]], "Creator") <- "Ano Nymous" meta(crude[[1]]) @ For corpora the story is a bit more sophisticated. Corpora in \pkg{tm} have two types of metadata: one is the metadata on the corpus level (\code{corpus}), the other is the metadata related to the individual documents (\code{indexed}) in form of a data frame. The latter is often done for performance reasons (hence the named \code{indexed} for indexing) or because the metadata has an own entity but still relates directly to individual text documents, e.g., a classification result; the classifications directly relate to the documents but the set of classification levels forms an own entity. Both cases can be handled with \code{meta()}: <<>>= meta(crude, tag = "test", type = "corpus") <- "test meta" meta(crude, type = "corpus") meta(crude, "foo") <- letters[1:20] meta(crude) @ \section*{Standard Operators and Functions} Many standard operators and functions (\code{[}, \code{[<-}, \code{[[}, \code{[[<-}, \code{c()}, \code{lapply()}) are available for corpora with semantics similar to standard \proglang{R} routines. E.g., \code{c()} concatenates two (or more) corpora. Applied to several text documents it returns a corpus. The metadata is automatically updated, if corpora are concatenated (i.e., merged). \section*{Creating Term-Document Matrices} A common approach in text mining is to create a term-document matrix from a corpus. In the \pkg{tm} package the classes \class{TermDocumentMatrix} and \class{DocumentTermMatrix} (depending on whether you want terms as rows and documents as columns, or vice versa) employ sparse matrices for corpora. Inspecting a term-document matrix displays a sample, whereas \code{as.matrix()} yields the full matrix in dense format (which can be very memory consuming for large matrices). <<>>= dtm <- DocumentTermMatrix(reuters) inspect(dtm) @ \section*{Operations on Term-Document Matrices} Besides the fact that on this matrix a huge amount of \proglang{R} functions (like clustering, classifications, etc.) can be applied, this package brings some shortcuts. Imagine we want to find those terms that occur at least five times, then we can use the \code{findFreqTerms()} function: <<>>= findFreqTerms(dtm, 5) @ Or we want to find associations (i.e., terms which correlate) with at least $0.8$ correlation for the term \code{opec}, then we use \code{findAssocs()}: <<>>= findAssocs(dtm, "opec", 0.8) @ Term-document matrices tend to get very big already for normal sized data sets. Therefore we provide a method to remove \emph{sparse} terms, i.e., terms occurring only in very few documents. Normally, this reduces the matrix dramatically without losing significant relations inherent to the matrix: <<>>= inspect(removeSparseTerms(dtm, 0.4)) @ This function call removes those terms which have at least a 40 percentage of sparse (i.e., terms occurring 0 times in a document) elements. \section*{Dictionary} A dictionary is a (multi-)set of strings. It is often used to denote relevant terms in text mining. We represent a dictionary with a character vector which may be passed to the \code{DocumentTermMatrix()} constructor as a control argument. Then the created matrix is tabulated against the dictionary, i.e., only terms from the dictionary appear in the matrix. This allows to restrict the dimension of the matrix a priori and to focus on specific terms for distinct text mining contexts, e.g., <<>>= inspect(DocumentTermMatrix(reuters, list(dictionary = c("prices", "crude", "oil")))) @ \section*{Performance} Often you do not need all the generality, modularity and full range of features offered by \pkg{tm} as this sometimes comes at the price of performance. \class{SimpleCorpus} provides a corpus which is optimized for the most common usage scenario: importing plain texts from files in a directory or directly from a vector in \proglang{R}, preprocessing and transforming the texts, and finally exporting them to a term-document matrix. The aim is to boost performance and minimize memory pressure. It loads all documents into memory, and is designed for medium-sized to large data sets. However, it operates only under the following contraints: \begin{itemize} \item only \code{DirSource} and \code{VectorSource} are supported, \item no custom readers, i.e., each document is read in and stored as plain text (as a string, i.e., a character vector of length one), \item transformations applied via \code{tm\_map} must be able to process strings and return strings, \item no lazy transformations in \code{tm\_map}, \item no meta data for individual documents (i.e., no \code{"local"} in \code{meta()}). \end{itemize} \bibliographystyle{abbrvnat} \bibliography{references} \end{document} tm/vignettes/extensions.Rnw0000644000175100001440000002727113177024075015626 0ustar hornikusers\documentclass[a4paper]{article} \usepackage[margin=2cm]{geometry} \usepackage[round]{natbib} \usepackage{url} \newcommand{\acronym}[1]{\textsc{#1}} \newcommand{\pkg}[1]{{\normalfont\fontseries{b}\selectfont #1}} \newcommand{\proglang}[1]{\textsf{#1}} \let\code\texttt %% \VignetteIndexEntry{Extensions} \begin{document} <>= library("tm") library("xml2") @ \title{Extensions\\How to Handle Custom File Formats} \author{Ingo Feinerer} \maketitle \section*{Introduction} The possibility to handle custom file formats is a substantial feature in any modern text mining infrastructure. \pkg{tm} has been designed aware of this aspect from the beginning on, and has modular components which allow for extensions. A general explanation of \pkg{tm}'s extension mechanism is described by~\citet[Sec.~3.3]{Feinerer_etal_2008}, with an updated description as follows. \section*{Sources} A source abstracts input locations and provides uniform methods for access. Each source must provide implementations for following interface functions: \begin{description} \item[close()] closes the source and returns it, \item[eoi()] returns \code{TRUE} if the end of input of the source is reached, \item[getElem()] fetches the element at the current position, \item[length()] gives the number of elements, \item[open()] opens the source and returns it, \item[reader()] returns a default reader for processing elements, \item[pGetElem()] (optional) retrieves all elements in parallel at once, and \item[stepNext()] increases the position in the source to the next element. \end{description} Retrieved elements must be encapsulated in a list with the named components \code{content} holding the document and \code{uri} pointing to the origin of the document (e.g., a file path or a \acronym{URL}; \code{NULL} if not applicable or unavailable). Custom sources are required to inherit from the virtual base class \code{Source} and typically do so by extending the functionality provided by the simple reference implementation \code{SimpleSource}. E.g., a simple source which accepts an \proglang{R} vector as input could be defined as <>= VecSource <- function(x) SimpleSource(length = length(x), content = as.character(x), class = "VecSource") @ which overrides a few defaults (see \code{?SimpleSource} for defaults) and stores the vector in the \code{content} component. The functions \code{close()}, \code{eoi()}, \code{open()}, and \code{stepNext()} have reasonable default methods already for the \code{SimpleSource} class: the identity function for \code{open()} and \code{close()}, incrementing a position counter for \code{stepNext()}, and comparing the current position with the number of available elements as claimed by \code{length()} for \code{eoi()}, respectively. So we only need custom methods for element access: <>= getElem.VecSource <- function(x) list(content = x$content[x$position], uri = NULL) pGetElem.VecSource <- function(x) lapply(x$content, function(y) list(content = y, uri = NULL)) @ \section*{Readers} Readers are functions for extracting textual content and metadata out of elements delivered by a source and for constructing a text document. Each reader must accept following arguments in its signature: \begin{description} \item[elem] a list with the named components \code{content} and \code{uri} (as delivered by a source via \code{getElem()} or \code{pGetElem()}), \item[language] a string giving the language, and \item[id] a character giving a unique identifier for the created text document. \end{description} The element \code{elem} is typically provided by a source whereas the language and the identifier are normally provided by a corpus constructor (for the case that \code{elem\$content} does not give information on these two essential items). In case a reader expects configuration arguments we can use a function generator. A function generator is indicated by inheriting from class \code{FunctionGenerator} and \code{function}. It allows us to process additional arguments, store them in an environment, return a reader function with the well-defined signature described above, and still be able to access the additional arguments via lexical scoping. All corpus constructors in package \pkg{tm} check the reader function for being a function generator and if so apply it to yield the reader with the expected signature. E.g., the reader function \code{readPlain()} is defined as <>= readPlain <- function(elem, language, id) PlainTextDocument(elem$content, id = id, language = language) @ For examples on readers using the function generator please have a look at \code{?readPDF} or \code{?readPDF}. However, for many cases, it is not necessary to define each detailed aspect of how to extend \pkg{tm}. Typical examples are \acronym{XML} files which are very common but can be rather easily handled via standard conforming \acronym{XML} parsers. The aim of the remainder in this document is to give an overview on how simpler, more user-friendly, forms of extension mechanisms can be applied in \pkg{tm}. \section*{Custom Data Formats} A general situation is that you have gathered together some information into a tabular data structure (like a data frame or a list matrix) that suffices to describe documents in a corpus. However, you do not have a distinct file format because you extracted the information out of various resources, e.g., as delivered by \code{readtext()} in package \pkg{readtext}. Now you want to use your information to build a corpus which is recognized by \pkg{tm}. We assume that your information is put together in a data frame. E.g., consider the following example: <>= df <- data.frame(doc_id = c("doc 1" , "doc 2" , "doc 3" ), text = c("content 1", "content 2", "content 3"), title = c("title 1" , "title 2" , "title 3" ), authors = c("author 1" , "author 2" , "author 3" ), topics = c("topic 1" , "topic 2" , "topic 3" ), stringsAsFactors = FALSE) @ We want to map the data frame rows to the relevant entries of a text document. An entry \code{text} in the mapping will be matched to fill the actual content of the text document, \code{doc\_id} will be used as document ID, all other fields will be used as metadata tags. So we can construct a corpus out of the data frame: <<>>= (corpus <- Corpus(DataframeSource(df))) corpus[[1]] meta(corpus[[1]]) @ \section*{Custom XML Sources} Many modern file formats already come in \acronym{XML} format which allows to extract information with any \acronym{XML} conforming parser, e.g., as implemented in \proglang{R} by the \pkg{xml2} package. Now assume we have some custom \acronym{XML} format which we want to access with \pkg{tm}. Then a viable way is to create a custom \acronym{XML} source which can be configured with only a few commands. E.g., have a look at the following example: <>= custom.xml <- system.file("texts", "custom.xml", package = "tm") print(readLines(custom.xml), quote = FALSE) @ As you see there is a top-level tag stating that there is a corpus, and several document tags below. In fact, this structure is very common in \acronym{XML} files found in text mining applications (e.g., both the Reuters-21578 and the Reuters Corpus Volume 1 data sets follow this general scheme). In \pkg{tm} we expect a source to deliver self-contained blocks of information to a reader function, each block containing all information necessary such that the reader can construct a (subclass of a) \code{TextDocument} from it. The \code{XMLSource()} function can now be used to construct a custom \acronym{XML} source. It has three arguments: \begin{description} \item[x] a character giving a uniform resource identifier, \item[parser] a function accepting an \acronym{XML} document (as delivered by \code{read\_xml()} in package \pkg{xml2}) as input and returning a \acronym{XML} elements/nodes (each element/node will then be delivered to the reader as a self-contained block), \item[reader] a reader function capable of turning \acronym{XML} elements/nodes as returned by the parser into a subclass of \code{TextDocument}. \end{description} E.g., a custom source which can cope with our custom \acronym{XML} format could be: <>= mySource <- function(x) XMLSource(x, parser = xml2::xml_children, reader = myXMLReader) @ As you notice in this example we also provide a custom reader function (\code{myXMLReader}). See the next section for details. \section*{Custom XML Readers} As we saw in the previous section we often need a custom reader function to extract information out of \acronym{XML} chunks (typically as delivered by some source). Fortunately, \pkg{tm} provides an easy way to define custom \acronym{XML} reader functions. All you need to do is to provide a so-called \emph{specification}. Let us start with an example which defines a reader function for the file format from the previous section: <>= myXMLReader <- readXML( spec = list(author = list("node", "writer"), content = list("node", "description"), datetimestamp = list("function", function(x) as.POSIXlt(Sys.time(), tz = "GMT")), description = list("node", "@short"), heading = list("node", "caption"), id = list("function", function(x) tempfile()), origin = list("unevaluated", "My private bibliography"), type = list("node", "type")), doc = PlainTextDocument()) @ Formally, \code{readXML()} is the relevant function which constructs an reader. The customization is done via the first argument \code{spec}, the second provides an empty instance of the document which should be returned (augmented with the extracted information out of the \acronym{XML} chunks). The specification must consist of a named list of lists each containing two character vectors. The constructed reader will map each list entry to the content or a metadatum of the text document as specified by the named list entry. Valid names include \code{content} to access the document's content, and character strings which are mapped to metadata entries. Each list entry must consist of two character vectors: the first describes the type of the second argument, and the second is the specification entry. Valid combinations are: \begin{description} \item[\code{type = "node", spec = "XPathExpression"}] the XPath (1.0) expression \code{spec} extracts information out of an \acronym{XML} node (as seen for \code{author}, \code{content}, \code{description}, \code{heading}, and \code{type} in our example specification). \item[\code{type = "function", spec = function(doc) \ldots}] The function \code{spec} is called, passing over the \acronym{XML} document (as delivered by \code{read\_xml()} from package \pkg{xml2}) as first argument (as seen for \code{datetimestamp} and \code{id}). As you notice in our example nobody forces us to actually use the passed over document, instead we can do anything we want (e.g., create a unique character vector via \code{tempfile()} to have a unique identification string). \item[\code{type = "unevaluated", spec = "String"}] the character vector \code{spec} is returned without modification (e.g., \code{origin} in our specification). \end{description} Now that we have all we need to cope with our custom file format, we can apply the source and reader function at any place in \pkg{tm} where a source or reader is expected, respectively. E.g., <<>>= corpus <- VCorpus(mySource(custom.xml)) @ constructs a corpus out of the information in our \acronym{XML} file: <<>>= corpus[[1]] meta(corpus[[1]]) @ \bibliographystyle{abbrvnat} \bibliography{references} \end{document} tm/R/0000755000175100001440000000000014346266014011117 5ustar hornikuserstm/R/utils.R0000644000175100001440000000330213206313007012365 0ustar hornikusers## Helper functions .print_via_format <- function(x, ...) { writeLines(format(x, ...)) invisible(x) } ## Efficient alternative to table() proposed by Kurt Hornik .table <- function(x) { u <- sort(unique(x)) if(!length(u)) return(integer()) v <- tabulate(match(x, u)) names(v) <- u v } .xml_content <- function(doc, spec) { switch(spec[[1]], node = xml_text(xml_find_all(doc, spec[[2]])), "function" = spec[[2]](doc), unevaluated = spec[[2]]) } IETF_Snowball_map <- list("danish" = c("da", "dan"), "dutch" = c("nl", "nld", "dut"), "english" = c("en", "eng"), "finnish" = c("fi", "fin"), "french" = c("fr", "fra", "fre"), "german" = c("de", "deu", "ger"), "hungarian" = c("hu", "hun"), "italian" = c("it", "ita"), "norwegian" = c("no", "nor"), "portuguese" = c("pt", "por"), "romanian" = c("ro", "ron", "rum"), "russian" = c("ru", "rus"), "spanish" = c("es", "esl", "spa"), "swedish" = c("sv", "swe"), ## Have stopwords but no SnowballC stemmer ... "catalan" = c("ca", "cat"), ## Have SnowballC stemmer but no stopwords ... "turkish" = c("tr", "tur") ) # Map IETF language tags to languages used by the Snowball stemmer project # http://en.wikipedia.org/wiki/IETF_language_tag map_IETF_Snowball <- local({ codes <- unlist(IETF_Snowball_map, use.names = FALSE) names <- rep.int(names(IETF_Snowball_map), lengths(IETF_Snowball_map)) function(code) { code <- as.character(code) if (identical(code, "") || identical(code, character(0)) || is.na(code)) return("porter") names[charmatch(gsub("-.*", "", code), codes)] } }) tm/R/source.R0000644000175100001440000002016614346266014012547 0ustar hornikusers## Author: Ingo Feinerer ## Sources getSources <- function() c("DataframeSource", "DirSource", "URISource", "VectorSource", "XMLSource", "ZipSource") SimpleSource <- function(encoding = "", length = 0, position = 0, reader = readPlain, ..., class) { if (!is.character(encoding)) stop("invalid encoding") if (!is.numeric(length) || (length < 0)) stop("invalid length entry denoting the number of elements") if (!is.numeric(position)) stop("invalid position") if (!is.function(reader)) stop("invalid default reader") s <- list(encoding = encoding, length = length, position = position, reader = reader, ...) class(s) <- unique(c(class, "SimpleSource", "Source")) s } # A data frame where each row is interpreted as document DataframeSource <- function(x) { stopifnot(all(!is.na(match(c("doc_id", "text"), names(x))))) SimpleSource(length = nrow(x), reader = readDataframe, content = x, class = "DataframeSource") } # A directory with files interpreted as documents DirSource <- function(directory = ".", encoding = "", pattern = NULL, recursive = FALSE, ignore.case = FALSE, mode = "text") { if (!identical(mode, "text") && !identical(mode, "binary") && !identical(mode, "")) stop(sprintf("invalid mode '%s'", mode)) d <- dir(directory, full.names = TRUE, pattern = pattern, recursive = recursive, ignore.case = ignore.case) if (!length(d)) stop("empty directory") isfile <- !file.info(d)[["isdir"]] if (any(is.na(isfile))) stop("non-existent or non-readable file(s): ", paste(d[is.na(isfile)], collapse = " ")) SimpleSource(encoding = encoding, length = sum(isfile), mode = mode, filelist = d[isfile], class = "DirSource") } # Documents identified by a Uniform Resource Identifier URISource <- function(x, encoding = "", mode = "text") { if (!identical(mode, "text") && !identical(mode, "binary") && !identical(mode, "")) stop(sprintf("invalid mode '%s'", mode)) SimpleSource(encoding = encoding, length = length(x), mode = mode, uri = x, class = "URISource") } # A vector where each component is interpreted as document VectorSource <- function(x) SimpleSource(length = length(x), content = x, class = "VectorSource") XMLSource <- function(x, parser = xml_contents, reader) { xmldoc <- read_xml(x) content <- parser(xmldoc) SimpleSource(length = length(content), reader = reader, content = content, uri = x, class = "XMLSource") } # A ZIP file with its compressed files interpreted as documents ZipSource <- function(zipfile, pattern = NULL, recursive = FALSE, ignore.case = FALSE, mode = "text") { if (!identical(mode, "text") && !identical(mode, "binary") && !identical(mode, "")) stop(sprintf("invalid mode '%s'", mode)) SimpleSource(exdir = NULL, files = NULL, mode = mode, pattern = pattern, recursive = recursive, ignore.case = ignore.case, zipfile = zipfile, class = "ZipSource") } # tau:::read_all_bytes read_all_bytes <- function(con, chunksize = 2 ^ 16) { if (is.character(con)) { return(readBin(con, raw(), file.info(con)$size)) } if (!isOpen(con)) { open(con, "rb") on.exit(close(con)) } bytes <- list() repeat { chunk <- readBin(con, raw(), chunksize) bytes <- c(bytes, list(chunk)) if (length(chunk) < chunksize) break } unlist(bytes) } readContent <- function(x, encoding, mode) { if (identical(mode, "text")) iconv(readLines(x, warn = FALSE), encoding, "UTF-8", "byte") else if (identical(mode, "binary")) read_all_bytes(x) else if (identical(mode, "")) NULL else stop("invalid mode") } open.SimpleSource <- close.SimpleSource <- function(con, ...) con open.ZipSource <- function(con, ...) { x <- con exdir <- tempfile("ZipSource") dir.create(exdir, mode = "0700") destfile <- x$zipfile if (!file.exists(destfile)) { destfile <- tempfile() download.file(x$zipfile, destfile) on.exit(file.remove(destfile)) } files <- unzip(destfile, list = TRUE) ## Directories have length 0 files <- files[files$Length > 0, "Name"] ## Idea: Subdirectories contain file separators if (!x$recursive) files <- files[!grepl(.Platform$file.sep, files, fixed = TRUE)] ## Idea: pattern and ignore.case refer to the file name (like basename) ## Cf. also ?dir if (!is.null(x$pattern)) files <- files[grepl(x$pattern, files, ignore.case = x$ignore.case)] unzip(destfile, files, exdir = exdir) x$exdir <- exdir x$files <- files x$length <- length(files) x } close.ZipSource <- function(con, ...) { x <- con if (!is.null(x$exdir)) { unlink(x$exdir, recursive = TRUE) x$exdir <- NULL x$files <- NULL x$length <- 0 } x } eoi <- function(x) UseMethod("eoi", x) eoi.SimpleSource <- function(x) x$length <= x$position getElem <- function(x) UseMethod("getElem", x) getElem.DataframeSource <- function(x) list(content = x$content[x$position, ], uri = NULL) getElem.DirSource <- function(x) { filename <- x$filelist[x$position] list(content = readContent(filename, x$encoding, x$mode), uri = paste0("file://", filename)) } getElem.URISource <- function(x) list(content = readContent(x$uri[x$position], x$encoding, x$mode), uri = x$uri[x$position]) getElem.VectorSource <- function(x) list(content = x$content[x$position], uri = NULL) getElem.XMLSource <- function(x) list(content = x$content[[x$position]], uri = x$uri) getElem.ZipSource <- function(x) { path <- file.path(x$exdir, x$files[x$position]) list(content = readContent(path, x$encoding, x$mode), uri = paste0("file://", path)) } getMeta <- function(x) UseMethod("getMeta", x) getMeta.DataframeSource <- function(x) list(cmeta = NULL, dmeta = x$content[, is.na(match(names(x$content), c("doc_id", "text"))), drop = FALSE]) length.SimpleSource <- function(x) x$length pGetElem <- function(x) UseMethod("pGetElem", x) pGetElem.DataframeSource <- function(x) tm_parLapply(seq_len(x$length), function(y) list(content = x$content[y, ], uri = NULL)) `[.DataframeSource` <- function(x, i, j, ...) x$content[i, j, ...] `[[.DataframeSource` <- function(x, ...) x$content[[...]] pGetElem.DirSource <- function(x) tm_parLapply(x$filelist, function(f) list(content = readContent(f, x$encoding, x$mode), uri = paste0("file://", f))) `[.DirSource` <- function(x, i, ...) x$filelist[i, ...] `[[.DirSource` <- function(x, i, ...) x$filelist[[i, ...]] pGetElem.URISource <- function(x) tm_parLapply(x$uri, function(uri) list(content = readContent(uri, x$encoding, x$mode), uri = uri)) `[.URISource` <- function(x, i, ...) x$uri[i, ...] `[[.URISource` <- function(x, i, ...) x$uri[[i, ...]] pGetElem.VectorSource <- function(x) tm_parLapply(x$content, function(y) list(content = y, uri = NULL)) `[.VectorSource` <- function(x, i, ...) x$content[i, ...] `[[.VectorSource` <- function(x, i, ...) x$content[[i, ...]] pGetElem.ZipSource <- function(x) tm_parLapply(file.path(x$exdir, x$files), function(f) list(content = readContent(f, x$encoding, x$mode), uri = paste0("file://", f))) reader <- function(x) UseMethod("reader", x) reader.SimpleSource <- function(x) x$reader stepNext <- function(x) UseMethod("stepNext", x) stepNext.SimpleSource <- function(x) { x$position <- x$position + 1 x } tm/R/hpc.R0000644000175100001440000000075413037140514012012 0ustar hornikuserstm_parLapply_engine <- local({ val <- NULL ## Could do some checking on new if given: should inherit from ## "cluster" or have formals (X, FUN, ...). function(new) { if (missing(new)) val else val <<- new } }) tm_parLapply <- function(X, FUN, ...) { engine <- tm_parLapply_engine() if (inherits(engine, "cluster")) parLapply(engine, X, FUN, ...) else if (is.function(engine)) engine(X, FUN, ...) else lapply(X, FUN, ...) } tm/R/doc.R0000644000175100001440000000577713177022574012031 0ustar hornikusersc.TextDocument <- function(..., recursive = FALSE) { args <- list(...) x <- args[[1L]] if (length(args) == 1L) return(x) if (!all(unlist(lapply(args, inherits, class(x))))) stop("not all arguments are text documents") v <- list(content = args, meta = CorpusMeta(), dmeta = data.frame(row.names = seq_along(args))) class(v) <- c("VCorpus", "Corpus") v } .format_TextDocument <- function(x, ...) c(sprintf("<<%s>>", class(x)[1L]), sprintf("Metadata: %d", length(meta(x)))) inspect.TextDocument <- function(x) { print(x) cat("\n") writeLines(as.character(x)) invisible(x) } PlainTextDocument <- function(x = character(0), author = character(0), datetimestamp = as.POSIXlt(Sys.time(), tz = "GMT"), description = character(0), heading = character(0), id = character(0), language = character(0), origin = character(0), ..., meta = NULL, class = NULL) { p <- list(content = as.character(x), meta = TextDocumentMeta(author, datetimestamp, description, heading, id, language, origin, ..., meta = meta)) class(p) <- unique(c(class, "PlainTextDocument", "TextDocument")) p } as.character.PlainTextDocument <- function(x, ...) content(x) content.PlainTextDocument <- function(x) x$content `content<-.PlainTextDocument` <- function(x, value) { x$content <- as.character(value) x } format.PlainTextDocument <- function(x, ...) c(.format_TextDocument(x), sprintf("Content: chars: %d", sum(nchar(x$content)))) meta.PlainTextDocument <- function(x, tag = NULL, ...) if (is.null(tag)) x$meta else x$meta[[tag]] `meta<-.PlainTextDocument` <- function(x, tag = NULL, ..., value) { if (is.null(tag)) x$meta <- value else x$meta[[tag]] <- value x } words.character <- words.PlainTextDocument <- function(x, ...) scan_tokenizer(x) XMLTextDocument <- function(x = xml_missing(), author = character(0), datetimestamp = as.POSIXlt(Sys.time(), tz = "GMT"), description = character(0), heading = character(0), id = character(0), language = character(0), origin = character(0), ..., meta = NULL) { d <- list(content = x, meta = TextDocumentMeta(author, datetimestamp, description, heading, id, language, origin, ..., meta = meta)) class(d) <- c("XMLTextDocument", "TextDocument") d } as.character.XMLTextDocument <- function(x, ...) xml_text(content(x)) content.XMLTextDocument <- function(x) x$content `content<-.XMLTextDocument` <- function(x, value) { x$content <- value x } format.XMLTextDocument <- .format_TextDocument meta.XMLTextDocument <- meta.PlainTextDocument `meta<-.XMLTextDocument` <- `meta<-.PlainTextDocument` tm/R/corpus.R0000644000175100001440000002216413754747777012610 0ustar hornikusers# Author: Ingo Feinerer Corpus <- function(x, readerControl = list(reader = reader(x), language = "en")) { stopifnot(inherits(x, "Source")) readerControl <- prepareReader(readerControl, reader(x)) if ( (inherits(x, "DataframeSource") || inherits(x, "DirSource") || inherits(x, "VectorSource") ) && identical(readerControl$reader, reader(x))) SimpleCorpus(x, readerControl) else VCorpus(x, readerControl) } PCorpus <- function(x, readerControl = list(reader = reader(x), language = "en"), dbControl = list(dbName = "", dbType = "DB1")) { stopifnot(inherits(x, "Source")) readerControl <- prepareReader(readerControl, reader(x)) if (!filehash::dbCreate(dbControl$dbName, dbControl$dbType)) stop("error in creating database") db <- filehash::dbInit(dbControl$dbName, dbControl$dbType) x <- open(x) tdl <- vector("list", length(x)) counter <- 1 while (!eoi(x)) { x <- stepNext(x) elem <- getElem(x) doc <- readerControl$reader(elem, readerControl$language, as.character(counter)) filehash::dbInsert(db, meta(doc, "id"), doc) tdl[[counter]] <- meta(doc, "id") counter <- counter + 1 } x <- close(x) cmeta <- CorpusMeta() dmeta <- data.frame(row.names = seq_along(tdl)) ## Check if metadata retrieval is supported for(cl in class(x)) { if (is.function(getS3method("getMeta", cl, TRUE))) { m <- getMeta(x) if (!is.null(m$cmeta)) cmeta <- m$cmeta if (!is.null(m$dmeta)) dmeta <- m$dmeta break } } p <- list(content = tdl, meta = cmeta, dmeta = dmeta, dbcontrol = dbControl) class(p) <- c("PCorpus", "Corpus") p } SimpleCorpus <- function(x, control = list(language = "en")) { stopifnot(inherits(x, "Source")) if (!is.null(control$reader) && !identical(control$reader, reader(x))) warning("custom reader is ignored") content <- if (inherits(x, "VectorSource")) { if (is.character(x$content)) x$content else as.character(x$content) } else if (inherits(x, "DirSource")) { setNames(as.character( lapply(x$filelist, function(f) paste(readContent(f, x$encoding, "text"), collapse = "\n")) ), basename(x$filelist)) } else if (inherits(x, "DataframeSource")) { setNames(as.character(x$content[, "text"]), x$content[, "doc_id"]) } else stop("unsupported source type") dmeta <- if (inherits(x, "DataframeSource")) x$content[, is.na(match(names(x$content), c("doc_id", "text"))), drop = FALSE] else data.frame(row.names = seq_along(x)) s <- list(content = content, meta = CorpusMeta(language = control$language), dmeta = dmeta) class(s) <- c("SimpleCorpus", "Corpus") s } VCorpus <- function(x, readerControl = list(reader = reader(x), language = "en")) { stopifnot(inherits(x, "Source")) readerControl <- prepareReader(readerControl, reader(x)) x <- open(x) tdl <- vector("list", length(x)) ## Check for parallel element access found <- FALSE for(cl in class(x)) { if (is.function(getS3method("pGetElem", cl, TRUE))) { tdl <- mapply(function(elem, id) readerControl$reader(elem, readerControl$language, id), pGetElem(x), id = as.character(seq_along(x)), SIMPLIFY = FALSE) found <- TRUE break } } if(!found) { counter <- 1 while (!eoi(x)) { x <- stepNext(x) elem <- getElem(x) doc <- readerControl$reader(elem, readerControl$language, as.character(counter)) tdl[[counter]] <- doc counter <- counter + 1 } } x <- close(x) cmeta <- CorpusMeta() dmeta <- data.frame(row.names = seq_along(tdl)) ## Check if metadata retrieval is supported for(cl in class(x)) { if (is.function(getS3method("getMeta", cl, TRUE))) { m <- getMeta(x) if (!is.null(m$cmeta)) cmeta <- m$cmeta if (!is.null(m$dmeta)) dmeta <- m$dmeta break } } v <- as.VCorpus(tdl) v$meta <- cmeta v$dmeta <- dmeta v } `[.PCorpus` <- `[.SimpleCorpus` <- function(x, i) { if (!missing(i)) { x$content <- x$content[i] x$dmeta <- x$dmeta[i, , drop = FALSE] } x } `[.VCorpus` <- function(x, i) { if (!missing(i)) { x$content <- x$content[i] x$dmeta <- x$dmeta[i, , drop = FALSE] if (!is.null(x$lazy)) x$lazy$index <- x$lazy$index[i] } x } .map_name_index <- function(x, i) { if (is.character(i)) match(i, meta(x, "id", "local")) else i } `[[.PCorpus` <- function(x, i) { i <- .map_name_index(x, i) db <- filehash::dbInit(x$dbcontrol[["dbName"]], x$dbcontrol[["dbType"]]) filehash::dbFetch(db, x$content[[i]]) } `[[.SimpleCorpus` <- function(x, i) { i <- .map_name_index(x, i) n <- names(x$content) PlainTextDocument(x$content[[i]], id = if (is.null(n)) i else n[i], language = meta(x, "language")) } `[[.VCorpus` <- function(x, i) { i <- .map_name_index(x, i) if (!is.null(x$lazy)) .Call(`_tm_copyCorpus`, x, materialize(x, i)) x$content[[i]] } `[[<-.SimpleCorpus` <- function(x, i, value) { x$content[i] <- paste0(as.character(value), collapse = "\n") x } `[[<-.PCorpus` <- function(x, i, value) { i <- .map_name_index(x, i) db <- filehash::dbInit(x$dbcontrol[["dbName"]], x$dbcontrol[["dbType"]]) db[[x$content[[i]]]] <- value x } `[[<-.VCorpus` <- function(x, i, value) { i <- .map_name_index(x, i) # Mark new objects as inactive for lazy mapping if (!is.null(x$lazy)) x$lazy$index[i] <- FALSE x$content[[i]] <- value x } as.list.PCorpus <- as.list.VCorpus <- function(x, ...) setNames(content(x), as.character(lapply(content(x), meta, "id"))) as.list.SimpleCorpus <- function(x, ...) as.list(content(x)) as.VCorpus <- function(x) UseMethod("as.VCorpus") as.VCorpus.VCorpus <- identity as.VCorpus.list <- function(x) { v <- list(content = x, meta = CorpusMeta(), dmeta = data.frame(row.names = seq_along(x))) class(v) <- c("VCorpus", "Corpus") v } outer_union <- function(x, y, ...) { if (nrow(x) > 0L) x[, setdiff(names(y), names(x))] <- NA if (nrow(y) > 0L) y[, setdiff(names(x), names(y))] <- NA res <- rbind(x, y) if (ncol(res) == 0L) res <- data.frame(row.names = seq_len(nrow(x) + nrow(y))) res } c.VCorpus <- function(..., recursive = FALSE) { args <- list(...) x <- args[[1L]] if (length(args) == 1L) return(x) if (!all(unlist(lapply(args, inherits, class(x))))) stop("not all arguments are of the same corpus type") v <- list(content = do.call("c", lapply(args, content)), meta = CorpusMeta(meta = do.call("c", lapply(args, function(a) meta(a, type = "corpus")))), dmeta = Reduce(outer_union, lapply(args, meta))) class(v) <- c("VCorpus", "Corpus") v } content.VCorpus <- function(x) { if (!is.null(x$lazy)) .Call(`_tm_copyCorpus`, x, materialize(x)) x$content } content.SimpleCorpus <- function(x) x$content content.PCorpus <- function(x) { db <- filehash::dbInit(x$dbcontrol[["dbName"]], x$dbcontrol[["dbType"]]) filehash::dbMultiFetch(db, unlist(x$content)) } inspect <- function(x) UseMethod("inspect", x) inspect.PCorpus <- inspect.SimpleCorpus <- inspect.VCorpus <- function(x) { print(x) cat("\n") print(noquote(content(x))) invisible(x) } length.PCorpus <- length.SimpleCorpus <- length.VCorpus <- function(x) length(x$content) names.PCorpus <- names.SimpleCorpus <- names.VCorpus <- function(x) as.character(meta(x, "id", "local")) `names<-.PCorpus` <- `names<-.VCorpus` <- function(x, value) { meta(x, "id", "local") <- as.character(value) x } format.PCorpus <- format.SimpleCorpus <- format.VCorpus <- function(x, ...) { c(sprintf("<<%s>>", class(x)[1L]), sprintf("Metadata: corpus specific: %d, document level (indexed): %d", length(meta(x, type = "corpus")), ncol(meta(x, type = "indexed"))), sprintf("Content: documents: %d", length(x))) } writeCorpus <- function(x, path = ".", filenames = NULL) { filenames <- file.path(path, if (is.null(filenames)) sprintf("%s.txt", as.character(meta(x, "id", "local"))) else filenames) stopifnot(length(x) == length(filenames)) mapply(function(doc, f) writeLines(as.character(doc), f), x, filenames) invisible(x) } tm/R/pdftools.R0000644000175100001440000001046312776627444013115 0ustar hornikuserspdf_info_via_xpdf <- function(file, options = NULL) { outfile <- tempfile("pdfinfo") on.exit(unlink(outfile)) status <- system2("pdfinfo", c(options, shQuote(normalizePath(file))), stdout = outfile) ## Could check the status ... ## This does not work ... ## info <- as.list(read.dcf(outfile)[1L, ]) tags <- c("Title", "Subject", "Keywords", "Author", "Creator", "Producer", "CreationDate", "ModDate", "Tagged", "Form", "Pages", "Encrypted", "Page size", "File size", "Optimized", "PDF version") re <- sprintf("^(%s)", paste(sprintf("%-16s", sprintf("%s:", tags)), collapse = "|")) lines <- readLines(outfile, warn = FALSE) ind <- grepl(re, lines) tags <- sub(": *", "", substring(lines[ind], 1L, 16L)) info <- split(sub(re, "", lines), cumsum(ind)) names(info) <- tags fmt <- "%a %b %d %X %Y" if (!is.null(d <- info$CreationDate)) info$CreationDate <- strptime(d, fmt) if (!is.null(d <- info$ModDate)) info$ModDate <- strptime(d, fmt) if (!is.null(p <- info$Pages)) info$Pages <- as.integer(p) info } pdf_info_via_gs <- function(file) { file <- normalizePath(file) gs_cmd <- tools::find_gs_cmd() out <- system2(gs_cmd, c("-dNODISPLAY -q", sprintf("-sFile=%s", shQuote(file)), system.file("ghostscript", "pdf_info.ps", package = "tm")), stdout = TRUE) out <- out[cumsum(out == "") == 2L][-1L] val <- sub("^[^:]+:[[:space:]]*", "", out) names(val) <- sub(":.*", "", out) val <- as.list(val) if (!is.null(d <- val$CreationDate)) val$CreationDate <- PDF_Date_to_POSIXt(d) if (!is.null(d <- val$ModDate)) val$ModDate <- PDF_Date_to_POSIXt(d) val } PDF_Date_to_POSIXt <- function(s) { ## Strip optional 'D:' prefix. s <- sub("^D:", "", s) ## Strip apostrophes in offset spec. s <- gsub("'", "", s) if (nchar(s) <= 14L) { s <- sprintf("%s%s", s, substring(" 0101000000", nchar(s) + 1L, 14L)) strptime(s, "%Y%m%d%H%M%S") } else if (substring(s, 15L, 15L) == "Z") { strptime(substring(s, 1L, 14L), "%Y%m%d%H%M%S") } else { strptime(s, "%Y%m%d%H%M%S%z") } } pdf_text_via_gs <- function(file) { file <- normalizePath(file) gs_cmd <- tools::find_gs_cmd() tf <- tempfile("pdf") on.exit(unlink(tf)) ## The current mechanism is first converting PDF to Postscript using ## the ps2write device, and then extract text using the ps2ascii.ps ## program. This fails for some files (e.g., ## /data/rsync/PKGS/AlleleRetain/inst/doc/AlleleRetain_User_Guide.pdf ## which Ghostscript also fails to render. Note that rendering via ## gv works "fine": but this uses the pswrite device which produces ## bitmap (from which no text can be extracted, of course). ## Using the txtwrite device is simply too unstable: e.g., ## gs -dBATCH -dNOPAUSE -sDEVICE=txtwrite -dQUIET -sOutputFile=- \ ## /data/rsync/PKGS/AlleleRetain/inst/doc/AlleleRetain_User_Guide.pdf ## keeps segfaulting. ## An additional nuisance is that there seems no simple way to ## detect a ps2ascii.ps failure. ## Finally, note that we currently use -DSIMPLE: without this, more ## information would be made available, but require post-processing. ## Step 1. Convert PDF to Postscript. res <- system2(gs_cmd, c("-q -dNOPAUSE -dBATCH -P- -dSAFER -sDEVICE=ps2write", sprintf("-sOutputFile=%s", tf), "-c save pop -f", shQuote(file))) ## Step 2. Extract text. txt <- system2(gs_cmd, c("-q -dNODISPLAY -P- -dSAFER -dDELAYBIND -dWRITESYSTEMDICT -dSIMPLE", "-c save -f ps2ascii.ps", tf, "-c quit"), stdout = TRUE) ## Argh. How can we catch errors? ## The return values are always 0 ... if (any(grepl("Error handled by opdfread.ps", txt))) { stop(paste(c("Ghostscript failed, with output:", txt), collapse = "\n")) } strsplit(paste(txt, collapse = "\n"), "\f")[[1L]] } tm/R/RcppExports.R0000644000175100001440000000077113404767400013537 0ustar hornikusers# Generated by using Rcpp::compileAttributes() -> do not edit by hand # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 tdm <- function(strings, remove_puncts, remove_digits, stopwords, dictionary, min_term_freq, max_term_freq, min_word_length, max_word_length) { .Call(`_tm_tdm`, strings, remove_puncts, remove_digits, stopwords, dictionary, min_term_freq, max_term_freq, min_word_length, max_word_length) } Boost_Tokenizer <- function(strings) { .Call(`_tm_Boost_Tokenizer`, strings) } tm/R/matrix.R0000644000175100001440000004462214262016526012554 0ustar hornikusers## Authors: Ingo Feinerer, Kurt Hornik TermDocumentMatrix_classes <- c("TermDocumentMatrix", "simple_triplet_matrix") DocumentTermMatrix_classes <- c("DocumentTermMatrix", "simple_triplet_matrix") .TermDocumentMatrix <- function(x, weighting) { x <- as.simple_triplet_matrix(x) if (!is.null(dimnames(x))) names(dimnames(x)) <- c("Terms", "Docs") class(x) <- TermDocumentMatrix_classes if (is.null(weighting)) weighting <- weightTf ## ## Note that if weighting is a weight function, it already needs to ## know whether we have a term-document or document-term matrix. ## ## Ideally we would require weighting to be a WeightFunction object ## or a character string of length 2. But then ## dtm <- DocumentTermMatrix(crude, ## control = list(weighting = ## function(x) ## weightTfIdf(x, normalize = ## FALSE), ## stopwords = TRUE)) ## in example("DocumentTermMatrix") fails [because weightTfIdf() is ## a weight function and not a weight function generator ...] ## Hence, for now, instead of ## if (inherits(weighting, "WeightFunction")) ## x <- weighting(x) ## use if (is.function(weighting)) x <- weighting(x) ## and hope for the best ... ## else if (is.character(weighting) && (length(weighting) == 2L)) attr(x, "weighting") <- weighting x } .SimpleTripletMatrix <- function(i, j, v, terms, corpus) { docs <- as.character(meta(corpus, "id", "local")) if (length(docs) != length(corpus)) { warning("invalid document identifiers") docs <- NULL } simple_triplet_matrix(i, j, v, nrow = length(terms), ncol = length(corpus), dimnames = list(Terms = terms, Docs = docs)) } filter_global_bounds <- function(m, bounds) { m <- as.simple_triplet_matrix(m) if (length(bounds) == 2L && is.numeric(bounds)) { rs <- row_sums(m > 0) m <- m[(rs >= bounds[1]) & (rs <= bounds[2]), ] } m } TermDocumentMatrix <- function(x, control = list()) UseMethod("TermDocumentMatrix", x) TermDocumentMatrix.SimpleCorpus <- function(x, control = list()) { stopifnot(is.list(control)) if (any(unlist(lapply(control, is.function)))) warning("custom functions are ignored") if (!is.null(control$tokenize) && !identical(control$tokenize, "Boost")) warning("custom tokenizer is ignored") txt <- content(x) ## Conversion to lower case if (is.null(control$tolower) || isTRUE(control$tolower)) txt <- tolower(txt) ## Stopword filtering .stopwords <- if (isTRUE(control$stopwords)) stopwords(meta(x, "language")) else if (is.character(control$stopwords)) control$stopwords else character(0) .dictionary <- if (is.null(control$dictionary)) character(0) else control$dictionary ## Ensure local bounds bl <- control$bounds$local min_term_freq <- if (length(bl) == 2L && is.numeric(bl) && bl[1] >= 0) bl[1] else 0L max_term_freq <- if (length(bl) == 2L && is.numeric(bl) && bl[2] >= 0) min(bl[2], .Machine$integer.max) else .Machine$integer.max ## Filter out too short or too long terms wl <- control$wordLengths min_word_length <- if (is.numeric(wl[1]) && wl[1] >= 0) wl[1] else 3L max_word_length <- if (is.numeric(wl[2]) && wl[2] >= 0) min(wl[2], .Machine$integer.max) else .Machine$integer.max m <- tdm(txt, isTRUE(control$removePunctuation), isTRUE(control$removeNumbers), .stopwords, .dictionary, as.integer(min_term_freq), as.integer(max_term_freq), as.integer(min_word_length), as.integer(max_word_length)) Encoding(m$terms) <- "UTF-8" m <- .SimpleTripletMatrix(m$i, m$j, m$v, m$terms, x) ## Stemming ## ## Ideally tdm() could perform stemming as well but there is no easy way to ## access the SnowballC::wordStem() function from C++ (via Rcpp) without ## significant overhead (as SnowballC does not export its internal C ## functions). ## ## Stemming afterwards is still quite performant as we already have ## all terms. However, there is some overhead involved as we need ## to recheck local bounds and word lengths. ## if (isTRUE(control$stemming)) { stems <- SnowballC::wordStem(m$dimnames$Terms, meta(x, "language")) ## Do as.factor(stems) "by hand" for performance reasons. uniqs <- sort(unique(stems)) stems <- match(stems, uniqs) attributes(stems) <- list(levels = uniqs, class = "factor") m <- rollup(m, "Terms", stems) ## Recheck local bounds ## No need to check lower local bound as rollup aggregates frequencies m[m > max_term_freq] <- 0 ## Recheck word lengths terms_length <- nchar(rownames(m)) m <- m[min_word_length <= terms_length & terms_length <= max_word_length, ] } m <- filter_global_bounds(m, control$bounds$global) .TermDocumentMatrix(m, control$weighting) } TermDocumentMatrix.PCorpus <- TermDocumentMatrix.VCorpus <- function(x, control = list()) { stopifnot(is.list(control)) tflist <- tm_parLapply(unname(content(x)), termFreq, control) v <- unlist(tflist) i <- names(v) terms <- sort(unique(as.character(if (is.null(control$dictionary)) i else control$dictionary))) i <- match(i, terms) j <- rep.int(seq_along(x), lengths(tflist)) m <- .SimpleTripletMatrix(i, j, as.numeric(v), terms, x) m <- filter_global_bounds(m, control$bounds$global) .TermDocumentMatrix(m, control$weighting) } TermDocumentMatrix.default <- function(x, control = list()) TermDocumentMatrix(Corpus(VectorSource(x)), control) DocumentTermMatrix <- function(x, control = list()) t(TermDocumentMatrix(x, control)) as.TermDocumentMatrix <- function(x, ...) UseMethod("as.TermDocumentMatrix") as.TermDocumentMatrix.TermDocumentMatrix <- function(x, ...) x as.TermDocumentMatrix.DocumentTermMatrix <- function(x, ...) t(x) as.TermDocumentMatrix.term_frequency <- as.TermDocumentMatrix.textcnt <- function(x, ...) { m <- simple_triplet_matrix(i = seq_along(x), j = rep_len(1L, length(x)), v = as.numeric(x), nrow = length(x), ncol = 1, dimnames = list(Terms = names(x), Docs = NA_character_)) .TermDocumentMatrix(m, weightTf) } as.TermDocumentMatrix.default <- function(x, weighting, ...) .TermDocumentMatrix(x, weighting) as.DocumentTermMatrix <- function(x, ...) UseMethod("as.DocumentTermMatrix") as.DocumentTermMatrix.DocumentTermMatrix <- function(x, ...) x as.DocumentTermMatrix.TermDocumentMatrix <- function(x, ...) t(x) as.DocumentTermMatrix.term_frequency <- as.DocumentTermMatrix.textcnt <- function(x, ...) t(as.TermDocumentMatrix(x)) as.DocumentTermMatrix.default <- function(x, weighting, ...) { x <- as.simple_triplet_matrix(x) t(.TermDocumentMatrix(t(x), weighting)) } t.TermDocumentMatrix <- t.DocumentTermMatrix <- function(x) { m <- NextMethod("t") attr(m, "weighting") <- attr(x, "weighting") class(m) <- if (inherits(x, "DocumentTermMatrix")) TermDocumentMatrix_classes else DocumentTermMatrix_classes m } termFreq <- function(doc, control = list()) { stopifnot(inherits(doc, "TextDocument") || is.character(doc), is.list(control)) ## Tokenize the corpus .tokenize <- control$tokenize if (is.null(.tokenize) || identical(.tokenize, "words")) .tokenize <- words else if (identical(.tokenize, "Boost")) .tokenize <- Boost_tokenizer else if (identical(.tokenize, "MC")) .tokenize <- MC_tokenizer else if (identical(.tokenize, "scan")) .tokenize <- scan_tokenizer else if (is.Span_Tokenizer(.tokenize)) .tokenize <- as.Token_Tokenizer(.tokenize) if (is.function(.tokenize)) txt <- .tokenize(doc) else stop("invalid tokenizer") ## Conversion to lower case .tolower <- control$tolower if (is.null(.tolower) || isTRUE(.tolower)) .tolower <- tolower if (is.function(.tolower)) txt <- .tolower(txt) ## Punctuation removal .removePunctuation <- control$removePunctuation if (isTRUE(.removePunctuation)) .removePunctuation <- removePunctuation else if (is.list(.removePunctuation)) .removePunctuation <- function(x) do.call(removePunctuation, c(list(x), control$removePunctuation)) ## Number removal .removeNumbers <- control$removeNumbers if (isTRUE(.removeNumbers)) .removeNumbers <- removeNumbers .language <- control$language if (inherits(doc, "TextDocument")) .language <- meta(doc, "language") if (is.null(.language)) .language <- "en" ## Stopword filtering .stopwords <- control$stopwords if (isTRUE(.stopwords)) .stopwords <- function(x) x[is.na(match(x, stopwords(.language)))] else if (is.character(.stopwords)) .stopwords <- function(x) x[is.na(match(x, control$stopwords))] ## Stemming .stemming <- control$stemming if (isTRUE(.stemming)) .stemming <- function(x) SnowballC::wordStem(x, .language) ## Default order for options which support reordering or <- c("removePunctuation", "removeNumbers", "stopwords", "stemming") ## Process control options in specified order nc <- names(control) n <- nc[!is.na(match(nc, or))] for (name in sprintf(".%s", c(n, setdiff(or, n)))) { g <- get(name) if (is.function(g)) txt <- g(txt) } ## If dictionary is set tabulate against it dictionary <- control$dictionary tab <- .table(if (is.null(dictionary)) txt else txt[!is.na(match(txt, dictionary))]) ## Ensure local bounds bl <- control$bounds$local if (length(bl) == 2L && is.numeric(bl)) tab <- tab[(tab >= bl[1]) & (tab <= bl[2]), drop = FALSE] ## Filter out too short or too long terms nc <- nchar(names(tab), type = "chars") wl <- control$wordLengths lb <- if (is.numeric(wl[1])) wl[1] else 3 ub <- if (is.numeric(wl[2])) wl[2] else Inf tab <- tab[(nc >= lb) & (nc <= ub), drop = FALSE] class(tab) <- c("term_frequency", class(tab)) tab } print.TermDocumentMatrix <- print.DocumentTermMatrix <- function(x, ...) { format <- c("term", "document") if (inherits(x, "DocumentTermMatrix")) format <- rev(format) writeLines(sprintf("<<%s (%ss: %d, %ss: %d)>>", class(x)[1], format[1L], nrow(x), format[2L], ncol(x))) writeLines(sprintf("Non-/sparse entries: %d/%.0f", length(x$v), prod(dim(x)) - length(x$v))) sparsity <- if (!prod(dim(x))) 100 else round( (1 - length(x$v) / prod(dim(x))) * 100) writeLines(sprintf("Sparsity : %s%%", sparsity)) writeLines(sprintf("Maximal term length: %s", max(nchar(Terms(x), type = "chars"), 0))) writeLines(sprintf("Weighting : %s (%s)", attr(x, "weighting")[1L], attr(x, "weighting")[2L])) invisible(x) } inspect.TermDocumentMatrix <- inspect.DocumentTermMatrix <- function(x) { print(x) cat("Sample :\n") print(as.matrix(sample.TermDocumentMatrix(x))) } `[.TermDocumentMatrix` <- `[.DocumentTermMatrix` <- function(x, i, j, ..., drop) { m <- NextMethod("[") attr(m, "weighting") <- attr(x, "weighting") class(m) <- if (inherits(x, "DocumentTermMatrix")) DocumentTermMatrix_classes else TermDocumentMatrix_classes m } `dimnames<-.DocumentTermMatrix` <- function(x, value) { x <- NextMethod("dimnames<-") dnx <- x$dimnames if (!is.null(dnx)) names(dnx) <- c("Docs", "Terms") x$dimnames <- dnx x } `dimnames<-.TermDocumentMatrix` <- function(x, value) { x <- NextMethod("dimnames<-") dnx <- x$dimnames if (!is.null(dnx)) names(dnx) <- c("Terms", "Docs") x$dimnames <- dnx x } nDocs <- function(x) UseMethod("nDocs") nTerms <- function(x) UseMethod("nTerms") nDocs.DocumentTermMatrix <- nTerms.TermDocumentMatrix <- function(x) x$nrow nDocs.TermDocumentMatrix <- nTerms.DocumentTermMatrix <- function(x) x$ncol Docs <- function(x) UseMethod("Docs") Terms <- function(x) UseMethod("Terms") Docs.DocumentTermMatrix <- Terms.TermDocumentMatrix <- function(x) { s <- x$dimnames[[1L]] if (is.null(s)) s <- rep.int(NA_character_, x$nrow) s } Docs.TermDocumentMatrix <- Terms.DocumentTermMatrix <- function(x) { s <- x$dimnames[[2L]] if (is.null(s)) s <- rep.int(NA_character_, x$ncol) s } c.term_frequency <- function(..., recursive = FALSE) { do.call("c", lapply(list(...), as.TermDocumentMatrix)) } c.TermDocumentMatrix <- function(..., recursive = FALSE) { m <- lapply(list(...), as.TermDocumentMatrix) if (length(m) == 1L) return(m[[1L]]) weighting <- attr(m[[1L]], "weighting") allTermsNonUnique <- unlist(lapply(m, function(x) Terms(x)[x$i])) allTerms <- unique(allTermsNonUnique) allDocs <- unlist(lapply(m, Docs)) cs <- cumsum(lapply(m, nDocs)) cs <- c(0, cs[-length(cs)]) j <- lapply(m, "[[", "j") m <- simple_triplet_matrix(i = match(allTermsNonUnique, allTerms), j = unlist(j) + rep.int(cs, lengths(j)), v = unlist(lapply(m, "[[", "v")), nrow = length(allTerms), ncol = length(allDocs), dimnames = list(Terms = allTerms, Docs = allDocs)) ## ## - We assume that all arguments have the same weighting ## - Even if all matrices have the same input weighting it might be ## necessary to take additional steps (e.g., normalization for tf-idf or ## check for (0,1)-range for binary tf) ## .TermDocumentMatrix(m, weighting) } c.DocumentTermMatrix <- function(..., recursive = FALSE) { t(do.call("c", lapply(list(...), as.TermDocumentMatrix))) } findFreqTerms <- function(x, lowfreq = 0, highfreq = Inf) { stopifnot(inherits(x, c("DocumentTermMatrix", "TermDocumentMatrix")), is.numeric(lowfreq), is.numeric(highfreq)) if (inherits(x, "DocumentTermMatrix")) x <- t(x) rs <- row_sums(x) names(rs[rs >= lowfreq & rs <= highfreq]) } findAssocs <- function(x, terms, corlimit) UseMethod("findAssocs", x) findAssocs.TermDocumentMatrix <- function(x, terms, corlimit) findAssocs(t(x), terms, corlimit) findAssocs.DocumentTermMatrix <- function(x, terms, corlimit) { stopifnot(is.character(terms), is.numeric(corlimit), corlimit >= 0, corlimit <= 1) j <- match(unique(terms), Terms(x), nomatch = 0L) suppressWarnings( findAssocs(crossapply_simple_triplet_matrix(x[, j], x[, -j], cor), terms, rep_len(corlimit, length(terms)))) } findAssocs.matrix <- function(x, terms, corlimit) { stopifnot(is.numeric(x)) i <- match(terms, rownames(x), nomatch = 0L) names(i) <- terms Map(function(i, cl) { xi <- x[i, ] t <- sort(round(xi[which(xi >= cl)], 2), TRUE) if (!length(t)) names(t) <- NULL t }, i, corlimit) } removeSparseTerms <- function(x, sparse) { stopifnot(inherits(x, c("DocumentTermMatrix", "TermDocumentMatrix")), is.numeric(sparse), sparse > 0, sparse < 1) m <- if (inherits(x, "DocumentTermMatrix")) t(x) else x t <- table(m$i) > m$ncol * (1 - sparse) termIndex <- as.numeric(names(t[t])) if (inherits(x, "DocumentTermMatrix")) x[, termIndex] else x[termIndex, ] } sample.TermDocumentMatrix <- function(x, size = 10) { stopifnot(inherits(x, c("DocumentTermMatrix", "TermDocumentMatrix")), is.numeric(size), size >= 0) if (length(x$v) == 0L) return(x) m <- if (inherits(x, "DocumentTermMatrix")) t(x) else x terms <- sort(names(sort(row_sums(m), decreasing = TRUE) [0:min(size, nTerms(m))])) docs <- sort(names(sort(col_sums(m), decreasing = TRUE) [0:min(size, nDocs(m))])) if (inherits(x, "DocumentTermMatrix")) x[docs, terms] else x[terms, docs] } CategorizedDocumentTermMatrix <- function(x, c) { if (inherits(x, "TermDocumentMatrix")) x <- t(x) else if (!inherits(x, "DocumentTermMatrix")) stop("wrong class") if (length(c) != nDocs(x)) stop("invalid category ids") attr(x, "Category") <- c class(x) <- c("CategorizedDocumentTermMatrix", DocumentTermMatrix_classes) x } findMostFreqTerms <- function(x, n = 6L, ...) UseMethod("findMostFreqTerms") findMostFreqTerms.term_frequency <- function(x, n = 6L, ...) { y <- x[order(x, decreasing = TRUE)[seq_len(n)]] y[y > 0] } findMostFreqTerms.DocumentTermMatrix <- function(x, n = 6L, INDEX = NULL, ...) { terms <- Terms(x) if (!is.null(INDEX)) x <- rollup(x, 1L, INDEX) f <- factor(x$i, seq_len(x$nrow)) js <- split(x$j, f) vs <- split(x$v, f) y <- Map(function(j, v, n) { p <- order(v, decreasing = TRUE)[seq_len(n)] v <- v[p] names(v) <- terms[j[p]] v }, js, vs, pmin(lengths(vs), n)) names(y) <- x$dimnames[[1L]] y } findMostFreqTerms.TermDocumentMatrix <- function(x, n = 6L, INDEX = NULL, ...) { terms <- Terms(x) if (!is.null(INDEX)) x <- rollup(x, 2L, INDEX) f <- factor(x$j, seq_len(x$ncol)) is <- split(x$i, f) vs <- split(x$v, f) y <- Map(function(i, v, n) { p <- order(v, decreasing = TRUE)[seq_len(n)] v <- v[p] names(v) <- terms[i[p]] v }, is, vs, pmin(lengths(vs), n)) names(y) <- x$dimnames[[2L]] y } tm/R/stopwords.R0000644000175100001440000000103213034740255013277 0ustar hornikusersstopwords <- { function(kind = "en") { kind <- as.character(kind) resolved <- map_IETF_Snowball(kind) base <- if (is.na(resolved)) kind else if (identical(resolved, "porter")) "english" else resolved s <- system.file("stopwords", paste0(base, ".dat"), package = "tm") if (identical(s, "")) stop(paste("no stopwords available for '", base, "'", sep = "")) readLines(s, encoding = "UTF-8") } } tm/R/transform.R0000644000175100001440000001067413311700175013255 0ustar hornikusers# Author: Ingo Feinerer # Transformations tm_map <- function(x, FUN, ...) UseMethod("tm_map", x) tm_map.VCorpus <- function(x, FUN, ..., lazy = FALSE) { # Lazy mapping if (lazy) { fun <- function(x) FUN(x, ...) if (is.null(x$lazy)) x$lazy <- list(index = rep_len(TRUE, length(x)), maps = list(fun)) else x$lazy$maps <- c(x$lazy$maps, list(fun)) } else x$content <- tm_parLapply(content(x), FUN, ...) x } tm_map.SimpleCorpus <- function(x, FUN, ...) { if (inherits(FUN, "content_transformer")) FUN <- get("FUN", envir = environment(FUN)) n <- names(content(x)) x$content <- FUN(content(x), ...) if (length(content(x)) != length(n)) warning("transformation drops documents") else names(x$content) <- n x } tm_map.PCorpus <- function(x, FUN, ...) { db <- filehash::dbInit(x$dbcontrol[["dbName"]], x$dbcontrol[["dbType"]]) for (i in seq_along(x)) db[[x$content[[i]]]] <- FUN(x[[i]], ...) filehash::dbReorganize(db) x } # Materialize lazy mappings materialize <- function(x, range = seq_along(x)) { if (!is.null(x$lazy)) { i <- (seq_along(x) %in% range) & x$lazy$index if (any(i)) { x$content[i] <- tm_parLapply(x$content[i], function(d) tm_reduce(d, x$lazy$maps)) x$lazy$index[i] <- FALSE } # Clean up if everything is materialized if (!any(x$lazy$index)) x["lazy"] <- list(NULL) } x } tm_reduce <- function(x, tmFuns, ...) Reduce(function(f, ...) f(...), tmFuns, x, right = TRUE) getTransformations <- function() c("removeNumbers", "removePunctuation", "removeWords", "stemDocument", "stripWhitespace") content_transformer <- function(FUN) { f <- function(x, ...) { content(x) <- FUN(content(x), ...) x } class(f) <- c("content_transformer", "function") f } removeNumbers <- function(x, ...) UseMethod("removeNumbers") removeNumbers.character <- function(x, ucp = FALSE, ...) { if (ucp) gsub("\\p{Nd}+", "", x, perl = TRUE) else .Call(`_tm_remove_chars`, x, 1L) } removeNumbers.PlainTextDocument <- content_transformer(removeNumbers.character) removePunctuation <- function(x, ...) UseMethod("removePunctuation") removePunctuation.character <- function(x, preserve_intra_word_contractions = FALSE, preserve_intra_word_dashes = FALSE, ucp = FALSE, ...) { # Assume there are no ASCII 0x01 (SOH) or ASCII 0x02 (STX) characters. if (preserve_intra_word_contractions) x <- gsub("(\\w)'(\\w)", "\\1\1\\2", x, perl = TRUE) if (preserve_intra_word_dashes) x <- gsub("(\\w)-(\\w)", "\\1\2\\2", x, perl = TRUE) if (ucp) x <- gsub("\\p{P}+", "", x, perl = TRUE) else x <- .Call(`_tm_remove_chars`, x, 0L) if (preserve_intra_word_contractions) x <- gsub("\1", "'", x, fixed = TRUE) if (preserve_intra_word_dashes) x <- gsub("\2", "-", x, fixed = TRUE) x } removePunctuation.PlainTextDocument <- content_transformer(removePunctuation.character) removeWords <- function(x, words) UseMethod("removeWords", x) # Improvements by Kurt Hornik removeWords.character <- function(x, words) gsub(sprintf("(*UCP)\\b(%s)\\b", paste(sort(words, decreasing = TRUE), collapse = "|")), "", x, perl = TRUE) removeWords.PlainTextDocument <- content_transformer(removeWords.character) stemDocument <- function(x, language = "english") UseMethod("stemDocument", x) stemDocument.character <- function(x, language = "english") { s <- unlist(lapply(x, function(line) paste(SnowballC::wordStem(words(line), as.character(language)), collapse = " "))) if (is.character(s)) s else "" } stemDocument.PlainTextDocument <- function(x, language = meta(x, "language")) { language <- as.character(language) if (identical(language, "") || identical(language, character(0)) || is.na(language)) language <- "english" content_transformer(stemDocument.character)(x) } stripWhitespace <- function(x) UseMethod("stripWhitespace", x) stripWhitespace.character <- function(x) gsub("[[:space:]]+", " ", x) stripWhitespace.PlainTextDocument <- content_transformer(stripWhitespace.character) tm/R/complete.R0000644000175100001440000000372113667334116013061 0ustar hornikusers# Author: Ingo Feinerer stemCompletion <- function(x, dictionary, type = c("prevalent", "first", "longest", "none", "random", "shortest")) { if (inherits(dictionary, "Corpus")) dictionary <- unlist(lapply(dictionary, words)) type <- match.arg(type) possibleCompletions <- lapply(x, function(w) grep(sprintf("^%s", w), dictionary, value = TRUE)) switch(type, first = { setNames(sapply(possibleCompletions, "[", 1), x) }, longest = { ordering <- lapply(possibleCompletions, function(x) order(nchar(x), decreasing = TRUE)) possibleCompletions <- mapply(function(x, id) x[id], possibleCompletions, ordering, SIMPLIFY = FALSE) setNames(sapply(possibleCompletions, "[", 1), x) }, none = { setNames(x, x) }, prevalent = { possibleCompletions <- lapply(possibleCompletions, function(x) sort(table(x), decreasing = TRUE)) n <- names(sapply(possibleCompletions, "[", 1)) setNames(if (length(n)) n else rep_len(NA, length(x)), x) }, random = { setNames(sapply(possibleCompletions, function(x) { if (length(x)) sample(x, 1) else NA }), x) }, shortest = { ordering <- lapply(possibleCompletions, function(x) order(nchar(x))) possibleCompletions <- mapply(function(x, id) x[id], possibleCompletions, ordering, SIMPLIFY = FALSE) setNames(sapply(possibleCompletions, "[", 1), x) } ) } tm/R/score.R0000644000175100001440000000143013023472115012343 0ustar hornikuserstm_term_score <- function(x, terms, FUN) UseMethod("tm_term_score", x) tm_term_score.term_frequency <- function(x, terms, FUN = function(x) sum(x, na.rm = TRUE)) FUN(x[match(terms, names(x), nomatch = 0L)]) tm_term_score.PlainTextDocument <- function(x, terms, FUN = function(x) sum(x, na.rm = TRUE)) tm_term_score(termFreq(x, control = list(tolower = FALSE, removePunctuation = TRUE, wordLengths = c(1, Inf))), terms, FUN) tm_term_score.TermDocumentMatrix <- function(x, terms, FUN = col_sums) FUN(x[match(terms, Terms(x), nomatch = 0L), ]) tm_term_score.DocumentTermMatrix <- function(x, terms, FUN = row_sums) FUN(x[, match(terms, Terms(x), nomatch = 0L)]) tm/R/plot.R0000644000175100001440000000531213023472034012211 0ustar hornikusersplot.TermDocumentMatrix <- plot.DocumentTermMatrix <- function(x, terms = sample(Terms(x), 20), corThreshold = 0.7, weighting = FALSE, attrs = list(graph = list(rankdir = "BT"), node = list(shape = "rectangle", fixedsize = FALSE)), ...) { if (system.file(package = "Rgraphviz") == "") stop("Plotting requires package 'Rgraphviz'.") m <- if (inherits(x, "TermDocumentMatrix")) t(x) else x m <- as.matrix(m[, terms]) c <- cor(m) c[c < corThreshold] <- 0 c[is.na(c)] <- 0 diag(c) <- 0 p <- Rgraphviz::plot(methods::as(c, "graphNEL"), attrs = attrs, ...) if (weighting) { i <- 1 lw <- round(c[lower.tri(c) & c >= corThreshold] * 10) for (ae in Rgraphviz::AgEdge(p)) { Rgraphviz::lines(ae, lwd = lw[i], len = 1) i <- i + 1 } } invisible(p) } ## Plotting functions for Zipf's and Heaps'law contributed by Kurt Hornik ## See http://en.wikipedia.org/wiki/Zipf%27s_law Zipf_plot <- function(x, type = "l", ...) { if (inherits(x, "TermDocumentMatrix")) x <- t(x) y <- log(sort(col_sums(x), decreasing = TRUE)) x <- log(seq_along(y)) m <- lm(y ~ x) dots <- list(...) if (is.null(dots$xlab)) dots$xlab <- "log(rank)" if (is.null(dots$ylab)) dots$ylab <- "log(frequency)" do.call(plot, c(list(x, y, type = type), dots)) abline(m) ## ## Perhaps this should (invisibly) return the fitted linear model ## instead of just the coefficients? coef(m) ## } ## http://en.wikipedia.org/wiki/Heaps%27_law ## http://en.wikipedia.org/wiki/Text_corpus ## cum_vocabulary_size <- ## function(m) ## { ## ## Should work in general, but it very slow for large simple triplet ## ## matrices ... ## s <- double(nrow(m)) ## v <- double(ncol(m)) ## for(i in seq_along(s)) { ## v <- pmax(v, c(m[i, ])) ## s[i] <- sum(v > 0) ## } ## s ## } cum_vocabulary_size <- function(m) { ## Only works for simple triplet matrices. i <- sapply(split(m$i, m$j), min) tab <- table(i) v <- double(nrow(m)) v[as.numeric(names(tab))] <- tab cumsum(v) } Heaps_plot <- function(x, type = "l", ...) { if (inherits(x, "TermDocumentMatrix")) x <- t(x) y <- log(cum_vocabulary_size(x)) x <- log(cumsum(row_sums(x))) m <- lm(y ~ x) dots <- list(...) if (is.null(dots$xlab)) dots$xlab <- "log(T)" if (is.null(dots$ylab)) dots$ylab <- "log(V)" do.call(plot, c(list(x, y, type = type), dots)) abline(m) ## ## Perhaps this should (invisibly) return the fitted linear model ## instead of just the coefficients? coef(m) ## } tm/R/weight.R0000644000175100001440000001211512776627444012546 0ustar hornikusers# Author: Ingo Feinerer WeightFunction <- function(x, name, acronym) { class(x) <- c("WeightFunction", "function") attr(x, "name") <- name attr(x, "acronym") <- acronym x } # Actual TermDocumentMatrix weighting functions weightTf <- WeightFunction(function(m) { attr(m, "weighting") <- c("term frequency", "tf") m }, "term frequency", "tf") weightTfIdf <- WeightFunction(function(m, normalize = TRUE) { isDTM <- inherits(m, "DocumentTermMatrix") if (isDTM) m <- t(m) if (normalize) { cs <- col_sums(m) if (any(cs == 0)) warning("empty document(s): ", paste(Docs(m)[cs == 0], collapse = " ")) names(cs) <- seq_len(nDocs(m)) m$v <- m$v / cs[m$j] } rs <- row_sums(m > 0) if (any(rs == 0)) warning("unreferenced term(s): ", paste(Terms(m)[rs == 0], collapse = " ")) lnrs <- log2(nDocs(m) / rs) lnrs[!is.finite(lnrs)] <- 0 m <- m * lnrs attr(m, "weighting") <- c(sprintf("%s%s", "term frequency - inverse document frequency", if (normalize) " (normalized)" else ""), "tf-idf") if (isDTM) t(m) else m }, "term frequency - inverse document frequency", "tf-idf") weightSMART <- WeightFunction(function(m, spec = "nnn", control = list()) { stopifnot(inherits(m, c("DocumentTermMatrix", "TermDocumentMatrix")), is.character(spec), nchar(spec) == 3L, is.list(control)) term_frequency <- match.arg(substr(spec, 1L, 1L), c("n", "l", "a", "b", "L")) document_frequency <- match.arg(substr(spec, 2L, 2L), c("n", "t", "p")) normalization <- match.arg(substr(spec, 3L, 3L), c("n", "c", "u", "b")) isDTM <- inherits(m, "DocumentTermMatrix") if (isDTM) m <- t(m) if (normalization == "b") { ## Need to compute the character lengths of the documents ## before starting the weighting. charlengths <- tapply(nchar(Terms(m))[m$i] * m$v, m$j, sum) } ## Term frequency m$v <- switch(term_frequency, ## natural n = m$v, ## logarithm l = 1 + log2(m$v), ## augmented a = { s <- tapply(m$v, m$j, max) 0.5 + (0.5 * m$v) / s[as.character(m$j)] }, ## boolean b = as.numeric(m$v > 0), ## log ave L = { s <- tapply(m$v, m$j, mean) ((1 + log2(m$v)) / (1 + log2(s[as.character(m$j)]))) }) ## Document frequency rs <- row_sums(m > 0) if (any(rs == 0)) warning("unreferenced term(s): ", paste(Terms(m)[rs == 0], collapse = " ")) df <- switch(document_frequency, ## natural n = 1, ## idf t = log2(nDocs(m) / rs), ## prob idf p = max(0, log2((nDocs(m) - rs) / rs))) df[!is.finite(df)] <- 0 ## Normalization cs <- col_sums(m) if (any(cs == 0)) warning("empty document(s): ", paste(Docs(m)[cs == 0], collapse = " ")) norm <- switch(normalization, ## none n = rep.int(1, nDocs(m)), ## cosine c = sqrt(col_sums(m ^ 2)), ## pivoted unique u = { if (is.null(pivot <- control$pivot)) stop("invalid control argument pivot") if (is.null(slope <- control$slope)) stop("invalid control argument slope") (slope * sqrt(col_sums(m ^ 2)) + (1 - slope) * pivot) }, ## byte size b = { if (is.null(alpha <- control$alpha)) stop("invalid control argument alpha") norm <- double(nDocs(m)) norm[match(names(charlengths), seq_along(norm))] <- charlengths ^ alpha norm }) m <- m * df m$v <- m$v / norm[m$j] attr(m, "weighting") <- c(paste("SMART", spec), "SMART") if (isDTM) t(m) else m }, "SMART", "SMART") weightBin <- WeightFunction(function(m) { m$v <- rep_len(1L, length(m$v)) attr(m, "weighting") <- c("binary", "bin") m }, "binary", "bin") tm/R/tokenizer.R0000644000175100001440000000202613307435131013246 0ustar hornikusersgetTokenizers <- function() c("Boost_tokenizer", "MC_tokenizer", "scan_tokenizer") ## Boost_tokenizer <- Token_Tokenizer(function(x) { y <- Boost_Tokenizer(as.character(x)) Encoding(y) <- "UTF-8" y }) ## MC_tokenizer <- Token_Tokenizer(function(x) { x <- as.character(x) if(!length(x)) return(character()) ASCII_letters <- "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" id <- sprintf("[%s]+", ASCII_letters) http <- sprintf("(https?://%s(\\.%s)*)", id, id) email <- sprintf("(%s@%s(\\.%s)*)", id, id, id) http_or_email <- sprintf("%s|%s", http, email) y <- c(unlist(regmatches(x, gregexpr(http_or_email, x)), FALSE, FALSE), unlist(strsplit(gsub(http_or_email, "", x), sprintf("[^%s]", ASCII_letters)), FALSE, FALSE)) y[nzchar(y)] }) scan_tokenizer <- Token_Tokenizer(function(x) { .Call(`_tm_scan`, as.character(x), 0L) }) tm/R/filter.R0000644000175100001440000000067013667624327012544 0ustar hornikusers# Author: Ingo Feinerer # Filters tm_filter <- function(x, FUN, ...) UseMethod("tm_filter", x) tm_filter.PCorpus <- tm_filter.SimpleCorpus <- tm_filter.VCorpus <- function(x, FUN, ...) x[tm_index(x, FUN, ...)] tm_index <- function(x, FUN, ...) UseMethod("tm_index", x) tm_index.PCorpus <- tm_index.SimpleCorpus <- tm_index.VCorpus <- function(x, FUN, ...) unlist(tm_parLapply(content(x), function(y) isTRUE(FUN(y, ...)))) tm/R/reader.R0000644000175100001440000001630513177046106012510 0ustar hornikusers## Author: Ingo Feinerer ## Readers FunctionGenerator <- function(x) { class(x) <- c("FunctionGenerator", "function") x } getReaders <- function() c("readDataframe", "readDOC", "readPDF", "readPlain", "readRCV1", "readRCV1asPlain", "readReut21578XML", "readReut21578XMLasPlain", "readTagged", "readXML") prepareReader <- function(readerControl, reader = NULL, ...) { if (is.null(readerControl$reader)) readerControl$reader <- reader if (inherits(readerControl$reader, "FunctionGenerator")) readerControl$reader <- readerControl$reader(...) if (is.null(readerControl$language)) readerControl$language <- "en" readerControl } processURI <- function(uri) { uri <- as.character(uri) if (identical(substr(uri, 1, 7), "file://")) uri <- substr(uri, 8, nchar(uri)) uri } readDataframe <- function(elem, language, id) { PlainTextDocument(elem$content[, "text"], id = elem$content[, "doc_id"], language = language) } # readDOC needs antiword installed to be able to extract the text readDOC <- function(engine = c("antiword", "executable"), AntiwordOptions = "") { stopifnot(is.character(engine), is.character(AntiwordOptions)) engine <- match.arg(engine) antiword <- switch(engine, antiword = antiword::antiword, executable = function(x) system2("antiword", c(AntiwordOptions, shQuote(normalizePath(x))), stdout = TRUE)) if (!is.function(antiword)) stop("invalid function for DOC extraction") function(elem, language, id) { uri <- processURI(elem$uri) content <- antiword(uri) PlainTextDocument(content, id = basename(elem$uri), language = language) } } class(readDOC) <- c("FunctionGenerator", "function") readPDF <- function(engine = c("pdftools", "xpdf", "Rpoppler", "ghostscript", "Rcampdf", "custom"), control = list(info = NULL, text = NULL)) { stopifnot(is.character(engine), is.list(control)) engine <- match.arg(engine) pdf_info <- switch(engine, pdftools = function(x) { i <- pdftools::pdf_info(x) c(i$keys, list(CreationDate = i$created)) }, xpdf = function(x) pdf_info_via_xpdf(x, control$info), Rpoppler = Rpoppler::PDF_info, ghostscript = pdf_info_via_gs, Rcampdf = Rcampdf::pdf_info, custom = control$info) pdf_text <- switch(engine, pdftools = pdftools::pdf_text, xpdf = function(x) system2("pdftotext", c(control$text, shQuote(x), "-"), stdout = TRUE), Rpoppler = Rpoppler::PDF_text, ghostscript = pdf_text_via_gs, Rcampdf = Rcampdf::pdf_text, custom = control$text) if (!is.function(pdf_info) || !is.function(pdf_text)) stop("invalid function for PDF extraction") function(elem, language, id) { uri <- processURI(elem$uri) meta <- pdf_info(uri) content <- pdf_text(uri) PlainTextDocument(content, meta$Author, meta$CreationDate, meta$Subject, meta$Title, basename(elem$uri), language, meta$Creator) } } class(readPDF) <- c("FunctionGenerator", "function") readPlain <- function(elem, language, id) { if (!is.null(elem$uri)) id <- basename(elem$uri) PlainTextDocument(elem$content, id = id, language = language) } readXML <- function(spec, doc) { stopifnot(is.list(spec), inherits(doc, "TextDocument")) function(elem, language, id) { content <- elem$content node <- if(inherits(content, "xml_node")) content else if(is.character(content)) read_xml(paste(elem$content, collapse = "\n")) else read_xml(content) content(doc) <- if ("content" %in% names(spec)) .xml_content(node, spec[["content"]]) else node for (n in setdiff(names(spec), "content")) meta(doc, n) <- .xml_content(node, spec[[n]]) if (!is.null(elem$uri)) id <- basename(elem$uri) if (!length(meta(doc, "id"))) meta(doc, "id") <- as.character(id) if (!length(meta(doc, "language"))) meta(doc, "language") <- as.character(language) doc } } class(readXML) <- c("FunctionGenerator", "function") RCV1Spec <- list(author = list("unevaluated", ""), datetimestamp = list("function", function(node) as.POSIXlt(xml_text(xml_find_all(node, "@date")), tz = "GMT")), description = list("unevaluated", ""), heading = list("node", "title"), id = list("node", "@itemid"), origin = list("unevaluated", "Reuters Corpus Volume 1"), publisher = list("node", "metadata/dc[@element='dc.publisher']/@value"), topics = list("node", "metadata/codes[@class='bip:topics:1.0']/code/@code"), industries = list("node", "metadata/codes[@class='bip:industries:1.0']/code/@code"), countries = list("node", "metadata/codes[@class='bip:countries:1.0']/code/@code")) readRCV1 <- readXML(spec = RCV1Spec, doc = XMLTextDocument()) readRCV1asPlain <- readXML(spec = c(RCV1Spec, list(content = list("node", "text"))), doc = PlainTextDocument()) Reut21578XMLSpec <- list(author = list("node", "TEXT/AUTHOR"), datetimestamp = list("function", function(node) strptime(xml_text(xml_find_all(node, "DATE")), format = "%d-%B-%Y %H:%M:%S", tz = "GMT")), description = list("unevaluated", ""), heading = list("node", "TEXT/TITLE"), id = list("node", "@NEWID"), topics = list("node", "@TOPICS"), lewissplit = list("node", "@LEWISSPLIT"), cgisplit = list("node", "@CGISPLIT"), oldid = list("node", "@OLDID"), origin = list("unevaluated", "Reuters-21578 XML"), topics_cat = list("node", "TOPICS/D"), places = list("node", "PLACES/D"), people = list("node", "PEOPLE/D"), orgs = list("node", "ORGS/D"), exchanges = list("node", "EXCHANGES/D")) readReut21578XML <- readXML(spec = Reut21578XMLSpec, doc = XMLTextDocument()) readReut21578XMLasPlain <- readXML(spec = c(Reut21578XMLSpec, list(content = list("node", "TEXT/BODY"))), doc = PlainTextDocument()) readTagged <- function(...) { args <- list(...) function(elem, language, id) { if (!is.null(elem$content)) { con <- textConnection(elem$content) on.exit(close(con)) } else con <- elem$uri if (!is.null(elem$uri)) id <- basename(elem$uri) a <- c(list(con = con, meta = list(id = id, language = language)), args) do.call(TaggedTextDocument, a) } } class(readTagged) <- c("FunctionGenerator", "function") tm/R/meta.R0000644000175100001440000001067113110235234012161 0ustar hornikusers# Author: Ingo Feinerer TextDocumentMeta <- function(author, datetimestamp, description, heading, id, language, origin, ..., meta = NULL) { if (is.null(meta)) meta <- list(author = author, datetimestamp = datetimestamp, description = description, heading = heading, id = id, language = language, origin = origin, ...) stopifnot(is.list(meta)) if (!is.null(meta$author) && !inherits(meta$author, "person")) meta$author <- as.character(meta$author) if (!is.null(meta$datetimestamp) && !inherits(meta$datetimestamp, "POSIXt")) meta$datetimestamp <- as.character(meta$datetimestamp) if (!is.null(meta$description)) meta$description <- as.character(meta$description) if (!is.null(meta$heading)) meta$heading <- as.character(meta$heading) if (!is.null(meta$id)) meta$id <- as.character(meta$id) if (!is.null(meta$language)) meta$language <- as.character(meta$language) if (!is.null(meta$origin)) meta$origin <- as.character(meta$origin) class(meta) <- "TextDocumentMeta" meta } print.TextDocumentMeta <- function(x, ...) { cat(sprintf(" %s: %s", format(names(x), justify = "left"), sapply(x, as.character)), sep = "\n") invisible(x) } CorpusMeta <- function(..., meta = NULL) { if (is.null(meta)) meta <- list(...) stopifnot(is.list(meta)) class(meta) <- "CorpusMeta" meta } meta.SimpleCorpus <- function(x, tag = NULL, type = c("indexed", "corpus"), ...) { if (identical(tag, "id")) { n <- names(content(x)) return(if (is.null(n)) as.character(seq_along(x)) else n) } if (!is.null(tag) && missing(type)) type <- if (tag %in% names(x$meta)) "corpus" else "indexed" type <- match.arg(type) if (identical(type, "indexed")) if (is.null(tag)) x$dmeta else x$dmeta[tag] else if (identical(type, "corpus")) if (is.null(tag)) x$meta else x$meta[[tag]] else stop("invalid type") } meta.VCorpus <- meta.PCorpus <- function(x, tag = NULL, type = c("indexed", "corpus", "local"), ...) { if (!is.null(tag) && missing(type)) { type <- if (tag %in% names(x$dmeta)) "indexed" else if (tag %in% names(x$meta)) "corpus" else "local" } type <- match.arg(type) if (identical(type, "indexed")) if (is.null(tag)) x$dmeta else x$dmeta[tag] else if (identical(type, "corpus")) if (is.null(tag)) x$meta else x$meta[[tag]] else if (identical(type, "local")) lapply(x, meta, tag) else stop("invalid type") } `meta<-.SimpleCorpus` <- function(x, tag, type = c("indexed", "corpus"), ..., value) { type <- match.arg(type) if (identical(type, "indexed")) x$dmeta[, tag] <- value else if (type == "corpus") x$meta[[tag]] <- value else stop("invalid type") x } `meta<-.VCorpus` <- `meta<-.PCorpus` <- function(x, tag, type = c("indexed", "corpus", "local"), ..., value) { type <- match.arg(type) if (identical(type, "indexed")) x$dmeta[, tag] <- value else if (type == "corpus") x$meta[[tag]] <- value else if (identical(type, "local")) { for (i in seq_along(x)) meta(x[[i]], tag) <- value[i] } else stop("invalid type") x } # Simple Dublin Core to tm metadata mapping # http://en.wikipedia.org/wiki/Dublin_core#Simple_Dublin_Core Dublin_Core_tm_map <- list("contributor" = "contributor", "coverage" = "coverage", "creator" = "author", "date" = "datetimestamp", "description" = "description", "format" = "format", "identifier" = "id", "language" = "language", "publisher" = "publisher", "relation" = "relation", "rights" = "rights", "source" = "source", # or better "origin"? "subject" = "subject", "title" = "heading", "type" = "type" ) DublinCore <- function(x, tag = NULL) { tmm <- unlist(Dublin_Core_tm_map, use.names = FALSE) dcm <- names(Dublin_Core_tm_map) if (is.null(tag)) { m <- lapply(tmm, function(t) meta(x, t)) names(m) <- dcm class(m) <- "TextDocumentMeta" m } else meta(x, tmm[charmatch(tolower(tag), dcm)]) } `DublinCore<-` <- function(x, tag, value) { tmm <- unlist(Dublin_Core_tm_map, use.names = FALSE) dcm <- names(Dublin_Core_tm_map) meta(x, tmm[charmatch(tolower(tag), dcm)]) <- value x } tm/R/foreign.R0000644000175100001440000000313513023471774012677 0ustar hornikusers## Readers and writers (eventually?) for foreign document-term matrix ## format files. ## CLUTO: as we do not know the weighting, there is no high-level DTM ## reader. If the weighting is weightTf, one can do ## as.DocumentTermMatrix(read_stm_CLUTO(file), weightTf) ## as CLUTO always has rows as documents and cols as terms. ## MC: a simple reader for now, could certainly use more effort to name ## the weightings more properly. read_dtm_MC <- function(file, scalingtype = NULL) { m <- read_stm_MC(file, scalingtype) s <- attr(m, "scalingtype") as.DocumentTermMatrix(m, rep.int(s, 2L)) } ## ## To write a decent writer we would need to be able to turn weighting ## information into MC scaling information, which may not even be ## possible. Alternatively, we could always use 'txx', or use this in ## case we cannot map ... ## ## Data files for the Blei et al LDA and CTM codes are in a List of List ## format, with lines ## n j1: x1 j2: x2 ... jn: xn ## (see http://www.cs.princeton.edu/~blei/lda-c/). ## As they are used for topic models, they *always* contain raw term ## frequencies. read_dtm_Blei_et_al <- function(file, vocab = NULL) { x <- scan(file, character(), quiet = TRUE) ind <- grepl(":", x, fixed = TRUE) counts <- x[!ind] i <- rep.int(seq_along(counts), counts) x <- strsplit(x[ind], ":", fixed = TRUE) j <- as.integer(unlist(lapply(x, `[`, 1L))) + 1L x <- as.numeric(unlist(lapply(x, `[`, 2L))) m <- simple_triplet_matrix(i, j, x) if (!is.null(vocab)) colnames(m) <- readLines(vocab) as.DocumentTermMatrix(m, weightTf) } tm/MD50000644000175100001440000002646114367745152011247 0ustar hornikusers3dc28ad3ce2504d672c24916ef916f09 *DESCRIPTION 9e447128d47f58d4210403c129e3fd94 *NAMESPACE c587e5c09daeb47a355ec8510eb75341 *R/RcppExports.R 881f00e795e17803432949ff05facc96 *R/complete.R dffd17856a1f0d1ad45fa6e9cc8deaa1 *R/corpus.R c1ac8a79992c42d3ec695df39b8c3bc9 *R/doc.R beba1a821bfdf61ece1708123ab71324 *R/filter.R b205235d27368949ee5ea0dd3a10b9d7 *R/foreign.R cb5367e831c1be819b9773304985724a *R/hpc.R 32b666ea3b78f2b188cb56c7f3e26790 *R/matrix.R c36f8ed69c326c2b027a670d2662e1d1 *R/meta.R 07d1407f6cfdbdbb6060ebfb11f97f6f *R/pdftools.R b9cd19804a89de8eca51394726256e68 *R/plot.R fd701389b291a843584167ab7385c453 *R/reader.R 5f6ff8b218e7679919b85230b11cdebb *R/score.R 658b904bc1ec319e536ca3844568dabd *R/source.R dee7e0a8b245fd670436a019c54d904c *R/stopwords.R e57141f4a63f3dc13b0ef97c6960a41b *R/tokenizer.R 22ebb540c91c1a2d1494967c6c7395a5 *R/transform.R 1c59b79f99cdeb623f387ea378d0331c *R/utils.R c1de3acc3bc1bc9f64926b93c3be8301 *R/weight.R ed6e51c4f5c25ae8d26028157b7ff787 *build/vignette.rds 9963f39d1ae0521163b9184bda8c9d72 *data/acq.rda 7b73fc31a572a15012a06dd7ff499cff *data/crude.rda 1710cf3dc724c13df75da9f29169d59d *inst/CITATION 9ca80cc27a1b68768ddbb284bcf03428 *inst/NEWS.Rd ad6a6fe44b80541732690af3f36a4c32 *inst/doc/extensions.R d194109d976d7f242e64a8eab85026f8 *inst/doc/extensions.Rnw a085ab84a911f56373bd186c0b43730b *inst/doc/extensions.pdf fa525227cffc24edeea2b34cfdf2dbf7 *inst/doc/tm.R e71ae9442d42f286eefa9d77a171c807 *inst/doc/tm.Rnw 00875059461330584910db259b211826 *inst/doc/tm.pdf 98f3b5f3d1f670032af4131a627c18d7 *inst/ghostscript/pdf_info.ps 7ec7b5de9c642afedf1159021c89f12a *inst/stopwords/SMART.dat 4c8fb2c1404c10540c267425fcc005f0 *inst/stopwords/catalan.dat 4e8d44fa90d87908846a2d92c2618b31 *inst/stopwords/danish.dat a638b876d5cbec644685d12d452a7407 *inst/stopwords/dutch.dat e181651a30ec45694b7fafc787f357dc *inst/stopwords/english.dat 1094269bf20052a5259983e23c69a552 *inst/stopwords/finnish.dat 29772f7c7dacf306981ad50c5484c4ad *inst/stopwords/french.dat 4a562db64979f200804127c3751a6efa *inst/stopwords/german.dat 1e1f45e67297e049bb22527d7efa8025 *inst/stopwords/hungarian.dat 7dfee49b4660f65f7bb935bef0c773bd *inst/stopwords/italian.dat 4cd3ddc90492cc5a3cbb9f0292d3844d *inst/stopwords/norwegian.dat d3483742365aa7d477512fd1810452c5 *inst/stopwords/portuguese.dat f6a262767ae1863b9e8cc92f78e3bb01 *inst/stopwords/romanian.dat 4bf4046fe7701b4940b8eb2c86f19c08 *inst/stopwords/russian.dat fddb7f14207d2649597b36e22b5eab18 *inst/stopwords/spanish.dat d3930c86664d4112ae772285dca85fd6 *inst/stopwords/swedish.dat 4dc7bdaa3323e71845cf4c018e871048 *inst/texts/acq/reut-00001.xml a63b803ca46191dc3a30eda875d95136 *inst/texts/acq/reut-00002.xml 7638d681bcb7d2f3539b8be8a454dff9 *inst/texts/acq/reut-00003.xml f822ea4bdb0691950284856b51c87e41 *inst/texts/acq/reut-00004.xml 1f8f1f8699bb3883748fa29807477a55 *inst/texts/acq/reut-00005.xml f44aa9f0b51556f382cf8a91d7f36244 *inst/texts/acq/reut-00006.xml e0d5ea56a8f42146f5b7d3735da730dc *inst/texts/acq/reut-00007.xml b7560c91c1f18e919d7548d9d1b59843 *inst/texts/acq/reut-00008.xml 6b2913f0f666d7f84dd38ac05b326726 *inst/texts/acq/reut-00009.xml 5625c064bfff14db909a25a6719dc3f8 *inst/texts/acq/reut-00010.xml 047f38558920a11ebaeab94727465e58 *inst/texts/acq/reut-00011.xml eb26151fa8a7fcd2c87065b0ad8f0924 *inst/texts/acq/reut-00012.xml abdbeb14424b6f5994674e604a0a5590 *inst/texts/acq/reut-00013.xml 05b945b892bbb8d575c6ff6193bb17b8 *inst/texts/acq/reut-00014.xml e5159c22413cae49c015a631df3a74e2 *inst/texts/acq/reut-00015.xml cd87fc59bfcbe37c847bd1548537effa *inst/texts/acq/reut-00016.xml 75ec08b1337a6035d553f8344ece2c2a *inst/texts/acq/reut-00017.xml 908e51c4b6f9f4e65805adef7029c884 *inst/texts/acq/reut-00018.xml e67944c5bb9ef8e0fe811b1ead21199b *inst/texts/acq/reut-00020.xml 1d19206cd4478bfc03bc9335316f6816 *inst/texts/acq/reut-00021.xml 621a7e8ba27aac9b8040adc7fc1d11f9 *inst/texts/acq/reut-00022.xml 736bff1fabc3f07b35cd992e8630ed90 *inst/texts/acq/reut-00023.xml da2ddc7ac585134cb7fe80e812d3ac80 *inst/texts/acq/reut-00024.xml a04162294ae6ae69f3d1a74f0ad0b9b1 *inst/texts/acq/reut-00025.xml 5e757cb13baa266c292da3ff010f1434 *inst/texts/acq/reut-00026.xml 7974dd802d4ca66b7f7f51c355c8e558 *inst/texts/acq/reut-00027.xml 62368bea00c9a71f01293060708fc6a4 *inst/texts/acq/reut-00028.xml 7e06015b7518b608148002364989c4f7 *inst/texts/acq/reut-00029.xml f24469e27c9f16266db0e141892e97d1 *inst/texts/acq/reut-00030.xml acc36dbfdffe0362d39975db07569b85 *inst/texts/acq/reut-00031.xml 7e342636219116a2d428e2188b1dcb0b *inst/texts/acq/reut-00032.xml c40ce905c6896410a672bee72f132b46 *inst/texts/acq/reut-00034.xml ead5a03af44fb5cf4e896f039a122e4b *inst/texts/acq/reut-00035.xml 684ddc28a9bb0fbb6f49fa412b54231d *inst/texts/acq/reut-00036.xml 1be33a6347aa406b843132da98286506 *inst/texts/acq/reut-00039.xml 1bdf38586ab43a0f6996d3135ff1f48c *inst/texts/acq/reut-00040.xml b89e5d9aeba1b0e02cf3bf3fa729e346 *inst/texts/acq/reut-00042.xml 7c3703135baad41765ad1f58fcab0ba5 *inst/texts/acq/reut-00043.xml d5ab6f6dfe5fefb25422b258bcd339d0 *inst/texts/acq/reut-00045.xml 1af51ea6ba1898d33a84b680c1fa4d09 *inst/texts/acq/reut-00046.xml cb00fc7833f2eb9e3ac97c12d900dd4f *inst/texts/acq/reut-00047.xml e5b440d419fa528d4c996cd47e88c0b4 *inst/texts/acq/reut-00048.xml 4ed77929b16a0c6f3264272183b6c951 *inst/texts/acq/reut-00049.xml 7f6df11fcb6617c253921861e217c3c6 *inst/texts/acq/reut-00050.xml ba0a88d8b9caaa0d0fa8bba01bf2a9d9 *inst/texts/acq/reut-00051.xml c8b4ee7875ddba1c1d2886c3e32a7cb6 *inst/texts/acq/reut-00052.xml b0e4f9f398ba4e2ab847e1dc44c2594e *inst/texts/acq/reut-00053.xml ea25a8bf959fe2769e578474d5f0176f *inst/texts/acq/reut-00054.xml 574a5170c695ad0bbc91055ef8fdd2e9 *inst/texts/acq/reut-00055.xml 66cf87f5587906604d96c3f64ab77a9b *inst/texts/acq/reut-00056.xml e1c26b346a6683c393b2f420593b02e5 *inst/texts/crude/reut-00001.xml 401049764894ad7b37be02cee2e926f6 *inst/texts/crude/reut-00002.xml 15a57b39a4172799d7926c440548b1fd *inst/texts/crude/reut-00004.xml 95474b7494ce4835ed952374601f921e *inst/texts/crude/reut-00005.xml e91c3ec329c1f82fc27ea79d33650d32 *inst/texts/crude/reut-00006.xml 5344713574482c3d393766422bd72498 *inst/texts/crude/reut-00007.xml 5803359fee327a77342d4d16bc467271 *inst/texts/crude/reut-00008.xml c0f88331bbf3da5ec273838ac832e7fa *inst/texts/crude/reut-00009.xml ed3994f50fa16217a6c62dfae5909a03 *inst/texts/crude/reut-00010.xml c74f1b54db67c730bcc117536903dc52 *inst/texts/crude/reut-00011.xml 32cf0da1d923fd2aee4fe28200047c3b *inst/texts/crude/reut-00012.xml 42f6d47f40304ddc482e62bf1d1c3c21 *inst/texts/crude/reut-00013.xml 51565e0b464e626cf1db1d812642e295 *inst/texts/crude/reut-00014.xml 8b107465269cd463e8d7deb470423dda *inst/texts/crude/reut-00015.xml 6b69f531b6953be522a58b0456820e04 *inst/texts/crude/reut-00016.xml 5deaf389a9067a5b6090c13195c0d254 *inst/texts/crude/reut-00018.xml 9e745c906a03765fb0b364ae78bbdcd5 *inst/texts/crude/reut-00019.xml 488f96e28466feeac3175f57724a1f8e *inst/texts/crude/reut-00021.xml da9f871a845a256e2c12ace2a2e2fb36 *inst/texts/crude/reut-00022.xml 2439e7823a1ff6403efd3108fa5ecc45 *inst/texts/crude/reut-00023.xml 7d9482d1fc4a624492dacf584a940b4c *inst/texts/custom.xml 717801d47bc20af5d69340eee342ce21 *inst/texts/loremipsum.txt e76c36aad136268277f2c036dc1c37cd *inst/texts/rcv1_2330.xml eda82aaa0c873d62be4905cb32dedb05 *inst/texts/reuters-21578.xml 5901120140c757daf5f21fba990e2bbe *inst/texts/txt/ovid_1.txt 2b5dc16305207ed29df7bbe0cc47abee *inst/texts/txt/ovid_2.txt 08197bca339b621d395220bd7ab719a7 *inst/texts/txt/ovid_3.txt 832ea34c305426cc653701df40750edf *inst/texts/txt/ovid_4.txt 3b3cb14d62de578684d6c59fa6dcba60 *inst/texts/txt/ovid_5.txt d44474e05cd96e80932106e24ed572a1 *man/Corpus.Rd 6339b0d2bae8c6d1e3a383bdea82d425 *man/DataframeSource.Rd 1c104e63fd71cd63ad6e0da3669fbdf5 *man/DirSource.Rd 5871b5f9883ba4359e269bbfca27db37 *man/Docs.Rd 00fa0c14e4086a140646ad23597ca5eb *man/PCorpus.Rd 8a778ebd67c6b9c7af89a2654e665bf6 *man/PlainTextDocument.Rd f1c465f51d627af46612833ffcc17f59 *man/Reader.Rd b4d2dcdc0c2b16f38561637956a7a328 *man/SimpleCorpus.Rd 79170405ed1af7434fbfa37adebd56f7 *man/Source.Rd 0874f71fccd7c7d141f46f405b1ae105 *man/TextDocument.Rd c82a889b500268683904a4ad7fc9d3b1 *man/URISource.Rd 7c84cd5a42cdac47a1b0301e2b6459a6 *man/VCorpus.Rd 3fb4034c6df0b6277f07a028a958b932 *man/VectorSource.Rd 5a32dfd6e72da8d3c8569803d6761126 *man/WeightFunction.Rd 0b79ee972dac094d6f0ed9c1f4d2685f *man/XMLSource.Rd 0a982a855094b02e983d7c7bf5e60c2b *man/XMLTextDocument.Rd 2d25fcd9863b4ac7128c1d2a521e27f2 *man/ZipSource.Rd ca38d43ef3a58075443e49cd244bd1ea *man/Zipf_n_Heaps.Rd abc871c091ed08f52258b0c7b56d6758 *man/acq.Rd aa36762f11d31e840ba6115b9b913341 *man/combine.Rd 0f0ed4b165a6c3744b83c69abf59c7a9 *man/content_transformer.Rd f522a5904055ee58353a704f29e7263f *man/crude.Rd f30ebc7d2c9ad750ef0e6037d1669827 *man/findAssocs.Rd 74d7ea8ee4c4ac46492bbc3b52a10dca *man/findFreqTerms.Rd 36e135250b446bbd0e677115bcf1a82a *man/findMostFreqTerms.Rd 1fcf051f6859a582dac1c9486e22061c *man/foreign.Rd be785d88b0821a06be0b4772868dc37c *man/getTokenizers.Rd 9ad9e3d7afb9815f04529a435f430a53 *man/getTransformations.Rd 16dddb0c44c025166a329e2d5920f97c *man/hpc.Rd 6a72cef1df5795bb189bd1a0177e5d4d *man/inspect.Rd 65457f7e41f5926f971a2bffd83e6484 *man/matrix.Rd 33870f4b1f105daa8307e58f3ec61fa2 *man/meta.Rd a90444b9479d7cf70c0c07b5806d7aac *man/plot.Rd 7de11cf5180caee710b5fda07b211eb8 *man/readDOC.Rd 13b3964279323a7d94ccab25ca7afaef *man/readDataframe.Rd 56f162b724f8a1ffd21bd47633bbd068 *man/readPDF.Rd d625f0434c021f98e4529ce1427703cf *man/readPlain.Rd b49b3852a0344d682e6bb4f6b30aa6d5 *man/readRCV1.Rd 39dd5ac2e088dd5f2e9f4cad6248905e *man/readReut21578XML.Rd ec13c14161ee1c95f89ce75237aa3df7 *man/readTagged.Rd ce6a6feb64dd79693b7ceba7bdb4c6a0 *man/readXML.Rd 295b85ec0a37c83bc105f97ca48dfc9a *man/removeNumbers.Rd f8e578de76e389cf55176fb546743468 *man/removePunctuation.Rd ef0d87508b367cdd71f066244605407e *man/removeSparseTerms.Rd 2484a54292458f80e26f2956fc5d7501 *man/removeWords.Rd 5bdcaccf0076e98a2341078e61c59be5 *man/stemCompletion.Rd ce3570d40ff709d339fbe5ba16385607 *man/stemDocument.Rd 8d6a6276a9bfbcf885c66218dc1b6bff *man/stopwords.Rd 15b8549fd381105839451d9b15c7efa3 *man/stripWhitespace.Rd 3b168f48614dfd541907617b9a4ffeb7 *man/termFreq.Rd 1dd2e47bdc3ac7481366dc0d359ef94a *man/tm_filter.Rd 29e0ffff4b61d1422fe7964e053a85bf *man/tm_map.Rd 6eb083c9b6f1b08700065fd58bf1f8be *man/tm_reduce.Rd 458b061071b9b320951c3b48adf16264 *man/tm_term_score.Rd fddf92931c8de1612e2b481c89afbcd6 *man/tokenizer.Rd 47bc8704437b53709120add15f205be0 *man/weightBin.Rd abe06433d8438326d1e03c8367312a59 *man/weightSMART.Rd 4e7d2dd30d4de494ba122cd3aff128ee *man/weightTf.Rd 88fbb7eda2e788887e1fe67cb7fd0855 *man/weightTfIdf.Rd 193b23f2d16e20a4944846725eebd155 *man/writeCorpus.Rd 813f07011de972121885f35821e6426b *src/RcppExports.cpp 1b7544de4c9e45507e82e6e5033819fa *src/copy.c 45b4524bfac392e34ba96c2609c77f7c *src/init.c 706a1d7e181fc2acd829a541bc769478 *src/remove.c 850321db7f6b4a0e324e563f39b0c5e7 *src/scan.c fe9ef490894f8d93571ffb091669c7dd *src/tdm.cpp b7995b66ea58d9604a6bf61ef68381fb *src/tokenizer.cpp f280e050264388e7c120d4869357efb7 *tests/testthat.R 7987b16eeb87d6c4e9787b85e5b764a4 *tests/testthat/test-Source.R ef259599b4562c161bf3e0c4529ebcf5 *tests/testthat/test-TermDocumentMatrix.R 7f1736751d70509612e9a728766fe146 *tests/testthat/test-Tokenizer.R 2003b069d4a811c99d5edf34a42eb2a1 *tests/testthat/test-Transformation.R d194109d976d7f242e64a8eab85026f8 *vignettes/extensions.Rnw 3641da272a48168ad7b4ffef9fbf7d21 *vignettes/references.bib e71ae9442d42f286eefa9d77a171c807 *vignettes/tm.Rnw tm/inst/0000755000175100001440000000000014367743045011702 5ustar hornikuserstm/inst/stopwords/0000755000175100001440000000000012327630227013735 5ustar hornikuserstm/inst/stopwords/catalan.dat0000644000175100001440000001066412074065306016040 0ustar hornikusersa abans abans-d'ahir abintestat ací adesiara adés adéu adàgio ah ahir ai aitambé aitampoc aitan aitant aitantost aixà això així aleshores algun alguna algunes alguns algú alhora allà allèn allò allí almenys alto altra altre altres altresí altri alça al·legro amargament amb ambdues ambdós amunt amén anc andante andantino anit ans antany apa aprés aqueix aqueixa aqueixes aqueixos aqueixs aquell aquella aquelles aquells aquest aquesta aquestes aquests aquèn aquí ara arran arrera arrere arreu arri arruix atxim au avall avant aviat avui açò bah baix baldament ballmanetes banzim-banzam bastant bastants ben bis bitllo-bitllo bo bé ca cada cal cap car caram catorze cent centes cents cerca cert certa certes certs cinc cinquanta cinquena cinquenes cinquens cinquè com comsevulla contra cordons corrents cric-crac d daixonses daixò dallonses dallò dalt daltabaix damunt darrera darrere davall davant de debades dedins defora dejorn dejús dellà dementre dempeus demés demà des desena desenes desens després dessobre dessota dessús desè deu devers devora deçà diferents dinou dins dintre disset divers diversa diverses diversos divuit doncs dos dotze dues durant ecs eh el ela elis ell ella elles ells els em emperò en enans enant encara encontinent endalt endarrera endarrere endavant endebades endemig endemés endemà endins endintre enfora engir enguany enguanyasses enjús enlaire enlloc enllà enrera enrere ens ensems ensota ensús entorn entre entremig entretant entrò envers envides environs enviró ençà ep ep era eren eres ergo es escar essent esser est esta estada estades estan estant estar estaran estarem estareu estaria estarien estaries estaré estarà estaràs estaríem estaríeu estat estats estava estaven estaves estem estes esteu estic estiguem estigueren estigueres estigues estiguessis estigueu estigui estiguin estiguis estigué estiguérem estiguéreu estigués estiguí estos està estàs estàvem estàveu et etc etcètera ets excepte fins fora foren fores força fos fossin fossis fou fra fui fóra fórem fóreu fóreu fóssim fóssiu gaire gairebé gaires gens girientorn gratis ha hagi hagin hagis haguda hagudes hagueren hagueres haguessin haguessis hagut haguts hagué haguérem haguéreu hagués haguéssim haguéssiu haguí hala han has hauran haurem haureu hauria haurien hauries hauré haurà hauràs hauríem hauríeu havem havent haver haveu havia havien havies havíem havíeu he hem heu hi ho hom hui hàgim hàgiu i igual iguals inclusive ja jamai jo l la leri-leri les li lla llavors llevat lluny llur llurs lo los ls m ma mai mal malament malgrat manco mant manta mantes mantinent mants massa mateix mateixa mateixes mateixos me mentre mentrestant menys mes meu meua meues meus meva meves mi mig mil mitges mitja mitjançant mitjos moixoni molt molta moltes molts mon mos més n na ne ni ningú no nogensmenys només noranta nos nosaltres nostra nostre nostres nou novena novenes novens novè ns nòs nós o oh oi oidà on onsevulga onsevulla onze pas pengim-penjam per perquè pertot però piano pla poc poca pocs poques potser prest primer primera primeres primers pro prompte prop prou puix pus pàssim qual quals qualsevol qualsevulla qualssevol qualssevulla quan quant quanta quantes quants quaranta quart quarta quartes quarts quasi quatre que quelcom qui quin quina quines quins quinze quisvulla què ran re rebé renoi rera rere res retruc s sa salvament salvant salvat se segon segona segones segons seguida seixanta sempre sengles sens sense ser seran serem sereu seria serien series seré serà seràs seríem seríeu ses set setanta setena setenes setens setze setè seu seua seues seus seva seves si sia siau sic siguem sigues sigueu sigui siguin siguis sinó sis sisena sisenes sisens sisè sobre sobretot sol sola solament soles sols som son sos sota sots sou sovint suara sí sóc són t ta tal tals també tampoc tan tanmateix tant tanta tantes tantost tants te tercer tercera terceres tercers tes teu teua teues teus teva teves ton tos tost tostemps tot tota total totes tothom tothora tots trenta tres tret tretze tu tururut u uf ui uix ultra un una unes uns up upa us va vagi vagin vagis vaig vair vam van vares vas vau vem verbigràcia vers vet veu vint vora vos vosaltres vostra vostre vostres vostè vostès vuit vuitanta vuitena vuitenes vuitens vuitè vés vàreig vàrem vàreu vós xano-xano xau-xau xec érem éreu és ésser àdhuc àlies ça ço òlim ídem últim última últimes últims únic única únics úniques tm/inst/stopwords/french.dat0000644000175100001440000000150512156574723015705 0ustar hornikusersau aux avec ce ces dans de des du elle en et eux il je la le leur lui ma mais me même mes moi mon ne nos notre nous on ou par pas pour qu que qui sa se ses son sur ta te tes toi ton tu un une vos votre vous c d j l à m n s t y été étée étées étés étant suis es est sommes êtes sont serai seras sera serons serez seront serais serait serions seriez seraient étais était étions étiez étaient fus fut fûmes fûtes furent sois soit soyons soyez soient fusse fusses fût fussions fussiez fussent ayant eu eue eues eus ai as avons avez ont aurai auras aura aurons aurez auront aurais aurait aurions auriez auraient avais avait avions aviez avaient eut eûmes eûtes eurent aie aies ait ayons ayez aient eusse eusses eût eussions eussiez eussent ceci cela celà cet cette ici ils les leurs quel quels quelle quelles sans soi tm/inst/stopwords/danish.dat0000644000175100001440000000065012156574721015704 0ustar hornikusersog i jeg det at en den til er som på de med han af for ikke der var mig sig men et har om vi min havde ham hun nu over da fra du ud sin dem os op man hans hvor eller hvad skal selv her alle vil blev kunne ind når være dog noget ville jo deres efter ned skulle denne end dette mit også under have dig anden hende mine alt meget sit sine vor mod disse hvis din nogle hos blive mange ad bliver hendes været thi jer sådan tm/inst/stopwords/hungarian.dat0000644000175100001440000000231312156574725016414 0ustar hornikusersa ahogy ahol aki akik akkor alatt által általában amely amelyek amelyekben amelyeket amelyet amelynek ami amit amolyan amíg amikor át abban ahhoz annak arra arról az azok azon azt azzal azért aztán azután azonban bár be belül benne cikk cikkek cikkeket csak de e eddig egész egy egyes egyetlen egyéb egyik egyre ekkor el elég ellen elő először előtt első én éppen ebben ehhez emilyen ennek erre ez ezt ezek ezen ezzel ezért és fel felé hanem hiszen hogy hogyan igen így illetve ill. ill ilyen ilyenkor ison ismét itt jó jól jobban kell kellett keresztül keressünk ki kívül között közül legalább lehet lehetett legyen lenne lenni lesz lett maga magát majd majd már más másik meg még mellett mert mely melyek mi mit míg miért milyen mikor minden mindent mindenki mindig mint mintha mivel most nagy nagyobb nagyon ne néha nekem neki nem néhány nélkül nincs olyan ott össze ő ők őket pedig persze rá s saját sem semmi sok sokat sokkal számára szemben szerint szinte talán tehát teljes tovább továbbá több úgy ugyanis új újabb újra után utána utolsó vagy vagyis valaki valami valamint való vagyok van vannak volt voltam voltak voltunk vissza vele viszont volna tm/inst/stopwords/swedish.dat0000644000175100001440000000105712156574731016107 0ustar hornikusersoch det att i en jag hon som han på den med var sig för så till är men ett om hade de av icke mig du henne då sin nu har inte hans honom skulle hennes där min man ej vid kunde något från ut när efter upp vi dem vara vad över än dig kan sina här ha mot alla under någon eller allt mycket sedan ju denna själv detta åt utan varit hur ingen mitt ni bli blev oss din dessa några deras blir mina samma vilken er sådan vår blivit dess inom mellan sådant varför varje vilka ditt vem vilket sitta sådana vart dina vars vårt våra ert era vilkas tm/inst/stopwords/romanian.dat0000644000175100001440000000341712327630227016240 0ustar hornikusersa abia acea aceasta această această aceea aceia acel acela acelaşi acelaşi acele acelea aceluiaşi acest acesta aceste acestea acestei aceşti aceştia acestor acestora acestui acolo acum adică ai aia aici al ăla alături ale alt alta altă altceva alte altele altfel alţi alţii altul am anume apoi ar are aş aşa asemenea asta astăzi astfel asupra atare atât atâta atâtea atâţi atâţia aţi atît atîti atîţia atunci au avea avem avut azi ba bine ca că cam când care căreia cărora căruia cât câtă câte câţi către ce cea ceea cei ceilalţi cel cele celelalte celor ceva chiar ci cînd cine cineva cît cîte cîteva cîţi cîţiva cu cui cum cumva da daca dacă dar de deasupra decât deci decît deja deşi despre din dintr dintre doar după ea ei el ele era este eu fără fecăreia fel fi fie fiecare fiecărui fiecăruia fiind foarte fost i-au iar ieri îi îl îmi împotriva în în înainte înapoi înca încît însă însă însuşi într între între îşi îţi l-am la le li lor lui mă mai mare mereu mod mult multă multe mulţi ne nici niciodata nimeni nimic nişte noi noştri noştri nostru nouă nu numai o oarecare oarece oarecine oarecui or orice oricum până pe pentru peste pînă plus poată prea prin printr-o puţini s-ar sa să să-i să-mi să-şi să-ţi săi sale sau său se şi sînt sîntem sînteţi spre sub sunt suntem sunteţi te ţi toată toate tocmai tot toţi totul totuşi tu tuturor un una unde unei unele uneori unii unor unui unul va vă voi vom vor vreo vreun tm/inst/stopwords/dutch.dat0000644000175100001440000000070512156574722015547 0ustar hornikusersde en van ik te dat die in een hij het niet zijn is was op aan met als voor had er maar om hem dan zou of wat mijn men dit zo door over ze zich bij ook tot je mij uit der daar haar naar heb hoe heeft hebben deze u want nog zal me zij nu ge geen omdat iets worden toch al waren veel meer doen toen moet ben zonder kan hun dus alles onder ja eens hier wie werd altijd doch wordt wezen kunnen ons zelf tegen na reeds wil kon niets uw iemand geweest andere tm/inst/stopwords/SMART.dat0000644000175100001440000000700512074065306015316 0ustar hornikusersa a's able about above according accordingly across actually after afterwards again against ain't all allow allows almost alone along already also although always am among amongst an and another any anybody anyhow anyone anything anyway anyways anywhere apart appear appreciate appropriate are aren't around as aside ask asking associated at available away awfully b be became because become becomes becoming been before beforehand behind being believe below beside besides best better between beyond both brief but by c c'mon c's came can can't cannot cant cause causes certain certainly changes clearly co com come comes concerning consequently consider considering contain containing contains corresponding could couldn't course currently d definitely described despite did didn't different do does doesn't doing don't done down downwards during e each edu eg eight either else elsewhere enough entirely especially et etc even ever every everybody everyone everything everywhere ex exactly example except f far few fifth first five followed following follows for former formerly forth four from further furthermore g get gets getting given gives go goes going gone got gotten greetings h had hadn't happens hardly has hasn't have haven't having he he's hello help hence her here here's hereafter hereby herein hereupon hers herself hi him himself his hither hopefully how howbeit however i i'd i'll i'm i've ie if ignored immediate in inasmuch inc indeed indicate indicated indicates inner insofar instead into inward is isn't it it'd it'll it's its itself j just k keep keeps kept know knows known l last lately later latter latterly least less lest let let's like liked likely little look looking looks ltd m mainly many may maybe me mean meanwhile merely might more moreover most mostly much must my myself n name namely nd near nearly necessary need needs neither never nevertheless new next nine no nobody non none noone nor normally not nothing novel now nowhere o obviously of off often oh ok okay old on once one ones only onto or other others otherwise ought our ours ourselves out outside over overall own p particular particularly per perhaps placed please plus possible presumably probably provides q que quite qv r rather rd re really reasonably regarding regardless regards relatively respectively right s said same saw say saying says second secondly see seeing seem seemed seeming seems seen self selves sensible sent serious seriously seven several shall she should shouldn't since six so some somebody somehow someone something sometime sometimes somewhat somewhere soon sorry specified specify specifying still sub such sup sure t t's take taken tell tends th than thank thanks thanx that that's thats the their theirs them themselves then thence there there's thereafter thereby therefore therein theres thereupon these they they'd they'll they're they've think third this thorough thoroughly those though three through throughout thru thus to together too took toward towards tried tries truly try trying twice two u un under unfortunately unless unlikely until unto up upon us use used useful uses using usually uucp v value various very via viz vs w want wants was wasn't way we we'd we'll we're we've welcome well went were weren't what what's whatever when whence whenever where where's whereafter whereas whereby wherein whereupon wherever whether which while whither who who's whoever whole whom whose why will willing wish with within without won't wonder would would wouldn't x y yes yet you you'd you'll you're you've your yours yourself yourselves z zero tm/inst/stopwords/norwegian.dat0000644000175100001440000000152312156574726016434 0ustar hornikusersog i jeg det at en et den til er som på de med han av ikke ikkje der så var meg seg men ett har om vi min mitt ha hadde hun nå over da ved fra du ut sin dem oss opp man kan hans hvor eller hva skal selv sjøl her alle vil bli ble blei blitt kunne inn når være kom noen noe ville dere som deres kun ja etter ned skulle denne for deg si sine sitt mot å meget hvorfor dette disse uten hvordan ingen din ditt blir samme hvilken hvilke sånn inni mellom vår hver hvem vors hvis både bare enn fordi før mange også slik vært være båe begge siden dykk dykkar dei deira deires deim di då eg ein eit eitt elles honom hjå ho hoe henne hennar hennes hoss hossen ikkje ingi inkje korleis korso kva kvar kvarhelst kven kvi kvifor me medan mi mine mykje no nokon noka nokor noko nokre si sia sidan so somt somme um upp vere vore verte vort varte vart tm/inst/stopwords/finnish.dat0000644000175100001440000000056012156574723016076 0ustar hornikusersolla olen olet on olemme olette ovat ole oli olisi olisit olisin olisimme olisitte olisivat olit olin olimme olitte olivat ollut olleet en et ei emme ette eivät minä sinä hän me te he tämä tuo se nämä nuo ne kuka ketkä mikä mitkä joka jotka että ja jos koska kuin mutta niin sekä sillä tai vaan vai vaikka kanssa mukaan noin poikki yli kun niin nyt itse tm/inst/stopwords/english.dat0000644000175100001440000000167212156574722016075 0ustar hornikusersi me my myself we our ours ourselves you your yours yourself yourselves he him his himself she her hers herself it its itself they them their theirs themselves what which who whom this that these those am is are was were be been being have has had having do does did doing would should could ought i'm you're he's she's it's we're they're i've you've we've they've i'd you'd he'd she'd we'd they'd i'll you'll he'll she'll we'll they'll isn't aren't wasn't weren't hasn't haven't hadn't doesn't don't didn't won't wouldn't shan't shouldn't can't cannot couldn't mustn't let's that's who's what's here's there's when's where's why's how's a an the and but if or because as until while of at by for with about against between into through during before after above below to from up down in out on off over under again further then once here there when where why how all any both each few more most other some such no nor not only own same so than too very tm/inst/stopwords/spanish.dat0000644000175100001440000000412212156574730016101 0ustar hornikusersde la que el en y a los del se las por un para con no una su al lo como más pero sus le ya o este sí porque esta entre cuando muy sin sobre también me hasta hay donde quien desde todo nos durante todos uno les ni contra otros ese eso ante ellos e esto mí antes algunos qué unos yo otro otras otra él tanto esa estos mucho quienes nada muchos cual poco ella estar estas algunas algo nosotros mi mis tú te ti tu tus ellas nosotras vosotros vosotras os mío mía míos mías tuyo tuya tuyos tuyas suyo suya suyos suyas nuestro nuestra nuestros nuestras vuestro vuestra vuestros vuestras esos esas estoy estás está estamos estáis están esté estés estemos estéis estén estaré estarás estará estaremos estaréis estarán estaría estarías estaríamos estaríais estarían estaba estabas estábamos estabais estaban estuve estuviste estuvo estuvimos estuvisteis estuvieron estuviera estuvieras estuviéramos estuvierais estuvieran estuviese estuvieses estuviésemos estuvieseis estuviesen estando estado estada estados estadas estad he has ha hemos habéis han haya hayas hayamos hayáis hayan habré habrás habrá habremos habréis habrán habría habrías habríamos habríais habrían había habías habíamos habíais habían hube hubiste hubo hubimos hubisteis hubieron hubiera hubieras hubiéramos hubierais hubieran hubiese hubieses hubiésemos hubieseis hubiesen habiendo habido habida habidos habidas soy eres es somos sois son sea seas seamos seáis sean seré serás será seremos seréis serán sería serías seríamos seríais serían era eras éramos erais eran fui fuiste fue fuimos fuisteis fueron fuera fueras fuéramos fuerais fueran fuese fueses fuésemos fueseis fuesen siendo sido tengo tienes tiene tenemos tenéis tienen tenga tengas tengamos tengáis tengan tendré tendrás tendrá tendremos tendréis tendrán tendría tendrías tendríamos tendríais tendrían tenía tenías teníamos teníais tenían tuve tuviste tuvo tuvimos tuvisteis tuvieron tuviera tuvieras tuviéramos tuvierais tuvieran tuviese tuvieses tuviésemos tuvieseis tuviesen teniendo tenido tenida tenidos tenidas tened tm/inst/stopwords/italian.dat0000644000175100001440000000316612156574725016070 0ustar hornikusersad al allo ai agli all agl alla alle con col coi da dal dallo dai dagli dall dagl dalla dalle di del dello dei degli dell degl della delle in nel nello nei negli nell negl nella nelle su sul sullo sui sugli sull sugl sulla sulle per tra contro io tu lui lei noi voi loro mio mia miei mie tuo tua tuoi tue suo sua suoi sue nostro nostra nostri nostre vostro vostra vostri vostre mi ti ci vi lo la li le gli ne il un uno una ma ed se perché anche come dov dove che chi cui non più quale quanto quanti quanta quante quello quelli quella quelle questo questi questa queste si tutto tutti a c e i l o ho hai ha abbiamo avete hanno abbia abbiate abbiano avrò avrai avrà avremo avrete avranno avrei avresti avrebbe avremmo avreste avrebbero avevo avevi aveva avevamo avevate avevano ebbi avesti ebbe avemmo aveste ebbero avessi avesse avessimo avessero avendo avuto avuta avuti avute sono sei è siamo siete sia siate siano sarò sarai sarà saremo sarete saranno sarei saresti sarebbe saremmo sareste sarebbero ero eri era eravamo eravate erano fui fosti fu fummo foste furono fossi fosse fossimo fossero essendo faccio fai facciamo fanno faccia facciate facciano farò farai farà faremo farete faranno farei faresti farebbe faremmo fareste farebbero facevo facevi faceva facevamo facevate facevano feci facesti fece facemmo faceste fecero facessi facesse facessimo facessero facendo sto stai sta stiamo stanno stia stiate stiano starò starai starà staremo starete staranno starei staresti starebbe staremmo stareste starebbero stavo stavi stava stavamo stavate stavano stetti stesti stette stemmo steste stettero stessi stesse stessimo stessero stando tm/inst/stopwords/portuguese.dat0000644000175100001440000000236312156574727016651 0ustar hornikusersde a o que e do da em um para com não uma os no se na por mais as dos como mas ao ele das à seu sua ou quando muito nos já eu também só pelo pela até isso ela entre depois sem mesmo aos seus quem nas me esse eles você essa num nem suas meu às minha numa pelos elas qual nós lhe deles essas esses pelas este dele tu te vocês vos lhes meus minhas teu tua teus tuas nosso nossa nossos nossas dela delas esta estes estas aquele aquela aqueles aquelas isto aquilo estou está estamos estão estive esteve estivemos estiveram estava estávamos estavam estivera estivéramos esteja estejamos estejam estivesse estivéssemos estivessem estiver estivermos estiverem hei há havemos hão houve houvemos houveram houvera houvéramos haja hajamos hajam houvesse houvéssemos houvessem houver houvermos houverem houverei houverá houveremos houverão houveria houveríamos houveriam sou somos são era éramos eram fui foi fomos foram fora fôramos seja sejamos sejam fosse fôssemos fossem for formos forem serei será seremos serão seria seríamos seriam tenho tem temos tém tinha tínhamos tinham tive teve tivemos tiveram tivera tivéramos tenha tenhamos tenham tivesse tivéssemos tivessem tiver tivermos tiverem terei terá teremos terão teria teríamos teriam tm/inst/stopwords/russian.dat0000644000175100001440000000250512156574727016131 0ustar hornikusersи в во не что он на я с со как а то все она так его но да ты к у же вы за бы по только ее мне было вот от меня еще нет о из ему теперь когда даже ну вдруг ли если уже или ни быть был него до вас нибудь опять уж вам сказал ведь там потом себя ничего ей может они тут где есть надо ней для мы тебя их чем была сам чтоб без будто человек чего раз тоже себе под жизнь будет ж тогда кто этот говорил того потому этого какой совсем ним здесь этом один почти мой тем чтобы нее кажется сейчас были куда зачем сказать всех никогда сегодня можно при наконец два об другой хоть после над больше тот через эти нас про всего них какая много разве сказала три эту моя впрочем хорошо свою этой перед иногда лучше чуть том нельзя такой им более всегда конечно всю между tm/inst/stopwords/german.dat0000644000175100001440000000250512156574724015713 0ustar hornikusersaber alle allem allen aller alles als also am an ander andere anderem anderen anderer anderes anderm andern anderr anders auch auf aus bei bin bis bist da damit dann der den des dem die das daß derselbe derselben denselben desselben demselben dieselbe dieselben dasselbe dazu dein deine deinem deinen deiner deines denn derer dessen dich dir du dies diese diesem diesen dieser dieses doch dort durch ein eine einem einen einer eines einig einige einigem einigen einiger einiges einmal er ihn ihm es etwas euer eure eurem euren eurer eures für gegen gewesen hab habe haben hat hatte hatten hier hin hinter ich mich mir ihr ihre ihrem ihren ihrer ihres euch im in indem ins ist jede jedem jeden jeder jedes jene jenem jenen jener jenes jetzt kann kein keine keinem keinen keiner keines können könnte machen man manche manchem manchen mancher manches mein meine meinem meinen meiner meines mit muss musste nach nicht nichts noch nun nur ob oder ohne sehr sein seine seinem seinen seiner seines selbst sich sie ihnen sind so solche solchem solchen solcher solches soll sollte sondern sonst über um und uns unse unsem unsen unser unses unter viel vom von vor während war waren warst was weg weil weiter welche welchem welchen welcher welches wenn werde werden wie wieder will wir wird wirst wo wollen wollte würde würden zu zum zur zwar zwischen tm/inst/doc/0000755000175100001440000000000014367743045012447 5ustar hornikuserstm/inst/doc/tm.R0000644000175100001440000001036014367743045013212 0ustar hornikusers### R code from vignette source 'tm.Rnw' ################################################### ### code chunk number 1: Init ################################################### library("tm") data("crude") ################################################### ### code chunk number 2: Ovid ################################################### txt <- system.file("texts", "txt", package = "tm") (ovid <- VCorpus(DirSource(txt, encoding = "UTF-8"), readerControl = list(language = "lat"))) ################################################### ### code chunk number 3: VectorSource ################################################### docs <- c("This is a text.", "This another one.") VCorpus(VectorSource(docs)) ################################################### ### code chunk number 4: Reuters ################################################### reut21578 <- system.file("texts", "crude", package = "tm") reuters <- VCorpus(DirSource(reut21578, mode = "binary"), readerControl = list(reader = readReut21578XMLasPlain)) ################################################### ### code chunk number 5: tm.Rnw:117-118 (eval = FALSE) ################################################### ## writeCorpus(ovid) ################################################### ### code chunk number 6: tm.Rnw:128-129 ################################################### inspect(ovid[1:2]) ################################################### ### code chunk number 7: tm.Rnw:133-135 ################################################### meta(ovid[[2]], "id") identical(ovid[[2]], ovid[["ovid_2.txt"]]) ################################################### ### code chunk number 8: tm.Rnw:139-141 ################################################### inspect(ovid[[2]]) lapply(ovid[1:2], as.character) ################################################### ### code chunk number 9: tm.Rnw:155-156 ################################################### reuters <- tm_map(reuters, stripWhitespace) ################################################### ### code chunk number 10: tm.Rnw:161-162 ################################################### reuters <- tm_map(reuters, content_transformer(tolower)) ################################################### ### code chunk number 11: Stopwords ################################################### reuters <- tm_map(reuters, removeWords, stopwords("english")) ################################################### ### code chunk number 12: Stemming ################################################### tm_map(reuters, stemDocument) ################################################### ### code chunk number 13: tm.Rnw:193-196 ################################################### idx <- meta(reuters, "id") == '237' & meta(reuters, "heading") == 'INDONESIA SEEN AT CROSSROADS OVER ECONOMIC CHANGE' reuters[idx] ################################################### ### code chunk number 14: DublinCore ################################################### DublinCore(crude[[1]], "Creator") <- "Ano Nymous" meta(crude[[1]]) ################################################### ### code chunk number 15: tm.Rnw:227-231 ################################################### meta(crude, tag = "test", type = "corpus") <- "test meta" meta(crude, type = "corpus") meta(crude, "foo") <- letters[1:20] meta(crude) ################################################### ### code chunk number 16: tm.Rnw:250-252 ################################################### dtm <- DocumentTermMatrix(reuters) inspect(dtm) ################################################### ### code chunk number 17: tm.Rnw:261-262 ################################################### findFreqTerms(dtm, 5) ################################################### ### code chunk number 18: tm.Rnw:267-268 ################################################### findAssocs(dtm, "opec", 0.8) ################################################### ### code chunk number 19: tm.Rnw:276-277 ################################################### inspect(removeSparseTerms(dtm, 0.4)) ################################################### ### code chunk number 20: tm.Rnw:291-293 ################################################### inspect(DocumentTermMatrix(reuters, list(dictionary = c("prices", "crude", "oil")))) tm/inst/doc/extensions.pdf0000644000175100001440000016045014367743045015347 0ustar hornikusers%PDF-1.5 % 1 0 obj << /Type /ObjStm /Length 2237 /Filter /FlateDecode /N 35 /First 262 >> stream xY[s۶~?oMNf|lvNFmʢCRir~HI$dItd$n ,vˆ$Ex /D"=!&IHJL%\gk9V2\.2L@R=N%Da炁hB9!:F W#S6Ĥ0cYE% DB *1IJib% Bi")~&H 5$Z_ =uu6@ =n\z{P>)nț7^:/Yȫ_0}Sb O~b~WՅbYYg#MNmQըkX ~Mk%=V+@/1Fټiukzy[|կAϩS}e"}Y9tXsWC?lݷ:xfy0v?<tT tgZ勌A^VmVV}n$kp=}poG$_hTKxO%!j&W>O48,mjJ +ki.m?aWtzH3:tBCPQ`t0n*0ca^-XZBzLQ)# OBR!ovS_mˆ޵ TR-|y; y^S 2 P0 uuUߖ ϕ2ʄrbX'.c,Z*j0z1-b]}8ctz}C=x#zy_#ӆK|MYu vq}S{ -~`3ϝz1ucL6qux+}_̋1B̊څ<4AzBwUV[wH+F H1C<ИfdX^7]}dYEfY{{=K\Gh}ԑMnNm*5s[}Ud֏ vhfߛ9y+DT:hR{;zFpb6OIpEJ%y OW[Okt;F uVFS; #o#ZnNceXm`lع<@p톦?m@ -17>"<_"dHb|=y73.i5W$k:RÖ1}<.Rŧ(^KCb-Zq}gG*gEu6~2n6{okm| / /2eCfT&C.Byׄ74ܴfChωohy$_&|Q3G]OPOaxux#b->2>Y\*O>oDOa–McXm _xŚY&߀p|aw:Y{ƣ JHPDLttx-Z/|iu*JKY(d#m xh?v0#Ÿd%ޓ:SGm02> stream 2023-02-05T16:07:17+01:00 2023-02-05T16:07:17+01:00 TeX Untitled endstream endobj 38 0 obj << /Filter /FlateDecode /Length 3330 >> stream xZKo700 '8 bc'0 dWeš$3x#jwqfw=oorǫ|p>nme&V+m3UM)e&m9^gomRԬZu_Ȍf'xfݟЙղ`;eMT-6s\&2k5T߹(:P~Pn|l1HXu8>Dmo]?8C*2ؾNi pD[t@H7UWskQL ܏q"@P!_3\7 tlj0m P=H!Sb=%"JQ#N!AadH5uv{}Tp҇vz5~>]QKNSp$H@k侧A@UhJ|Blf;?5l{öd cy1ݔr@8T5%tN=>,mMZӐJv] A 9L \@섵Gu3R`ZZ;H,З4&a I6`AGi쒅hƕd;SEzP9,@yGQ`€ fH Q .lK׬>csn/$C@+cZI>=*S_ENkr‰;'!Qӷ$KPB)勇u6Ce "-y뛵T&ޓKY0O FR%U4ٯC/lLoh)&]Ba㹊aSXyUP>s4~l1 Bjaz|E+H6;hAJ8IЩ|t)m z A"BzBHԅ|K p\dpH8U7^ Ps])iJO4GғM}q u_zΪUdj(0t\^=:;*eМ{$cM(V5 5@ϡm\\ji\'/yq_uÌ0!|}t^],85_5HyUIU(lQ$z۔ Ʌүomad!奥u߿goSSQ}0P\x S,k:gLqch_ vh~38mYBaY!(KdwKF?}t޵N6Qn ż8H& ŭ0j)l_snBЬv[1(ișHrmQآPCMZ `uwx k?'_[iW%`F=d( HNTn08Ux0oQyG'4i:yD.X/+6k\62S8JSYt:uu7kQbz\[v_L.0ϡBpb 7:=s;lԂ943c3Sx(e؝:qNAtևj\o!7iKWp"b5 H3Cʧ9ю-(AK!%t)?#tu%Bi8iH)BtFhxw3x6o{AmL1rc_3 }+4.h'Yd@^h0DҐ;:2LM =/=bkߡ_r .0rd&t16L|-.X\4}K ̞!K|1lMa9%T.}:J}&4X#W=-?' _C$A6a}w$6)g?{^yOoW4ּ$(h3y?endstream endobj 39 0 obj << /Filter /FlateDecode /Subtype /Type1C /Length 2166 >> stream xV{PSg!JRuw:ڎuƪ`Tc}!!Ay{ τG\EYЭmEv]l1Ӻ/uƽ;w~s9bF !fnNI~kUџ b!g ˓~MD9 J6mQ(3+ BJl'~K,&RiNu"M'DLl#yS" ;l11w* J=Ae_ZhfꗉY6;Y8?}(f8Ds}(OoJXʪPiƩZу/|gAyEB׈%c4"c>,dy`lۯKe0\IW _+B,H%XWa7//c17p //\oW}nw`:ڮY(-̭ȁ51e'mMdk"g=:Ih%-a$>ڿHn&.ۂsq 0< iuXAo͂ohX= cuz 1ZHȈu nM*5ݕ)ǃ܀=κ nWe }NKׄ 3몂jTidf[sm7Pj=|F2'+ja+9B:Cɛ-Yt Ǐ0=~p/*$XAcbhT5jŁ@q`51M5̄?nNZ/aipA+pzb ~Ǒvy1tZV!>ZNߦ>{LYkp{.Wϩ.ߎrD|̭%2TUJ 5\>_x(0~0f gp9 }♟YΗ'V\uC=M n]CŔzFz[vmLKٸj5UM֟]Uչw}rqI ʹW4W'N\dmӪ͔"@A~=e I_}ۻp׺^O7y>5ŏDk5dɟe3RY_#BL+@MVת(kjۂF;!(zu(K?>Cc̖R3LO;.xjdE'c_6xlR2~ɴA$A k2A 2+SHFylߜHҢ 'lxG<q ~ s`Ȑendstream endobj 40 0 obj << /Filter /FlateDecode /Subtype /Type1C /Length 2233 >> stream xM}TW'fjKEiwjwk]֭.."P$B $3ɼ !&APREݭֺ֯Swںzn89nR9;3s}yߟ"$ɴkSxaQbbd3Ĭ͉Bls츻y"<,*Y[.="os R Fi-"XMk׈׉G ".|A"$&&}Iz01<JCK֦3G1_B 4-YsVJ+]TϽˈTn9J+qa@M)UA'Sn{ XCmS 銟;!k(t~-,HlC Wߙ5fQ %zyM\` >T A!*J,{Fq@>X#>"6fso"xs˲k Vtga#H/)*` d|+ P9FRD5JluP~5,y.m8겯&4,OU*A.vnlk"X=4,8:b0(JKxWz` qyFFf)vJ|8@VC%:ƫԱs֋Zjez-v4F җcؔNJU %4qSNzå0b:B'x!NtLkLRY2`S1qe5oa4}u y-cf?Rag&Lމre (0WW P7j^hDŽ%Hȩ{*'qp1oln(Z:>oMfH詖?_yH e{ZK(%#0m`1C]h;ⱗ*+]:]T8Gɟd#k2(=rюhDSֹC < 9.G>h)OC[`־h1&DzU6Ksֲx>ՀNRηC;:uÓ8'Xbz@b;t9 ZДە:}[ꛪ8+k3R j[-HWty*;KK+**> ИݣcxoPZjiwt1DD-:w5g= Xhekj] ݣN)$cߺr5ToY5/mhj K(X:PlmsVt 2t$'|{VS־O(S^[O۩ö+_#~ ƈ3@l>NN t}lƯ`% A& \U)2gQ_ ֣в&o3u k$xGv=ic醜3Tk17|vW׾p ϼ]-))8m2Ӹ'fM12M,Z^óHĩp%܊egIXg[3փ]AOw[',`VQ[VY[ HźoTg0BWCn{t9@>Ҳ'^^YL]*S%w9NV`C!bJ-YPJs q)yGG>91p3Db?=UZHrY0`~PfL_܁xu7e? >;byJɲLAmGՀfnadd7nyiUr(}:9*i!^ȝR(A-&nc/he?^'_Ͼ+~Ԯq{%gc~)04ύاލuyo\=J]+#p0֓ u. 8 #,MMVr TBi-/䔐 {m`YILUPjn\Àk\`*+L^?;fӥ v:>qyrRendstream endobj 41 0 obj << /Filter /FlateDecode /Subtype /Type1C /Length 2605 >> stream xmV{Tg%BVWۺ o1(`#!$"ЈZm}UZ]k=nkYwowbGwfΜ3ߙ~W@Q`DtnԈpG(G=<0nTT>{陣c)m Qۥ1I7&HJQKej"KFQ+U$j.5¨hJJͣQ j ̇*C0Mo@P??r'.- caY6eDԘG~*xqI?LzkzMB{kY\gDwhТјH_- b9Rx9kp8Zs "<3 G巘0kL;n58`05Eź|r{"h-t`r [588ޖE Z*'CJljM{]{wԘNp!MrN*X#EP=MV*70-䦇oMt \HcBtxebM6㤁'Uǁ8,F|/9ksȕ;#VXjc.`tZZQVU)3U*N_"'y~ss_8`}></ˌHlj~vWt? p5ۜ~ {p#kb , 'sH.7h ] ·ݿ`'hE  8Z.CAQ'R" ){e.գGؗUVXR0272d IΛa؃,*zg> stream xytSW5ƺ 4KH%%$B ݀ 5t),۲ew˒e`Lu 4„$!9bM[|eꞳ}(݋HHH)ύ7c@h@DЯwǎ `Gԩsf!?[:@a^Ei)śo^emo.\*Z|بт1c=&˓^y4A ED*xxxXB,%Fˈg b1XIL'Fo31* b,1E"BD, ^&ăD61H }i8r$%kkۉL)Ihޗ,,ӫg7_; 83p·>4hSOK~1ڐ!\raÅC}kÔ~|d=d102G׍x )S뻋+YBS@E, 6[{;` !@A/*fЗ[ (#dJ/lʀ*`З#WgE)zW)c^&z-ZP/mg%B $t- oL0<:3"x1)>] *#M@n̶l.ߕhcO{vϹsri4.nZл@gir zv,5ԯu^_롬8ݑh[t{<dk]mbVDmrS]S.Ui2c ` O u4>"T_OwBe| RLz BI7Q/8;8wA F8 @~Az?)YM MY49B sy/Q&91~!PTAoC˲BKh"ZQh2| t:8)ɭ)tn' 0ygoOKۻ>q ADU@l.ݤUW3=XBKY8*M}>MqP"LiY˽t8n$| XA n]J #5÷DPFF D/rn^x2Wc." ?Ke ߁ire]òK(%iļN{-~,0+$9$Ǘ.CРx}PQo#? >lzk`\_*^ 7T&E]6U!S,lfఛhb5[V $<8S(![˴`fAy2_T% XsRGaǾT~\ 4Zj54; eieUT #^J}DG%2sVCw;8h )kKpJjTCUetgzxˢe˖˯ Y0L~}d)(jec/\%I ~khpE }j08mޱiGc{?$lYlPX;̿g`lܤK JL;gKL^qF)mZ8Z^ǠInrZ(씕1IB_i't|yn9LJ\o/8nU9<>33!ɣ-QKp|Hf/,x3 R$ F񻂙bpy ÖQ.+_f$$[堄Z~}bͅNFE#` yO*i6vM\?6rCm7+aTIm3n3c(8l9ۙ8K7ZA ,/j!wFN7^SE]2=խ!4]YVPV *֟P`Wusx!|ŸɌ,{'I*usȩ_~SK|ӔmIٰ YUhe -dz>C.%8K*p?(p|LlkqC9H_&TA?BkS"T].kӄwM9 "<Ԡ`3a4f5ejUډp1zrjZ#2VӾMY732%Ce/ qP `s呪#G!W°J ˫WT.wK'xaGseR_jřܒr|i RQ4gHϙܦ{{ 7nhl^ypMEcPxP+Qi1==Ef T]5LW fxixЮkWl; v<{K"%3Z8<#umHۛ 2ǰQihAqG^!)G+rh c *^(qe׺U@,3 2LxV KWA=JIVu2VejdeaVk;qjkK$  dijrgf+M]h¹gE0^n+ReD5_p9yɂ\sA Ȧr czEIŞg(3ăsg_16ӋYmpt۴1oe8 *Q^hW5H>r#Q/\ljm^;9w~=%.r]Zdq'Ȇ\D)hC唖OmZg2\L+<Jݫ´L}fG5vPidI]B)uy='ShN8B<[mW>ҴϾc57 z̭mK^\Ae:,zw凫?\>ܰg?ZDxJ9329o5(?c%XlIQ}COZe$}@P.W`S0h Nix5֜W]0@9DW O*!Pn-Tc$g֑ofy6&!ˋҠ @`?4VKn/<ʹY!"2aN%lIU?Mc\wN_hx=44VGeu3{b]V;ۈqPiɺU`3Ȭɉ55]|L;yi,`+W '055t,n׳J @ E}`2ylY "PHVGꪪWF7;F+vT܄NS?|'vþdZCd¡cWȒ VaQ񎃳= l[LҬmS5+գhC-\|4Ex~?"}N{H2;[+5ւhxfuH7|vM z5PxvIh5)*3TMU~-;sŒUUnvhzPP%@<& goV*ͥM&hZN fyq\atQ!IBUp<=J_Fᢔ9cU3AG]A<zh꼍IJbZ^m"NM\iȶ:*殍:tz19_ 5:V׀T0wSz{ 0<=öxp /t䬆}+IzL>M}V) [@{N׮&T]?:]t<$Fbk Spby6  `Ġ^ `7O'o:5[ V3Ygob}P&wIL2ih^}.6H"4l1 B{g\}±3™DH~@A&׾'^ifD'5E.}էiܔ=e`6rUBK1wn OBޟ]Sc+F; Sx->r>S6ku4!u6_.;6 cw& Kz¾7VqKZ~f,ަt.njipe+>ʅݣ}+dG=Df˒8v0HP>R-%Hf8å(*<+XO~$RZ#ilmrH ~1y 4Ȳ4kuVfIx޽W.L#䞨q#s[!8 &\jP)x#jq0A_O-P>=yj;(އŋyVUp+VA?UKւ%(cSySԇ0;]΀',_);q""na̤EkԚumByz&yKH"TlUno+iSX W䷠8κ}ǂ%N}NӸo, @ߨ/9㞳FrK935ϯr , A9%y7 sR /muDýUr9!O؟~BRsWi79<p!8{l^lʜy|W~';]yX+= )'w`J`W͎\r7Vp@afYYx!Jj9Uk:Nz Q^uL<突2c.4fPdha1 ~cP[K•ȁK[CoVw:7O{yl/;׎9:@և K5P3H}z%L$ѳ$$"ְ/>t8h6ʕUjkʣ6E^.\洭mhr<,t)I_6UYVMf#0QFȮ>T\N?͒gm+;^Sp?4 }Ŧ-dϲnɿ0 %3iԻo%;Z֕,EOS2s:Z"LXqv6O4|Mx.W\OW6L.7BY%wa? 3V c<(Ԧcxcb++5-@;۷-2-((8K'~Ϲ߾ñ9Zێ娏=49V1ȢbD^ \vFoKx)?`I;kUtb+k[J;BzMnUiqVE/޷kGކHgCm.^2yfRh4F=6XJ z8]8W⇤|M  X!zWL`ҳgGO.eniDSPExL _rk26`BKвhQ@-\9`l_FDZunP8?ÿ]ݘ錜?,ć8_TSm/v0z^ZQhLZ$~^|9?{ݷg~ 2#Xmu#+s>ƥa6r2z8I6Rj5;p\xaJ_ZdkDFX C"lJyͳBWI] N']߼b`2e|msŧфݟCp.MOg.44nZ@:dtBYnn.VUre2 g# h `a7uhncE|&Xѐt9["⅜o>0߽̀⾙~+;( )n=E@-p>o7mx1[Qxʪ66^4PnCUPx/,rg N#=,\?i3t=_;J?6QzT^wqjxϏI>؊wbo:,cZ6É#m̯T A4e+_wC^|cN}&vz1qh@nMwyݟ߈8>ݍ_8OOw6na lz ܫ%}yOQyKJNR. ,pOr̋*jM %K֯7ulbpeumUxw*Y+iT! Kʐ;`z>AP~8="֞RCLDЀu^hI>p\ܫ.ϩQL28΁@ǝ\G`1[,Ccd;xl-+EhB~:BipV^V]ApA:/Гp mXePb1 A↯g+]'- .gRFuy@81u-+i[ ٠|, Pϼ/ΜA%ҒbqԯapV @r>__oA?`Npendstream endobj 43 0 obj << /Filter /FlateDecode /Subtype /Type1C /Length 1162 >> stream xmmLSWϥpEI^p,iY1[tѩX`QE@ P(/ϥ"p*-U(Q]cnn %SlNcnO;p%Px(*Ґ}]bBBB3S3[" JQ]o_=^pc(t"LUy2L@P2JA -j5&kjhR)kv#I2^ZuXQ47L{X, L6 y>K駀^uGk& {N*Lk0kZ>dz̺'6*\wg7Hb"]47(|p 6v .d"={hk`t@Ñ8 ӄƛ81z{'J­|^|W鯛3zUߝhW8ŷz:$:X[|% JWI֯`>ߟk>2!mRjgT(w0]o9as/)8\-Ufֈ3d^n#P6 bi<S(?[0cDewA|7-xSRH,buE+:يH cѬ8KEBϬhv[KuA:TUX{ s<Qy[0Xy@O^w1%<=;lde 0y{w|1j;"Bǫ):Xq;yk_L C;;vK!k4 w+&ܿ|UWd6 kb=>KjKX΋70 1p+ 7߮$A4N j>6e<77{yggjrj;ɸvV~zY)@ F,)jp+_ /ў _6Vؐ[rH%WH fUC(8w}اe ĭ27*`U\a-y*ETpcF.O~7}b6Βⳗsq\(-leMۚHQs!J?0 endstream endobj 44 0 obj << /Filter /FlateDecode /Subtype /Type1C /Length 2622 >> stream xuVyTSw~!Zb-zJk:u +F(ȢlHBHB"8NmLRgCvfLyrwGDȈ;]`!HKFU$ =>-z)HFfVxvܼ„MSRҟz6S[ԓT$E=EES۩.jNͧ"5ZjDPTK1T.uG_G~wOṚ5bYT_I?exŮ`7y$Bb}^[N{/Q.tMʕ1;bVK$&Nxv]hG)=C_$^KIs4ŷ HnB\&4B$wpwx a|"UJKNiR4x걋R+I>g4-nn%w$Ib6L<Ҝ{̎V^^J:Y#n}.kbjPS3}ޭRY?c6_37N3im*P25pa~nA^AnVRX0Y&X("'ʰNe3|㖋C86Fg?pIQ Ǽs%%>W)Sailへ1 Q Nz&!k-]ju NK烡X]7.gj9w?nE'75S]T,xY 9]ѕRclsF`w=nj ZJf?<9g% J`s +x$)J0_{ ӎs&YKj&]YȽӧogڳ!/zǏ#x 5!]O$_a.bM9ý/ g2JPf;8z*niѵPgCWg8o ߐtW 7tbR}%0S ֪;1Gl\_ wgu)W CnCM @(ʊ舁G.Nvpf#pC.u%~ Ÿ  )-,5նB$h$A0nS%.sY}. L:2 l5&U{)`Qԋ~Ý95z9Eǎ:ZZRs8NVXI+*Ig-f0HnYIq7-2ϒys_Tpg+`mn^jMYEUs-"tc#:2uB3#Y;HF#w13Ю(>VvX YɈ'O~ pA< HOY Rç+BTg70 I%t)%%t`q -E빟n 1 |w{wd}!>|_&.2x]άh.k6 -uհ(17 ?|Q^ B7b*jZeO#^M"ulJS g2jsЅ0`em ROoWa-G6`R]d)E@^L$ V@ɪjWAW-&4 yp7\ɂ-* pAA!(Iendstream endobj 45 0 obj << /Filter /FlateDecode /Subtype /Type1C /Length 7702 >> stream xy xնnBP4e@ ʼnAdPAd Lyݻyt!(2 c *GW{߮—PU{_U*2cÇ~Jr+3@BЧgC{nǸI,Td"qd 銙f\5gyWίzqmGtb'5pAcǍpC6`X96==-bI9l( æ`Sǰitl6=fac[mX_v;1vVbB?Va`4`-s,cxᭅzz+\9n!"\$IH~týnSo6u߻v>Cw(~*sg;R%sJ(ux~~GQ+ݝ<Qz<}=`8dr}>nuUl^q1.mz@ j$na.Tc5C ڔPx}`@?R-a:IPDS< &P001~W*V[vd5U*vJvH(HIXGށs`qoba]tf8lr7]Bcr<^H"\ЈEQwz[RDm7T$4Ɏv^[JEH $NS1UH?~\r }} Eά -t`pV%58&2BQ `7XknI;.-]:!@T ˈ-h$rR6ApPoFf(WRThƆdc {%OLJԕFHZd0QgVPqF%K#\VwY"HEځMYQ`q]m݈?^t\cm6l. }[~q@jpXv,`sx_L + &FS> #/!qlόYڹ6dLr3gCDJUU2ӵñY8, *%wTx^CdcqL/G(_ڊ@u/@Uo,HHwH\~KH_g3<1- X?=xacSMjW1rzB" J" R4F$Uwñ?fؑ )bqHP/]Ur/ X?h a{j?W[piR7í݋Т.$rJUEE5)x(@MxȳDZH%ZL,6F1m mL*5VP1l \Sy^3% nUWoJ0_7S/fadU+7mfo% HҢs{>c#q5Z-)jqm22c$jOZ4K@r'ou[ed,9/rEKnVr~QJGk !@ђT śF9hm" x#/~D]PS M+fQ}Ӎ^]'0-bmݮEpyϰ EZOEOz>A;xYAR͙H0g&$Qjtjzڳ"i01/ j^D{IjmF "eՍR)3 RT%ɸ^,ܞ- N3b"K24"R`I3lnF6@J.nW?ddWÿ|^&l)9εKU#*| 4 "]ռ& fIv$;-;> 3~|ؒܰea_q|>ߣIxZ-GQ7 ញ+>s;3AU jm /.oACMӎք:(d _mrD"7|ոmc- c5wv IIAM3)I_,1RIeBK4J3 >v^U4l"Rx jc:! (F|a 2ja`cqu-P, ۅN)z9qO',8Eq*gTʠ {D1]nth/nio_n<bP ?o'O|SIEP@S{14⛺D~Ov5ZWzvBi`|'\X:ioe :J]mu62"tZE0GEHb j"{+@}BU].=i#0K17jPn(EYq޼#iG=x\0aSk[~T|MnwҎ@X2nmlG⌃Ɣ>@J"'\6'p2p:}M_eV5$Dd=IPۧ?~bFb[<Wc o yw;۷H(T"h;%32\=(S*ʈZEmHQH¾ຍ'݈6 mEkH@0>q"x:#4$u!u /kPhDi"唗#/>q@ӥ^ML <:T֊jRAAgn *"~e~dɞd̟Ǔ2 SB<[py&UQcj=_SLb)JF $Nй[؟[b5t1!*HjoF \ 9eh5ȗd\S~u@l]9c:ƶ?:ԩ**Fdss c 5Jpە~{o@lM1ws,o G{Z1Q@ąZ%EMpwQiO 4T kEDz`i`E<9gWh*x9Ah;8|Kƕ>)E$2+)/|~S)z`l]9yWA>ؼ;ˆgt|*$DrU-'p V{1G\:@po=Җͧ'aW]gO) b܀LC&$`5@a/ fUKZpsFNߢ---W*ߦr.@j_Tp)dYY{C?Ԉq#G^ԑ"2CDΜ$Z^Dilt~&A @#YҠ2 ]Ҷ`qqF  ,8Qղ29O b{=t;o Mgx"M~1r.'a>Mm[iTWPbWyeuh7ܹ*I0n`EJo!kQB.#rśx58JyhAawXr+K|dH !8?4vUPc _66 /4# Ew;n݇ -}Rp>Sq+0y1"i}epҏLmEQ2%HUTc̽s/qdkyplBda1/ FO& *X7ę٧pnHQ.,v@(Tz7OB_}[o :*# 3T˞F'pgd9ߟ݅7;qdd,S g΃ gc0K3f4>Aӑ%9jbq^gBVrNܷ|[\v@iMKh৹GxV\^] QOxSi`.DZ$U@jP=IY!\}:;mTH/`KYh3HR4wUFfA9pwCZ&Oᔄ)#M㯯zU.Ӭ骖V;p{@6]x|kRYNykl_vpȱ'|kA%*Ḉ3 !m#[ɃDmq|NMz' 5[yo僛 Q7htu@2W)Z:ZFBd/. >Duˆ4h5@H m M}̈;tYŻPov٧9?瓋>Lobp<|}w~!F{ rpimXՋ4%ˆ>6oY͸EdjnLiޔ+/,9}G"BW"I:d7G Kv 'Z%I@$];>!s R'*ѸLl˂JGB[fa A`K*- h2 z\8JfԹ)ZOPxI*a"D"YG}8J{A\NPos'/S~?saʊ$M竳8%x޲L(D9E*ZmfɅF7D"9 *-I y1E PP&8ЏؼHVeTADo g J.[20ԱOuCMw)@b x]/ 8݉`HX AD (ML{ 0 /Ư怊=4hM$<we _˜!։]b&ƥR@8:fO4R0o|~S٧>lWew\2XA84! ;ڰOŨKRn+*A\ ֍F#D^6} "fѡ[FSmy$2\A8tehZ^ BVIj ?m-.C,#9{eaB* cT&"#mQTm" ,VҬW 0M74Oa}p꙼55hށ-^X̫ E\Jo@toַ.Ԑ |Ԭ9)!gH8$h5pDWb\G&^3H G'؅5jeQQ( )cO)_lW۾u}xax;IʰX,V˟p\\q//g/>;>J{q8 'Xmp*{oDgb,/LSֵ7 H'z#{?&jY9S\rQ`:o%rJ!J %u4ļ|{ďijS$be\* ekS$hC PU!s|+ioטrX{jû:G$%>1- [ڇ4Dͱ/MTկT뀀M[4`t)cV"3n8wGrt pHg#6lrȿ}0g j ̈́MҥN?uqհIMZRc-cAH;$j~ҖژV @l{ ͜wӑ~_9|K [VgJp|6w8ci7X8%`P[[j;R"y]͹T(^̓2Ux%K|v ]iuO9Igy\LܘglЂ'΃{iR:krxZ_ð-%\䎅"y~ɏ?`6>crI"=Bd$P/T=_wGY\'^zSUĤCNy~=Ҩ@5DۉI'OCXFvv!XrbW|݆)`9q?cp[\V5D(J8Hgy,o[K877LڐKQž59 84 H/bل. i$tFJ]Op*nڞۺ˻a (~/x3VGI_itmJRJyDpCC7K )(L_uĄ1<}GJ0܂BrLǾCG*g{i1mv_oѿPxbџ?pqROWzD"B&`J˝|-lm3^ܹϾ׹$&B8~WBnu,)8* [O:9縑v3JnLg}hP8K\ L!*Ku1_ ~oNb#^O]I-V-/Z]Ig򙹇?f<{߱sCyNxhGtropM쁞oB<x‚ "w78083koL+UiEx QYkIkvY$u_|ҦX(;/~'G vU,\Hl2+ -jBOHl _N9>)Hm$RڰHԈ_xվ*Y9}X䛹l \9QN"<Ou~w#ɨ*"S5ͺaB@)TS:= Ӫ&eZ  Am/E"nj~w1^_Ṷ|kfe]0ēu*EPDS)3Wtԟh2;s7m%#=f-w1Pit%p@(FKKХܥ7,w<ij JԏY81/#GHR"'Hv#A9UT=:ڣ()S)”>cο|s 6bLѕ@$XTJgoyy 2<?3L~.:HLQ{O΅ `ᩀ;E% J`57@g $o `8) uB#T7CQg̙A'8֘^j7Q_Vӗ.R@Vd O\{m[6e,m?#mtzc+-:5PIBc^7`Ke` JJğЩendstream endobj 46 0 obj << /Filter /FlateDecode /Subtype /Type1C /Length 844 >> stream xR_lSen+xވ&F d !,AJ[]캎]CѮ:8d&1&DT_4fO_73oãt^~ aZZojj<{l3 -77_{&T;oX>h;ځvȤ# b^g1יڸnEVc4jy9.B>;_{ʞ!Y[wnڀHP.3yNJ:Ma%bh3M'=-^<+D8{hn|<[VZZVND (0ՅLTzjT![hP§Hv_xlstp]]*H4'L94qpY$;XwS'+m⚆;.@%x5$mv`nˉ u|B^3<]РU/aKYCme>)CRUG6mFQw0:p9t4d,#-6G1!U4hN)Z.SQP=#>[ao |t]\kUj&ǟ@&9+=0(-" sG n[E++E$F  `~[Xendstream endobj 47 0 obj << /Filter /FlateDecode /Subtype /Type1C /Length 203 >> stream x? SFSS1000K  BR3rM@vKJ%Au[ы'}}}#ԋ뇰2R`aUxf~dd ? ΄Lendstream endobj 48 0 obj << /Filter /FlateDecode /Subtype /Type1C /Length 7604 >> stream xyyxTUoŐ]AT spQP'e% $TyLI!(4bۢvmm.}Tww$_rZaΠ8YI&+fd+޿n2߅qN5T;nv b dmZyɖW.[^b饏<xr= />sg1g*g >RW9pq&ps;P*kf*VHp07mVfSb Vi%{y Z1g"=lDإ6l5{ʓ`2E]艧&A=oe:Qo.V]a)$owW~%iHhgxBHjūԍ$+_4 \ikB/Ve5fufUL0 ЅDEhi :鄓 } ےr ARi5Xa2! F凵;i?f80PTq@a)hH@.?łnO=0шdnRy4xO*E7k [^Y4f=- E#h:8[=VwkGu'N}2ݔJ7N޺y^nP%UVGs-hTryPS %1FJ`,BcPiq)Kt7livvoORUX*$3a_(-s3@qSEh^;ՋԧFhB'[@ѧ4 jˡEJ`gvFE;z -$o%nkݤ﨣o&(2!ip*r!!1&IwN%J̹_n͆ lTA&N'zQ7 ba67/[!OL;Qg-L4ab F̵*Œ=@>{x q.h'D0M^^)%,\h)ϹcX))$Mfy)|S;[1zj_Լdrg|?-!:)w+z$'x]ղ`W̝pDG"Hd\-IuPL1wƙue>F:BHf⧁,ܤ彨lս}RouԋcwER{HYL yKuq՘K 3DR 26ŦzEXRRRSnz$䥰u5-g0:_> jDSŭ: ҸR`A#+3z;($(\}47 y ](@:S&gXҠ`?ڐ2EBZG'G'{߅aDr/}/&p{+@,."D@%Pg%Ĺ|#O@TPF pէݴK u16 5oj5[f?%D% :s{K~ɸ*7oB/"%nS cw a`}p6edL"U&yG9цUvb"PM"U)cNݒRoS+{6J 5 #^yHEUd>YEYݯ*DciSi_D1*`πɸ<$G"݂bUu5K<~W]^YVmQn/Q4~7Γh4pFp5cug$%58JD*5>RZgfɠGE;n[<cdNB\anZn YcGei> e12s\ݤcOIj&ɷCG.kߧVMT=S*@O] A\L `* H(54' wUoow)u׷ $p= ^>pSwtߨ/'ķ>i5 VWATU6U*ۭ^;~-0ӝX*<5xHy}|%I|ݦ0%2EO3UZ"!\g[]qg] WǹL8qo#Q\D>%`܂^ FЇT!*DC,:SX4={$ƞDނߟCg d`\;{f #`O1St9Tx惖c(߇֔#uU Ҽ#_R$'C} 6 ϼr |U 6fL͐.N# \XkH6WU#K;?}0!<-OT@g盧(NїVVw|0Ge0h8pB)d7}~C%iϚ)mZ/`$;c 5! ,lovGqqIckSNbI W`?mwJ$tfpuf|;eҴV|M@"x?}2Nh#-)ŁTJƨȠ JҀ@0!U3Jvn&u>CFlosRMKK·Dy='x;*6R?kB%/\ 4ySLCJ'8LuʠpE~vZȼpz~4o/ `4DE}ăi,$}ҟu@! tԴƪۋx KƔ%Wyw%NHoNJBU@wHk(C1N:Ow>`O0$3peIctFf*¿GƖ.C up7^U+ չ4#WѱzeNĕÕz0Ib`0' m6w^bRUN2Vhab1VHTzZ"4>-njiJ8I7wY'yWi.9!އa!uߪ<:E!^E7zI4OCc6$ '1 y'gй| IT)zSgM3y@eJfȸLVO+l#xC#Koi!{j"`ܮ嬽_{=4LaNkKcz _a?Q&!+c3/?,yAD+` &8}|~ENE_YW8썇➤+ ߄-(V|ӢiD&+opC.|pw\uح`-a>B[*ϙ%qx}bb cb[G1ZًV`I}g&iVPE1 XVtj -dLX1zE-MBPA u/G]U;vIMX^gӤͪjl~56XxBVnتS(ʐ2j%_=C(lV]^rۡ? ֵL?-bUJ1vz` MF~D4 fv$bc=-CCТBZA UV:a CT᠘M7Έ]hRSUu;Ijnvrt;Wp/XdkH)hFRqm\PÛs~04s颇 +KQ{ʾt%P%[ZBI-\̭h5a U+D ͵+úY-X)\an)>w|7ۮ@#>[xo+TӫHK_>&٦TAbs\%DWb¤0Y4/:&Am<x9~|'=y2>u"$-ʺkEw~Tb_- W-e 6}!+6s+9::9|v4."QxzKy=ľU w6p /l<;0n s34]̽įlPW, ܎*r)F._O4b&Qnplj2,tf NIcu=[z@+ ('\b>=o⿿ /.?Dr$ث#+[¤<7vmft7錳]`kXTۗ6M'ft,#FՇ+`i1EZB7Wfca!sXW9->j?b~B f?1ULVZrAŬYgOCcTb 5?Y~e'Ɛ. x$K5/AEdWֲQ<]=WA"z0yHo eZ 6yY5!*Ǐcc@:OE_> stream xZmܶ`@흺|DM&v")SH\_y!)J8p^Nju&/og?>/rKy!MM.oya,w(l>>{'WlMYVTthp}2P2+eyq#jWwJ&sNfk)nΧд޿YkqvqCW>PbCXYm{ZTm}mV7' tknTJcNЕٟ/  J,!wԺT ;0~ОR{hP&f l-:Lũ:)#pmw^ډვ(gҥL8`TP𬒅 -Ǻ9w ](FT}ަ px:BL_hxl8+d`O-~bN\4բ^:e8جd2.SSI4Lvʔ~fu駤u֛/h:վ^ 'N|&K5ɉt0~L=> 9%+6&ŠW:Ӆ~7ȃrmѯ@wͶ:>iYD;J&<["h>T|;Š~u@e> N ֋h[i<9"Q|k1 5D: \c$1\p;$ ~(q r$a)+;IDd8aFGod@D_P⹒H1oS$C,Fڇ6nm{^ \5kвWʰ¶أzs{Wm#W;M "R=,!h'HiK0ۚQ$GDc`+EpVƺ<+P<=,┇ƘIs\^H$ 4 =osA&~c}y&BHۻr&tFJܺY< 2'$mH.B(0V.G-gij M.G\b"HrRSިp?Usz0XJr(/bȡb ZM&|%KUtZ_@ )"ЫWWQpdC@Hف PSvj6,İ\ idL32q3P0{oWP=xw!ֶs1MTZDKqZ&^g nm=(,ET~!`,s ‰7?Y|v kLZ-l=Lņ˓p2A[X@WO/֚5` S|228Vc; ."h)R\KpD24"!@GC5yͱƜYC8-) p~+@؉[5‘88!YJ1McDgΔ|2ϩLNfN(ߊkwb̭ERr>ɴPfRaԅ. (".x>n}xϋX$5'Xld7΃iuĉn}n7xXZuay2҂bu![e^ HC 6-b\`N9&2DG#@p8n9-D!_bR 2jmIN1o@bhG}f]*JPLqQAGI}R#.<9+| <~]SgcLS?Y8gɆJXEJYDG¥Ic~wgY &{%Xl3,5 W|;oWXjG]ZBg\#GN"^ gߢd#]b)(])v_saP;:0h]@I nC| ^ T/ވCxq Ւ939UX"~Uȳ*rJc/I?w[׻ [פ9p"T .P,;&b~1TFE5z+a!cҋ 1UBo1%'p+)1g:Bý&ɿ3s̞(\U8 *!˾AFkq'Dbg&KG92S?,==<~Xqԡ$TF^q-oW:Ojj,:ĿĨkJ!;‰nf|0_Ғ'ń3b_V&$yWo_;P'r(*+9&_}><J5DE;ʒ5ќFMBM}B;xS4k<T)Nd|`A(֦/$ロ?nƎDh9Pɲv.;NҹخB/`A kpn鼷zl$t21T2Ԝ1np9-YH/07_Ø 3c;6e0[\} $ "4<+fit2m(0Z[j4fxPq8\ J>XB/*eƔO>LJHwΥ9ĝ2k%ݰvD^6OΞ3YρpAI8A.Tl%qr1Zmaj ņqYiBC' @)AeD{l jm§|0ߊo ]H\î6+=-eK_z.jO"6! eJXׇaR((3¥Gɠ9> stream xZYo~oC/ g 8.[HƂ$3,ɱnvsD{!iOSֻNoOݟzw˓<-2]O<ᚋ<׉rwrŪxrZIJVW}Ufj''_ئ0V=TIYLR N:~P2_+.iC uǦkWkO54U߳(2ͺCԯp_)VmKɓ-  ת]5{ {QQf\Ci@a/-w8dնnYimv^@ E $a,qS$:O_ac8Y}SB v@ݶ[4՜=X_/d*e#* C]OЌ$dEY?f(̗Y]+ /ø J+ؾӿH2]SX5iΚ~kvcFHM1%()?ǴIE>L 21;ϦJ <`DbdX4dB~ɾnA VB5L#z=~ 4{@]qpvL/%R83r'r%G4, _Xw$%.>$"8UumcAU)g'nP@'m'uK޴)#"fk!,LEyi xXi>Di4yj[ϜN74WEy9 \~Us"/'C_OO5_mrq 4Q}1:G5eDw틻CORkmrt=e3o̓?^bUf=K - /x7D{jܲL>AchJB={p4S?E$!u5,W^{s;MrOlOOq .p@fkhh=A)0FM`>2/~Gtѷvfj5Yq| )mH^M$eC7˛wh2Smb S%N<υ0ŏNZHRgpُ'Tw%uC4*a#.6QDFO>@Y~n%E^< @ooTwa8wcaWRƇ[uU{qWMxG~8( "nc0FwCrqNGM"DX*]1i޼P7'{충q9zԸ*bMo&:H(t~Zss^;cOK?tzK|w iJq6g(C6S='0xNmp!z%7utDi- Fci(v1m{y:?@6;~ ! Elcj?3u y},g[c l8+cuԀRxtbGdʝ,-W~V,w/zȚaejΡm z>Uf?#Ĕ0C= R%Q\dK6FCbcx,BpØ#|!% .ZZySdyƯspBV9&F4۟rF0N(KbMMwN;[&N"9@0JޮB[!AY>VF8 CdNt5g! 6y! q`Y7қֱCKs馪#<.(nb  O*N5 ‚nm;:v2!)؀k{ytKLJWPmȌ@ _QQU^<n\hUf9[o@iW:ѥv6| @.-X3TO=Gd36Ʋcކm8?=Qop@|L ǧh)hz[ Mn*HnSzE(@*m5u5 o~~\=/33"M}gtw9gZ2q%"1Qb"hR(wSG&KdHm!{ k>GAWámЭX 0:k6&vqTyH4\Hf Z,kM[dC.%KX&Ϟ4/."T*ag.u }%&2lb[G ;&'1.KjBM7m86+}ʥ$ NgjDr1愑9kѩ &:zЁ&xt:)[0`אvIt8ByNS!Du=n`OrZLq[SGxsgD~3ڦ)>}jFR^򰚀7Yxެ>Sh.`Ch)(`Ֆ΃#x1(*t+#R4E`$ jڶJ|ʿ~{ [,LEQYJDwO"e2!b߽gt1(_UiQ RY;k^t.c{"NhWS7W~,}Y쉶pW.$$k k8j1phdj>ޅT=:'w&*b_lʄ Pm5iwl G-u}J}y|Sp9 Iz^7MoڗtazO@g54od<n$m76$K/7n9ҬZn,%Cl̯01En [2.'@ULH#3c)fK)A|7XK 4S!9c3x17(fIT.,y ~q\vSTBůojw &j!P~w}@ZQz{[  kq^-ZCcfػ;͹$f_v48舒[])St{F\DZhы"J-q~F)BL%CBM3 :!\e!k5#bLj}SW[y`7lђj`W1,`ϴ qFƓP{Qfuɺ/= $ԏ  58{EH?b2_Eœ@4(':>L5<ślPYl1/Z' 4Úmr;P魉7 tܲDNY_an{(fs"=r [GV6g6my47((HvqB2WC[C!Y9֙D|yO_pendstream endobj 51 0 obj << /Filter /FlateDecode /Subtype /Type1C /Length 2802 >> stream xMkTgɫR6njujo(^)"@$$$!m$B.$!AEV^ЪuVzz=3ӳDs:23O6%ʚ}m,YyQƇ;ȝ;e3oSXY[WQAYV^q9ۊ`۱Bl[Vck:l" ;`BRsVsʉ' |%%1S_٦7<ʪ8xT'О5Q|5` = vV,xZyʩE&!TL\TAyuOGn_#`֯Yո^jB=*:Q;q aA9FN!MP&pom+d ./0RW vJzpwB'/+,P(?;} a=YN3΋?~;'v!bY.Ol * /nFG~Gmp7 BlsM:Y+1(LYǟf;C' ErY!eLj4#a讀AxZyALmuȭ@mB]oצ͚XHB'Ϛy)Ïz_sN򴢫 .WS-/=)g^8 D؈[6!Mgo6jmsVz@/t%|uL$&k'6dꃯ&o&9&7*A]=;g',7[zUSn7')M4z}G"< CrD?Mݽ9tC6P~`ٕ Bp|Zt>Ms=V-b?z r9(p!3zŠV&Q6CK-N7ݳZHlnU,8R3[LƕO-zxCR@-&0!=mgvޭ #iZZ}-c%0tXs@U 4 ~JFnEi,rq@~vV>h7Œ2BB&ѢPQ4w`EVr~߄F hFVeI-E[ozU{6xL/"h36JU{UE-U*8qZ09EhAMn!5D5Kz;mёQ: ;]JhkRی*vvw⢛Qm =IɍI,vY~=;>&P<ʏJV2?_~+?%Mi%P̳ NCqe7Kz̔EwmE|/?\ {j.'}^& ¶JrYn-۷7W+D/l o$M8 RIݸQl㝀p[ɱ9H'ʬo䰃(~O3,>U"*QRgk.f(su^XiwlPO M#PZG%8y?$eOȜ>Oʝ w:°endstream endobj 52 0 obj << /Filter /FlateDecode /Length 3670 >> stream xZs{K pMiS+u-%ӁAHDB Zf} !;L"w{{{;O~?,=8sr}۳7BI:K\!v6B'i*՛XT-Io?3'qfo8<ɵ緯ne6Ve9kuwwMnLh6`mU-ײ9LI9+7MW5]P~IڢڕmMEU]v]:)7% <ow,̴%s vI7 ۶CC#=CaUob<<bi#D.6e_>u7~ q։ß( C)J9w+ @̬D[0X 8gAF>@W@f8 r所rc߱z7+>K[3a4hfv1n܄ܚ%=.r&+xҋN Ȯke`*"6@I[bUwf]^e pM8&dl&9 rwX4ۺMjwI%m?M:F{>^@p N4/V |*765O)I`z̈́#a@єUY62?lI/W^ .7pݾm?1G'VGM$@ MǺ[s<=HV}rv*mE'rtJ=wbfтmbއVp.LUd L^3I^M;X@e!T1gDn e,;h_oLKSI RliacU_NzɿG;s|'UGk4̝+H9B_ e2'ID)9q9]0ck>s'79[tmXCL`naqnNǧ)ztU.a "\7AJ?s3Eȟ%"4,8 Y`Ӻ ?u@k?ɽ_DQԄQr C]Ğ3'Nd?Qѡb7mD gr20'L UR%I{r_TS>~}sޡJ\gYV{F7 =ϣψSg'lg3>zTv/"ݷ_gh EQDmD~S #7#B\tUE9T`E@M:f8df}bc^O%U\jpN,xQ#') 1ɤ!Q#&-}kRrS2nGYx,RoQGIYfPM ~ƮɃ7 ,$O%P!J9l'nZ\ p;ɴ 7뎠[ʇ8~i<Ç l1-OŔpSnF?'dW"8NϐRWUUAD θoֻ^l2e|vtIoU@}1YYtQL¦µSX':51&UlAfs)דovRh+`1w} جW Qi 5$T‹A$\D#җv<7;A #_ss+m 1̨F->̝OWpp{B?w_V<qdlT0c8w ׷n[j_@Z'2/w:})+kk\ɧe]1 {|΃Djj̠^wxXoG&>bq7ryUjFm8Tn>ֻ*ؽoS() Rvr,_{9GWs7z%..*i 2!Ŕ}VE m\U8c@m[A8! sIoʺjHA9d@yH6~$]8rC`ey4$P>Ϊn|[y E6]B#;cNzMԿ:452 Ldpҵ_0(nXp!o`X̥}ֶph<}s@,$ wᦠCٷ7*ՙ&ݸKh|,|8) ."䰽 %;wvqUuhؓ_eRKI]:\+;w&OhL ͫI*Yˮ^]^>>>&u=ki.? }YMY|\e|ԢO ?=/d> /W [ 1 2 1 ] /Info 3 0 R /Root 2 0 R /Size 54 /ID [<496b45db610ca9accd584af6eb8f941d>] >> stream xcb&F~ cN{L n@0jrm@ ;H @I$$V ~I $3k  endstream endobj startxref 57281 %%EOF tm/inst/doc/tm.Rnw0000644000175100001440000003350513155253051013550 0ustar hornikusers\documentclass[a4paper]{article} \usepackage[margin=2cm]{geometry} \usepackage[utf8]{inputenc} \usepackage[round]{natbib} \usepackage{url} \newcommand{\acronym}[1]{\textsc{#1}} \newcommand{\class}[1]{\mbox{\textsf{#1}}} \newcommand{\code}[1]{\mbox{\texttt{#1}}} \newcommand{\pkg}[1]{{\normalfont\fontseries{b}\selectfont #1}} \newcommand{\proglang}[1]{\textsf{#1}} %% \VignetteIndexEntry{Introduction to the tm Package} \begin{document} <>= library("tm") data("crude") @ \title{Introduction to the \pkg{tm} Package\\Text Mining in \proglang{R}} \author{Ingo Feinerer} \maketitle \section*{Introduction} This vignette gives a short introduction to text mining in \proglang{R} utilizing the text mining framework provided by the \pkg{tm} package. We present methods for data import, corpus handling, preprocessing, metadata management, and creation of term-document matrices. Our focus is on the main aspects of getting started with text mining in \proglang{R}---an in-depth description of the text mining infrastructure offered by \pkg{tm} was published in the \emph{Journal of Statistical Software}~\citep{Feinerer_etal_2008}. An introductory article on text mining in \proglang{R} was published in \emph{R News}~\citep{Rnews:Feinerer:2008}. \section*{Data Import} The main structure for managing documents in \pkg{tm} is a so-called \class{Corpus}, representing a collection of text documents. A corpus is an abstract concept, and there can exist several implementations in parallel. The default implementation is the so-called \class{VCorpus} (short for \emph{Volatile Corpus}) which realizes a semantics as known from most \proglang{R} objects: corpora are \proglang{R} objects held fully in memory. We denote this as volatile since once the \proglang{R} object is destroyed, the whole corpus is gone. Such a volatile corpus can be created via the constructor \code{VCorpus(x, readerControl)}. Another implementation is the \class{PCorpus} which implements a \emph{Permanent Corpus} semantics, i.e., the documents are physically stored outside of \proglang{R} (e.g., in a database), corresponding \proglang{R} objects are basically only pointers to external structures, and changes to the underlying corpus are reflected to all \proglang{R} objects associated with it. Compared to the volatile corpus the corpus encapsulated by a permanent corpus object is not destroyed if the corresponding \proglang{R} object is released. Within the corpus constructor, \code{x} must be a \class{Source} object which abstracts the input location. \pkg{tm} provides a set of predefined sources, e.g., \class{DirSource}, \class{VectorSource}, or \class{DataframeSource}, which handle a directory, a vector interpreting each component as document, or data frame like structures (like \acronym{CSV} files), respectively. Except \class{DirSource}, which is designed solely for directories on a file system, and \class{VectorSource}, which only accepts (character) vectors, most other implemented sources can take connections as input (a character string is interpreted as file path). \code{getSources()} lists available sources, and users can create their own sources. The second argument \code{readerControl} of the corpus constructor has to be a list with the named components \code{reader} and \code{language}. The first component \code{reader} constructs a text document from elements delivered by a source. The \pkg{tm} package ships with several readers (e.g., \code{readPlain()}, \code{readPDF()}, \code{readDOC()}, \ldots). See \code{getReaders()} for an up-to-date list of available readers. Each source has a default reader which can be overridden. E.g., for \code{DirSource} the default just reads in the input files and interprets their content as text. Finally, the second component \code{language} sets the texts' language (preferably using \acronym{ISO} 639-2 codes). In case of a permanent corpus, a third argument \code{dbControl} has to be a list with the named components \code{dbName} giving the filename holding the sourced out objects (i.e., the database), and \code{dbType} holding a valid database type as supported by package \pkg{filehash}. Activated database support reduces the memory demand, however, access gets slower since each operation is limited by the hard disk's read and write capabilities. So e.g., plain text files in the directory \code{txt} containing Latin (\code{lat}) texts by the Roman poet \emph{Ovid} can be read in with following code: <>= txt <- system.file("texts", "txt", package = "tm") (ovid <- VCorpus(DirSource(txt, encoding = "UTF-8"), readerControl = list(language = "lat"))) @ For simple examples \code{VectorSource} is quite useful, as it can create a corpus from character vectors, e.g.: <>= docs <- c("This is a text.", "This another one.") VCorpus(VectorSource(docs)) @ Finally we create a corpus for some Reuters documents as example for later use: <>= reut21578 <- system.file("texts", "crude", package = "tm") reuters <- VCorpus(DirSource(reut21578, mode = "binary"), readerControl = list(reader = readReut21578XMLasPlain)) @ \section*{Data Export} For the case you have created a corpus via manipulating other objects in \proglang{R}, thus do not have the texts already stored on a hard disk, and want to save the text documents to disk, you can simply use \code{writeCorpus()} <>= writeCorpus(ovid) @ which writes a character representation of the documents in a corpus to multiple files on disk. \section*{Inspecting Corpora} Custom \code{print()} methods are available which hide the raw amount of information (consider a corpus could consist of several thousand documents, like a database). \code{print()} gives a concise overview whereas more details are displayed with \code{inspect()}. <<>>= inspect(ovid[1:2]) @ Individual documents can be accessed via \code{[[}, either via the position in the corpus, or via their identifier. <>= meta(ovid[[2]], "id") identical(ovid[[2]], ovid[["ovid_2.txt"]]) @ A character representation of a document is available via \code{as.character()} which is also used when inspecting a document: <>= inspect(ovid[[2]]) lapply(ovid[1:2], as.character) @ \section*{Transformations} Once we have a corpus we typically want to modify the documents in it, e.g., stemming, stopword removal, et cetera. In \pkg{tm}, all this functionality is subsumed into the concept of a \emph{transformation}. Transformations are done via the \code{tm\_map()} function which applies (maps) a function to all elements of the corpus. Basically, all transformations work on single text documents and \code{tm\_map()} just applies them to all documents in a corpus. \subsection*{Eliminating Extra Whitespace} Extra whitespace is eliminated by: <<>>= reuters <- tm_map(reuters, stripWhitespace) @ \subsection*{Convert to Lower Case} Conversion to lower case by: <<>>= reuters <- tm_map(reuters, content_transformer(tolower)) @ We can use arbitrary character processing functions as transformations as long as the function returns a text document. In this case we use \code{content\_transformer()} which provides a convenience wrapper to access and set the content of a document. Consequently most text manipulation functions from base \proglang{R} can directly be used with this wrapper. This works for \code{tolower()} as used here but also e.g.\ for \code{gsub()} which comes quite handy for a broad range of text manipulation tasks. \subsection*{Remove Stopwords} Removal of stopwords by: <>= reuters <- tm_map(reuters, removeWords, stopwords("english")) @ \subsection*{Stemming} Stemming is done by: <>= tm_map(reuters, stemDocument) @ \section*{Filters} Often it is of special interest to filter out documents satisfying given properties. For this purpose the function \code{tm\_filter} is designed. It is possible to write custom filter functions which get applied to each document in the corpus. Alternatively, we can create indices based on selections and subset the corpus with them. E.g., the following statement filters out those documents having an \code{ID} equal to \code{"237"} and the string \code{"INDONESIA SEEN AT CROSSROADS OVER ECONOMIC CHANGE"} as their heading. <<>>= idx <- meta(reuters, "id") == '237' & meta(reuters, "heading") == 'INDONESIA SEEN AT CROSSROADS OVER ECONOMIC CHANGE' reuters[idx] @ \section*{Metadata Management} Metadata is used to annotate text documents or whole corpora with additional information. The easiest way to accomplish this with \pkg{tm} is to use the \code{meta()} function. A text document has a few predefined attributes like \code{author} but can be extended with an arbitrary number of additional user-defined metadata tags. These additional metadata tags are individually attached to a single text document. From a corpus perspective these metadata attachments are locally stored together with each individual text document. Alternatively to \code{meta()} the function \code{DublinCore()} provides a full mapping between Simple Dublin Core metadata and \pkg{tm} metadata structures and can be similarly used to get and set metadata information for text documents, e.g.: <>= DublinCore(crude[[1]], "Creator") <- "Ano Nymous" meta(crude[[1]]) @ For corpora the story is a bit more sophisticated. Corpora in \pkg{tm} have two types of metadata: one is the metadata on the corpus level (\code{corpus}), the other is the metadata related to the individual documents (\code{indexed}) in form of a data frame. The latter is often done for performance reasons (hence the named \code{indexed} for indexing) or because the metadata has an own entity but still relates directly to individual text documents, e.g., a classification result; the classifications directly relate to the documents but the set of classification levels forms an own entity. Both cases can be handled with \code{meta()}: <<>>= meta(crude, tag = "test", type = "corpus") <- "test meta" meta(crude, type = "corpus") meta(crude, "foo") <- letters[1:20] meta(crude) @ \section*{Standard Operators and Functions} Many standard operators and functions (\code{[}, \code{[<-}, \code{[[}, \code{[[<-}, \code{c()}, \code{lapply()}) are available for corpora with semantics similar to standard \proglang{R} routines. E.g., \code{c()} concatenates two (or more) corpora. Applied to several text documents it returns a corpus. The metadata is automatically updated, if corpora are concatenated (i.e., merged). \section*{Creating Term-Document Matrices} A common approach in text mining is to create a term-document matrix from a corpus. In the \pkg{tm} package the classes \class{TermDocumentMatrix} and \class{DocumentTermMatrix} (depending on whether you want terms as rows and documents as columns, or vice versa) employ sparse matrices for corpora. Inspecting a term-document matrix displays a sample, whereas \code{as.matrix()} yields the full matrix in dense format (which can be very memory consuming for large matrices). <<>>= dtm <- DocumentTermMatrix(reuters) inspect(dtm) @ \section*{Operations on Term-Document Matrices} Besides the fact that on this matrix a huge amount of \proglang{R} functions (like clustering, classifications, etc.) can be applied, this package brings some shortcuts. Imagine we want to find those terms that occur at least five times, then we can use the \code{findFreqTerms()} function: <<>>= findFreqTerms(dtm, 5) @ Or we want to find associations (i.e., terms which correlate) with at least $0.8$ correlation for the term \code{opec}, then we use \code{findAssocs()}: <<>>= findAssocs(dtm, "opec", 0.8) @ Term-document matrices tend to get very big already for normal sized data sets. Therefore we provide a method to remove \emph{sparse} terms, i.e., terms occurring only in very few documents. Normally, this reduces the matrix dramatically without losing significant relations inherent to the matrix: <<>>= inspect(removeSparseTerms(dtm, 0.4)) @ This function call removes those terms which have at least a 40 percentage of sparse (i.e., terms occurring 0 times in a document) elements. \section*{Dictionary} A dictionary is a (multi-)set of strings. It is often used to denote relevant terms in text mining. We represent a dictionary with a character vector which may be passed to the \code{DocumentTermMatrix()} constructor as a control argument. Then the created matrix is tabulated against the dictionary, i.e., only terms from the dictionary appear in the matrix. This allows to restrict the dimension of the matrix a priori and to focus on specific terms for distinct text mining contexts, e.g., <<>>= inspect(DocumentTermMatrix(reuters, list(dictionary = c("prices", "crude", "oil")))) @ \section*{Performance} Often you do not need all the generality, modularity and full range of features offered by \pkg{tm} as this sometimes comes at the price of performance. \class{SimpleCorpus} provides a corpus which is optimized for the most common usage scenario: importing plain texts from files in a directory or directly from a vector in \proglang{R}, preprocessing and transforming the texts, and finally exporting them to a term-document matrix. The aim is to boost performance and minimize memory pressure. It loads all documents into memory, and is designed for medium-sized to large data sets. However, it operates only under the following contraints: \begin{itemize} \item only \code{DirSource} and \code{VectorSource} are supported, \item no custom readers, i.e., each document is read in and stored as plain text (as a string, i.e., a character vector of length one), \item transformations applied via \code{tm\_map} must be able to process strings and return strings, \item no lazy transformations in \code{tm\_map}, \item no meta data for individual documents (i.e., no \code{"local"} in \code{meta()}). \end{itemize} \bibliographystyle{abbrvnat} \bibliography{references} \end{document} tm/inst/doc/extensions.Rnw0000644000175100001440000002727113177024075015340 0ustar hornikusers\documentclass[a4paper]{article} \usepackage[margin=2cm]{geometry} \usepackage[round]{natbib} \usepackage{url} \newcommand{\acronym}[1]{\textsc{#1}} \newcommand{\pkg}[1]{{\normalfont\fontseries{b}\selectfont #1}} \newcommand{\proglang}[1]{\textsf{#1}} \let\code\texttt %% \VignetteIndexEntry{Extensions} \begin{document} <>= library("tm") library("xml2") @ \title{Extensions\\How to Handle Custom File Formats} \author{Ingo Feinerer} \maketitle \section*{Introduction} The possibility to handle custom file formats is a substantial feature in any modern text mining infrastructure. \pkg{tm} has been designed aware of this aspect from the beginning on, and has modular components which allow for extensions. A general explanation of \pkg{tm}'s extension mechanism is described by~\citet[Sec.~3.3]{Feinerer_etal_2008}, with an updated description as follows. \section*{Sources} A source abstracts input locations and provides uniform methods for access. Each source must provide implementations for following interface functions: \begin{description} \item[close()] closes the source and returns it, \item[eoi()] returns \code{TRUE} if the end of input of the source is reached, \item[getElem()] fetches the element at the current position, \item[length()] gives the number of elements, \item[open()] opens the source and returns it, \item[reader()] returns a default reader for processing elements, \item[pGetElem()] (optional) retrieves all elements in parallel at once, and \item[stepNext()] increases the position in the source to the next element. \end{description} Retrieved elements must be encapsulated in a list with the named components \code{content} holding the document and \code{uri} pointing to the origin of the document (e.g., a file path or a \acronym{URL}; \code{NULL} if not applicable or unavailable). Custom sources are required to inherit from the virtual base class \code{Source} and typically do so by extending the functionality provided by the simple reference implementation \code{SimpleSource}. E.g., a simple source which accepts an \proglang{R} vector as input could be defined as <>= VecSource <- function(x) SimpleSource(length = length(x), content = as.character(x), class = "VecSource") @ which overrides a few defaults (see \code{?SimpleSource} for defaults) and stores the vector in the \code{content} component. The functions \code{close()}, \code{eoi()}, \code{open()}, and \code{stepNext()} have reasonable default methods already for the \code{SimpleSource} class: the identity function for \code{open()} and \code{close()}, incrementing a position counter for \code{stepNext()}, and comparing the current position with the number of available elements as claimed by \code{length()} for \code{eoi()}, respectively. So we only need custom methods for element access: <>= getElem.VecSource <- function(x) list(content = x$content[x$position], uri = NULL) pGetElem.VecSource <- function(x) lapply(x$content, function(y) list(content = y, uri = NULL)) @ \section*{Readers} Readers are functions for extracting textual content and metadata out of elements delivered by a source and for constructing a text document. Each reader must accept following arguments in its signature: \begin{description} \item[elem] a list with the named components \code{content} and \code{uri} (as delivered by a source via \code{getElem()} or \code{pGetElem()}), \item[language] a string giving the language, and \item[id] a character giving a unique identifier for the created text document. \end{description} The element \code{elem} is typically provided by a source whereas the language and the identifier are normally provided by a corpus constructor (for the case that \code{elem\$content} does not give information on these two essential items). In case a reader expects configuration arguments we can use a function generator. A function generator is indicated by inheriting from class \code{FunctionGenerator} and \code{function}. It allows us to process additional arguments, store them in an environment, return a reader function with the well-defined signature described above, and still be able to access the additional arguments via lexical scoping. All corpus constructors in package \pkg{tm} check the reader function for being a function generator and if so apply it to yield the reader with the expected signature. E.g., the reader function \code{readPlain()} is defined as <>= readPlain <- function(elem, language, id) PlainTextDocument(elem$content, id = id, language = language) @ For examples on readers using the function generator please have a look at \code{?readPDF} or \code{?readPDF}. However, for many cases, it is not necessary to define each detailed aspect of how to extend \pkg{tm}. Typical examples are \acronym{XML} files which are very common but can be rather easily handled via standard conforming \acronym{XML} parsers. The aim of the remainder in this document is to give an overview on how simpler, more user-friendly, forms of extension mechanisms can be applied in \pkg{tm}. \section*{Custom Data Formats} A general situation is that you have gathered together some information into a tabular data structure (like a data frame or a list matrix) that suffices to describe documents in a corpus. However, you do not have a distinct file format because you extracted the information out of various resources, e.g., as delivered by \code{readtext()} in package \pkg{readtext}. Now you want to use your information to build a corpus which is recognized by \pkg{tm}. We assume that your information is put together in a data frame. E.g., consider the following example: <>= df <- data.frame(doc_id = c("doc 1" , "doc 2" , "doc 3" ), text = c("content 1", "content 2", "content 3"), title = c("title 1" , "title 2" , "title 3" ), authors = c("author 1" , "author 2" , "author 3" ), topics = c("topic 1" , "topic 2" , "topic 3" ), stringsAsFactors = FALSE) @ We want to map the data frame rows to the relevant entries of a text document. An entry \code{text} in the mapping will be matched to fill the actual content of the text document, \code{doc\_id} will be used as document ID, all other fields will be used as metadata tags. So we can construct a corpus out of the data frame: <<>>= (corpus <- Corpus(DataframeSource(df))) corpus[[1]] meta(corpus[[1]]) @ \section*{Custom XML Sources} Many modern file formats already come in \acronym{XML} format which allows to extract information with any \acronym{XML} conforming parser, e.g., as implemented in \proglang{R} by the \pkg{xml2} package. Now assume we have some custom \acronym{XML} format which we want to access with \pkg{tm}. Then a viable way is to create a custom \acronym{XML} source which can be configured with only a few commands. E.g., have a look at the following example: <>= custom.xml <- system.file("texts", "custom.xml", package = "tm") print(readLines(custom.xml), quote = FALSE) @ As you see there is a top-level tag stating that there is a corpus, and several document tags below. In fact, this structure is very common in \acronym{XML} files found in text mining applications (e.g., both the Reuters-21578 and the Reuters Corpus Volume 1 data sets follow this general scheme). In \pkg{tm} we expect a source to deliver self-contained blocks of information to a reader function, each block containing all information necessary such that the reader can construct a (subclass of a) \code{TextDocument} from it. The \code{XMLSource()} function can now be used to construct a custom \acronym{XML} source. It has three arguments: \begin{description} \item[x] a character giving a uniform resource identifier, \item[parser] a function accepting an \acronym{XML} document (as delivered by \code{read\_xml()} in package \pkg{xml2}) as input and returning a \acronym{XML} elements/nodes (each element/node will then be delivered to the reader as a self-contained block), \item[reader] a reader function capable of turning \acronym{XML} elements/nodes as returned by the parser into a subclass of \code{TextDocument}. \end{description} E.g., a custom source which can cope with our custom \acronym{XML} format could be: <>= mySource <- function(x) XMLSource(x, parser = xml2::xml_children, reader = myXMLReader) @ As you notice in this example we also provide a custom reader function (\code{myXMLReader}). See the next section for details. \section*{Custom XML Readers} As we saw in the previous section we often need a custom reader function to extract information out of \acronym{XML} chunks (typically as delivered by some source). Fortunately, \pkg{tm} provides an easy way to define custom \acronym{XML} reader functions. All you need to do is to provide a so-called \emph{specification}. Let us start with an example which defines a reader function for the file format from the previous section: <>= myXMLReader <- readXML( spec = list(author = list("node", "writer"), content = list("node", "description"), datetimestamp = list("function", function(x) as.POSIXlt(Sys.time(), tz = "GMT")), description = list("node", "@short"), heading = list("node", "caption"), id = list("function", function(x) tempfile()), origin = list("unevaluated", "My private bibliography"), type = list("node", "type")), doc = PlainTextDocument()) @ Formally, \code{readXML()} is the relevant function which constructs an reader. The customization is done via the first argument \code{spec}, the second provides an empty instance of the document which should be returned (augmented with the extracted information out of the \acronym{XML} chunks). The specification must consist of a named list of lists each containing two character vectors. The constructed reader will map each list entry to the content or a metadatum of the text document as specified by the named list entry. Valid names include \code{content} to access the document's content, and character strings which are mapped to metadata entries. Each list entry must consist of two character vectors: the first describes the type of the second argument, and the second is the specification entry. Valid combinations are: \begin{description} \item[\code{type = "node", spec = "XPathExpression"}] the XPath (1.0) expression \code{spec} extracts information out of an \acronym{XML} node (as seen for \code{author}, \code{content}, \code{description}, \code{heading}, and \code{type} in our example specification). \item[\code{type = "function", spec = function(doc) \ldots}] The function \code{spec} is called, passing over the \acronym{XML} document (as delivered by \code{read\_xml()} from package \pkg{xml2}) as first argument (as seen for \code{datetimestamp} and \code{id}). As you notice in our example nobody forces us to actually use the passed over document, instead we can do anything we want (e.g., create a unique character vector via \code{tempfile()} to have a unique identification string). \item[\code{type = "unevaluated", spec = "String"}] the character vector \code{spec} is returned without modification (e.g., \code{origin} in our specification). \end{description} Now that we have all we need to cope with our custom file format, we can apply the source and reader function at any place in \pkg{tm} where a source or reader is expected, respectively. E.g., <<>>= corpus <- VCorpus(mySource(custom.xml)) @ constructs a corpus out of the information in our \acronym{XML} file: <<>>= corpus[[1]] meta(corpus[[1]]) @ \bibliographystyle{abbrvnat} \bibliography{references} \end{document} tm/inst/doc/tm.pdf0000644000175100001440000022316514367743046013574 0ustar hornikusers%PDF-1.5 % 1 0 obj << /Type /ObjStm /Length 2987 /Filter /FlateDecode /N 57 /First 455 >> stream xZYsF~_1oTʙJJei-2uXV*IPBP_3`@:vS`ˆ$E"!(Ή%K∇\O8pFT^ 2ȔD0 )0,o5"Ri]<=QqD2@TD3H$'~&F (csx2;b'$#{N$'NJT %t3V0His\WИ0kb; xB0ȍ hB,)S* (#ɣ@Ϲh+ 󅖹>s2 m %hY0 ?~Уl1 C=(u@&A}FOf7Peg twLHXnbǢlN~8.h6Tx>=\eP!y{;||Qj `FߍG9  b*&[98".rbXQ6gߠ#L^ :sj]1]rb !q1+Ժ u Se4=8PrL`/"Zsjςo:v.} xD?OГ_F6'I<:yE(%8&u :ւT`%f4;?<]ϹXm<q49mEos6mU|/"ĸshۑh6 `C&f~mkC*rfMڷmx&۩+CؼUJt2 pg3g2p&x랁g1gIDY$ʺ^gKTObZ-$^Q >jrDKMD=i"~+/O'w{|N^>܃ aesغw҇]2݈ïo80â:|wU,Ik`C" 44~>+`̀57ɟZ"ګojqW%JׇKV=t8ax1Hz8BcwӡǗCuEWbd?+&C;M$@zfChr$'Ϛt^^Q4&ۏwKMi5)^hRyvt]==l󐹤1h'|B֢m.Y!6֦)i:xh|t:;:hG+<“ᬱ؉a-1-s/BMa'~kkIy)_ܮy)!_:ZwKE)yӆ&NXvDxZd-&Q z~~2t^̷ChEF5T;7_UG[ۮ>⨋j) R Fz} I'%^[l쿃Iendstream endobj 59 0 obj << /Subtype /XML /Type /Metadata /Length 1168 >> stream 2023-02-05T16:07:17+01:00 2023-02-05T16:07:17+01:00 TeX Untitled endstream endobj 60 0 obj << /Filter /FlateDecode /Length 5457 >> stream x[KFrb9b"dJvڃL7fb7 w> ̈ ^Y5*rqU?zx^ǫyTWʥ0W–(UT\yus|[H;Wl͵,B*۝S)A2|-emPyQ@V*n Zа8Ѩ*ʎ[bd eVaw#a9EL!@-Z2W"w7qd5'Gκ/,[<<,URb!5~MvHlrmu6r7,+`|Zǭ*T XYrҕ.kf`]%a+-yl*ӰNteԟFTٙ^g '?yI, R+sKI*A5h0KmZ 07|+ ĎVZsw ?murLT-,/n;oGy!L+ڇ&|F( N`ZY}?L@`25!4{ FAAc2evl,8ʥRTnlp|d?o)C}l>Nvd'eA v,evU~J5,d'ՑPRBU T!qMn L@Q`c4X&\kc`tw{BT K O36"ki܍( `WOFOlEUҦW`cnw]`7&bیch@+a|Xw ]ɼRz2ӂ# MT!u k6p@bTp0.{ÃihauE9 \>x ȴ ALFJ"tax⏼ mGG1PZGQjޭ%bn߂v J`rupKYR庈ufFwM67@rCci#9cDF2HXATI\"AѰi )|O8h -q'`P8@E3: M߰V߁y-J&]RɒBkLd SllPĴ*,g ʲrb6l5ӡ |BhMP0!hހ 0bt]s_*$䙐tTPk7EN ͠BޱdsU24X DudKfY / zj_7F4@)zmJ37^qL{4Q7PlgưgŽK]y#w#0ލtKz@<]FqZC x*tiS4 "=7v["[Ưe@m@(v=4kKˈLO/IGK;s`pgvχRB:ٹZI0 @Vps$&K97 Tf+9!&͋'ߋ uY]Ix;O>W%Q\c2QeHm. r?";] g &eCGT Tȉ?K]@{M8L6 .GU/V;XVዓ$BQF~I. b~!ɥ I9l.b1fgh^q,x1s[ |Z4_t+-MIZz*'IC 3kft.yɾk'up^K.aq GGaeش&(TVoR8!+==Y%d8둍!rӛ&a"CV Bex24[BMdtZV!{fÕ0!& i޲BN5t!T5pT4º"x*ˌN|WEO|Xr١}G!'H0Lg&U@Bc%ʹ\bLn/@ HxiNx['@GB q`VVPUy^f6lEQhB{3 7BIGd8ĺf6ciN 2m:w)|V_!5YiKc=ѐ>+ Cj ғ~T` @S[_`ydwu9U.S=磔;ȑH-VWO T0@ ȕhW~y89s9=*vmaca~죽'\Ywa\7\BpDiM#UTn U5VzϳMRfJiE݄bG΃DU\柇6)y dĝ\I2* g{-1S`A;d|k%QXs0Ƽrx3Q+@+"tX/f6&tO'?>*O[edqO 6VẃYT?`n1~Q]0)\;Q%}+7󽁔UC%dxDSg=~8ش81)[ϒo,~Ve)}PipNb-~yv\<)u; .+ŸM ǰEy}Zxm>T2DTt߫l${>5r_I)kdw4;vA+QP%\\ Z+7"I-@JhՒxł*2=DuN(K P,Qk@ :*=9ov#xM ;kiJU.d&OŰT%侘?,^$eֹq.ͧg(i!*jr4;;_T{p7OWW +M8v|7Cr^-KH ÊV)EFxaAAm\9OPOKnLt(;S5sq= g%dU EXOaLbc>K>Bti;&] jtK&F! KzpshhYf)z~*kw~D'BE=Bev.9Vק=p Z; 1[)@Wù?8f}ߋVs4C'lrW^xo-Wx[㴖u9ȺK2lB?ޅ)εMm}uNL/n*ꖬ"ȯ.ͬ@_c՞'cjz-DHHDG?owqjE^#Lya(V*TzsއK _K.Udy;m?~}Ϳاqj}{hni|*4U23(پ#WTl/K(GKzt쟮*q-@m{gG,K+YR6WɊXlFb i)Լ4f0•͵1WfX Kk}o2JSendstream endobj 61 0 obj << /Filter /FlateDecode /Subtype /Type1C /Length 2205 >> stream xU}PgwQj[Tޮt^[ۙzն(`-V# $!$$$$+ ,Dŏ ZUKU^7{n6f?vf}w}EDrmۓsPD2H%6/o2]b9XI"  BdAbޒע%(0EN<,|is|p?iJi*,*(UGOh9363;oƊoLmrs ݇B>oݣ2 n24PjDL-'s&h&fJU)V^K EYV #(h1_ |0i]_hw?<:xJTGGkss"#8G( ӱ܏rWn堮w؂n3^LJZX{-)ڐ #D1\ky'\TۍPIV{:[!Jߦ(v(ߒ}όKG&++Yuw}#uG<+6R׈ұѓ=(Keϧ63|gk {qhYp޶aG]3lWٯF+nJ~OzTRG?@\__RѱC[67Tk(;Gs q~ot( _ɨJQ@12RQSMK ~ OGendstream endobj 62 0 obj << /Filter /FlateDecode /Subtype /Type1C /Length 321 >> stream xcd`ab`dd v r247q$~.)2%0SI.C(3##{ܿS3^gh[CpEކB?d{N\wſ%؅o]]U]ˡz8͹wNm͈do?A»6C+|}  ?_4m>𵥋;r?d~u9l'V=9obIۻI^(ɎGr.{յ+ʶEON, bk9 ~7c+4{nnn! endstream endobj 63 0 obj << /Filter /FlateDecode /Subtype /Type1C /Length 198 >> stream xD SFSS1728@  =R3m9c^Ivѽ@?,CuY֠ǪË+>4Նg.R^]Tyf~dd ? Hendstream endobj 64 0 obj << /Filter /FlateDecode /Subtype /Type1C /Length 2233 >> stream xM}TW'fjKEiwjwk]֭.."P$B $3ɼ !&APREݭֺ֯Swںzn89nR9;3s}yߟ"$ɴkSxaQbbd3Ĭ͉Bls츻y"<,*Y[.="os R Fi-"XMk׈׉G ".|A"$&&}Iz01<JCK֦3G1_B 4-YsVJ+]TϽˈTn9J+qa@M)UA'Sn{ XCmS 銟;!k(t~-,HlC Wߙ5fQ %zyM\` >T A!*J,{Fq@>X#>"6fso"xs˲k Vtga#H/)*` d|+ P9FRD5JluP~5,y.m8겯&4,OU*A.vnlk"X=4,8:b0(JKxWz` qyFFf)vJ|8@VC%:ƫԱs֋Zjez-v4F җcؔNJU %4qSNzå0b:B'x!NtLkLRY2`S1qe5oa4}u y-cf?Rag&Lމre (0WW P7j^hDŽ%Hȩ{*'qp1oln(Z:>oMfH詖?_yH e{ZK(%#0m`1C]h;ⱗ*+]:]T8Gɟd#k2(=rюhDSֹC < 9.G>h)OC[`־h1&DzU6Ksֲx>ՀNRηC;:uÓ8'Xbz@b;t9 ZДە:}[ꛪ8+k3R j[-HWty*;KK+**> ИݣcxoPZjiwt1DD-:w5g= Xhekj] ݣN)$cߺr5ToY5/mhj K(X:PlmsVt 2t$'|{VS־O(S^[O۩ö+_#~ ƈ3@l>NN t}lƯ`% A& \U)2gQ_ ֣в&o3u k$xGv=ic醜3Tk17|vW׾p ϼ]-))8m2Ӹ'fM12M,Z^óHĩp%܊egIXg[3փ]AOw[',`VQ[VY[ HźoTg0BWCn{t9@>Ҳ'^^YL]*S%w9NV`C!bJ-YPJs q)yGG>91p3Db?=UZHrY0`~PfL_܁xu7e? >;byJɲLAmGՀfnadd7nyiUr(}:9*i!^ȝR(A-&nc/he?^'_Ͼ+~Ԯq{%gc~)04ύاލuyo\=J]+#p0֓ u. 8 #,MMVr TBi-/䔐 {m`YILUPjn\Àk\`*+L^?;fӥ v:>qyrRendstream endobj 65 0 obj << /Filter /FlateDecode /Subtype /Type1C /Length 3472 >> stream xmW TSw!T-1b?.jgj;.,.u P%a $!( ¢(ZJqTThosu9grrNywwy F>c8 tw>V$OwK9/vkr@J`жEQ,+VŇ&&M j"D}@PRjj 5O©@j*DSj:ZL͠fRKe%h*z1.rgPho m`1G|1⻑GFw=mt.9^Z(.!F.Τ;!nPw" `aQ}[NG8}2Gܺ`;VO}]mp7) ch׮JIK͋ޑnKm}K<*{xw ZaV|)̀_-rYhKQ V2C![ e?ߨN f7s;R@ \S'&44aʪ*5e&ؼgӻ"y( %})!DZM(ԖM' =<}2Ţ},ZxN&`q46ǗRRNwФ-`ޚXv3ֺS8/UYb;F^r4лI*^y|`}5T$,'N|CVgOˋA".v08u8cD&9ZLֲ`i2J%cf /3b`mwcVم23,Fӏ+++*m575wȶ1օ DE~ºk&]Sv^cHr6Qor(U 73REjP+ 4O~#&)vAQQf0∖6I0۾^P  (2Md<xsD2LPf6wNSPoz`dnӔ5O'|"mv^s~3cv(ɏ#DpF)Z~N*'c]Ҋ%N @a(0TWU[2Du^&2N$yƞF8zW47Sa'3%[gT04ȥ [H/ B>̋褓:x{#qry$mmg-d>VybF, a/X̌EUR dP[~cz`>p n|p?5|kѽCgIlAB| GǕRYeE66Yx,,6ҥu;.̵ZAop}Cv$u: {!&i9u6w*jUpj0qMjS.0o[ag|3fީqtL[DM8䣵Uk"kO~ٷY} :N=22\)~a@>fOV|+[ZTѫRH;iYNA,r aVkR?DAU5ouhnB84*%N7xE-/-1@)cQ d<9,c=me]^akLC}Q `}c*{] /$l&pݴkt0)Qk));s/>GDG7>yn.k;I %۽WW,7gG޵_˖9Wm fYX@e(PEf"qYIޟ|5Q\Sy@n_lBFI|>;pT[fRU$V5jg/z=?5<[$#r.A!Ù{;*cc|x|3>2%K8''{^+?YlBNV*{/qߒګe?o 7mY.%/ļA挥 yussUbSO{#sGHpwXendstream endobj 66 0 obj << /Filter /FlateDecode /Subtype /Type1C /Length 7596 >> stream xyX Hmh^koV;qϺET d/fB0čZ:jK{?/XoxnϗĜ=3N>D\\\ߥdU#> zB0fNȟY0ق9¹"%[d-ܺm9rV8g&=‹ēcFhb!(3xXB1'I`C '"H!ÉH"pI4ō^†>}L&XCXȧԓ~9_0@7@A?6}Cz!KR5;CdpviNoIG4<Y@`W(*Up* 8FrTpBvL:ʁEZRJzi zY0E+Jˋ۵xua"z?o\XJ0 ƟO7_IfU͆4j"76ph!ݕ/u0qHMf "ߡiU Xy5!6,nj2Wp{/;C+a(iD_s ![KF:v޹st`N?6b,qFB#ӝj2ʐȐWڛha֜dא!9ƣ+mN+"ݦEYL]RqInPI@%~$$P=I]c?Fz Q|t"F*ܽ[`S2h:(ުv* LF`҈bX:?-H!yAAiB$E:i3Nt[;:R}}/+ie:9O+CW^ x ,3d6re+ 'p%ybɜB S%UA*?Z=ϠG!(3Ăv\i-SgHUY2 We`iUUA;pP.Vإh*b]a{1* ~j7bxʢEn3v9涋תWge̝tw9^Y=¢`)WZg B|`֕dG~ }BY':@}Z͂}pm)Yj?Uo|Gc=vǹwCyrJH5=GIXWqey x/jvOQ\)>>Ygu t\\ ^79KeQļq-uGF4g/sVO¡wޏs.%ow:5%څxQ1Y(б(氻 bap~XilFB\0 hwj$yx|{ Q'k^jWd\OTqRC{^M*3ͱQsv‰)$Ăd;jmkk?e=pnl&0efjɰ)q" d0Yھf mۣY6&V{.$zZKh1+ȶݑP'rB%rJ"K1*VE \žPyhGa#)Fv6vg<YS+y¼"*&1`{o2"N 7I=[?R7˽Of2Q3$*:L;N ~arcôP܋=_B/ Մӭr&],\sqܖgJ"V^"t[mݙ`XUld2(7=}tu?+SX<l|o9 Z|-[0[X5Vku;cn.;;w? Ūi\F,5SpYQ' ڭ<!nZƨU.)YZd!HQ+ mʝL8`+0x55j^# 8*2bO,їL̑娲+t^}"^񡬤hIy0Wpp륄WfDB+fp +MJAv1DOG`間DTPR -nT-<3h33s"wph'݂}EdeV˱ZroXc4@Zqg N\pȾ+@ݫ%digTump*KK r*[;_˂AXKp cQNHǑkjW&dL]2^2gA*^ C5g2Qw}5L߄dawbJ)ů@ħ?FO1ȓ>(na t Eb<ʽ0K_NŶzuq<Ӎ WM#بHira_aLZJ'NGRFJm)獆>/@˗>[E/\oxOztܫOn*ilk xu! GcBqXT&0*rUppLXfO|y>}WUh/ ӕ\dsYԣjr;JWPw=hR72Mg/%f;w2FG]} ̫%|{_0{?RY1&zMBcrҬJ¢e7`9 ʆZCX>vή%h鯠{fB%1|+*NN"`4x5zW{-O~vC auؼ3 _'¹ppx;@vCg LFi?֒AN8A@IaɾK?ɏ"jE+ҔɁH<-5/;u@_0 H"Y=endstream endobj 67 0 obj << /Filter /FlateDecode /Subtype /Type1C /Length 2107 >> stream xmUiTWBK)!vk !. *( FmY 5[7;4K  Nt>S3w{~߽$amE$iq…f RֺSWL;$}s`51a6FDzO^71!f~?@ b 1XGA|M,"6KM=ď4ibAck뫼$^?-?@ϧĄV2$j _[ai۵z4A߀GGlg&'_0J> M;Y<9q&a05?>ׁ5SH"Dx:F& ߌG~1kIPoD\Ivӻv2accHR"v!/pe ~pKbJҧ {'aO [W[Ź9Y(g*PUh*qqe?lӊ1A(?^F 0 Cbn!&ʫA si)@uM/;)JÔagny@-)+ה97"w<VpT&>S?̠KAW}uZHi<\2 `)9C<e =<~w֠+,G=-yҾdW"j!kg+V-UF2t,?$ݐn@LXom)1 g=N(RKw+mFS׎;I 0TY óeR}ꭩB'!\J,TёRIR$CyV ZтgاkYRr]jF%iQ7sw\R-FmY&OѮa;($a. 9tH o[sag/5ob}a^pFw1U[,!ş51lynFqTTQ3L~ڞ-FF)H2 ǎ%lsQFS_I}쓙'3VuK*VpƗ&$QC9x؋C@  x)CZ)v2{Reɲ}݈9w҇`ó6SHDzNOӖ喣r/kK'V30߈`"5^!'[P\SiƩ!%GLEZufLn.v/b$$f'2Ceo̺1NBZZ `UikuIb+^mdm'eS#bGRnm>ZQСxOɨh-y)ft{01"E]5x[9xmV[+G V-o3?WOW)t6N⻠M޲* ^(pb~h~#4' ֹ\Z89|/hf׸eqaJYc ?@w:a6]+X>NlV_B uWYGaI(ܒ )OnsbQoepRدFM#:L#wH~La$8*d]S*΢J!=ҡp^kt{wbWP=:URά;PNZ*Ckݖ@x{caa!뼂UL2-rIɳ,e0uFμ1hv@͑ܠQXd98 09'>WG>pV0|aEf&ˡU*KmXgr ;(ʐx}c2A.[a.clSs$XXξ-ps/r87Md v6VӑwFuu5B\a='P7VsY_Llm8d; ?EpVendstream endobj 68 0 obj << /Filter /FlateDecode /Subtype /Type1C /Length 1041 >> stream xmmL[UO){j65V{nf Y\,2aUdui;ڧ-V-iفw8DX >qg^jI眜=$$R5C23376O9D d?V񏷠AHP*D:*B4Dϊf2|IqẨ`)'mz G"ɎUA\^ i|\׏"-*kl:v0dǼ0igE)"voOxgf'ʝ*GIs8| * efB6%h3X@6̗z#.OW2"WnUw7y/:\x5U`` }ɫ4ɦms kWT} ZA~9Z~i );$'O_$+'Cy[A0}//7kbԤ]< 5,{MKS&-3Gj+LJ~LYJȸR/K)ձsYr:|FJZS`ܷ-Mܕ(ɵ4<6ڣcZK@~_H[D>a|i|AIucR\89UY>䕦Gɯ VNI#ʄ+hG69&Q5j~#̈́Q,ϡydHG#F]^UMXhtt?էr~E~`jxp$$@$2lendstream endobj 69 0 obj << /Filter /FlateDecode /Subtype /Type1C /Length 4266 >> stream x]X tSeM)G@FZ'gDǠ(R T@jb%MҼMVPPp/.^hQCNj Mmf¬pvYU\NMVB_]vk\$e/66ʞ-=/{H\Hlll*R]"ɂ ~3­Lgo)Kh3q8#4CV<ۯ{eRfKO$]0Yyo!u\ϺЄ]^xW";**4Dg31y }Wbuq{Ƨ_$m֝JT޼'j a_3]umk7FC7:VlAq~pyWZճ-7ߝM L*I@Rx㑿Crtwk4]/KJw-Ug?SpX?1FW>*qnpweOs j 0]RCi#fZocx.Ɛ)=ϾRa}}mjjxث1<OO]>vN`Ҡq`scI?w9ݐvE>KShye?etfllzR^ME~͞V8?v`8=@105W%Zy~~mkosyTF}09OI7ۛRS1:.>2 +кcB91GD˄.-ṷVK_:+j_׈ 󙳁p?`QYz5 gh 4G.R [HsLjWįm9wlwx }E/=c4zh] uZZԵ?M! ^5y *OPg'g8f:2:p q L:MNӤ[rmX7Ku@fPiKҐ!tf},ӡU~~byOcĮ]Q{oSe>U}?w=)T8F%`A+RxŸ[e%5|kK:Ÿ; gbXdv1(?t[U\6PGÏ:ܳ3xgM1ȣUoHyt:YD3tnaZQ9u4-]Em~0ԇo;>!#p,KXޛ*Qvmg7**'fڍx pg<}0s⏊6b%A }VS OџUx#ɊXEwŸ4tߛ}~GeS$}kޟ+r:/Npk] hh8\e$(۲l ?-ԞUmE6UhmgA[V(iTr,]"͖mnKWydtԒFOdXNUU>mY hFύ^^vJԛ_Y[<kvx}t+rbSnZqE^LĩWKF{/'wL?l|>Yi~)NWJzd= +ڑso#fDj!:cXbi~Diړ6ˤ9c{i|Nbh8o vvN':ko>`m'D64۪iF #Օz 43;߻{DI6h@<&# G 1GN saSoOhm4M/-CzBL)E%zJPU@?O- 괊'OeѮ;HohaʹL3GI9XŻy<1k#l$k!ޠ70UW/L]_=ǹDG|.~|Χj6^G,ҐCit<7g{ߩ+:>Q\M6_ޢSܩrך';í<NJOP%݋_7E]b΍Uu4`hI&CaSWv5=5'Ԃ<؍Y?Wh$zP. ozeC!jNԊ IKCaXwʪZ'0=YĺX̨LbT\OѝM:+㕀\vhcjxsq1׎ϖ.XeS|ERprz Jd}T]Íw{OEdv2oѩ5g|(;tP'X֓K:7hK:Oe=G#|gd%FF^+98Omyn{&I!w"A'qM`DpXd19.Dqz:b;0z6'΍CS6_ $g-x*,KTD7Jg^C?Z7ڷx0u3Pƌf =9R^4m'3٢],=Dz.|8Zʙ@尢™d-1s=Au#rS7ڠ+y:z0a,h/cp˥wm0EmA_>vu՝l[ iͿ+L;X>3(BU7-&3N`(vB؊l+:VVٚ-MH8|rMpF/&}xH{:݀3mM[iq*|?wN&b'>wq\Hٚe7hԳ3 COϺf̀Y3aA Fendstream endobj 70 0 obj << /Filter /FlateDecode /Subtype /Type1C /Length 8803 >> stream xzxĐ(q@tDE(HiRi;ٽd{I CDJQ`=9y])=<CIfͬ}Mr`999͚Abޘ7U xS(*$^:Yl˧rť^Vٜ-?`A7ŗʐ^C~O?aiPl:6 `3b/6 {==by?6{-bq l<<6{{Mc)+Tl6{=u^úacX>Fa`P߇t=m=,K}G>QeĂ w{V}Lw<z?3L{OkODM"O{rsw^5=-xg&=]>in+a|jm<=ЙuUbOdnTY,j % z`Ƀ/Oެ *>eMmq:͎xF>3uC6"Pk782|Ћ U+k؁NB8.,x.;#Φ0c<DL_%rHҢ ϦR)5 BDL3s'{^9g1B\VtD+"tU ÂU-,f(lq878{vVhRQ@nM^_4%9.@{ɹs0:awXiVd{-Svl(B(DzGynݛ1WLSqP,.Vj#L?<"CBQEjvԪ,,A1G@ploZnI7)W<H?a=N:WI?S~c6%hīL>.ӕ,Eѝ"Zf =Iu5on5oʣ߸fd ! g-Z$)[@;}9~ѥ>G'AmQnw_RTbNPb_N.Sb/U'v.kD"v߸e_0/ۧe,vP3zQ=J{<+"WexFU>!S9ȝ"\WAK껍vbs{6ofچ}WO4IO'F2uĜh;0ܞ7l_vqFܥ@EuTQteh5Qg@d:^ )m0.imحAq`;L2Βhϻ[m=sIV9eEͶhD~DH9r('x}^z2&bRZjs0IM@V"a^ FG,BNli r3]K G)q/ q67nFH87ά?JO|ϟv05I8t`]5k5ͧF]-kWŗ҄OfEga {,r& rsy$e*XB +֏;G-&M"Pjh=p`eh*P"60/fC,/iv Ai,ʉt"M';҉>:yNp+?P  P*eH W".bWE͜Έq j:2-w8?\Gt,7-=v]ލ)|FS#wO̅!XJĥ l N>lwi6qc~qV$T^qfʹP#Gԑ: O QQ(`Am@fa6$ROxfacV:1"^Zp憘gT\EFU!aXxVudvaЦ!-<5-t6.gniˏ?Z=@I<v}('+բ_vO9gv {y ;;xVXh B0i"Win uf2ǃDBBd:A2+>UJeDd t NIfŋX2Ēn#$@Blڄ@\aRUQ:Ӊ%!\ǃXU F.CF$t>h#dΎŅZL߭4VδMߣ^`ص6a׿4Цq)tJZ2O< `uZ5~;_Nћ -;Ž=lÜF"酺[p5*]fOYCr]i^^4ɔG.Ldy,Rv!Ç+b#|i.2pKh .FA)+]&_ ]ɏܖGs۳ Dr7)D~rK ̙**_i[ّBg+Lo8o VMunj*Ȅ5td%Ņq5M^Cᦔz.)?(>(=(9R(%r:^\S&N3p= i;CK#]jY$.Eјu=sH]bB/5ѳ(qu$$WM#%YV0sD[vIsmB{gBX|:8˾@I..Eι닽l>-RR{ئM^UqG&G "; KH0q}ȮɏO\B;os 63m¤/p#PxjOim)Ԛ|7|,RqryewYr'K*k,dǐH*ԠL5p[fuN3k}%D%,PѸ9Y܊?{~e;tNsx[ޓ-ά|wvl6iޓϳE҃oB!2.qJ[5+00 i>v;g8SHU ׎ᵋBCi9LgC֍;vX0)q5hOC.M DCvFn Trd0d;`xRji9gEq]Ųv yߴ #6 rܯw|JHG_+87㾨Vܕu)}5:=7ӈ~$Uc1G;c;zAA]P 7F]zI&D65 v!O-ok/U+A]ŠOGop򛏉8x~ǫkK]k9U8˂Ռn4zT5@ʽbI}^&*3Ǔouvѻ{7#]|-Fsh&HOaʔڕSߦIUSg1ف 8#V/( Nl]YmHD[[<|y[<< 1:_' XYR].U]VU*i=ζB8s!ڊ5m[K9KEwlo|픠vA)w0Wqng©h#7i\An 5YDm-k1|a{*E%m{Dj,u|q}5)Sn'e:ڳ?u$řqB΄5 l)1GA*>֢耎%"j-/pQ\(B=jl7 -j\Vgk);  m4 .V F$WY+Rh^hLw ɺ2ՄͶ;NPΛyWQI2N<+'igj 7=" & kD&xݢYE}y*eخm06՛`9v)5u" $_B l@rr a""=D8ƯJGU./@۾]I)23 KEFԧ@F9VDj9,MثN1p*qPӑs-SW#HMҟ-PxQ"f^!Bh4\gopd}YndA\LP;d/G>2~"4~u"qFhjGT/TU*J}1s)MB p1WJwUd5 j$hmIGOb}V1p\+Ȏ;`_Nم%!Ű-;sJ(kPǾ8]z1g]!OGzO7Y- ZQF[Dȍ̶WȰ G@n`Zً85FꭒtPrTuȘ ͪ7}`1Kcx VQ9̸la2g 3eEtl"`T#f!u@-~pMq^;y9z4u(g* ih<}+p#+>O>2?O։"ZK$#kz6}wG/hY= [tys]n -AZ !g<}ojҨhl1LBS R~byQ)({8}*Y@݇bb_m/U˷lvo }iG@@jX`NmlO=W}:{񰫁%\f>" N/e)rwVUh-RPY(}M88jJ%k S[U,~&CQGY=XU.!E"^.T(1ej tCqjU#1GJng(aQ|> 7x2Ae\YS*, Փ>ѾCե ~qNn"n!e7IEBf c6yn.#Ln_iQ0nXM$BmQS=>5jy N$j nHatAn$Ʃײ 0,!-]s BPJLU7 }ZUMK<{5>ObȀ # /` VQ1O sDʌ9Ӌ~l~LELQfP~E #ۮov}}>z>{ HR" CE\WW_'kP"7ەdy]8h,b8}"5k)2/@ ׭o Hj#կ-"Zti=CQy.BuA܏k Tw<~T $k bRnSwY*f?ʗT vAJ'm8h$M巯+Go"D"@cFĻ5?W*GijK0h%QiڀX#K5*;XmU:~N|=\pOqan6O8}G_K|lN1S45AplHᔳ .oZ[j^I\!҂"Q q"b.gn_Ny 4ǻ~P=}܅suJx/(!MY& ^b ,|׺C_oNxI G*~dEXJ*Pq @lwv^ NNDھMw1,! T$c&mpdd:8c /;R8S RxSҺv#&߄܎΁ҟx.^FKN,㳋BTEk>̓U<'ki88y;C<'΄eyi"[TzeJa.XtKV6@^7a |YznV~2)UWeavq!߰JjE+x m?xne!'hyΟ?]`dɷ*zP3 Dc,|n=ٗ,o#;N!ېYrB],G緞}x~80=cQWa8; 3!:g! XI5an3!’07XQ8" HxAhQFj"jjfͩ{tvGPgQ+s$Cri7oDRߝqԉ6Rd;g q޳/3IkQ9v($̥#'NNszb3Ƿ嗟/( OF|1O> ٟǛuy l.(,,- +ټ=¸sapIhb6.*NYK,%sm-Gl/Y7OP?/A]LoMcTpكn xTuH*G\柆þ`}̎aَ.vȁzxYkh{ƎY:81$lG\3- {Ě}X3c-:9hDs}@漟 [!AҥW:'&M$X᜕0~w#^i+m3*u+PڻonIC{/=~#=}=X8b/v/#F^gorYnB`#2ZDoCeܔlf-95o SZqpLr'2F >CpUfF)5.=Ѭ,tx S@0HM^2["[ [nT&9i$!Y ga (Ԏ,T.ZV3oN0&g7+qGW}i \:`9;9$qW8Hd2͞tQ;6[n:QwI:+j%B#Sz{]mHzލ[UkJ&1l3%D0F8| Y ; 2 J*^Evv@Q(+u*-@ϧEB7 }z&HC KfoK;CnܡCE/}^o~?wJAHVz|t*+T) ⺠:+s:&)›Qt^(s:Rc;&['I)`{]d{nns :U3`>;ZB2ILuD @PiU@ QmPˈlg>bZ)b_v]K?s"|x] icSwTK'Х3蒏aݤendstream endobj 71 0 obj << /Filter /FlateDecode /Subtype /Type1C /Length 815 >> stream x{LSgƿӃ>YXSa)a2px ˖2:[X28`SzE 5UL.a#:8%דυϓw[q.+_"ߘƏqt#>Up 8I*7G;:|dK镀44l X Q!sa?Dfi~zu%Nq[ePJX5]>Uh [i .p{ k{e0~.g_][63l [HlW;G5O*ݷIѼͧ)xo>'[y}`/UjCi'$Ǚ<ܫ`+!6c-}E.Hlq[Y_kʈ2,v$%ko)/a-lao&*+r ˽P,qe$&Z\;rWRy 70:bR-,s C`_44dwi JFnH_7&:EIvnemeo8zrW{%cKт wGL> |da! w{endstream endobj 72 0 obj << /Filter /FlateDecode /Subtype /Type1C /Length 8273 >> stream xzxUU!l99!7*TP ]z*zrze{IBB @pT bQ8;޵O`>}IxȓZ[~-rXa ^6{'L𰐖dǭypX.6(_#h}'g1fL^]3vfݬ EśH6/,Ye%eo/[^bʪUEO?6Lx'NzG_}mo<$(k1k2k k k)q۬eq'X+XOVbbfMgÚz5kk9? #sb[ΐ) רᔈut?:ў>vuSckqK=îgnW{3/ΏByqy,(IS pnS %⪐뭆B~xdIݼnMZ+!!P+!k!"iQ2h& YJ% 1wa*LQC)WDC@g]=7qoe 6]'":B>eQj7o)TbqHH  ţ갈*.o pIe$4(]h~* V(%B@:KA S#DCx6Fٖu.zjN3Gzm|N :%^G_\_m7: H^o2cAcP(!D2>mRb&ebJbR}hG<TbS, +S$'܅?`0P(>x#ϓ+; {WrgvM6pEqH R 4 P,U &NzUhL^Cq1g Wq`E|w m~Y~x4  B%}3dx羽?Sn;ݾp 4YaMo#82%֪ې~Ҁ^'b6COm%~gWyۋCb|@E؎۾:AǾ KV[!etZS2x}+tdyB%VDroNf\$όBDL''FO/Gh*U)= ȿқ4kFI 5Osu/yI sG)nt!u{μ>$.t._3 OrT WSUL#t h_QzhE&ڠ~Q\y4J羼VP) Kr-yP.D0"jG1GZγBPn$+$ A?L#;s8ݾ@\cTP^vͦZ?}O\ V3uEz9]hq7R)8te&3g*!yH}j@FEHqЅy!`՚5PK4`j2=,&> :¢X0'anC&`PH@AB2FDCܐ "лC"ѿsmW&$$}/c2Y{YF #pX r#QkpG1:c{k͹3}W} <՜L5y*ׯT|\ D*TPbHt<4 4Ev'j90$AO#ERRУ3$~YfE^<7C:U!6S݋+n^{vQg M$g) 4H {IU@"6K4C:vM 4OH3M0va91q5T$g[ޖ*/Y|g՚4PCHH7^s=ߋ1$&b1p)t=#Li`zZBp zTH\X#P@ʓ0]D`5@C]exL &ӒmU[Ņ~=JB]-ĝ{}բ *gfXTfUENm2'.[$JR.-+ tE]q{ ~@oxٚP*pF> GN'Itj‚\Vh}vY|Z]UzW=jgk@4md\yzx0,'gH0"tJ: ꨼Y$oֆu>_ @  :|>&,5Bs^0aY[QbSl\Y|ALKDA٩ y(DxvQmU& Q$|s$lXzmŸA"PŒ\E_$2Pۨ/<[4Ei;'þ#î~auHa#2A@ز2I;R(פbֹ0QԟIV#s?+2ɫUu$ÁdJP̬fFMpAT#EU2l7=g3aԚ,2IFTzB `5)n$m~9~,z<;8Zp}qEn955[o)% k] *Fv3+cmF!f&?X_ͼ?8~\iILC A4C{5ĺxnupqK=*1Hkt5xk?&SmV=AÀ.NzB" xE Tr_dS,l9Į0 qy{?;p WV_z1?0UX=ePnlXW9_۰NVY+X<6OwR{B{Rsg:(sWCb˲N䖟K X0rTV[ !C5Z6֕Im) {Ԟ6k19b#b^Džۈeӧu<{# BI׷~*//?(։=X-BS2,Q76Iq?L*) I>s ;)Ya w6Yö'f 4"KM& Y[qD݀P>bJ%XFUS߳YEv!sQU.oIS@}?bU%)sK`-4KM17~KG@3 y:+S.#s2r.V.WiQB%QҤf+J_o:6%*z̘Ҏ ޫ> ~81<ϏWJ!?Ɖ%5UEfwۿ8=9BGzo?l;)aTfbȨzC - 3&)M5(~}rXBQ/b)w3| \ٗ{(|[{޲-[T6:gZH%g㾣R"3&Q?qYكFF̕8T5hC н;Аu'iYVZ)5i,mC漯axƺcKeWB|e3\ҋ,Z{%ϼvf(J~>|гO!n+Dd D#C6B H" yÔ 4w۰c!JYft*wR6(C1J*۪P&l_: /R)js]D >OB'G,]ɵA$CA[I2YeQRƚFs{ Vd,{?|zIFsT'~߅?ga6 B̮loP~urF#KF M̦MY"'ig@b{>襚.i9HVIU:lMo7h2([`P`cpi)DQLh!HC.M=4^`dgщEl;2qdfzϪ`y?m3n)c$ DA$ Ll<>s!ৌ1 ZX; Tõw[q2lp4%xMdS"y\.'}'@Q% ¨.MuXEC_ڃwdsrsS@TH%aUN337x?mJö*zPK/W QK ;n⎄#iy6 H1ӕ1BT+4 U fk5%؅/ Ƕ%]IGm;oϭ(f R i0#5̦~F"9IOkԏi+̩ 瘘_ӂ(=.vwB"NA{'z$ZӐ,J$"qIg+{ Dj^ x1vYkS`/W(Ʀ6vjy6fdqCH$Z(=me58NS_1BjIEB L, E+^b  X.21M:K}]9i+?x ?xCLwW դrOux,5Z1q׹ӈsz]_h|0ZgnU1Ulz:bݻd3;bs`" Jm[cm3*cqy=.=;0I.0B4"rRV" nR4LX;>EQD96@C|k-+VྙF蝼3dW P8qrn=wUp\Bqpٜ;P5ߧ,zS!9K{1p)R7B"r?KOzS¯ rL2Y }V{b{ 0E|_BE,8~Gل*$3ͤǢ~6Xbx\UWl*+(RByP7{(3ز:UAo 9X=-c#I7ʸ{/N;4v F~OE4^b4DB}=-IТDj4YrN FiaVa]Df0LkmBӠN xM([iDeK. v:[Z;jmOg/\Jx2,$Dɘ:&S|4w󛟠sѯجi9c4=Gh,4Ap4~~f-XOG>EAs'F9-yF<6ʭli'_t6H)Do3׼.0`gO܎ϕ̚mLubPRBKԫ06yHeiM߮e\L8ǥԾN1K8W ӼOLi[|PRZҩ }>S!tᔙ+yk篝FөI4jKK=mFaeϡ.1Su5ޜ3N14%NmB؜L4Il1`~3OWFfTE bLiԍ9멳% &??խk<-ՔѥB=r$,6+ vw `@Ͱ~XQvCop%4+o*Ө6 ҉i,v.W;u:UL٢$Wܞt$$q1$cR@v^k7OoR%;{ޤZnn94 -CXðrx6'v NsDc[4@$ [ I=Hk(MźsŊ /6tVrO0I/ [Օuw| zM* ӽQ612t!=^RCgo$AYJArtwAp  6Gf`i=:I9z/hZTOD;t6ڛ>gѺKtV _]N~D9PqF{Țujbc_Iuo9-_uH{e߿tRlrj澱y=?~C]_df: I&D1fҷ2曊ML: s'ɛ^g:|2x*x{m^,Z1>Zl07Amħ`b<X23OFJfTS0UIhw8q+ɥJfܘ7P=[|0MDQRѫe3KfnYR0uJ[dϗe~}PD,ƥ>=[Ҁ8 ߦE $EEG/>{#8d}v"=Cze_Db Gaw/= K"'ŤrlVs?r oř.k VPtAQʡ"2d@؇MhgQ:~hݪBj)N 9l06 X m] endstream endobj 73 0 obj << /Filter /FlateDecode /Length 2179 >> stream xXKo7ї"RDY .8E4<~T!Jmdr|$z}_UUUqW/^Pu՟wJE*ejpaOUF4&"pqK~$c)ImI/XW*Rlq SFr\7MLLnR4B}ZC=vEvVFq|Ktﯗ䫽-QRX_z$')9|II,RN?t&Wdwy1T]X{ڰ4R<{T;HMlҌE2U-;cV0aNm7\o3\ܑn$<z?)1X ɛvأ!^NTnc4R4ԝ WAnĜTPr}{uTC^C~Ҙy4fUtqݕ289fn:#)\F}} pި"b(-#7CV2gw9]mt0y2:4YNOXFFN`OjqT~ɪ(8$z-@X:#Ewt9".tP]Cҭ1{;qe_Uz80& S)@G|НhDC"|$^A%P#yYu][{ZÝۇ_W_sLL "@1z\n|BS)D>< C"3,'57%Ӹbp;hNM2fY1iҹ)!ofG~S %t5E^kRA`ְ7JUS@7k%F !S8Ƭz,1F"I>Qjaz9e=(eR_&d+/ҟV ͘= C譽 2cnk6`szC)`/Se_M>fȘ!.DHs''ɜ9'8y&}.OJc׶g赌Ա$*g/pk 2 k#cWJ@dr>vw2D;3qRǘ̃ r $3F|!,z|;4D¥GEíR%1&<ʚ $rC DbH2v%=(zŘ \QtuXW{ß r'A]Lsg]OP6`ώDE"Y[S2"̪ͧW+ Wfѭ^'*aPѵM"J65?ֹmq+$L >g17C@) TOgz$/htyKGb4f[zk<`&4|g'9V^}'Roݨ+V ڲ-wk*Y,b,u~H<"aa5tlllCv^U[q_v*M::.ijڈޡnpµ'^1hzڴIKuںx}`[/Y{[EwρR 0.Z&c|C4Ps2&A=LF.ˤ) E˰^'L k˨Qj%uN?CotR?g10UIAh־ra@:ԆwtJq3VH;0D%B8}TxΔ,R>YsUoLU2x6XFMesPŽ0ӹMOGI?RsO.51$3Wn3@_͠ ?.Eܘendstream endobj 74 0 obj << /Filter /FlateDecode /Length 1843 >> stream xXn6SŀɃ%[iQCt49HBF{yH;"(|qd߬E㌴Oݟi&d/;2C/]Cކr(yɨwQs))82'^E(giPL+NU1pӌNGrdy#D[Ϡ(Z 2|[q?wGAi+I T`)>HJT^* ;֊J(6>؜JT*֜rFŐ3*Ѡ|WHE{]HXl#"@YK Dɿt3(֦n.5 Zǐ:gopGZջH@F6YW6ODF#5ULGP+P3%lO_9@H|o_q n[$XݢMr`GOs$TOj F.}ro%+tO/^ |*]bu!5WjP/_M57mPvzTFRTTBLaS+77ݬdE0#+ bː`sfN0ϴ?Vr 4p<1V9(wѡliJ,yIohO!XҎ^gZc6vI{^KB *NkgS)^!5wFȿXm{c$lhx27,7Hpzݸ&F"").0i-Zvq}Gn'c74ܵ L*AV,lb uҽ]lz;p+3Asfw|> stream xYYo~o@qOaDXXP3- w2SUMrF6`GU}u|U83>_9'g8ɷ󳿼zU&U!&OtNNљj2ߜ}`sRzgV䖕 ~IemYsӊm<7lOf2W2UgYkZOY%lCQ>%*eٮ^&5h£T6Gj 8)9a;yB˃d!`uMgZ j|ځ͗gL翜z%sXtJƬ r](UaXX]s%t^ ]/"LN.o&8fwqeTCy oFl%ec$ K3t>(DU=<ή=9@drnJ9VW%?j5C\ e;~ޔ[v;OX- ^UνWQ$_WIXvfp-9+qGkt$~ D 6 œaU>n `(ߖML SET\PpsO Nq%RHl 2ů@!Dņ5kP/r/$۸&7h2J؎[Z#ok/B1l7v'RHeO k(|lc l64dlEutBbYۢ2T˄4Juysk[rmo唰B _4*t T=VNO ?ͤ5.%#R+: b!PLHpX_$pC1%J rwP|ǗyF媮pm ,HuC"[Ti* hu8G b'Ԍ8hX )B6:vQ6O#bPJ/ 3,>32 JSh:Ds(?Xr J~ݍ TP [mɬ oA3DS^?>Mq9E3Gu[TKqEJ8+7xXC\PF9@S,)PMS9 3I o [hgkXĬmנ R jX +tvoҀ`UVqhC.΃k״Q*7BjX,JKz 0H *»#;o*5UQ Wc-"'SItԈR5t>W*v 1r EG+xe#`=Ȑ5a;2pP2 i4F QHrYڇow6~FwBY) "5!>]t$94d.Z>)j@h}|8bNO eUuޕ%AC# Ovy*ePf$w(K*`X%6̊fλgӑG<зEqf,;]/>(3JݲsNլb| 2 z7{v=8gJU{X8TQ%~I"Quh;+k`0Ac/_xA=WgӚ|[.˶|1c('1Q[~QWTyK̜'B{_v",9N1d-H*cMZpI&!Q;YA֐dV* Xϡ7a`g3/4٠``DN$q|}/O: -xd:vOG4 ͘g#m  "+2HA噱+$ dKФFHK#G*G-A%TݔTi "g& L(+ 䣐xa,W4yxxHڏ[2; Q5Q ;:BIS1[05*[iG)#o8CTתl`ڗZ&VDE{A#k?1<Gɮi\M0 cƸ]q;wdpF.i.OًfM8B~P`YOS0悪Kӂ][JEio4JHyeay4%t7ɋ^+#)XA4z36W*T7j TD} %h1Y:f86`K瘧m4EZ=UoIL쨵 = IWv kOYg Lqhj3'OSؓj tzpM:P>~`G9֘BǧAӜ8OO2P#ߢ|4zİ$c+4/oendstream endobj 76 0 obj << /Filter /FlateDecode /Subtype /Type1C /Length 3210 >> stream xWity&U@%! |M @X 66,ڗ%M bLp@)KЦ-Г|||sf~hfys{8DXpŋM0ؔ9aS[!""\~ό=K~^~L4̬=9[sA- D"O$ۈD".DB&""XGN'Mrb^2⟜!CzC6 %y0yE.&v{c'g<#cH5F+^%Aiͽ5<'n;ZU{K$F gEh% j SQo|JX{ˌ8l\B&szP$rgBo2kQQ"vB\.yq iX2H[Uhv&;C1˩~o\@su"2z?Ƅ=%yݐYb?f?.HSa4j͋^\3vUW7粮eg_[fE9}FKh6 3  W` |(VI8]hk*hYk{A<&bTR&f1r4o{AZЙ'bd@T?->vko1/|TMqicOg2gDh?/ͽ~MLeh.yێK!0SrdtNGqAؚ[>3%I~DW&%ͽiFa,\X;WH&<Gs{P"f*:IKp(nskp3UPڕY~r:m8i9`.Cޱ怨{-٭`9\6W3 E]|O`a+Tmjڈ 3L$_[% T&ml2K*C?NGQػh:z#_&_Y6{菏_$|¢r xoUȝƽG@56OuU!ԑUUQ0^A|Wzs=wZK4 B'2بmFV{ZAzWJWκՖc4G(*,v]S1 m0uhҁ6czYȆ+d\Fjh1ljg{wQ;;P"k4>39w:]ঽxzZR rhWfvnRD}yqڠfx*ָQ\yES.pY@pePݩIw+lyhoG:f %R{pcf%}@k}ac,zUd$'C d|?g Vyъ>" ke[*r҄ B,GaZ*56ڣq(J%2 ?#I]{gSQt4;zs ,rW'Ϧw ]u!zF4;4YhIߢWx쬦 P(PҢ.cwCclյ3mfGD vjZ+[[wXi|e11H[i.td~c"2` WI 2FaRvt::0"q4C'}wI-8~OX#6A5P!l07\8t3> *ƒ ;ڱ6q=XUьr+τO@~4žEy?AB]0 2;j53ޫlV*a98BcaLO8$w{ʭ5v&ώ5bb^^硁 8gHo+fC.R 3L)q,߇q+A+Yz H̸-.lMnl&V"㳺Tkk6CSzTbsq0]hzro7ɪqTeN̪cuYŬ1ZI}o{Q:Mo zWEL1"_/`endstream endobj 77 0 obj << /Filter /FlateDecode /Subtype /Type1C /Length 155 >> stream xcd`ab`dd v 1400q$dbaa]R~= >r~2Ft''%'u5u6t7rTM^"c˾HLX8yq|>EUJ丘pvpupw100endstream endobj 78 0 obj << /Filter /FlateDecode /Length 2667 >> stream xYopQVr?t{|NMZG@S;ԑT%)NR!o׿//bw'N _[> "XN&2\$T䚽[Q,S[Yk4r5~f}\I4UpvRLjYQs#{wNd+8t6 X@EF[rX=Y-@$'F,cٹߩR;`Y]7}IV`لm@î"86QaTEAi #z, zA@l)j(뻦e-WZmŨPueHPtI(N?00,Z%£bTΰ,ϛݾ*_C | '3FA3AcHP WtMrAx'(=MK +zaԜ[@̀|V}ڃU_uFxT\3ti_Mt@yʽE o6p[%`7c nVmՆ}Sn XP EU/l(C t}(Xn 5Wy(fɘGF/2Hﱐ@R |WQ׷?qKWJAp_`]Ye-y ^a07pBA80`)'saНgt"fb%C_Whxȟcɔ2nkR*JmA28uǡe ;bw͡;]δD.䀩5 q)m85qWO\=9Qx} IIS Rڼl? ZT fU,VŒP<ϵ)-Τ-ChNe|_-wW?_sTG%e!lC@^Q7? :aĐ\$[VUVpSxzμM[Sxnz:Q{;72fOwBU<]4\0&뫋73J[}xN4ARKC3jmxWY^}f,h\z}~"fmT7,ŞH%t e,>E}XVO14qV8BciHS4.Tv2 JEARClI;# 'vs'4QQ~,=M}Cn p]3e=WMĚU6UX\څi75Jfhy41a-< ،Sc+z(`D5PsҊJӤw׮a GE UP=*j MaV̼# j!׸^ӉL9X[TpS0کK}/֨ jjzj6ynӈC&$ i|l"o3H8ZEI5y5fVzO"r&Jg(avvͮUG Fv7mpuDxj:%O̻e4$xhu|x'nXֹpfr fUdB#a͓q}WѮ}%UT<)>]oNq>&3Wg)}$4˻3`>WP@!ܭ }phhZsnd!@- UGֆ嬁BO Yuӳ0#Uuwyh% hUg7ӳC6!!JB|=`XL&F{µ ;e_|ꂰ<#+jxq /L֗2. .y9IA")ta r2.4KEYWvG)tzS MsBH^N(?CtT e:8HƙozS|c{FSll}1?]+n~r4z:rb|&"?jaЌwb2.v\Xc4v Hp-j5qu7qNlF&`I^O 6endstream endobj 79 0 obj << /Filter /FlateDecode /Length 2946 >> stream xYYo~o0@@eIa6 ^y@ c \V) U=fb@;zκ> !S5 vB'f%C(S6V9a(8"!˝[ ՂWx֕ON^6}7U[vy %)H_2ϻ6ƲY<En}E{ftƟ/gz['! :V9rm R9c^v,?8B&.Қ\<qc5N8O]!^ӈ╀_pQnl.OwHL[%MIJaY.M$0.idX^gP- v %3:i|A<ݕ=wVs4%PF8 rBo;R p$ɊWD a- P%Z] PBw_LQg46ƌg,X 0}g=ipt(# SgVB؎.R\wԴÍqa( ?|ݐ˹P2Ubes;SII̞WL)& yVӖuMƒ'Cp忴_P7lm(v*[ ~t|BSTñ5FS3e *p))hXBG"gC`/B!sjBi-eU/!GE_hc4FTe;c v{)%ӡʷkn/e۔V%3x2M)rD M$1JNg1EÿyTHTh7Q|۹x3{&: d|dQtu} 7ȸ[#E|\܄(L<￿tt%[j0Q:Vز\yͭ{:]JA:Ӽ]jPZbh F.x|fy2wY㱚T68*th@6!InMGN=0a.sy?S>biQS;.nuUCeB?v\CVVC6z4:327 ELSܼW&I6Y7xHo%I!O: C\ Dyyzb]i˗SO\= ']~x'(JF6\Xg,W" K~Q2~e6eXՒ3>j>XH>Q =C_g-2կceLo (. eH=RtM~̧< iR/MZ E3]ˤ lB$]pP-Db> stream xYߏܶ~! EOEJ"@EyvJkwfHJݞ+j8Ώ}/ߜo M9oysQnto)bSeVٝo޳B- i٫cڿثOaMp㚳Cg\wD%o .KfWM3AX%wwV{IZE6`'dHՍ eC#QB>i 9iq~f?-zMYcz)8%WΎP>XQ̇~8o~uYXU4?U@m̆tt3y[35 >7X&[LU劵pcW;f6{ Ǡ6Bڢf{2m5{h1sy' <_<"xZ7,~'\U^={RRAV3sX 5)5rw8'bE9Z]sܮ]@xzϢ5ё[ dDS%W!D`hY"xj rIK请^HzeV\IGE?buޝWK׋"_r3\|ph&Z=qD`E=`q)c  8^fj{ !0쏣?@SC&P7S=YjAAkja;xT.*B{g ~9V$I0#BxٖTGaU @1]`GaYHiaw w$4k>OC">D>j|f!gRrNAL-Nv6sP #S ePl?Xj:-*")h+#7w =c{o*&ŗH-(04 _"M=TL?.ַ1-R%ȭ/$v#,lgf&8(X"Qwݕ|=ϡ/m4G"BKA-۾{wnߌdqTչZTo!"@ G~4_Zs8>ASB'ռ3רTl3_"wMTR$|uHp,2bԆʓ|ŤxQQj6b<#,|0,sy$֕Ejs{ Y,⹤6'2Q/o׆޲I?T.6.Y$EYAIjrqm1T*ЫM][]="6P/QzVܟBL A؀ѐB dUEN O NwZ%2RC̜9DP" e PaL |1F,ct,7]naP_KCמm E*$L+r'o3{=٧ Ls2}߆h? R˜B5 {P+#=L/kU4ɵ~n;RH۔Cwb EE?$4a=4IP@:FF ]lTQ" {-t.7N>jA_х70w u:[zЍ%?ᕂ8s- q֞3a(4D$whEm^f^gC?Pm41 ). zWM&.2`0>oA^`9z;H)H$7.%o*EJV+J~rf]w ӓܷ ;}!o2bGZ`Jw|NQܻ 4{.."MjvZ% dRGCӟÃJX`f_BP;Cγ7 BE%PS`#ȳ11_mzpTu˶pq=2M%o z73i͢& m?cTK̻C}@)^ [C_4=qr27BAOn<$E!}-?DS$V>/%_X.d߫ߠd wJ/5ww,E ~xjio/Oo8%К>Jo]Faw_$endstream endobj 81 0 obj << /Filter /FlateDecode /Subtype /Type1C /Length 363 >> stream x`CMR10u`.   08 u`ۆh]9aOBqb.kF=;@1?:wנАًgdPyNjƋˋ͓ ֔y(S?L@GK|,Rm^utt7]]̵WCRd[Oh|Ӱ20':A.y^\rzQn5S(!Hex{K'a=xmNA@W95Cū 7 %endstream endobj 82 0 obj << /Filter /FlateDecode /Subtype /Type1C /Length 114 >> stream xcd`ab`dddw441ź|eaaThn hȢ29 xq^^qq^+/^!'b>G0endstream endobj 83 0 obj << /Filter /FlateDecode /Length 2513 >> stream xX[o~{EKqA4nQJj`ItZV~Av93CrtӠ;~d)dlsӯ_l~sqy%Ʀ|MjNT ]ul\;޸yU}4xfdn. 3|Q- 6?nM&Vkwo^\eHI5 B)~- Bkd'8sf=͔cݔ Z2޵CTa f&a{:,'~wO6]62|]6=xk NA* rfK*.aHn]lO1~;HkHFUG2cy=@럩 Q;n8Yƥ(T1ԋ5r(Ss,x\ Elu"\hͺ0 /Gy"(#cAL7ܻ; &2>{W]D++.ՇgP#\@bgzj>Iz;3!n"#2Rp{V8VzK#iČ<}YRO׈EW[R76^CA^׺d=1_!iܕcݵCFa_{wMJcpfŴ)+<4iY"u&L ,9~K;8g0Z vK_'dhHSxSz=y: /8_/Cg9W2;OA01OccvjRc 3|rZp:1<5܆$GomKM@rzm\+7JnڒC?{gkKX:EB/5\]Gr߶HS8nI #l) IȌ;sngx9$75UBK5{WX^,.BqL&ϹBYS$~tU@QL4}KX:o_W㗲 Lng VA(N?v宫.Zrr,_K- }H9ϷBAdVa԰LNo|~30%{AL1rY4odd/hK^0殗`aj e4qqsX> stream xcd`ab`dd v 5400q$baa]H~= Pׁ? 7~H/(>GhXKdiVB~FM|w> /W [ 1 3 1 ] /Info 3 0 R /Root 2 0 R /Size 86 /ID [<26a7d532b0dfa964583f2368e3651baa><4bc6102fabaedc6e97597d2350bcad6a>] >> stream xcb&F~0 $8JRpf} [Y ;d" cSA$k6d{A$sT\ uD׀H /@k X48~"\R` endstream endobj startxref 74983 %%EOF tm/inst/doc/extensions.R0000644000175100001440000000653114367743043014774 0ustar hornikusers### R code from vignette source 'extensions.Rnw' ################################################### ### code chunk number 1: Init ################################################### library("tm") library("xml2") ################################################### ### code chunk number 2: extensions.Rnw:55-58 ################################################### VecSource <- function(x) SimpleSource(length = length(x), content = as.character(x), class = "VecSource") ################################################### ### code chunk number 3: extensions.Rnw:68-72 ################################################### getElem.VecSource <- function(x) list(content = x$content[x$position], uri = NULL) pGetElem.VecSource <- function(x) lapply(x$content, function(y) list(content = y, uri = NULL)) ################################################### ### code chunk number 4: extensions.Rnw:100-102 ################################################### readPlain <- function(elem, language, id) PlainTextDocument(elem$content, id = id, language = language) ################################################### ### code chunk number 5: extensions.Rnw:124-130 ################################################### df <- data.frame(doc_id = c("doc 1" , "doc 2" , "doc 3" ), text = c("content 1", "content 2", "content 3"), title = c("title 1" , "title 2" , "title 3" ), authors = c("author 1" , "author 2" , "author 3" ), topics = c("topic 1" , "topic 2" , "topic 3" ), stringsAsFactors = FALSE) ################################################### ### code chunk number 6: extensions.Rnw:138-141 ################################################### (corpus <- Corpus(DataframeSource(df))) corpus[[1]] meta(corpus[[1]]) ################################################### ### code chunk number 7: CustomXMLFile ################################################### custom.xml <- system.file("texts", "custom.xml", package = "tm") print(readLines(custom.xml), quote = FALSE) ################################################### ### code chunk number 8: mySource ################################################### mySource <- function(x) XMLSource(x, parser = xml2::xml_children, reader = myXMLReader) ################################################### ### code chunk number 9: myXMLReader ################################################### myXMLReader <- readXML( spec = list(author = list("node", "writer"), content = list("node", "description"), datetimestamp = list("function", function(x) as.POSIXlt(Sys.time(), tz = "GMT")), description = list("node", "@short"), heading = list("node", "caption"), id = list("function", function(x) tempfile()), origin = list("unevaluated", "My private bibliography"), type = list("node", "type")), doc = PlainTextDocument()) ################################################### ### code chunk number 10: extensions.Rnw:244-245 ################################################### corpus <- VCorpus(mySource(custom.xml)) ################################################### ### code chunk number 11: extensions.Rnw:249-251 ################################################### corpus[[1]] meta(corpus[[1]]) tm/inst/CITATION0000644000175100001440000000150614367741745013046 0ustar hornikuserscitation(auto = meta) bibentry(bibtype = "Article", title = "Text Mining Infrastructure in R", author = c(person("Ingo", "Feinerer", email = "feinerer@logic.at", comment = c(ORCID = "0000-0001-7656-8338")), person("Kurt", "Hornik", email = "Kurt.Hornik@R-project.org", comment = c(ORCID = "0000-0003-4198-9911")), person("David", "Meyer", email = "David.Meyer@wu.ac.at", comment = c(ORCID = "0000-0002-5196-3048"))), year = 2008, journal = "Journal of Statistical Software", volume = 25, number = 5, pages = "1--54", month = "March", doi = "10.18637/jss.v025.i05" ) tm/inst/ghostscript/0000755000175100001440000000000012213264557014245 5ustar hornikuserstm/inst/ghostscript/pdf_info.ps0000644000175100001440000001604212200717467016377 0ustar hornikusers%!PS % Copyright (C) 2007 Artifex Software, Inc. All rights reserved. % % This software is provided AS-IS with no warranty, either express or % implied. % % This software is distributed under license and may not be copied, % modified or distributed except as expressly authorized under the terms % of the license contained in the file LICENSE in this distribution. % % For more information about licensing, please refer to % http://www.ghostscript.com/licensing/. For information on % commercial licensing, go to http://www.artifex.com/licensing/ or % contact Artifex Software, Inc., 101 Lucas Valley Road #110, % San Rafael, CA 94903, U.S.A., +1(415)492-9861. % % $Id: pdf_info.ps 6300 2005-12-28 19:56:24Z alexcher $ % Dump some info from a PDF file % usage: gs -dNODISPLAY -q -sFile=____.pdf [-dDumpMediaSizes] [-dDumpFontsUsed [-dShowEmbeddedFonts] ] toolbin/pdf_info.ps /showoptions { ( where "options" are:) = ( -dDumpMediaSizes=false (default true) MediaBox and CropBox for each page) = ( -dDumpFontsNeeded=false (default true)Fonts used, but not embedded) = ( -dDumpFontsUsed List all fonts used) = ( -dShowEmbeddedFonts only meaningful with -dDumpFontsUsed) = (\n If no options are given, the default is -dDumpMediaSizes -dDumpFontsNeeded) = () = flush } bind def /DumpMediaSizes where { pop } { /DumpMediaSizes true def } ifelse /DumpFontsNeeded where { pop } { /DumpFontsNeeded true def } ifelse [ shellarguments { counttomark 1 eq { dup 0 get (-) 0 get ne { % File specified on the command line using: -- toolbin/pdf_info.ps infile.pdf /File exch def false % don't show usage } { true % show usage and quit } ifelse } { true } ifelse { (\n*** Usage: gs [options] -- toolbin/pdf_info.ps infile.pdf ***\n\n) print showoptions quit } if } if /File where not { (\n *** Missing input file name \(use -sFile=____.pdf\)\n) = ( usage: gs -dNODISPLAY -q -sFile=____.pdf [ options ] toolbin/pdf_info.ps\n) = showoptions quit } if pop % discard the dict from where /QUIET true def % in case they forgot () = File dup (r) file runpdfbegin /PDFPageCount pdfpagecount def ( ) print print ( has ) print PDFPageCount =print ( pages.\n) = flush % Print out the "Info" dictionary if present Trailer /Info knownoget { dup /Title knownoget { (Title: ) print = flush } if dup /Author knownoget { (Author: ) print = flush } if dup /Subject knownoget { (Subject: ) print = flush } if dup /Keywords knownoget { (Keywords: ) print = flush } if dup /Creator knownoget { (Creator: ) print = flush } if dup /Producer knownoget { (Producer: ) print = flush } if dup /CreationDate knownoget { (CreationDate: ) print = flush } if dup /ModDate knownoget { (ModDate: ) print = flush } if dup /Trapped knownoget { (Trapped: ) print = flush } if } if % if Info known DumpMediaSizes { () = % Print out the Page Size info for each page. 1 1 PDFPageCount { dup (Page ) print =print pdfgetpage dup /MediaBox pget { ( MediaBox: ) print oforce_array ==only } if dup /CropBox pget { ( CropBox: ) print oforce_array ==only } if dup /Rotate pget { ( Rotate = ) print =print } if pageusestransparency { ( Page uses transparency features) print } if () = flush } for } if % List of standard font names for use when we are showing the FontsNeeded /StdFontNames [ /Times-Roman /Helvetica /Courier /Symbol /Times-Bold /Helvetica-Bold /Courier-Bold /ZapfDingbats /Times-Italic /Helvetica-Oblique /Courier-Oblique /Times-BoldItalic /Helvetica-BoldOblique /Courier-BoldOblique ] def /res-type-dict 10 dict begin /Font { { exch pop oforce dup //null ne { dup /DescendantFonts knownoget { exch pop 0 get oforce } if dup /FontDescriptor knownoget { dup /FontFile known 1 index /FontFile2 known or exch /FontFile3 known or /ShowEmbeddedFonts where { pop pop //false } if { pop % skip embedded fonts } { /BaseFont knownoget { % not embedded FontsUsed exch //null put } if } ifelse } { /BaseFont knownoget { % no FontDescriptor, not embedded FontsUsed exch //null put } if } ifelse } { pop } ifelse } forall % traverse the dictionary } bind def /XObject { { exch pop oforce dup //null ne { dup /Subtype knownoget { /Form eq { /Resources knownoget { get-fonts-from-res } if } { pop } ifelse } { pop } ifelse } { pop } ifelse } forall } bind def /Pattern { { exch pop oforce dup //null ne { /Resources knownoget { get-fonts-from-res } if } { pop } ifelse } forall } bind def currentdict end readonly def % <> get-fonts-from-res - /get-fonts-from-res { oforce dup //null ne { { oforce dup //null ne { //res-type-dict 3 -1 roll .knownget { exec } { pop } ifelse } { pop pop } ifelse } forall } { pop } ifelse } bind def currentdict /res-type-dict undef /getPDFfonts { % (filename) getPDFfonts array_of_font_names /FontsUsed 1000 dict def % this will increase if needed mark 1 1 PDFPageCount { pdfgetpage % get pagedict dup /Resources pget { get-fonts-from-res } if /Annots knownoget { { oforce dup //null ne { /AP knownoget { { exch pop oforce dup //null ne { dup /Resources knownoget { get-fonts-from-res } if { exch pop oforce dup type /dicttype eq { /Resources knownoget { get-fonts-from-res } if } { pop } ifelse } forall } { pop } ifelse } forall } if } { pop } ifelse } forall } if } for % If DumpFontsUsed is not true, then remove the 'standard' fonts from the list systemdict /DumpFontsUsed known not { StdFontNames { FontsUsed 1 index known { FontsUsed 1 index undef } if pop } forall } if % Now dump the FontsUsed dict into an array so we can sort it. [ FontsUsed { pop } forall ] { 100 string cvs exch 100 string cvs exch lt } .sort } bind def systemdict /DumpFontsUsed known { (\nFont or CIDFont resources used:) = getPDFfonts { = } forall } { DumpFontsNeeded { getPDFfonts dup length 0 gt { (\nFonts Needed that are not embedded \(system fonts required\):) = { ( ) print = } forall } { pop (\nNo system fonts are needed.) = } ifelse } if } ifelse quit tm/inst/NEWS.Rd0000644000175100001440000005264114367742735012762 0ustar hornikusers\name{NEWS} \title{News for Package 'tm'} \encoding{UTF-8} \section{Changes in tm version 0.7-11}{ \subsection{BUG FIXES}{ \itemize{ \item Use the default C++ standard instead of C++11. } } } \section{Changes in tm version 0.7-10}{ \subsection{NEW FEATURES}{ \itemize{ \item All built-in \code{pGetElem()} methods now use \code{tm_parLapply()}. } } } \section{Changes in tm version 0.7-9}{ \subsection{BUG FIXES}{ \itemize{ \item Compilation fixes. } } } \section{Changes in tm version 0.7-8}{ \subsection{BUG FIXES}{ \itemize{ \item Fix invalid counting in \code{prevalent} \code{stemCompletion()}. Reported by Bernard Chang. \item \code{tm_index()} now interprets all non-\code{TRUE} logical values returned by the filter function as \code{FALSE}. This fixes corner cases where filter functions return \code{logical(0)} or \code{NA}. Reported by Tom Nicholls. } } } \section{Changes in tm version 0.7-6}{ \subsection{NEW FEATURES}{ \itemize{ \item \code{TermDocumentMatrix.SimpleCorpus()} now also honors a logical \code{removePunctuation} control option (default: false). } } \subsection{BUG FIXES}{ \itemize{ \item Sync encoding fixes in \code{TermDocumentMatrix.SimpleCorpus()} with \code{Boost_tokenizer()}. } } } \section{Changes in tm version 0.7-5}{ \subsection{BUG FIXES}{ \itemize{ \item Handle \code{NA}s consistently in tokenizers. } } } \section{Changes in tm version 0.7-4}{ \subsection{BUG FIXES}{ \itemize{ \item Keep document names in \code{tm_map.SimpleCorpus()}. \item Fix encoding problems in \code{scan_tokenizer()} and \code{Boost_tokenizer()}. } } } \section{Changes in tm version 0.7-3}{ \subsection{BUG FIXES}{ \itemize{ \item \code{scan_tokenizer()} now works with character vectors and character strings. \item \code{removePunctuation()} now works again in \code{latin1} locales. \item Handle empty term-document matrices gracefully. } } } \section{Changes in tm version 0.7-2}{ \subsection{SIGNIFICANT USER-VISIBLE CHANGES}{ \itemize{ \item \code{DataframeSource} now only processes data frames with the two mandatory columns \code{"doc_id"} and \code{"text"}. Additional columns are used as document level metadata. This implements compatibility with \emph{Text Interchange Formats} corpora (\url{https://github.com/ropenscilabs/tif}). \item \code{readTabular()} has been removed. Use \code{DataframeSource} instead. \item \code{removeNumbers()} and \code{removePunctuation()} now have an argument \code{ucp} to check for Unicode general categories \code{Nd} (decimal digits) and \code{P} (punctuation), respectively. Contributed by Kurt Hornik. \item The package \pkg{xml2} is now imported for \acronym{XML} functionality instead of the (\acronym{CRAN} maintainer orphaned) package \pkg{XML}. } } \subsection{NEW FEATURES}{ \itemize{ \item \code{Boost_tokenizer} provides a tokenizer based on the Boost (\url{https://www.boost.org}) Tokenizer. } } \subsection{BUG FIXES}{ \itemize{ \item Correctly handle the \code{dictionary} argument when constructing a term-document matrix from a \code{SimpleCorpus} (reported by Joe Corrigan) or from a \code{VCorpus} (reported by Mark Rosenstein). } } } \section{Changes in tm version 0.7-1}{ \subsection{BUG FIXES}{ \itemize{ \item Compilation fixes for Clang's libc++. } } } \section{Changes in tm version 0.7}{ \subsection{SIGNIFICANT USER-VISIBLE CHANGES}{ \itemize{ \item \code{inspect.TermDocumentMatrix()} now displays a sample instead of the full matrix. The full dense representation is available via \code{as.matrix()}. } } \subsection{NEW FEATURES}{ \itemize{ \item \code{SimpleCorpus} provides a corpus which is optimized for the most common usage scenario: importing plain texts from files in a directory or directly from a vector in \R, preprocessing and transforming the texts, and finally exporting them to a term-document matrix. The aim is to boost performance and minimize memory pressure. It loads all documents into memory, and is designed for medium-sized to large data sets. \item \code{inspect()} on text documents as a shorthand for \code{writeLines(as.character())}. \item \code{findMostFreqTerms()} finds most frequent terms in a document-term or term-document matrix, or a vector of term frequencies. \item \code{tm_parLapply()} is now internally used for the parallelization of transformations, filters, and term-document matrix construction. The preferred parallelization engine can be registered via \code{tm_parLapply_engine()}. The default is to use no parallelization (instead of \code{\link[parallel]{mclapply}} (package \pkg{parallel}) in previous versions). } } } \section{Changes in tm version 0.6-2}{ \subsection{BUG FIXES}{ \itemize{ \item \code{format.PlainTextDocument()} now reports only one character count for a whole document. } } } \section{Changes in tm version 0.6-1}{ \subsection{SIGNIFICANT USER-VISIBLE CHANGES}{ \itemize{ \item \code{format.PlainTextDocument()} now displays a compact representation instead of the content. Use \code{as.character()} to obtain the character content (which in turn can be applied to a corpus via \code{lapply()}). } } \subsection{NEW FEATURES}{ \itemize{ \item \code{ZipSource()} for processing ZIP files. \item Sources now provide \code{open()} and \code{close()}. \item \code{termFreq()} now accepts \code{Span_Tokenizer} and \code{Token_Tokenizer} (both from package \pkg{NLP}) objects as tokenizers. \item \code{readTagged()}, a reader for text documents containing POS-tagged words. } } \subsection{BUG FIXES}{ \itemize{ \item The function \code{removeWords()} now correctly processes words being truncations of others. Reported by Александр Труфанов. } } } \section{Changes in tm version 0.6}{ \subsection{SIGNIFICANT USER-VISIBLE CHANGES}{ \itemize{ \item \code{DirSource()} and \code{URISource()} now use the argument \code{encoding} for conversion via \code{iconv()} to \code{"UTF-8"}. \item \code{termFreq()} now uses \code{words()} as the default tokenizer. \item Text documents now provide the functions \code{content()} and \code{as.character()} to access the (possibly raw) document content and the natural language text in a suitable (not necessarily structured) form. \item The internal representation of corpora, sources, and text documents changed. Saved objects created with older \pkg{tm} versions are incompatible and need to be rebuilt. } } \subsection{NEW FEATURES}{ \itemize{ \item \code{DirSource()} and \code{URISource()} now have a \code{mode} argument specifying how elements should be read (no read, binary, text). \item Improved high-level documentation on corpora (\code{?Corpus}), text documents (\code{?TextDocument}), sources (\code{?Source}), and readers (\code{?Reader}). \item Integration with package \pkg{NLP}. \item Romanian stopwords. Suggested by Cristian Chirita. \item \code{words.PlainTextDocument()} delivers word tokens in the document. } } \subsection{BUG FIXES}{ \itemize{ \item The function \code{stemCompletion()} now avoids spurious duplicate results. Reported by Seong-Hyeon Kim. } } \subsection{DEPRECATED & DEFUNCT}{ \itemize{ \item Following functions have been removed: \itemize{ \item \code{Author()}, \code{DateTimeStamp()}, \code{CMetaData()}, \code{content_meta()}, \code{DMetaData()}, \code{Description()}, \code{Heading()}, \code{ID()}, \code{Language()}, \code{LocalMetaData()}, \code{Origin()}, \code{prescindMeta()}, \code{sFilter()} (use \code{meta()} instead). \item \code{dissimilarity()} (use \code{proxy::dist()} instead). \item \code{makeChunks()} (use \code{[} and \code{[[} manually). \item \code{summary.Corpus()} and \code{summary.TextRepository()} (\code{print()} now gives a more informative but succinct overview). \item \code{TextRepository()} and \code{RepoMetaData()} (use e.g. a list to store multiple corpora instead). } } } } \section{Changes in tm version 0.5-10}{ \subsection{SIGNIFICANT USER-VISIBLE CHANGES}{ \itemize{ \item License changed to GPL-3 (from GPL-2 | GPL-3). \item Following functions have been renamed: \itemize{ \item \code{tm_tag_score()} to \code{tm_term_score()}. } } } \subsection{DEPRECATED & DEFUNCT}{ \itemize{ \item Following functions have been removed: \itemize{ \item \code{Dictionary()} (use a character vector instead; use \code{Terms()} to extract terms from a document-term or term-document matrix), \item \code{GmaneSource()} (but still available via an example in \code{XMLSource()}), \item \code{preprocessReut21578XML()} (moved to package \pkg{tm.corpus.Reuters21578}), \item \code{readGmane()} (but still available via an example in \code{readXML()}), \item \code{searchFullText()} and \code{tm_intersect()} (use \code{grep()} instead). } \item Following S3 classes are no longer registered as S4 classes: \itemize{ \item \code{VCorpus} and \code{PlainTextDocument}. } } } } \section{Changes in tm version 0.5-9}{ \subsection{SIGNIFICANT USER-VISIBLE CHANGES}{ \itemize{ \item Stemming functionality is now provided by the package \pkg{SnowballC} replacing packages \pkg{Snowball} and \pkg{RWeka}. \item All stopword lists (besides Catalan and SMART) available via \code{stopwords()} now come from the Snowball stemmer project. \item Transformations, filters, and term-document matrix construction now use \code{\link[parallel]{mclapply}} (package \pkg{parallel}). Packages \pkg{snow} and \pkg{Rmpi} are no longer used. } } \subsection{DEPRECATED & DEFUNCT}{ \itemize{ \item Following functions have been removed: \itemize{ \item \code{tm_startCluster()} and \code{tm_stopCluster()}. } } } } \section{Changes in tm version 0.5-8}{ \subsection{SIGNIFICANT USER-VISIBLE CHANGES}{ \itemize{ \item The function \code{termFreq()} now processes the \code{tolower} and \code{tokenize} options first. } } \subsection{NEW FEATURES}{ \itemize{ \item Catalan stopwords. Requested by Xavier Fernández i Marín. } } \subsection{BUG FIXES}{ \itemize{ \item The function \code{termFreq()} now correctly accepts user-provided stopwords. Reported by Bettina Grün. \item The function \code{termFreq()} now correctly handles the lower bound of the option \code{wordLength}. Reported by Steven C. Bagley. } } } \section{Changes in tm version 0.5-7}{ \subsection{SIGNIFICANT USER-VISIBLE CHANGES}{ \itemize{ \item The function \code{termFreq()} provides two new arguments for generalized bounds checking of term frequencies and word lengths. This replaces the arguments minDocFreq and minWordLength. \item The function \code{termFreq()} is now sensitive to the order of control options. } } \subsection{NEW FEATURES}{ \itemize{ \item Weighting schemata for term-document matrices in SMART notation. \item Local and global options for term-document matrix construction. \item SMART stopword list was added. } } } \section{Changes in tm version 0.5-5}{ \subsection{NEW FEATURES}{ \itemize{ \item Access documents in a corpus by names (fallback to IDs if names are not set), i.e., allow a string for the corpus operator `[[`. } } \subsection{BUG FIXES}{ \itemize{ \item The function \code{findFreqTerms()} now checks bounds on a global level (to comply with the manual page) instead per document. Reported and fixed by Thomas Zapf-Schramm. } } } \section{Changes in tm version 0.5-4}{ \subsection{SIGNIFICANT USER-VISIBLE CHANGES}{ \itemize{ \item Use IETF language tags for language codes (instead of ISO 639-2). } } \subsection{NEW FEATURES}{ \itemize{ \item The function \code{tm_tag_score()} provides functionality to score documents based on the number of tags found. This is useful for sentiment analysis. \item The weighting function for term frequency-inverse document frequency \code{weightTfIdf()} has now an option for term normalization. \item Plotting functions to test for Zipf's and Heaps' law on a term-document matrix were added: \code{Zipf_plot()} and \code{Heaps_plot()}. Contributed by Kurt Hornik. } } } \section{Changes in tm version 0.5-3}{ \subsection{NEW FEATURES}{ \itemize{ \item The reader function \code{readRCV1asPlain()} was added and combines the functionality of \code{readRCV1()} and \code{as.PlainTextDocument()}. \item The function \code{stemCompletion()} has a set of new heuristics. } } } \section{Changes in tm version 0.5-2}{ \subsection{SIGNIFICANT USER-VISIBLE CHANGES}{ \itemize{ \item The function \code{termFreq()} which is used for building a term-document matrix now uses a whitespace oriented tokenizer as default. } } \subsection{NEW FEATURES}{ \itemize{ \item A combine method for merging multiple term-document matrices was added (\code{c.TermDocumentMatrix()}). \item The function \code{termFreq()} has now an option to remove punctuation characters. } } \subsection{DEPRECATED & DEFUNCT}{ \itemize{ \item Following functions have been removed: \itemize{ \item \code{CSVSource()} (use \code{DataframeSource(read.csv(..., stringsAsFactors = FALSE))} instead), and \item \code{TermDocMatrix()} (use \code{DocumentTermMatrix()} instead). } } } \subsection{BUG FIXES}{ \itemize{ \item \code{removeWords()} no longer skips words at the beginning or the end of a line. Reported by Mark Kimpel. } } } \section{Changes in tm version 0.5-1}{ \subsection{BUG FIXES}{ \itemize{ \item \code{preprocessReut21578XML()} no longer generates invalid file names. } } } \section{Changes in tm version 0.5}{ \subsection{SIGNIFICANT USER-VISIBLE CHANGES}{ \itemize{ \item All classes, functions, and generics are reimplemented using the S3 class system. \item Following functions have been renamed: \itemize{ \item \code{activateCluster()} to \code{tm_startCluster()}, \item \code{asPlain()} to \code{as.PlainTextDocument()}, \item \code{deactivateCluster()} to \code{tm_stopCluster()}, \item \code{tmFilter()} to \code{tm_filter()}, \item \code{tmIndex()} to \code{tm_index()}, \item \code{tmIntersect()} to \code{tm_intersect()}, and \item \code{tmMap()} to \code{tm_map()}. } \item Mail handling functionality is factored out to the \pkg{tm.plugin.mail} package. } } \subsection{DEPRECATED & DEFUNCT}{ \itemize{ \item Following functions have been removed: \itemize{ \item \code{tmTolower()} (use \code{tolower()} instead), and \item \code{replacePatterns()} (use \code{gsub()} instead). } } } } \section{Changes in tm version 0.4}{ \subsection{SIGNIFICANT USER-VISIBLE CHANGES}{ \itemize{ \item The Corpus class is now virtual providing an abstract interface. \item VCorpus, the default implementation of the abstract corpus interface (by subclassing), provides a corpus with volatile (= standard \R object) semantics. It loads all documents into memory, and is designed for small to medium-sized data sets. \item PCorpus, an implementation of the abstract corpus interface (by subclassing), provides a corpus with permanent storage semantics. The actual data is stored in an external database (file) object (as supported by the \pkg{filehash} package), with automatic (un-)loading into memory. It is designed for systems with small memory. \item Language codes are now in ISO 639-2 (instead of ISO 639-1). \item Reader functions no longer have a load argument for lazy loading. } } \subsection{NEW FEATURES}{ \itemize{ \item The reader function \code{readReut21578XMLasPlain()} was added and combines the functionality of \code{readReut21578XML()} and \code{asPlain()}. } } \subsection{BUG FIXES}{ \itemize{ \item \code{weightTfIdf()} no longer applies a binary weighting to an input matrix in term frequency format (which happened only in 0.3-4). } } } \section{Changes in tm version 0.3-4}{ \subsection{SIGNIFICANT USER-VISIBLE CHANGES}{ \itemize{ \item \code{.onLoad()} no longer tries to start a MPI cluster (which often failed in misconfigured environments). Use \code{activateCluster()} and \code{deactivateCluster()} instead. \item DocumentTermMatrix (the improved reimplementation of defunct TermDocMatrix) does not use the \pkg{Matrix} package anymore. } } \subsection{NEW FEATURES}{ \itemize{ \item The \code{DirSource()} constructor now accepts the two new (optional) arguments pattern and ignore.case. With pattern one can define a regular expression for selecting only matching files, and ignore.case specifies whether pattern-matching is case-sensitive. \item The \code{readNewsgroup()} reader function can now be configured for custom date formats (via the DateFormat argument). \item The \code{readPDF()} reader function can now be configured (via the PdfinfoOptions and PdftotextOptions arguments). \item The \code{readDOC()} reader function can now be configured (via the AntiwordOptions argument). \item Sources now can be vectorized. This allows faster corpus construction. \item New XMLSource class for arbitrary XML files. \item The new \code{readTabular()} reader function allows to create a custom tailor-made reader configured via mappings from a tabular data structure. \item The new \code{readXML()} reader function allows to read in arbitrary XML files which are described with a specification. \item The new \code{tmReduce()} transformation allows to combine multiple maps into one transformation. } } \subsection{DEPRECATED & DEFUNCT}{ \itemize{ \item CSVSource is defunct (use DataframeSource instead). \item weightLogical is defunct. \item TermDocMatrix is defunct (use DocumentTermMatrix or TermDocumentMatrix instead). } } } \section{Changes in tm version 0.3-3}{ \subsection{NEW FEATURES}{ \itemize{ \item The abstract Source class gets a default implementation for the \code{stepNext()} method. It increments the position counter by one, a reasonable value for most sources. For special purposes custom methods can be created via overloading \code{stepNext()} of the subclass. \item New URISource class for a single document identified by a Uniform Resource Identifier. \item New DataframeSource for documents stored in a data frame. Each row is interpreted as a single document. } } \subsection{BUG FIXES}{ \itemize{ \item Fix off-by-one error in \code{convertMboxEml()} function. Reported by Angela Bohn. \item Sort row indices in sparse term-document matrices. Kudos to Martin Mächler for his suggestions. \item Sources and readers no longer evaluate calls in a non-standard way. } } } \section{Changes in tm version 0.3-2}{ \subsection{NEW FEATURES}{ \itemize{ \item Weighting functions now have an Acronym slot containing abbreviations of the weighting functions' names. This is highly useful when generating tables with indications which weighting scheme was actually used for your experiments. \item The functions \code{tmFilter()}, \code{tmIndex()}, \code{tmMap()} and \code{TermDocMatrix()} now can use a MPI cluster (via the \pkg{snow} and \pkg{Rmpi} packages) if available. Use \code{(de)activateCluster()} to manually override cluster usage settings. Special thanks to Stefan Theussl for his constructive comments. \item The Source class receives a new Length slot. It contains the number of elements provided by the source (although there might be rare cases where the number cannot be determined in advance---then it should be set to zero). } } } tm/inst/texts/0000755000175100001440000000000012213264557013043 5ustar hornikuserstm/inst/texts/reuters-21578.xml0000644000175100001440000004050412074065307015742 0ustar hornikusers 26-FEB-1987 15:01:01.79 cocoa el-salvadorusauruguay C T f0704reute u f BC-BAHIA-COCOA-REVIEW 02-26 0105 BAHIA COCOA REVIEW SALVADOR, Feb 26 - Showers continued throughout the week in the Bahia cocoa zone, alleviating the drought since early January and improving prospects for the coming temporao, although normal humidity levels have not been restored, Comissaria Smith said in its weekly review. The dry period means the temporao will be late this year. Arrivals for the week ended February 22 were 155,221 bags of 60 kilos making a cumulative total for the season of 5.93 mln against 5.81 at the same stage last year. Again it seems that cocoa delivered earlier on consignment was included in the arrivals figures. Comissaria Smith said there is still some doubt as to how much old crop cocoa is still available as harvesting has practically come to an end. With total Bahia crop estimates around 6.4 mln bags and sales standing at almost 6.2 mln there are a few hundred thousand bags still in the hands of farmers, middlemen, exporters and processors. There are doubts as to how much of this cocoa would be fit for export as shippers are now experiencing dificulties in obtaining +Bahia superior+ certificates. In view of the lower quality over recent weeks farmers have sold a good part of their cocoa held on consignment. Comissaria Smith said spot bean prices rose to 340 to 350 cruzados per arroba of 15 kilos. Bean shippers were reluctant to offer nearby shipment and only limited sales were booked for March shipment at 1,750 to 1,780 dlrs per tonne to ports to be named. New crop sales were also light and all to open ports with June/July going at 1,850 and 1,880 dlrs and at 35 and 45 dlrs under New York july, Aug/Sept at 1,870, 1,875 and 1,880 dlrs per tonne FOB. Routine sales of butter were made. March/April sold at 4,340, 4,345 and 4,350 dlrs. April/May butter went at 2.27 times New York May, June/July at 4,400 and 4,415 dlrs, Aug/Sept at 4,351 to 4,450 dlrs and at 2.27 and 2.28 times New York Sept and Oct/Dec at 4,480 dlrs and 2.27 times New York Dec, Comissaria Smith said. Destinations were the U.S., Covertible currency areas, Uruguay and open ports. Cake sales were registered at 785 to 995 dlrs for March/April, 785 dlrs for May, 753 dlrs for Aug and 0.39 times New York Dec for Oct/Dec. Buyers were the U.S., Argentina, Uruguay and convertible currency areas. Liquor sales were limited with March/April selling at 2,325 and 2,380 dlrs, June/July at 2,375 dlrs and at 1.25 times New York July, Aug/Sept at 2,400 dlrs and at 1.25 times New York Sept and Oct/Dec at 1.25 times New York Dec, Comissaria Smith said. Total Bahia sales are currently estimated at 6.13 mln bags against the 1986/87 crop and 1.06 mln bags against the 1987/88 crop. Final figures for the period to February 28 are expected to be published by the Brazilian Cocoa Trade Commission after carnival which ends midday on February 27. Reuter 26-FEB-1987 15:02:20.00 usa F Y f0708reute d f BC-STANDARD-OIL-<SRD>-TO 02-26 0082 STANDARD OIL <SRD> TO FORM FINANCIAL UNIT CLEVELAND, Feb 26 - Standard Oil Co and BP North America Inc said they plan to form a venture to manage the money market borrowing and investment activities of both companies. BP North America is a subsidiary of British Petroleum Co Plc <BP>, which also owns a 55 pct interest in Standard Oil. The venture will be called BP/Standard Financial Trading and will be operated by Standard Oil under the oversight of a joint management committee. Reuter 26-FEB-1987 15:03:27.51 usa F A f0714reute d f BC-TEXAS-COMMERCE-BANCSH 02-26 0064 TEXAS COMMERCE BANCSHARES <TCB> FILES PLAN HOUSTON, Feb 26 - Texas Commerce Bancshares Inc's Texas Commerce Bank-Houston said it filed an application with the Comptroller of the Currency in an effort to create the largest banking network in Harris County. The bank said the network would link 31 banks having 13.5 billion dlrs in assets and 7.5 billion dlrs in deposits. Reuter 26-FEB-1987 15:07:13.72 usabrazil F f0725 reute u f BC-TALKING-POINT/BANKAME 02-26 0105 TALKING POINT/BANKAMERICA <BAC> EQUITY OFFER by Janie Gabbett, Reuters LOS ANGELES, Feb 26 - BankAmerica Corp is not under pressure to act quickly on its proposed equity offering and would do well to delay it because of the stock's recent poor performance, banking analysts said. Some analysts said they have recommended BankAmerica delay its up to one-billion-dlr equity offering, which has yet to be approved by the Securities and Exchange Commission. BankAmerica stock fell this week, along with other banking issues, on the news that Brazil has suspended interest payments on a large portion of its foreign debt. The stock traded around 12, down 1/8, this afternoon, after falling to 11-1/2 earlier this week on the news. Banking analysts said that with the immediate threat of the First Interstate Bancorp <I> takeover bid gone, BankAmerica is under no pressure to sell the securities into a market that will be nervous on bank stocks in the near term. BankAmerica filed the offer on January 26. It was seen as one of the major factors leading the First Interstate withdrawing its takeover bid on February 9. A BankAmerica spokesman said SEC approval is taking longer than expected and market conditions must now be re-evaluated. "The circumstances at the time will determine what we do," said Arthur Miller, BankAmerica's Vice President for Financial Communications, when asked if BankAmerica would proceed with the offer immediately after it receives SEC approval. "I'd put it off as long as they conceivably could," said Lawrence Cohn, analyst with Merrill Lynch, Pierce, Fenner and Smith. Cohn said the longer BankAmerica waits, the longer they have to show the market an improved financial outlook. Although BankAmerica has yet to specify the types of equities it would offer, most analysts believed a convertible preferred stock would encompass at least part of it. Such an offering at a depressed stock price would mean a lower conversion price and more dilution to BankAmerica stock holders, noted Daniel Williams, analyst with Sutro Group. Several analysts said that while they believe the Brazilian debt problem will continue to hang over the banking industry through the quarter, the initial shock reaction is likely to ease over the coming weeks. Nevertheless, BankAmerica, which holds about 2.70 billion dlrs in Brazilian loans, stands to lose 15-20 mln dlrs if the interest rate is reduced on the debt, and as much as 200 mln dlrs if Brazil pays no interest for a year, said Joseph Arsenio, analyst with Birr, Wilson and Co. He noted, however, that any potential losses would not show up in the current quarter. With other major banks standing to lose even more than BankAmerica if Brazil fails to service its debt, the analysts said they expect the debt will be restructured, similar to way Mexico's debt was, minimizing losses to the creditor banks. Reuter 26-FEB-1987 15:10:44.60 grainwheatcornbarleyoatsorghum usa C G f0738 reute u f BC-average-prices 02-26 0095 NATIONAL AVERAGE PRICES FOR FARMER-OWNED RESERVE WASHINGTON, Feb 26 - The U.S. Agriculture Department reported the farmer-owned reserve national five-day average price through February 25 as follows (Dlrs/Bu-Sorghum Cwt) - Natl Loan Release Call Avge Rate-X Level Price Price Wheat 2.55 2.40 IV 4.65 -- V 4.65 -- VI 4.45 -- Corn 1.35 1.92 IV 3.15 3.15 V 3.25 -- X - 1986 Rates. Natl Loan Release Call Avge Rate-X Level Price Price Oats 1.24 0.99 V 1.65 -- Barley n.a. 1.56 IV 2.55 2.55 V 2.65 -- Sorghum 2.34 3.25-Y IV 5.36 5.36 V 5.54 -- Reserves I, II and III have matured. Level IV reflects grain entered after Oct 6, 1981 for feedgrain and after July 23, 1981 for wheat. Level V wheat/barley after 5/14/82, corn/sorghum after 7/1/82. Level VI covers wheat entered after January 19, 1984. X-1986 rates. Y-dlrs per CWT (100 lbs). n.a.-not available. Reuter 26-FEB-1987 15:14:36.41 veg-oillinseedlin-oilsoy-oilsun-oilsoybeanoilseedcornsunseedgrainsorghumwheat argentina G f0754 reute r f BC-ARGENTINE-1986/87-GRA 02-26 0066 ARGENTINE 1986/87 GRAIN/OILSEED REGISTRATIONS BUENOS AIRES, Feb 26 - Argentine grain board figures show crop registrations of grains, oilseeds and their products to February 11, in thousands of tonnes, showing those for futurE shipments month, 1986/87 total and 1985/86 total to February 12, 1986, in brackets: Bread wheat prev 1,655.8, Feb 872.0, March 164.6, total 2,692.4 (4,161.0). Maize Mar 48.0, total 48.0 (nil). Sorghum nil (nil) Oilseed export registrations were: Sunflowerseed total 15.0 (7.9) Soybean May 20.0, total 20.0 (nil) The board also detailed export registrations for subproducts, as follows, SUBPRODUCTS Wheat prev 39.9, Feb 48.7, March 13.2, Apr 10.0, total 111.8 (82.7) . Linseed prev 34.8, Feb 32.9, Mar 6.8, Apr 6.3, total 80.8 (87.4). Soybean prev 100.9, Feb 45.1, MAr nil, Apr nil, May 20.0, total 166.1 (218.5). Sunflowerseed prev 48.6, Feb 61.5, Mar 25.1, Apr 14.5, total 149.8 (145.3). Vegetable oil registrations were : Sunoil prev 37.4, Feb 107.3, Mar 24.5, Apr 3.2, May nil, Jun 10.0, total 182.4 (117.6). Linoil prev 15.9, Feb 23.6, Mar 20.4, Apr 2.0, total 61.8, (76.1). Soybean oil prev 3.7, Feb 21.1, Mar nil, Apr 2.0, May 9.0, Jun 13.0, Jul 7.0, total 55.8 (33.7). REUTER 26-FEB-1987 15:14:42.83 usa F f0755 reute d f BC-RED-LION-INNS-FILES-P 02-26 0082 RED LION INNS FILES PLANS OFFERING PORTLAND, Ore., Feb 26 - Red Lion Inns Limited Partnership said it filed a registration statement with the Securities and Exchange Commission covering a proposed offering of 4,790,000 units of limited partnership interests. The company said it expects the offering to be priced at 20 dlrs per unit. It said proceeds from the offering, along with a 102.5 mln dlr mortgage loan, will be used to finance its planned acquisition of 10 Red Lion hotels. Reuter 26-FEB-1987 15:15:40.12 usa F A RM f0758 reute u f BC-USX-<X>-DEBT-DOWGRADE 02-26 0103 USX <X> DEBT DOWGRADED BY MOODY'S NEW YORK, Feb 26 - Moody's Investors Service Inc said it lowered the debt and preferred stock ratings of USX Corp and its units. About seven billion dlrs of securities is affected. Moody's said Marathon Oil Co's recent establishment of up to one billion dlrs in production payment facilities on its prolific Yates Field has significant negative implications for USX's unsecured creditors. The company appears to have positioned its steel segment for a return to profit by late 1987, Moody's added. Ratings lowered include those on USX's senior debt to BA-1 from BAA-3. Reuter 26-FEB-1987 15:17:11.20 earn usa F f0762 reute r f BC-CHAMPION-PRODUCTS-<CH 02-26 0067 CHAMPION PRODUCTS <CH> APPROVES STOCK SPLIT ROCHESTER, N.Y., Feb 26 - Champion Products Inc said its board of directors approved a two-for-one stock split of its common shares for shareholders of record as of April 1, 1987. The company also said its board voted to recommend to shareholders at the annual meeting April 23 an increase in the authorized capital stock from five mln to 25 mln shares. Reuter 26-FEB-1987 15:18:06.67 acq usa F f0767 reute d f BC-COMPUTER-TERMINAL-SYS 02-26 0107 COMPUTER TERMINAL SYSTEMS <CPML> COMPLETES SALE COMMACK, N.Y., Feb 26 - Computer Terminal Systems Inc said it has completed the sale of 200,000 shares of its common stock, and warrants to acquire an additional one mln shares, to <Sedio N.V.> of Lugano, Switzerland for 50,000 dlrs. The company said the warrants are exercisable for five years at a purchase price of .125 dlrs per share. Computer Terminal said Sedio also has the right to buy additional shares and increase its total holdings up to 40 pct of the Computer Terminal's outstanding common stock under certain circumstances involving change of control at the company. The company said if the conditions occur the warrants would be exercisable at a price equal to 75 pct of its common stock's market price at the time, not to exceed 1.50 dlrs per share. Computer Terminal also said it sold the technolgy rights to its Dot Matrix impact technology, including any future improvements, to <Woodco Inc> of Houston, Tex. for 200,000 dlrs. But, it said it would continue to be the exclusive worldwide licensee of the technology for Woodco. The company said the moves were part of its reorganization plan and would help pay current operation costs and ensure product delivery. Computer Terminal makes computer generated labels, forms, tags and ticket printers and terminals. Reuter tm/inst/texts/txt/0000755000175100001440000000000012213264556013661 5ustar hornikuserstm/inst/texts/txt/ovid_3.txt0000644000175100001440000000134412074065306015604 0ustar hornikusers vera canam: coeptis, mater Amoris, ades! este procul, vittae tenues, insigne pudoris, quaeque tegis medios, instita longa, pedes. nos venerem tutam concessaque furta canemus, inque meo nullum carmine crimen erit. principio, quod amare velis, reperire labora, qui nova nunc primum miles in arma venis. proximus huic labor est placitam exorare puellam: tertius, ut longo tempore duret amor. hic modus, haec nostro signabitur area curru: haec erit admissa meta terenda rota. dum licet, et loris passim potes ire solutis, elige cui dicas 'tu mihi sola places.' haec tibi non tenues veniet delapsa per auras: quaerenda est oculis apta puella tuis. tm/inst/texts/txt/ovid_1.txt0000644000175100001440000000126412074065306015603 0ustar hornikusers Si quis in hoc artem populo non novit amandi, hoc legat et lecto carmine doctus amet. arte citae veloque rates remoque moventur, arte leves currus: arte regendus amor. curribus Automedon lentisque erat aptus habenis, Tiphys in Haemonia puppe magister erat: me Venus artificem tenero praefecit Amori; Tiphys et Automedon dicar Amoris ego. ille quidem ferus est et qui mihi saepe repugnet: sed puer est, aetas mollis et apta regi. Phillyrides puerum cithara perfecit Achillem, atque animos placida contudit arte feros. qui totiens socios, totiens exterruit hostes, creditur annosum pertimuisse senem. tm/inst/texts/txt/ovid_2.txt0000644000175100001440000000131612074065306015602 0ustar hornikusers quas Hector sensurus erat, poscente magistro verberibus iussas praebuit ille manus. Aeacidae Chiron, ego sum praeceptor Amoris: saevus uterque puer, natus uterque dea. sed tamen et tauri cervix oneratur aratro, frenaque magnanimi dente teruntur equi; et mihi cedet Amor, quamvis mea vulneret arcu pectora, iactatas excutiatque faces. quo me fixit Amor, quo me violentius ussit, hoc melior facti vulneris ultor ero: non ego, Phoebe, datas a te mihi mentiar artes, nec nos aëriae voce monemur avis, nec mihi sunt visae Clio Cliusque sorores servanti pecudes vallibus, Ascra, tuis: usus opus movet hoc: vati parete perito; tm/inst/texts/txt/ovid_4.txt0000644000175100001440000000137412074065306015610 0ustar hornikusers scit bene venator, cervis ubi retia tendat, scit bene, qua frendens valle moretur aper; aucupibus noti frutices; qui sustinet hamos, novit quae multo pisce natentur aquae: tu quoque, materiam longo qui quaeris amori, ante frequens quo sit disce puella loco. non ego quaerentem vento dare vela iubebo, nec tibi, ut invenias, longa terenda via est. Andromedan Perseus nigris portarit ab Indis, raptaque sit Phrygio Graia puella viro, tot tibi tamque dabit formosas Roma puellas, 'Haec habet' ut dicas 'quicquid in orbe fuit.' Gargara quot segetes, quot habet Methymna racemos, aequore quot pisces, fronde teguntur aves, quot caelum stellas, tot habet tua Roma puellas: tm/inst/texts/txt/ovid_5.txt0000644000175100001440000000131612074065306015605 0ustar hornikusers mater in Aeneae constitit urbe sui. seu caperis primis et adhuc crescentibus annis, ante oculos veniet vera puella tuos: sive cupis iuvenem, iuvenes tibi mille placebunt. cogeris voti nescius esse tui: seu te forte iuvat sera et sapientior aetas, hoc quoque, crede mihi, plenius agmen erit. tu modo Pompeia lentus spatiare sub umbra, cum sol Herculei terga leonis adit: aut ubi muneribus nati sua munera mater addidit, externo marmore dives opus. nec tibi vitetur quae, priscis sparsa tabellis, porticus auctoris Livia nomen habet: quaque parare necem miseris patruelibus ausae Belides et stricto stat ferus ense pater. tm/inst/texts/loremipsum.txt0000644000175100001440000000622212074065307015777 0ustar hornikusersLorem ipsum dolor sit amet, consectetur adipiscing elit. Sed at ante. Mauris eleifend, quam a vulputate dictum, massa quam dapibus leo, eget vulputate orci purus ut lorem. In fringilla mi in ligula. Pellentesque aliquam quam vel dolor. Nunc adipiscing. Sed quam odio, tempus ac, aliquam molestie, varius ac, tellus. Vestibulum ut nulla aliquam risus rutrum interdum. Pellentesque lorem. Curabitur sit amet erat quis risus feugiat viverra. Pellentesque augue justo, sagittis et, lacinia at, venenatis non, arcu. Nunc nec libero. In cursus dictum risus. Etiam tristique nisl a nulla. Ut a orci. Curabitur dolor nunc, egestas at, accumsan at, malesuada nec, magna. Nulla facilisi. Nunc volutpat. Vestibulum ante ipsum primis in faucibus orci luctus et ultrices posuere cubilia Curae; Ut sit amet orci vel mauris blandit vehicula. Nullam quis enim. Integer dignissim viverra velit. Curabitur in odio. In hac habitasse platea dictumst. Ut consequat, tellus eu volutpat varius, justo orci elementum dolor, sed imperdiet nulla tellus ut diam. Vestibulum ipsum ante, malesuada quis, tempus ac, placerat sit amet, elit. Sed eget turpis a pede tempor malesuada. Vivamus quis mi at leo pulvinar hendrerit. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Pellentesque aliquet lacus vitae pede. Nullam mollis dolor ac nisi. Phasellus sit amet urna. Praesent pellentesque sapien sed lacus. Donec lacinia odio in odio. In sit amet elit. Maecenas gravida interdum urna. Integer pretium, arcu vitae imperdiet facilisis, elit tellus tempor nisi, vel feugiat ante velit sit amet mauris. Vivamus arcu. Integer pharetra magna ac lacus. Aliquam vitae sapien in nibh vehicula auctor. Suspendisse leo mauris, pulvinar sed, tempor et, consequat ac, lacus. Proin velit. Nulla semper lobortis mauris. Duis urna erat, ornare et, imperdiet eu, suscipit sit amet, massa. Nulla nulla nisi, pellentesque at, egestas quis, fringilla eu, diam. Donec semper, sem nec tristique tempus, justo neque commodo nisl, ut gravida sem tellus suscipit nunc. Aliquam erat volutpat. Ut tincidunt pretium elit. Aliquam pulvinar. Nulla cursus. Suspendisse potenti. Etiam condimentum hendrerit felis. Duis iaculis aliquam enim. Donec dignissim augue vitae orci. Curabitur luctus felis a metus. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. In varius neque at enim. Suspendisse massa nulla, viverra in, bibendum vitae, tempor quis, lorem. Donec dapibus orci sit amet elit. Maecenas rutrum ultrices lectus. Aliquam suscipit, lacus a iaculis adipiscing, eros orci pellentesque nisl, non pharetra dolor urna nec dolor. Integer cursus dolor vel magna. Integer ultrices feugiat sem. Proin nec nibh. Duis eu dui quis nunc sagittis lobortis. Fusce pharetra, enim ut sodales luctus, lectus arcu rhoncus purus, in fringilla augue elit vel lacus. In hac habitasse platea dictumst. Aliquam erat volutpat. Fusce iaculis elit id tellus. Ut accumsan malesuada turpis. Suspendisse potenti. Vestibulum lacus augue, lobortis mattis, laoreet in, varius at, nisi. Nunc gravida. Phasellus faucibus. In hac habitasse platea dictumst. Integer tempor lacus eget lectus. Praesent fringilla augue fringilla dui. tm/inst/texts/custom.xml0000644000175100001440000000070512074065307015076 0ustar hornikusers Ano Nymous The Invisible Man A story about an invisible man. Science fiction Sokrates Scio Nescio I know that I know nothing. Classics tm/inst/texts/rcv1_2330.xml0000644000175100001440000000265512074065307015114 0ustar hornikusers USA: Tylan stock jumps; weighs sale of company. Tylan stock jumps; weighs sale of company. SAN DIEGO

The stock of Tylan General Inc. jumped Tuesday after the maker of process-management equipment said it is exploring the sale of the company and added that it has already received some inquiries from potential buyers.

Tylan was up $2.50 to $12.75 in early trading on the Nasdaq market.

The company said it has set up a committee of directors to oversee the sale and that Goldman, Sachs & Co. has been retained as its financial adviser.

(c) Reuters Limited 1996
tm/inst/texts/crude/0000755000175100001440000000000012213264556014144 5ustar hornikuserstm/inst/texts/crude/reut-00004.xml0000644000175100001440000000151512074065306016305 0ustar hornikusers 26-FEB-1987 18:18:00.84 crude canada Y E f0308 reute u f BC-TEXACO-CANADA-<TXC>-L 02-26 0064 TEXACO CANADA <TXC> LOWERS CRUDE POSTINGS NEW YORK, Feb 26 - Texaco Canada said it lowered the contract price it will pay for crude oil 64 Canadian cts a barrel, effective today. The decrease brings the company's posted price for the benchmark grade, Edmonton/Swann Hills Light Sweet, to 22.26 Canadian dlrs a bbl. Texaco Canada last changed its crude oil postings on Feb 19. Reuter tm/inst/texts/crude/reut-00010.xml0000644000175100001440000000511712074065306016304 0ustar hornikusers 1-MAR-1987 08:22:30.94 crude qatar RM f0413 reute u f BC-QATAR-UNVEILS-BUDGET 03-01 0111 QATAR UNVEILS BUDGET FOR FISCAL 1987/88 DOHA, March 1 - The Gulf oil state of Qatar, recovering slightly from last year's decline in world oil prices, announced its first budget since early 1985 and projected a deficit of 5.472 billion riyals. The deficit compared with a shortfall of 7.3 billion riyals in the last published budget for 1985/86. In a statement outlining the budget for the fiscal year 1987/88 beginning today, Finance and Petroleum Minister Sheikh Abdul-Aziz bin Khalifa al-Thani said the government expected to spend 12.217 billion riyals in the period. Projected expenditure in the 1985/86 budget had been 15.6 billion riyals. Sheikh Abdul-Aziz said government revenue would be about 6.745 billion riyals, down by about 30 pct on the 1985/86 projected revenue of 9.7 billion. The government failed to publish a 1986/87 budget due to uncertainty surrounding oil revenues. Sheikh Abdul-Aziz said that during that year the government decided to limit recurrent expenditure each month to one-twelfth of the previous fiscal year's allocations minus 15 pct. He urged heads of government departments and public institutions to help the government rationalise expenditure. He did not say how the 1987/88 budget shortfall would be covered. Sheikh Abdul-Aziz said plans to limit expenditure in 1986/87 had been taken in order to relieve the burden placed on the country's foreign reserves. He added in 1987/88 some 2.766 billion riyals had been allocated for major projects including housing and public buildings, social services, health, education, transport and communications, electricity and water, industry and agriculture. No figure was revealed for expenditure on defence and security. There was also no projection for oil revenue. Qatar, an OPEC member, has an output ceiling of 285,000 barrels per day. Sheikh Abdul-Aziz said: "Our expectations of positive signs regarding (oil) price trends, foremost among them OPEC's determination to shoulder its responsibilites and protect its wealth, have helped us make reasonable estimates for the coming year's revenue on the basis of our assigned quota." REUTER tm/inst/texts/crude/reut-00012.xml0000644000175100001440000000543412074065306016310 0ustar hornikusers 2-MAR-1987 01:05:49.72 crude saudi-arabia uae opec RM f0600 reute b f BC-SAUDI-FEBRUARY-CRUDE 03-02 0095 SAUDI FEBRUARY CRUDE OUTPUT PUT AT 3.5 MLN BPD ABU DHABI, March 2 - Saudi crude oil output last month fell to an average of 3.5 mln barrels per day (bpd) from 3.8 mln bpd in January, Gulf oil sources said. They said exports from the Ras Tanurah and Ju'aymah terminals in the Gulf fell to an average 1.9 mln bpd last month from 2.2 mln in January because of lower liftings by some customers. But the drop was much smaller than expected after Gulf exports rallied in the fourth week of February to 2.5 mln bpd from 1.2 mln in the third week, the sources said. The production figures include neutral zone output but not sales from floating storage, which are generally considered part of a country's output for Opec purposes. Saudi Arabia has an Opec quota of 4.133 mln bpd under a production restraint scheme approved by the 13-nation group last December to back new official oil prices averaging 18 dlrs a barrel. The sources said the two-fold jump in exports last week appeared to be the result of buyers rushing to lift February entitlements before the month-end. Last week's high export levels appeared to show continued support for official Opec prices from Saudi Arabia's main crude customers, the four ex-partners of Aramco, the sources said. The four -- Exxon Corp <XON>, Mobil Corp <MOB>, Texaco Inc <TX> and Chevron Corp <CHV> -- signed a long-term agreement last month to buy Saudi crude for 17.52 dlrs a barrel. However the sources said the real test of Saudi Arabia's ability to sell crude at official prices in a weak market will come this month, when demand for petroleum products traditionally tapers off. Spot prices have fallen in recent weeks to more than one dlr below Opec levels. Saudi Arabian oil minister Hisham Nazer yesterday reiterated the kingdom's commitment to the December OPEC accord and said it would never sell below official prices. The sources said total Saudi refinery throughput fell slightly in February to an average 1.1 mln bpd from 1.2 mln in January because of cuts at the Yanbu and Jubail export refineries. They put crude oil exports through Yanbu at 100,000 bpd last month, compared to zero in January, while throughput at Bahrain's refinery and neutral zone production remained steady at around 200,000 bpd each. REUTER tm/inst/texts/crude/reut-00016.xml0000644000175100001440000000216612074065306016313 0ustar hornikusers 2-MAR-1987 08:25:42.14 crude ship usa Y F f0300 reute u f BC-PHILADELPHIA-PORT-CLO 03-02 0115 PHILADELPHIA PORT CLOSED BY TANKER CRASH PHILADELPHIA, March 2 - The port of Philadelphia was closed when a Cypriot oil tanker, Seapride II, ran aground after hitting a 200-foot tower supporting power lines across the river, a Coast Guard spokesman said. He said there was no oil spill but the ship is lodged on rocks opposite the Hope Creek nuclear power plant in New Jersey. He said the port would be closed until today when they hoped to refloat the ship on the high tide. After delivering oil to a refinery in Paulsboro, New Jersey, the ship apparently lost its steering and hit the power transmission line carrying power from the nuclear plant to the state of Delaware. Reuter tm/inst/texts/crude/reut-00002.xml0000644000175100001440000000634512074065306016311 0ustar hornikusers 26-FEB-1987 17:34:11.89 crude usa opec Y f0189 reute r f BC-/OPEC-MAY-HAVE-TO-MEE 02-26 0105 OPEC MAY HAVE TO MEET TO FIRM PRICES - ANALYSTS BY TED D'AFFLISIO, Reuters NEW YORK, Feb 26 - OPEC may be forced to meet before a scheduled June session to readdress its production cutting agreement if the organization wants to halt the current slide in oil prices, oil industry analysts said. "The movement to higher oil prices was never to be as easy as OPEC thought. They may need an emergency meeting to sort out the problems," said Daniel Yergin, director of Cambridge Energy Research Associates, CERA. Analysts and oil industry sources said the problem OPEC faces is excess oil supply in world oil markets. "OPEC's problem is not a price problem but a production issue and must be addressed in that way," said Paul Mlotok, oil analyst with Salomon Brothers Inc. He said the market's earlier optimism about OPEC and its ability to keep production under control have given way to a pessimistic outlook that the organization must address soon if it wishes to regain the initiative in oil prices. But some other analysts were uncertain that even an emergency meeting would address the problem of OPEC production above the 15.8 mln bpd quota set last December. "OPEC has to learn that in a buyers market you cannot have deemed quotas, fixed prices and set differentials," said the regional manager for one of the major oil companies who spoke on condition that he not be named. "The market is now trying to teach them that lesson again," he added. David T. Mizrahi, editor of Mideast reports, expects OPEC to meet before June, although not immediately. However, he is not optimistic that OPEC can address its principal problems. "They will not meet now as they try to take advantage of the winter demand to sell their oil, but in late March and April when demand slackens," Mizrahi said. But Mizrahi said that OPEC is unlikely to do anything more than reiterate its agreement to keep output at 15.8 mln bpd." Analysts said that the next two months will be critical for OPEC's ability to hold together prices and output. "OPEC must hold to its pact for the next six to eight weeks since buyers will come back into the market then," said Dillard Spriggs of Petroleum Analysis Ltd in New York. But Bijan Moussavar-Rahmani of Harvard University's Energy and Environment Policy Center said that the demand for OPEC oil has been rising through the first quarter and this may have prompted excesses in its production. "Demand for their (OPEC) oil is clearly above 15.8 mln bpd and is probably closer to 17 mln bpd or higher now so what we are seeing characterized as cheating is OPEC meeting this demand through current production," he told Reuters in a telephone interview. Reuter tm/inst/texts/crude/reut-00007.xml0000644000175100001440000000647212074065306016317 0ustar hornikusers 1-MAR-1987 03:25:46.85 crude kuwait ecuador opec RM f0374 reute b f BC-KUWAIT-SAYS-NO-PLANS 03-01 0091 KUWAIT SAYS NO PLANS FOR EMERGENCY OPEC TALKS KUWAIT, March 1 - Kuwait"s Oil Minister, in remarks published today, said there were no plans for an emergency OPEC meeting to review oil policies after recent weakness in world oil prices. Sheikh Ali al-Khalifa al-Sabah was quoted by the local daily al-Qabas as saying: "None of the OPEC members has asked for such a meeting." He denied Kuwait was pumping above its quota of 948,000 barrels of crude daily (bpd) set under self-imposed production limits of the 13-nation organisation. Traders and analysts in international oil markets estimate OPEC is producing up to one mln bpd above a ceiling of 15.8 mln bpd agreed in Geneva last December. They named Kuwait and the United Arab Emirates, along with the much smaller producer Ecuador, among those producing above quota. Kuwait, they said, was pumping 1.2 mln bpd. "This rumour is baseless. It is based on reports which said Kuwait has the ability to exceed its share. They suppose that because Kuwait has the ability, it will do so," the minister said. Sheikh Ali has said before that Kuwait had the ability to produce up to 4.0 mln bpd. "If we can sell more than our quota at official prices, while some countries are suffering difficulties marketing their share, it means we in Kuwait are unusually clever," he said. He was referring apparently to the Gulf state of qatar, which industry sources said was selling less than 180,000 bpd of its 285,000 bpd quota, because buyers were resisting official prices restored by OPEC last month pegged to a marker of 18 dlrs per barrel. Prices in New York last week dropped to their lowest levels this year and almost three dollars below a three-month high of 19 dollars a barrel. Sheikh Ali also delivered "a challenge to any international oil company that declared Kuwait sold below official prices." Because it was charging its official price, of 16.67 dlrs a barrel, it had lost custom, he said but did not elaborate. However, Kuwait had guaranteed markets for its oil because of its local and international refining facilities and its own distribution network abroad, he added. He reaffirmed that the planned meeting March 7 of OPEC"s differentials committee has been postponed until the start of April at the request of certain of the body"s members. Ecuador"s deputy energy minister Fernando Santos Alvite said last Wednesday his debt-burdened country wanted OPEC to assign a lower official price for its crude, and was to seek this at talks this month of opec"s pricing committee. Referring to pressure by oil companies on OPEC members, in apparent reference to difficulties faced by Qatar, he said: "We expected such pressure. It will continue through March and April." But he expected the situation would later improve. REUTER tm/inst/texts/crude/reut-00006.xml0000644000175100001440000000204712074065306016310 0ustar hornikusers 26-FEB-1987 19:00:57.33 crude usa F Y f0379 reute d f BC-HOUSTON-OIL-<HO>-RESE 02-26 0101 HOUSTON OIL <HO> RESERVES STUDY COMPLETED HOUSTON, Feb 26 - Houston Oil Trust said that independent petroleum engineers completed an annual study that estimates the trust's future net revenues from total proved reserves at 88 mln dlrs and its discounted present value of the reserves at 64 mln dlrs. Based on the estimate, the trust said there may be no money available for cash distributions to unitholders for the remainder of the year. It said the estimates reflect a decrease of about 44 pct in net reserve revenues and 39 pct in discounted present value compared with the study made in 1985. Reuter tm/inst/texts/crude/reut-00014.xml0000644000175100001440000000227212074065306016307 0ustar hornikusers 2-MAR-1987 07:43:22.81 crude saudi-arabia bahrain hisham-nazer opec F f0161 reute r f AM-OIL-SAUDI 03-02 0114 SAUDI ARABIA REITERATES COMMITMENT TO OPEC ACCORD BAHRAIN, March 2 - Saudi Arabian Oil Minister Hisham Nazer reiterated the kingdom's commitment to last December's OPEC accord to boost world oil prices and stabilize the market, the official Saudi Press Agency SPA said. Asked by the agency about the recent fall in free market oil prices, Nazer said Saudi Arabia "is fully adhering by the ... accord and it will never sell its oil at prices below the pronounced prices under any circumstance." Saudi Arabia was a main architect of December pact under which OPEC agreed to cut its total oil output ceiling by 7.25 pct and return to fixed prices of around 18 dollars a barrel. Reuter tm/inst/texts/crude/reut-00022.xml0000644000175100001440000000453612074065306016313 0ustar hornikusers 2-MAR-1987 14:38:34.72 crude usa nymex Y f0753 reute r f BC-NYMEX-WILL-EXPAND-OFF 03-02 0103 NYMEX WILL EXPAND OFF-HOUR TRADING APRIL ONE By BERNICE NAPACH, Reuters NEW YORK, March 2 - The New York Mercantile Exchange set April one for the debut of a new procedure in the energy complex that will increase the use of energy futures worldwide. On April one, NYMEX will allow oil traders that do not hold a futures position to initiate, after the exchange closes, a transaction that can subsequently be hedged in the futures market, according to an exchange spokeswoman. "This will change the way oil is transacted in the real world," said said Thomas McKiernan, McKiernan and Co chairman. Foreign traders will be able to hedge trades against NYMEX prices before the exchange opens and negotiate prices at a differential to NYMEX prices, McKiernan explained. The expanded program "will serve the industry because the oil market does not close when NYMEX does," said Frank Capozza, secretary of Century Resources Inc. The rule change, which has already taken effect for platinum futures on NYMEX, is expected to increase the open interest and liquidity in U.S. energy futures, according to traders and analysts. Currently, at least one trader in this transaction, called an exchange for physical or EFP, must hold a futures position before entering into the transaction. Under the new arrangement, neither party has to hold a futures position before entering into an EFP and one or both parties can offset their cash transaction with a futures contract the next day, according to exchange officials. When NYMEX announced its proposed rule change in December, NYMEX President Rosemary McFadden, said, "Expansion of the EFP provision will add to globalization of the energy markets by providing for, in effect, 24-hour trading." The Commodity Futures Trading Commission approved the rule change in February, according to a CFTC spokeswoman. Reuter tm/inst/texts/crude/reut-00005.xml0000644000175100001440000000157412074065306016313 0ustar hornikusers 26-FEB-1987 18:21:01.50 crude usa Y f0313 reute u f BC-MARATHON-PETROLEUM-RE 02-26 0075 MARATHON PETROLEUM REDUCES CRUDE POSTINGS NEW YORK, Feb 26 - Marathon Petroleum Co said it reduced the contract price it will pay for all grades of crude oil one dlr a barrel, effective today. The decrease brings Marathon's posted price for both West Texas Intermediate and West Texas Sour to 16.50 dlrs a bbl. The South Louisiana Sweet grade of crude was reduced to 16.85 dlrs a bbl. The company last changed its crude postings on Jan 12. Reuter tm/inst/texts/crude/reut-00015.xml0000644000175100001440000000213712074065306016310 0ustar hornikusers 2-MAR-1987 07:43:41.57 crude kuwait opec V f0163 reute r f BC-OIL-KUWAIT 03-02 0109 KUWAIT MINISTER SAYS NO EMERGENCY OPEC TALKS SET KUWAIT, March 2 - Kuwait's oil minister said in a newspaper interview that there were no plans for an emergency OPEC meeting after the recent weakness in world oil prices. Sheikh Ali al-Khalifa al-Sabah was quoted by the local daily al-Qabas as saying that "none of the OPEC members has asked for such a meeting." He also denied that Kuwait was pumping above its OPEC quota of 948,000 barrels of crude daily (bpd). Crude oil prices fell sharply last week as international oil traders and analysts estimated the 13-nation OPEC was pumping up to one million bpd over its self-imposed limits. Reuter tm/inst/texts/crude/reut-00008.xml0000644000175100001440000000654412074065306016320 0ustar hornikusers 1-MAR-1987 03:39:14.63 crude indonesia usa worldbank RM f0379 reute u f BC-INDONESIA-SEEN-AT-CRO 03-01 0107 INDONESIA SEEN AT CROSSROADS OVER ECONOMIC CHANGE By Jeremy Clift, Reuters JAKARTA, March 1 - Indonesia appears to be nearing a political crossroads over measures to deregulate its protected economy, the U.S. Embassy says in a new report. To counter falling oil revenues, the government has launched a series of measures over the past nine months to boost exports outside the oil sector and attract new investment. Indonesia, the only Asian member of OPEC and a leading primary commodity producer, has been severely hit by last year"s fall in world oil prices, which forced it to devalue its currency by 31 pct in September. But the U.S. Embassy report says President Suharto"s government appears to be divided over what direction to lead the economy. "(It) appears to be nearing a crossroads with regard to deregulation, both as it pertains to investments and imports," the report says. It primarily assesses Indonesia"s agricultural sector, but also reviews the country"s general economic performance. It says that while many government officials and advisers are recommending further relaxation, "there are equally strong pressures being exerted to halt all such moves." "This group strongly favours an import substitution economy," the report says. Indonesia"s economic changes have been welcomed by the World Bank and international bankers as steps in the right direction, though they say crucial areas of the economy like plastics and steel remain highly protected, and virtual monopolies. Three sets of measures have been announced since last May, which broadened areas for foreign investment, reduced trade restrictions and liberalised imports. The report says Indonesia"s economic growth in calendar 1986 was probably about zero, and the economy may even have contracted a bit. "This is the lowest rate of growth since the mid-1960s," the report notes. Indonesia, the largest country in South-East Asia with a population of 168 million, is facing general elections in April. But the report hold out little hope for swift improvement in the economic outlook. "For 1987 early indications point to a slightly positive growth rate not exceeding one pct. Economic activity continues to suffer due to the sharp fall in export earnings from the petroleum industry." "Growth in the non-oil sector is low because of weak domestic demand coupled with excessive plant capacity, real declines in construction and trade, and a reduced level of growth in agriculture," the report states. Bankers say continuation of present economic reforms is crucial for the government to get the international lending its needs. A new World Bank loan of 300 mln dlrs last month in balance of payments support was given partly to help the government maintain the momentum of reform, the Bank said. REUTER tm/inst/texts/crude/reut-00013.xml0000644000175100001440000000225012074065306016302 0ustar hornikusers 2-MAR-1987 07:39:23.30 crude uae bahrain saudi-arabia kuwait qatar opec V f0149 reute r f BC-GULF-ARAB-DEPUTY-OIL 03-02 0110 GULF ARAB DEPUTY OIL MINISTERS TO MEET IN BAHRAIN ABU DHABI, March 2 - Deputy oil ministers from six Gulf Arab states will meet in Bahrain today to discuss coordination of crude oil marketing, the official Emirates news agency WAM reported. WAM said the officials would be discussing implementation of last Sunday's agreement in Doha by Gulf Cooperation Council (GCC) oil ministers to help each other market their crude oil. Four of the GCC states - Saudi Arabia, the United Arab Emirates (UAE), Kuwait and Qatar - are members of the Organiaation of Petroleum Exporting Countries (OPEC) and some face stiff buyer resistance to official OPEC prices. Reuter tm/inst/texts/crude/reut-00021.xml0000644000175100001440000000173512074065306016310 0ustar hornikusers 2-MAR-1987 12:13:46.82 crude usa Y F f0206 reute r f BC-UNOCAL-<UCL>-UNIT-CUT 03-02 0088 UNOCAL <UCL> UNIT CUTS CRUDE OIL POSTED PRICES LOS ANGELES, March 2 - Unocal Corp's Union Oil Co said it lowered its posted prices for crude oil one to 1.50 dlrs a barrel in the eastern region of the U.S., effective Feb 26. Union said a 1.50 dlrs cut brings its posted price for the U.S. benchmark grade, West Texas Intermediate, to 16 dlrs. Louisiana Sweet also was lowered 1.50 dlrs to 16.35 dlrs, the company said. No changes were made in Union's posted prices for West Coast grades of crude oil, the company said. Reuter tm/inst/texts/crude/reut-00018.xml0000644000175100001440000000254012074065306016311 0ustar hornikusers 2-MAR-1987 11:20:05.52 crude usa C f0937 reute d f BC-STUDY-GROUP-URGES-INC 03-02 0156 STUDY GROUP URGES INCREASED U.S. OIL RESERVES WASHINGTON, March 2 - A study group said the United States should increase its strategic petroleum reserve to one mln barrels as one way to deal with the present and future impact of low oil prices on the domestic oil industry. U.S. policy now is to raise the strategic reserve to 750 mln barrels, from its present 500 mln, to help protect the economy from an overseas embargo or a sharp price rise. The Aspen Institute for Humanistic Studies, a private group, also called for new research for oil exploration and development techniques. It predicted prices would remain at about 15-18 dlrs a barrel for several years and then rise to the mid 20s, with imports at about 30 pct of U.S. consumption. It said instead that such moves as increasing oil reserves and more exploration and development research would help to guard against or mitigate the risks of increased imports. Reuter tm/inst/texts/crude/reut-00011.xml0000644000175100001440000000521412074065306016303 0ustar hornikusers 1-MAR-1987 18:31:44.74 crude bahrain saudi-arabia hisham-nazer opec RM f0427 reute b f BC-SAUDI-ARABIA-REITERAT 03-01 0084 SAUDI ARABIA REITERATES COMMITMENT TO OPEC PACT BAHRAIN, March 1 - Saudi Arabian Oil Minister Hisham Nazer reiterated the kingdom's commitment to last December's OPEC accord to boost world oil prices and stabilise the market, the official Saudi Press Agency SPA said. Asked by the agency about the recent fall in free market oil prices, Nazer said Saudi Arabia "is fully adhering by the ... Accord and it will never sell its oil at prices below the pronounced prices under any circumstance." Nazer, quoted by SPA, said recent pressure on free market prices "may be because of the end of the (northern hemisphere) winter season and the glut in the market." Saudi Arabia was a main architect of the December accord, under which OPEC agreed to lower its total output ceiling by 7.25 pct to 15.8 mln barrels per day (bpd) and return to fixed prices of around 18 dlrs a barrel. The agreement followed a year of turmoil on oil markets, which saw prices slump briefly to under 10 dlrs a barrel in mid-1986 from about 30 dlrs in late 1985. Free market prices are currently just over 16 dlrs. Nazer was quoted by the SPA as saying Saudi Arabia's adherence to the accord was shown clearly in the oil market. He said contacts among members of OPEC showed they all wanted to stick to the accord. In Jamaica, OPEC President Rilwanu Lukman, who is also Nigerian Oil Minister, said the group planned to stick with the pricing agreement. "We are aware of the negative forces trying to manipulate the operations of the market, but we are satisfied that the fundamentals exist for stable market conditions," he said. Kuwait's Oil Minister, Sheikh Ali al-Khalifa al-Sabah, said in remarks published in the emirate's daily Al-Qabas there were no plans for an emergency OPEC meeting to review prices. Traders and analysts in international oil markets estimate OPEC is producing up to one mln bpd above the 15.8 mln ceiling. They named Kuwait and the United Arab Emirates, along with the much smaller producer Ecuador, among those producing above quota. Sheikh Ali denied that Kuwait was over-producing. REUTER tm/inst/texts/crude/reut-00009.xml0000644000175100001440000000270012074065306016307 0ustar hornikusers 1-MAR-1987 05:27:27.17 crude bahrain saudi-arabia opec RM f0401 reute u f BC-SAUDI-RIYAL-DEPOSIT-R 03-01 0108 SAUDI RIYAL DEPOSIT RATES REMAIN FIRM BAHRAIN, March 1 - Saudi riyal interbank deposits were steady at yesterday's higher levels in a quiet market. Traders said they were reluctant to take out new positions amidst uncertainty over whether OPEC will succeed in halting the current decline in oil prices. Oil industry sources said yesterday several Gulf Arab producers had had difficulty selling oil at official OPEC prices but Kuwait has said there are no plans for an emergency meeting of the 13-member organisation. A traditional Sunday lull in trading due to the European weekend also contributed to the lack of market activity. Spot-next and one-week rates were put at 6-1/4, 5-3/4 pct after quotes ranging between seven, six yesterday. One, three, and six-month deposits were quoted unchanged at 6-5/8, 3/8, 7-1/8, 6-7/8 and 7-3/8, 1/8 pct respectively. The spot riyal was quietly firmer at 3.7495/98 to the dollar after quotes of 3.7500/03 yesterday. REUTER tm/inst/texts/crude/reut-00001.xml0000644000175100001440000000200512074065306016275 0ustar hornikusers 26-FEB-1987 17:00:56.04 crude usa Y f0119 reute u f BC-DIAMOND-SHAMROCK-(DIA 02-26 0097 DIAMOND SHAMROCK (DIA) CUTS CRUDE PRICES NEW YORK, FEB 26 - Diamond Shamrock Corp said that effective today it had cut its contract prices for crude oil by 1.50 dlrs a barrel. The reduction brings its posted price for West Texas Intermediate to 16.00 dlrs a barrel, the copany said. "The price reduction today was made in the light of falling oil product prices and a weak crude oil market," a company spokeswoman said. Diamond is the latest in a line of U.S. oil companies that have cut its contract, or posted, prices over the last two days citing weak oil markets. Reuter tm/inst/texts/crude/reut-00019.xml0000644000175100001440000000320212074065306016306 0ustar hornikusers 2-MAR-1987 11:28:26.03 crude usa Y f0976 reute d f BC-STUDY-GROUP-URGES-INC 03-02 0099 STUDY GROUP URGES INCREASED U.S. OIL RESERVES WASHINGTON, March 2 - A study group said the United States should increase its strategic petroleum reserve to one mln barrels as one way to deal with the present and future impact of low oil prices on the domestic oil industry. U.S. policy now is to raise the strategic reserve to 750 mln barrels, from its present 500 mln, to help protect the economy from an overseas embargo or a sharp price rise. The Aspen Institute for Humanistic Studies, a private group, also called for new research for oil exploration and development techniques. It predicted prices would remain at about 15-18 dlrs a barrel for several years and then rise to the mid 20s, with imports at about 30 pct of U.S. consumption. The study cited two basic policy paths for the nation: to protect the U.S. industry through an import fee or other such device or to accept the full economic benefits of cheap oil. But the group did not strongly back either option, saying there were benefits and drawbacks to both. It said instead that such moves as increasing oil reserves and more exploration and development research would help to guard against or mitigate the risks of increased imports. Reuter tm/inst/texts/crude/reut-00023.xml0000644000175100001440000000156512074065306016313 0ustar hornikusers 2-MAR-1987 14:49:06.33 crude nat-gas argentina Y f0783 reute u f BC-ARGENTINE-OIL-PRODUCT 03-02 0071 ARGENTINE OIL PRODUCTION DOWN IN JANUARY 1987 BUENOS AIRES, March 2 - Argentine crude oil production was down 10.8 pct in January 1987 to 12.32 mln barrels, from 13.81 mln barrels in January 1986, Yacimientos Petroliferos Fiscales said. January 1987 natural gas output totalled 1.15 billion cubic metrers, 3.6 pct higher than 1.11 billion cubic metres produced in January 1986, Yacimientos Petroliferos Fiscales added. Reuter tm/inst/texts/acq/0000755000175100001440000000000012213264556013606 5ustar hornikuserstm/inst/texts/acq/reut-00003.xml0000644000175100001440000000203612074065306015745 0ustar hornikusers 26-FEB-1987 15:49:56.01 acq ship usa F f0874 reute r f BC-MCLEAN'S-<MII>-U.S.-L 02-26 0094 MCLEAN'S <MII> U.S. LINES SETS ASSET TRANSFER CRANFORD, N.J., Feb 26 - McLean Industries Inc's United States Lines Inc subsidiary said it has agreed in principle to transfer its South American service by arranging for the transfer of certain charters and assets to <Crowley Mariotime Corp>'s American Transport Lines Inc subsidiary. U.S. Lines said negotiations on the contract are expected to be completed within the next week. Terms and conditions of the contract would be subject to approval of various regulatory bodies, including the U.S. Bankruptcy Court. Reuter tm/inst/texts/acq/reut-00056.xml0000644000175100001440000000313212074065306015753 0ustar hornikusers 2-MAR-1987 11:29:26.84 acq usa F f0981 reute r f BC-CARBIDE-<UK>-LOOKS-TO 03-02 0095 CARBIDE <UK> LOOKS TO ACQUISITIONS FOR GROWTH NEW YORK, March 2 - Union Carbide Corp is looking to acquisitions and joint ventures to aid its chemicals and plastics growth, according the H.W. Lichtenberger, president of Chemicals and Plastics. Describing this as a major departure in the company's approach to commercial development, he told the annual new business forum of the Commercial Development Association "We are looking to acquisitions and joint ventures when they look like the fastest and most promising routes to the growth markets we've identified." Not very long ago Union Carbide had the attitude "that if we couldn't do it ourselves, it wasn't worth doing. Or, if it was worth doing, we had to go it alone," Lichtenberger explained. He said "there are times when exploiting a profitable market is done best with a partner. Nor do we see any need to plow resources into a technology we may not have if we can link up profitably with someone who is already there." He said Carbide has extended its catalyst business that way and is now extending its specialty chemicals business in the same way. Reuter tm/inst/texts/acq/reut-00055.xml0000644000175100001440000000152312074065306015754 0ustar hornikusers 2-MAR-1987 11:24:06.09 acq usa F f0958 reute r f BC-UTILICORP-<UCU>-COMPL 03-02 0066 UTILICORP <UCU> COMPLETES ACQUISITION KANSAS CITY, March 2 - UtiliCorp United Inc said it completed the acquisition of West Virginia Power from Dominion Resources for about 21 mln dlrs. The sale was approved by the West Virginia Public Service Commission in January and became effective March one. West Virginia's management will continue to be responsible for operating the utility, it said. Reuter tm/inst/texts/acq/reut-00053.xml0000644000175100001440000000667512074065306015767 0ustar hornikusers 2-MAR-1987 11:23:31.27 acq usa F f0955 reute u f BC-VIACOM 03-02 0104 REDSTONE DETAILS SWEETENED VIACOM <VIA> OFFER WASHINGTON, March 2 - Investor Sumner Redstone, who leads one of the two groups vying for control of Viacom International Inc, offered to sweeten his bid for the company by 1.50 dlrs a share cash and 1.50 dlrs in securities. In a filing with the Securities and Exchange Commission, Redstone, who controls Dedham, Mass.,-based National Amusements Inc, a theater chain operator, offered to raise the cash portion of its Viacom offer to 42 dlrs a share from 40.50 dlrs. Redstone also raised the face value of the preferred stock he is offering to 7.50 dlrs from six dlrs. The Redstone offer, which is being made through Arsenal Holdings Inc, a National Amusements subsidiary set up for that purpose, which also give Viacom shareholders one-fifth of a share of Arsenal common stock after the takeover. Viacom said earlier today it received revised takeover bids from Redstone and MCV Holdings Inc, a group led by Viacom management which is competing with Redstone for control of the company and already has a formal merger agreement with Viacom. The company did not disclose the details of the revised offers, but said a special committee of its board would review them later today. The Redstone group, which has a 19.5 pct stake in Viacom, and the management group, which has a 5.4 pct stake, have both agreed not to buy more shares of the company until a merger is completed, unless the purchases are part of a tender offer for at least half of the outstanding stock. The two rivals also signed confidentiality agreements, which give them access to Viacom's financial records provided they keep the information secret. In his SEC filing, Redstone, who estimated his cost of completing the takeover at 2.95 billion dlrs, said Bank of America is confident it can raise 2.275 billion dlrs. Besides the financing it would raise through a bank syndicate, Bank of America has also agreed to provide a separate 25 mln dlr for the limited purpose of partial financing and has committed to provide another 592 mln dlrs, Redstone said. Merrill Lynch, Pierce Fenner and Smith Inc has increased its underwriting commitment to 175 mln dlrs of subordinated financing debt for the Viacom takeover, from the 150 mln dlrs it agreed to underwrite earlier, Redstone said. Redstone said his group would contribute more than 475 mln dlrs in equity toward the takeover. The Redstone equity contribution to the takeover would consist of all of his group's 6,881,800 Viacom common shares and at least 118 mln dlrs cash, he said. The new offer, the second sweetened deal Redstone has proposed in his month-long bidding war with management, also contains newly drawn up proposed merger documents, he said. Last week, the management group submitted what it called its last offer for the company, valued at 3.1 mln dlrs and consisting of 38.50 dlrs a share cash, preferred stock valued at eight dlrs a share and equity in the new company. Redstone's previous offer had been valued at 3.2 billion dlrs. Reuter tm/inst/texts/acq/reut-00004.xml0000644000175100001440000000562412074065306015754 0ustar hornikusers 26-FEB-1987 15:51:17.84 acq usa F f0881 reute u f BC-CHEMLAWN-<CHEM>-RISES 02-26 0106 CHEMLAWN <CHEM> RISES ON HOPES FOR HIGHER BIDS By Cal Mankowski, Reuters NEW YORK, Feb 26 - ChemLawn Corp <CHEM> could attract a higher bid than the 27 dlrs per share offered by Waste Management Inc <WNX>, Wall Street arbitrageurs said. Shares of ChemLawn shot up 11-5/8 to 29-3/8 in over-the-counter- trading with 3.8 mln of the company's 10.1 mln shares changing hands by late afternoon. "This company could go for 10 times cash flow or 30 dlrs, maybe 32 dollars depending on whether there is a competing bidder," an arbitrageur said. Waste Management's tender offer, announced before the opening today, expires March 25. "This is totally by surprise," said Debra Strohmaier, a ChemLawn spokeswoman. The company's board held a regularly scheduled meeting today and was discussing the Waste Management announcement. She said a statement was expected but it was not certain when it would be ready. She was unable to say if there had been any prior contact between Waste Management and ChemLawn officials. "I think they will resist it," said Elliott Schlang, analyst at Prescott, Ball and Turben Inc. "Any company that doesn't like a surprise attack would." Arbitrageurs pointed out it is difficult to resist tender offers for any and all shares for cash. Schlang said ChemLawn could try to find a white knight if does not want to be acquired by Waste Management. Analyst Rosemarie Morbelli of Ingalls and Snyder said ServiceMaster Companies L.P. <SVM> or Rollins Inc <ROL> were examples of companies that could be interested. ChemLawn, with about two mln customers, is the largest U.S. company involved in application of fertilizers, pesticides and herbicides on lawns. Waste Management is involved in removal of wastes. Schlang said ChemLawn's customer base could be valuable to another company that wants to capitalize on a strong residential and commercial distribution system. Both Schlang and Morbelli noted that high growth rates had catapulted ChemLawn's share price into the mid-30's in 1983 but the stock languished as the rate of growth slowed. Schlang said the company's profits are concentrated in the fourth quarter. In 1986 ChemLawn earned 1.19 dlrs per share for the full year, and 2.58 dlrs in the fourth quarter. Morbelli noted ChemLawn competes with thousands of individual entrepreuers who offer lawn and garden care sevice. Reuter tm/inst/texts/acq/reut-00010.xml0000644000175100001440000000133512074065306015744 0ustar hornikusers 26-FEB-1987 17:08:27.52 acq usa F f0143 reute d f BC-GULF-APPLIED-TECHNOLO 02-26 0049 GULF APPLIED TECHNOLOGIES <GATS> SELLS UNITS HOUSTON, Feb 26 - Gulf Applied Technologies Inc said it sold its subsidiaries engaged in pipeline and terminal operations for 12.2 mln dlrs. The company said the sale is subject to certain post closing adjustments, which it did not explain. Reuter tm/inst/texts/acq/reut-00012.xml0000644000175100001440000000211212074065306015740 0ustar hornikusers 26-FEB-1987 17:36:22.14 acq usa F f0204 reute r f BC-EPSILON-DATA 02-26 0110 DREXEL OFFICIAL HAS STAKE IN EPSILON DATA <EPSI> WASHINGTON, Feb 26 - A senior official of Drexel Burnham Lambert Inc and his father told the Securities and Exchange Commission they have acquired 258,591 shares of Epsilon Data Management Inc, or 9.4 pct of the total outstanding. Kenneth Thomas, senior vice president-investments at Drexel's Los Angeles office, and his father, retired university professor C.A. Thomas, said they bought the stake for 2.1 mln dlrs primarily for investment purposes. They said they may buy more stock or sell some or all of their stake, depending on market conditions, but have no plans to seek control of the company. Reuter tm/inst/texts/acq/reut-00016.xml0000644000175100001440000000122012074065306015743 0ustar hornikusers 26-FEB-1987 18:12:51.94 acq canada E f0301 reute r f BC-VIDEOTRON-BUYS-INTO-E 02-26 0036 VIDEOTRON BUYS INTO EXHIBIT COMPANY MONTREAL, Feb 26 - (Groupe Videotron Ltd) said it agreed to buy 50 pct of (Groupe Promexpo Inc), a company which specializes in product exhibits, for three mln dlrs. Reuter tm/inst/texts/acq/reut-00048.xml0000644000175100001440000000417412074065306015763 0ustar hornikusers 2-MAR-1987 10:36:13.53 gold acq platinum canada brazil E F f0710 reute r f BC-cons-tvx-to-buy 03-02 0090 CONSOLIDATED TVX TO BUY BRAZIL GOLD MINE STAKES TORONTO, March 2 - <Consolidated TVX Mining Corp> said it agreed to issue 7.8 mln treasury shares to acquire interests in three gold mining companies in Brazil and an option to increase the company's interest in a platinum property. The company said the transactions will bring immediate production and earnings to Consolidated TVX, enhance its precious metal potential and is expected to improve cash flow and earnings on a per share basis. The company did not give specific figures. Consolidated TVX said it will acquire 29 pct of CMP, a public gold mining company in which TVX already holds a 15 pct interest, making TVX the largest single shareholder. The company also agreed to acquire a 19 pct stake in Novo Astro, a private company, and a 16 pct interest in Teles Pires Mining, increasing the TVX's ownership to 51 pct. In addition, Consolidated TVX said it will acquire the right to add a 10 pct interest to a platinum property in which it already owns a 29.4 pct stake. CMP earned 11 mln Canadian dlrs in 1986 and expects to produce 42,000 ounces of gold in 1987 at a cost of 160 U.S. dlrs an ounce, Consolidated TVX said. Novo Astro operates Brazil's richest gold mine located in Amapa State, with an average grade of 0.8 ounces of gold a ton in a hardrock quartz vein, Consolidated TVX said. Mining of eluvial surface material produced 25,000 ounces in 1986 and is expected to produce 60,000 ounces in 1987. It also said Teles Pires Mining controls rights to a 350 kilometer section of the Teles Pires River, where one dredge is expected to produce 10,000 ounces of gold in 1987. Reuter tm/inst/texts/acq/reut-00002.xml0000644000175100001440000000243712074065306015751 0ustar hornikusers 26-FEB-1987 15:19:15.45 earn acq usa F f0773 reute u f BC-OHIO-MATTRESS-<OMT>-M 02-26 0095 OHIO MATTRESS <OMT> MAY HAVE LOWER 1ST QTR NET CLEVELAND, Feb 26 - Ohio Mattress Co said its first quarter, ending February 28, profits may be below the 2.4 mln dlrs, or 15 cts a share, earned in the first quarter of fiscal 1986. The company said any decline would be due to expenses related to the acquisitions in the middle of the current quarter of seven licensees of Sealy Inc, as well as 82 pct of the outstanding capital stock of Sealy. Because of these acquisitions, it said, first quarter sales will be substantially higher than last year's 67.1 mln dlrs. Noting that it typically reports first quarter results in late march, said the report is likely to be issued in early April this year. It said the delay is due to administrative considerations, including conducting appraisals, in connection with the acquisitions. Reuter tm/inst/texts/acq/reut-00027.xml0000644000175100001440000000255312074065306015757 0ustar hornikusers 2-MAR-1987 08:22:40.30 acq usa F f0290 reute r f BC-ROPAK-<ROPK>-HAS-34-P 03-02 0109 ROPAK <ROPK> HAS 34 PCT OF BUCKHORN <BKN> FULLERTON, Calif., March 2 - Ropak Corp said it received and accepted about 456,968 common shares and 527,035 Series A convertible preferred shares of Buckhorn Inc at four dlrs and 5.75 dlrs each respectively in response to its tender offer that expired Friday, and it now owns 34.4 pct of Buckhorn voting power. The company had owned 63,000 common and 25,100 preferred shares before starting the hostile tender. Ropak said it is borrowing the funds needed to buy the Buckhorn shares from its bank lender and will not need to use any funds that another bank had committed to provide under a margin loan. Ropak said it waived minimum acceptance requirements to buy the shares and intends to evaluate a number of possible ways of completing an acquisition of Buckhorn. It said it hopes that Buckhorn's board will reevaluate its position and enter into meaningful negotiations. Reuter tm/inst/texts/acq/reut-00007.xml0000644000175100001440000001026112074065306015750 0ustar hornikusers 26-FEB-1987 16:43:13.65 acq usa F f0061 reute u f BC-AMERICAN-EXPRESS-<AXP 02-26 0108 AMERICAN EXPRESS <AXP> SEEN IN POSSIBLE SPINNOFF By Patti Domm, Reuter New York, Feb 26 - American Express Co remained silent on market rumors it would spinoff all or part of its Shearson Lehman Brothers Inc, but some analysts said the company may be considering such a move because it is unhappy with the market value of its stock. American Express stock got a lift from the rumor, as the market calculated a partially public Shearson may command a good market value, thereby boosting the total value of American Express. The rumor also was accompanied by talk the financial services firm would split its stock and boost its dividend. American Express closed on the New York Stock Exchange at 72-5/8, up 4-1/8 on heavy volume. American Express would not comment on the rumors or its stock activity. Analysts said comments by the company at an analysts' meeting Tuesday helped fuel the rumors as did an announcement yesterday of management changes. At the meeting, company officials said American Express stock is undervalued and does not fully reflect the performance of Shearson, according to analysts. Yesterday, Shearson said it was elevating its chief operating officer, Jeffery Lane, to the added position of president, which had been vacant. It also created four new positions for chairmen of its operating divisions. Analysts speculated a partial spinoff would make most sense, contrary to one variation on market rumors of a total spinoff. Some analysts, however, disagreed that any spinoff of Shearson would be good since it is a strong profit center for American Express, contributing about 20 pct of earnings last year. "I think it is highly unlikely that American Express is going to sell shearson," said Perrin Long of Lipper Analytical. He questioned what would be a better investment than "a very profitable securities firm." Several analysts said American Express is not in need of cash, which might be the only reason to sell a part of a strong asset. But others believe the company could very well of considered the option of spinning out part of Shearson, and one rumor suggests selling about 20 pct of it in the market. Larry Eckenfelder of Prudential-Bache Securities said he believes American Express could have considered a partial spinoff in the past. "Shearson being as profitable as it is would have fetched a big premium in the market place. Shearson's book value is in the 1.4 mln dlr range. Shearson in the market place would probably be worth three to 3.5 bilion dlrs in terms of market capitalization," said Eckenfelder. Some analysts said American Express could use capital since it plans to expand globally. "They have enormous internal growth plans that takes capital. You want your stock to reflect realistic valuations to enhance your ability to make all kinds of endeavors down the road," said E.F. Hutton Group analyst Michael Lewis. "They've outlined the fact that they're investing heavily in the future, which goes heavily into the international arena," said Lewis. "...That does not preclude acquisitions and divestitures along the way," he said. Lewis said if American Express reduced its exposure to the brokerage business by selling part of shearson, its stock might better reflect other assets, such as the travel related services business. "It could find its true water mark with a lesser exposure to brokerage. The value of the other components could command a higher multiple because they constitute a higher percentage of the total operating earnings of the company," he said. Lewis said Shearson contributed 316 mln in after-tax operating earnings, up from about 200 mln dlrs in 1985. Reuter tm/inst/texts/acq/reut-00034.xml0000644000175100001440000000166612074065306015761 0ustar hornikusers 2-MAR-1987 09:02:51.89 acq usa F f0411 reute u f BC-LAROCHE-STARTS-BID-FO 03-02 0058 LAROCHE STARTS BID FOR NECO <NPT> SHARES NEW YORK, March 2 - Investor David F. La Roche of North Kingstown, R.I., said he is offering to purchase 170,000 common shares of NECO Enterprises Inc at 26 dlrs each. He said the successful completion of the offer, plus shares he already owns, would give him 50.5 pct of NECO's 962,016 common shares. La Roche said he may buy more, and possible all NECO shares. He said the offer and withdrawal rights will expire at 1630 EST/2130 gmt, March 30, 1987. Reuter tm/inst/texts/acq/reut-00006.xml0000644000175100001440000000154012074065306015747 0ustar hornikusers 26-FEB-1987 16:32:37.30 acq usa F f0024 reute u f BC-CYCLOPS 02-26 0073 INVESTMENT FIRMS CUT CYCLOPS <CYL> STAKE WASHINGTON, Feb 26 - A group of affiliated New York investment firms said they lowered their stake in Cyclops Corp to 260,500 shares, or 6.4 pct of the total outstanding common stock, from 370,500 shares, or 9.2 pct. In a filing with the Securities and Exchange Commission, the group, led by Mutual Shares Corp, said it sold 110,000 Cyclops common shares on Feb 17 and 19 for 10.0 mln dlrs. Reuter tm/inst/texts/acq/reut-00039.xml0000644000175100001440000000241612074065306015760 0ustar hornikusers 2-MAR-1987 09:28:21.66 acq usa F f0482 reute u f BC-MILLER-TABAK-HAS-91.8 03-02 0057 MILLER TABAK HAS 91.8 PCT OF PENN TRAFFIC <PNF> NEW YORK, March 2 - <Miller Tabak Hirsch and Co> said it has received an accepted 3,424,729 common shares of Penn Traffic Co in response to its 31.60 dlr per share tender offer that expired Friday, and together with the 380,728 shares it already owned, it now has about 91.8 pct of Penn Traffic. The company said Penn Traffic is expected to hold a special shareholders' meeting later this month to approve a merger into Miller Tabak at the tender price. It said two Miller Tabak representatives will be named to the Penn Traffic board on March Four to serve as the only directors with Penn Traffic president and chief executive officer Guido Malacarne. The company said it received financing for the transaction from First National Bank of Minneapolis and Salomon Inc <SB>. Reuter tm/inst/texts/acq/reut-00024.xml0000644000175100001440000000212512074065306015747 0ustar hornikusers 2-MAR-1987 06:58:00.68 acq usa uk F f0032 reute u f BC-COLOROLL-AGREES-TO-BU 03-02 0109 COLOROLL AGREES TO BUY U.S. WALLCOVERINGS COMPANY LONDON, March 2 - <Coloroll Group Plc> said it has entered into a conditional agreement to acquire the business and assets of <Wallco Inc> and related companies for 14.5 mln dlrs. Miami-based Wallco manufactures and distributes wallcoverings and showed a pretax profit of 1.5 mln dlrs on turnover of 37 mln in the year ending June 1986. The total U.S. Market was estimated to be worth 840 mln dlrs in 1986, having grown by 47 pct in the previous five years, Coloroll said. The combined sales and profit of the enlarged Coloroll U.S. Business would be 67 mln and four mln dlrs respectively. REUTER tm/inst/texts/acq/reut-00052.xml0000644000175100001440000000205512074065306015752 0ustar hornikusers 2-MAR-1987 11:09:06.82 acq canada E F f0882 reute r f BC-FOUR-SEASONS-BUYING-M 03-02 0100 FOUR SEASONS BUYING MARRIOTT <MHS> HOTEL TORONTO, March 2 - <Four Seasons Hotels Inc> and VMS Realty Partners said they agreed to acquire the Santa Barbara Biltmore Hotel in California from Marriott Corp, for undisclosed terms. Closing was expected by March 31, they added. The companies said they would jointly own the hotel and rename it the Four Seasons Biltmore at Santa Barbara. They said they would spend more than 13 mln U.S. dlrs "to enhance the Biltmore's position as one of the finest resort hotels in North America." Chicago-based VMS Realty is a real estate and development firm. Reuter tm/inst/texts/acq/reut-00045.xml0000644000175100001440000000245612074065306015761 0ustar hornikusers 2-MAR-1987 10:20:41.80 acq usa F A RM f0657 reute u f BC-BANK-OF-NEW-YORK-<BK> 03-02 0054 BANK OF NEW YORK <BK> TO HAVE GAIN ON UNIT SALE NEW YORK, March 2 - Bank of New York Co said it and the management of RMJ Securities Corp have agreed to sell 80 pct of their interests in RMJ Holding Corp to <British and Commonwealth Holdings PLC> and Bank of New York expects to realize a substantial gain on the transaction. RMJ Holding is the holding company for RMJ Securities, a large broker of U.S. government securities and agency obligations Bank of New York owns a majority interest in RMJ Holding and management of RMJ Securities the remainder. Bank of New York said the sale is expected to be completed during the second quarter. It said it and RMJ Securities management will continue to own 20 pct of RMJ Holding for now, but the agreement provides for the sale of that remaining interest to British and Commonwealth over the next six years. Reuter tm/inst/texts/acq/reut-00036.xml0000644000175100001440000000373212074065306015757 0ustar hornikusers 2-MAR-1987 09:16:08.70 acq usa F f0448 reute b f BC-/VIACOM-<VIA>-RECEIVE 03-02 0045 VIACOM <VIA> RECEIVES TWO REVISED OFFERS NEW YORK, March 2 - Viacom International Inc said it received revised merger offers from <National Amusements Inc> and <MCV Holdings Inc>. The company said the special committee plans to meet later today to review both offers. Viacom said National Amusements' Arsenal Holdings Inc raised the value of its offer for the Viacom shares not held by National Amusements in three areas. National Amusements holds 19.6 pct of Viacom's stock. The cash value of the offer was raised to 42.00 dlrs from the 40.50 dlrs a Viacom share offered February 23 while the value of the fraction of a share of exchangeable preferred being offered was increased to 7.50 dlrs a share from six dlrs. The interest rate to be used to increase the cash value of the merger, if delayed beyond April 30, was raised to nine pct from eight pct and 12 pct after May 31. A Viacom spokesman said the Arsenal Holdings's offer continues to include a 20 pct interest in Arsenal for present Viacom shareholders. Viacom said MCV Holdings, a group which includes the company's senior management and the Equitable Life Assurance Society of the United States, raised the value of its offer by increasing the value of the preferred being offered to 8.50 dlrs from 8.00 dlrs a share and raising the ownership in the new company to be held by present Viacom shareholders to 45 pct from 25 pct. MCV called its previous offer, made February 26, the "final" proposed revision of its agreement with Viacom. Reuter tm/inst/texts/acq/reut-00014.xml0000644000175100001440000000136312074065306015751 0ustar hornikusers 26-FEB-1987 17:43:59.12 acq usa F f0235 reute h f BC-SUFFIELD-FINANCIAL-<S 02-26 0050 SUFFIELD FINANCIAL <SSBK> GETS FED APPROVAL SUFFIELD, Conn., Feb 26 - Suffield Financial Corp said the Federal Reserve Board approved its application to acquire Coastal Bancorp <CSBK>, Portland, Me. Suffield said it still needs the approval of the superintendent of Maine's banking department. Reuter tm/inst/texts/acq/reut-00031.xml0000644000175100001440000000133712074065306015751 0ustar hornikusers 2-MAR-1987 08:41:41.32 acq usa F f0358 reute r f BC-FINANCIAL-SANTA-BARBA 03-02 0048 FINANCIAL SANTA BARBARA <FSB> TO MAKE PURCHASE SANTA BARBARA, Calif., March 2 - Financial Corp of Santa Barbara said it has signed a definitive agreement to purchase Stanwell Financial, the lending operations unit of mortgage banking company <Stanwell Mortgage>, for undisclosed terms. Reuter tm/inst/texts/acq/reut-00022.xml0000644000175100001440000000166512074065306015755 0ustar hornikusers 2-MAR-1987 05:48:46.98 acq usa uk F f0923 reute u f BC-SALE-TILNEY-BUYS-STAK 03-02 0083 SALE TILNEY BUYS STAKE IN U.S. INSURANCE BROKER LONDON, March 2 - <Sale Tilney Plc> said it has purchased 80 pct of the ordinary share capital of <B and R International Inc.>, a U.S. Insurance broker, for 5.6 mln dlrs. Sale is paying 3.6 mln dlrs in cash on completion, with the balance plus interest to be paid in equal instalments over the next six years. B and R posted pretax profit of 855,000 dlrs in the year to Dec 31, 1986 when it had net tangible assets of 563,000 dlrs. REUTER tm/inst/texts/acq/reut-00047.xml0000644000175100001440000000155012074065306015755 0ustar hornikusers 2-MAR-1987 10:36:04.57 acq usa F f0709 reute r f BC-BALLY-<BLY>-COMPLETES 03-02 0071 BALLY <BLY> COMPLETES PURCHASE OF GOLDEN NUGGET CHICAGO, March 2 - Bally Manufacturing Corp said it completed the acquisition of the Golden Nugget Casino Hotel in Atlantic City, New Jersey from Golden Nugget Inc. Bally also acquired from Golden Nugget various parcels of real estate in Atlantic City, it noted. The transaction included 140 mln dlrs in cash and stock and the assumption of a 299 mln dlrs mortgage. Reuter tm/inst/texts/acq/reut-00032.xml0000644000175100001440000000150612074065306015750 0ustar hornikusers 2-MAR-1987 08:43:25.91 acq usa F f0362 reute d f BC-MARRIOTT-<MHS>-TO-SEL 03-02 0063 MARRIOTT <MHS> TO SELL HOTEL TORONTO, March 2 - <Four Seasons Hotels> said it and <VMS Realty Partners> of Chicago have agreed to purchase the Santa Barbara Biltmore Hotel from Marriott Corp for an undisclosed amount. It said the venture will rename the hotel the Four Seasons Biltmore at Santa Barbara and invest over 13 mln dlrs in improvements on the 228-room property. Reuter tm/inst/texts/acq/reut-00005.xml0000644000175100001440000000152112074065306015745 0ustar hornikusers 26-FEB-1987 16:08:33.15 acq usa F f0949 reute r f BC-<COFAB-INC>-BUYS-GULF 02-26 0066 <COFAB INC> BUYS GULFEX FOR UNDISCLOSED AMOUNT HOUSTON, Feb 26 - CoFAB Inc said it acquired <Gulfex Inc>, a Houston-based fabricator of custom high-pressure process vessels for the energy and petrochemical industries. CoFAB said its group of companies manufacture specialized cooling and lubricating systems for the oil and gas, petrochemical, utility, pulp and paper and marine industries. Reuter tm/inst/texts/acq/reut-00015.xml0000644000175100001440000000134412074065306015751 0ustar hornikusers 26-FEB-1987 18:12:35.70 acq canada E F f0300 reute r f BC-VERSATILE-TO-SELL-UNI 02-26 0049 VERSATILE TO SELL UNIT TO VICON VANCOUVER, British Columbia, Feb 26 - <Versatile Corp> said it agreed in principle to sell its Alberta-based Versatile Noble Cultivators Co division to Vicon Inc, of Ontario, for undisclosed terms. The division manufactures tillage and spraying equipment. Reuter tm/inst/texts/acq/reut-00049.xml0000644000175100001440000000146112074065306015760 0ustar hornikusers 2-MAR-1987 10:50:34.12 acq usa F f0802 reute w f BC-AMERICAN-NURSERY-<ANS 03-02 0060 AMERICAN NURSERY <ANSY> BUYS FLORIDA NURSERY TAHLEQUAH, OKLA., March 2 - American Nursery Products Inc said it purchased Miami-based Heinl's Nursery Inc, for undisclosed terms. Heinl's Nursery has sales of about 4.5 mln dlrs and owns 100 acres, of which 75 are in shade houses and about 58,300 square feet cover greenhouses, shipping and office facilities. Reuter tm/inst/texts/acq/reut-00046.xml0000644000175100001440000000166712074065306015765 0ustar hornikusers 2-MAR-1987 10:29:07.31 acq usa F f0682 reute b f BC-CORNING-<GLW>,-HAZLET 03-02 0083 CORNING <GLW>, HAZLETON <HLC> SET EXCAHNGE RATIO CORNING, N.Y., March 2 - Corning Glass Works said the exchange ratio for its previously announced acquisition of Hazleton Laboratories Corp has been established at 0.5165 Corning common share for each Hazleton common share. Corning said the prospectus regarding the merger is expected to be mailed tomorrow to all Hazleton holders of record February 18. Hazleton shareholders will vote on the proposed merger at a special meeting on March 31. Reuter tm/inst/texts/acq/reut-00008.xml0000644000175100001440000000207412074065306015754 0ustar hornikusers 26-FEB-1987 16:59:25.38 acq usa F f0116 reute d f BC-WRATHER 02-26 0109 HONG KONG FIRM UPS WRATHER<WCO> STAKE TO 11 PCT WASHINGTON, Feb 26 - Industrial Equity (Pacific) Ltd, a Hong Kong investment firm, said it raised its stake in Wrather Corp to 816,000 shares, or 11.3 pct of the total outstanding common stock, from 453,300 shares, or 6.3 pct. In a filing with the Securities and Exchange Commission, Industrial Equity, which is principally owned by Brierley Investments Ltd, a publicly held New Zealand company, said it bought 362,700 Wrather common shares between Feb 13 and 24 for 6.6 mln dlrs. When it first disclosed its stake in Wrather earlier this month, it said it bought the stock for investment purposes. Reuter tm/inst/texts/acq/reut-00040.xml0000644000175100001440000000303612074065306015747 0ustar hornikusers 2-MAR-1987 09:33:32.93 acq usa F f0501 reute u f BC-PITTSTON-<PCO>-AGREES 03-02 0111 PITTSTON <PCO> AGREES TO ACQUIRE WTC <WAF> STAMFORD, Conn., March 2 - Pittston Co said it has tentatively agreed to acquire WTC International N.V. in a tax-free exchange of stock. Pittston said it agreed to exchange 0.523 common share for each of the about 8,612,000 WTC common shares outstanding. Pittston said WTC's three principal shareholders, who own 62 pct of its stock, are parties to this agreement. They have granted Pittston the right of first refusal to their shares. WTC has granted Pittston an option to buy WTC shares equal to 18.5 poct of its outstanding stock. The agreement is subject to approval of both boards and WTC shareholders. Pittston said described WTC as a fast growing air freight forwarding company with operations throughout the world. Its revenues totaled nearly 200 mln dlrs in the year ended November 30 and for the quarter ended on that date it earned 1.3 mln dlrs on revenues of 55.8 mln dlrs. Pittston said its Burlington Air Express subsidiary generates about two-thirds of its 450 mln dlrs in annual revenes with its domestic air freight services. Reuter tm/inst/texts/acq/reut-00013.xml0000644000175100001440000000257212074065306015753 0ustar hornikusers 26-FEB-1987 17:38:47.04 acq canada F E f0214 reute d f BC-<NOVA>-WINS-GOVERNMEN 02-26 0106 <NOVA> WINS GOVERNMENT OKAY FOR HUSKY <HYO> DEAL CALGARY, Alberta, Feb 26 - Nova, the Canadian company that owns 56 pct of Husky Oil Ltd, said it received government approval for a transaction under which <Union Faith Canada Holding Ltd> would buy a 43 pct stake in Husky. Nova said the Minister of Regional and Industrial Expansion, Michel Cote, ruled that Union Faith's purchase of the Husky stake would not result in Husky ceding control to a non-Canadian company. It said this ruling was a key condition in completing the deal. Union Faith is equally owned by <Hutchison Whampoa Ltd> and <Hong Kong Electric Holdings Ltd>. Under the agreement with Union Faith, Husky will become a private company with Union Faith and Nova each holding 43 pct of its stock. Nine pct of Husky would be owned by relatives of Li Ka-Shing, chairman of Hutchison, and five pct by the Canadian Imperial Bank of Commerice. Reuter tm/inst/texts/acq/reut-00021.xml0000644000175100001440000000275412074065306015754 0ustar hornikusers 2-MAR-1987 04:52:58.27 acq uk F f0825 reute b f BC-SHV-SAYS-IT-MAKING-TE 03-02 0061 SHV SAYS IT MAKING TENDER OFFER FOR IC GAS LONDON, March 2 - <SHV (United Kingdom) Holding Co Ltd> said it was making a tender offer for up to 33 mln ordinary shares in Imperial Continental Gas Association.<ICGS.L>. It said in a statement the offer was on the basis of 700p for each IC Gas ordinary and 252p for every one stg nominal of IC Gas loan stock. SHV already holds 6.8 mln IC Gas ordinary stock units representing around 4.9 pct of the current issued share capital. Successful completion of the offer would increase SHV's stake in IC Gas to 39.8 mln shares, representing around 27.9 pct of issued share capital, it said. The offer capitalises IC Gas at around one billion stg. It said it was tendering for both ordinary stock and loan stock, which when fully converted, gave a total of 33 mln IC Gas ordinary. It is making the tender offer through N.M. Rothschilds. IC Gas said in a statement it noted the SHV tender offer and the terms were being considered. It said a further statement would be made as soon as possible. REUTER... tm/inst/texts/acq/reut-00042.xml0000644000175100001440000000217712074065306015756 0ustar hornikusers 2-MAR-1987 09:49:48.14 acq usa F f0554 reute u f BC-DIAGNOSTIC-<DRS>-MAKE 03-02 0115 DIAGNOSTIC <DRS> MAKES A BID FOR ROSPATCH <RPCH> OAKLAND , N.J., March 2 - Diagnostic Retrieval Systems Inc said it has made an offer to acquire, through a wholly owned unit, all outstanding shares of Rospatch Corp's common stock for 22 dlrs a share cash, or about 53 mln dlrs. DRS, a warfare systems producer, said it would make the transaction through a cash tender offer for all, but not less than 51 pct, of Rospatch's outstanding common stock followed by a merger with Rospatch, a labels, high technology and wood producer, at the same purchase price per share. DRS said the deal is subject to approval by the Rospatch board, and the tender offer expires on March 6, 1986. Reuter tm/inst/texts/acq/reut-00018.xml0000644000175100001440000000256612074065306015763 0ustar hornikusers 1-MAR-1987 22:20:43.45 acq japan M C f0515 reute u f BC-NIPPON-KOKAN-STEEL-AF 03-01 0113 NIPPON KOKAN STEEL AFFILIATES CONSIDERING MERGER TOKYO, March 2 - Toshin Steel Co Ltd <TOSS.T> and <Azuma Steel Co Ltd>, affiliates of Nippon Kokan KK <NKKT.T>, are considering a merger, company spokesmen said. Toshin Steel, owned 41.9 pct by Nippon Kokan, and Azuma Steel, owned 41.3 pct by Nippon Kokan, are expected to decide by the end of March, they said. Both firms have been struggling with losses caused by the recession in the steel industry and the yen's appreciation. Azuma Steel's current losses are estimated at 3.1 billion yen in the year ending March 31 against a 6.99 billion loss a year earlier, a spokesman said. The firm employs 1,100 workers Toshin Steel, with 1,700 workers, has given no forecast for the year ending March 31. But industry sources said they expected the company to show current losses of about five billion yen or more in 1986/87 compared with a 2.98 billion loss in 1985/86. REUTER tm/inst/texts/acq/reut-00029.xml0000644000175100001440000000227012074065306015755 0ustar hornikusers 2-MAR-1987 08:26:35.85 acq usa F f0305 reute d f BC-<DALE-BURDETT-INC>-FA 03-02 0126 <DALE BURDETT INC> FACES DAMAGE CLAIM WESTMINSTER, Calif., March 2 - Dale Burdett Inc said it faces damages claims totalling about 420,000 dlrs from the former owners of Burdett Publications Inc. The company said on February 20, 1986, its predecessor Nolex Development Inc acquired Burdett Publications Inc in an exchange of 17 mln common shares for all Burdett Publications shares, but the transaction was not qualified with the California Department of Corporations. As a result, it said, the former Burdett Publications owners have a claim for damages against Dale Burdett as successor to Nolex for one yuear starting January 21, 1987, with the damages measured by the difference in values of shares exchanged plus interest from February 20, 1986. Reuter tm/inst/texts/acq/reut-00020.xml0000644000175100001440000000573112074065306015751 0ustar hornikusers 2-MAR-1987 04:45:57.78 acq sweden F f0812 reute b f BC-WALLENBERGS-FIGHT-BID 03-02 0115 WALLENBERGS FIGHT BID FOR SWEDISH MATCH STAKE STOCKHOLM, March 2 - Sweden's Wallenberg group fought back a bid by the London-based Swedish financier Erik Penser to secure a large stake in Swedish Match <SMBS ST>, one of the companies at the core of their business empire. A statement issued by the Wallenberg holding companies AB Investor and Forvaltnings AB Providentia said they had taken over an option held by Nobel Industrier Sweden AB to acquire 33 pct of the voting rights in Swedish Match. Thre Wallenbergs paid Nobel Industrier <NOBL ST>, in which Penser group has a 72 pct stake, about 20 pct over the market price for the Swedish Match option, the statement said. Swedish Match's B shares open to foreign buyers closed at 424 crowns on Friday. The A shares -- with increased voting rights -- closed at 450 crowns for the restricted and 455 for the free shares. The statement said the deal increased Investor's stake to 49.4 pct of the voting rights and 14.8 pct of the share capital while Providentia is left holding 34.1 pct of the voting rights and 14.5 pct of the share capital in Swedish Match. The Wallenbergs' stake in Swedish Match had previously amounted to 52 pct of the voting rights in the company. The Swedish Match deal will cost the Wallenbergs about 400 mln crowns, share analysts said, making it one of the most expensise moves the group has undertaken in the last four years to defend its far-flung interests from outside predators. The Wallenbergs originally sold Nobel Industrier, an arms and chemicals group, to Penser in 1984 to pay for buying Volvo <VOLV ST> out of two other key group companies, Atlas Copco <ASTS ST> and Stora Koppabergs <SKPS ST>. Since then, the Wallenbergs were ousted as the largest shareholders in SKF (SKFR ST> by Skanska AB <SKBS ST> and Frederik Lundberg wrested control of Incentive AB from them. Lundberg, a Zurich-based Swedish property tycoon, also managed to acquire a 25 pct stake in another Wallenberg company, the diary equipment firm Alfa -Laval AB <ALFS ST>. During 1986, the Wallenbergs have been concentrating on building up their stake in Investor and Providentia to prevent any raid on the heart of their business empire. But analysts say the Wallenbergs' position in the electrical engineering firm ASEA AB <ASEA ST> is also too small at 12.6 pct of the voting rights and there has been growing speculation that the group will be forced to sell off fringe interests to protect its core activities. REUTER tm/inst/texts/acq/reut-00054.xml0000644000175100001440000000201012074065306015743 0ustar hornikusers 2-MAR-1987 11:23:45.24 acq italy spain F f0956 reute d f BC-MONTEDISON-CONCLUDES 03-02 0093 MONTEDISON CONCLUDES TALKS WITH ANTIBIOTICOS MILAN, March 2 - Montedison Spa <MONI.MI> said it has concluded its negotiations with Spanish pharmaceuticals company <Antibioticos SA>. A company spokesman told Reuters "We have concluded the talks and we are now awaiting authorization from Spanish authorities." He declined to comment further. Earlier today the Italian company postponed a scheduled press conference on its talks with Antibioticos. An Italian press report today said Montedison has agreed to acquire Antibioticos for 500 billion lire. REUTER tm/inst/texts/acq/reut-00017.xml0000644000175100001440000000200712074065306015750 0ustar hornikusers 26-FEB-1987 18:27:56.14 acq usa F f0324 reute d f BC-CIRCUIT-SYSTEMS-<CSYI 02-26 0098 CIRCUIT SYSTEMS <CSYI> BUYS BOARD MAKER ADDISON, Ill., Feb 26 - Circuit Systems Inc said it has bought all of the stock of (Ionic Industries Inc) in exchange for 3,677,272 shares of its common. Following the exchange there will be 4,969,643 shares of Circuit Systems stock outstanding. Ionic holders will own about 74 pct of the outstanding stock of Circuit Systems, it said. Ionic, a maker of circuit boards, had revenues of 8.4 mln dlrs and pretax profits of 232,000 dlrs in 1986, up from revenues of 5.9 mln and pretax profits of 204,000 dlrs in 1985, Circuit Systems said. Reuter tm/inst/texts/acq/reut-00011.xml0000644000175100001440000000213112074065306015740 0ustar hornikusers 26-FEB-1987 17:09:47.78 acq usa F f0146 reute r f BC-ROBESON 02-26 0113 INVESTMENT GROUP RAISES ROBESON <RBSN> STAKE WASHINGTON, Feb 26 - A group of affiliated Miami-based investment firms led by Fundamental Management Corp said it raised its stake in Robeson Industries Corp to 238,000 shares, or 14.6 pct of the total, from 205,000 or 12.8 pct. In a filing with the Securities and Exchange Commission, the group said it bought 32,800 Robeson common shares between Jan 26 and Feb 9 for 175,691 dlrs. The group said it may buy more shares and plans to study Robeson's operations. Afterwards it may recommend that management make changes in its operations. Fundamental Management Chairman Carl Singer was recently elected to the Robeson board. Reuter tm/inst/texts/acq/reut-00050.xml0000644000175100001440000000275612074065306015760 0ustar hornikusers 2-MAR-1987 10:59:16.80 earn acq E F f0832 reute r f BC-multi-step-to-sell 03-02 0108 MULTI-STEP TO SELL LADDER UNIT, CANCEL SHARES TORONTO, March 2 - <Multi-Step Products Inc>, earlier reporting an initial six month loss, said it agreed to sell wholly owned Multi-Step Manufacturing Inc for 100,000 dlrs cash, subject to shareholder and regulatory approval. Multi-Step also said it will pay 900,000 dlrs to cancel 711,192 of its own shares, which will be acquired from Michael Penhale and his benficiaries. Penhale will control and manage Multi-Step Manufacturing, following the transactions. Multi-Step had a 739,146 dlr loss for the six months ended December 31. The company received its initial public listing in December. The company said its ladder-making unit has been losing 300,000 dlrs quarterly. The sale, expected to close in April, also calls for retirement of the unit's 400,000 dlr bank debt, Multi-Step said. The unit also has agreed to pay a debt of 400,000 dlrs to Tarxien Company Ltd, which is 40 pct owned by Multi-Step. Multi-Step previously said it agreed to acquire the remaining 60 pct of Tarxien it does not already own. Reuter tm/inst/texts/acq/reut-00009.xml0000644000175100001440000000136512074065306015757 0ustar hornikusers 26-FEB-1987 17:01:28.10 acq usa F f0121 reute u f BC-LIEBERT-CORP-<LIEB>-A 02-26 0051 LIEBERT CORP <LIEB> APPROVES MERGER COLUMBUS, Ohio, Feb 26 - Liebert Corp said its shareholders approved the merger of a wholly-owned subsidiary of Emerson Electric Co <EMR>. Under the terms of the merger, each Liebert shareholder will receive .3322 shares of Emerson stock for each Liebert share. Reuter tm/inst/texts/acq/reut-00025.xml0000644000175100001440000000212312074065306015746 0ustar hornikusers 2-MAR-1987 08:16:59.80 acq usa F f0267 reute r f BC-SCIENTIFIC-MICRO-SYST 03-02 0111 SCIENTIFIC MICRO SYSTEMS <SMSI> ACUIRES SUPERMAC NEW YORK, March 2 - Scientific Micro Systems Inc said it has acquired Supermac Technology, a rapidly growing supplier of enhancement products and disc drive subsystems for the Apple personal computer market. Scientific Micro said it acquired all the common stock of Supermac in exchange for 1.05 mln shares of its own common stock. The stock closed at 5.50 dlrs bid on Friday. Supermac, a privately held firm based in Mountain View, California, as is Scientific Micro, reported a net profit of 300,000 dlrs on revenue of 9.5 mln dlrs in fiscal 1986. It expects its revenue to approximately double in 1987. Reuter tm/inst/texts/acq/reut-00030.xml0000644000175100001440000000714112074065306015747 0ustar hornikusers 2-MAR-1987 08:29:05.15 acq usa F f0315 reute u f PM-PUROLATOR 03-02 0102 PUROLATOR <PCC> IN BUYOUT WITH HUTTON <EFH> By Patti Domm NEW YORK, March 2 - New Jersey-based overnight messenger Purolator Courier Corp said it has agreed to be acquired for about 265 mln dlrs by a company formed by E.F. Hutton LBO Inc and certain managers of Purolator's U.S. courier business. Analysts have said that Purolator has been for sale for some time. Purolator announced earlier it was mulling a takeover bid, but analysts wrongly predicted the offer was from another courier company. Hutton LBO, a wholly owned subsidiary of E.F. Hutton Group Inc, will be majority owner of the company. Hutton said the acquiring company, PC Acquisition Inc, is paying 35 dlrs cash per share for 83 pct of Purolator's stock in a tender offer to begin Thursday. The rest of the shares will be purchased for securities and warrants to buy stock in a subsidiary of PC Acquisition, containing Purolator's U.S. courier operations. If all the shares of Purolator are tendered, shareholders would receive for each share 29 dlrs cash, six dlrs in debentures, and a warrant to buy shares in a subsidiary of PC Acquisition containing the U.S. courier operations. Hutton said in the merger shareholders would get 46 mln dlrs aggregate amount of guaranteed debentures due 2002 of PC Acquisition and warrants to buy 15 pct of the common stock of the PC courier subsidiary. Hutton said the company has valued the warrants at two to three dlrs per share. Purolator's stock price closed at 35.125 dlrs on Friday. While some analysts estimated the company was worth in the mid 30s, at least one said it would be worth 38 to 42 dlrs. This follows sales of two other Purolator units. It agreed recently to sell its Canadian Courier unit to Onex Capital for 170 mln dlrs, and previously sold its auto filters business. Purolator retains its Stant division, which makes closure caps for radiators and gas tanks. A Hutton spokesman said the firm is reviewing its options on Stant. Purolator's courier business has been lagging that of its U.S. rivals because of the high price it paid in the past several years to add air delivery to its ground fleet. E.F. Hutton will provide 279 mln dlrs of its funds to complete the transaction. This so-called "bridge" financing will be replaced later with long-term debt most likely in the form of bank loans, Hutton said. Hutton LBO is committed to keeping the courier business, its president Warren Idsal said. "Purolator lost 120 mln dlrs over the last two years largely due to U.S. courier operations, which we believe the management is turning around. We belive it will be a very serious competitor in the future," said Idsal. William Taggart, chief executive officer of U.S. Courier division, will be chief executive officer of the new company. The tender offer will be conditioned on a minimum of two thirds of the common stock being tendered and not withdrawn to the expiration of the offer as well as certain other conditions. The offer will begin Thursday, subject to clearances from the staff of the Interstate Commerce Commission and will expire 20 business days after commencement unless extended. Reuter tm/inst/texts/acq/reut-00001.xml0000644000175100001440000000343012074065306015742 0ustar hornikusers 26-FEB-1987 15:18:06.67 acq usa F f0767 reute d f BC-COMPUTER-TERMINAL-SYS 02-26 0107 COMPUTER TERMINAL SYSTEMS <CPML> COMPLETES SALE COMMACK, N.Y., Feb 26 - Computer Terminal Systems Inc said it has completed the sale of 200,000 shares of its common stock, and warrants to acquire an additional one mln shares, to <Sedio N.V.> of Lugano, Switzerland for 50,000 dlrs. The company said the warrants are exercisable for five years at a purchase price of .125 dlrs per share. Computer Terminal said Sedio also has the right to buy additional shares and increase its total holdings up to 40 pct of the Computer Terminal's outstanding common stock under certain circumstances involving change of control at the company. The company said if the conditions occur the warrants would be exercisable at a price equal to 75 pct of its common stock's market price at the time, not to exceed 1.50 dlrs per share. Computer Terminal also said it sold the technolgy rights to its Dot Matrix impact technology, including any future improvements, to <Woodco Inc> of Houston, Tex. for 200,000 dlrs. But, it said it would continue to be the exclusive worldwide licensee of the technology for Woodco. The company said the moves were part of its reorganization plan and would help pay current operation costs and ensure product delivery. Computer Terminal makes computer generated labels, forms, tags and ticket printers and terminals. Reuter tm/inst/texts/acq/reut-00051.xml0000644000175100001440000000203412074065306015746 0ustar hornikusers 2-MAR-1987 10:59:28.36 acq usa sweden F f0833 reute r f BC-ESSELTE-BUSINESS-<ESB 03-02 0097 ESSELTE BUSINESS <ESB> UNIT BUYS ANTONSON UNIT GARDEN CITY, N.Y., March 2 - Esselte Business Systems Inc's Esselte Meto division said it has acquired the Antonson America Co, a subsidiary of <Antonson Machines AB>, of Sweden. Esselte said the Antonson unit, based in LaPorte, Indiana, manufactures scales and label printers. The company said the purchase is part of a plan to increase the range of retail electronic scales being offered by Esselte in the U.S. It said the acquisition will enble Esselte to increase its distribution base in its effort to grow in the U.S. Reuter tm/inst/texts/acq/reut-00028.xml0000644000175100001440000000221712074065306015755 0ustar hornikusers 2-MAR-1987 08:25:56.49 acq usa F f0301 reute r f BC-PENRIL-<PNL>-SEEKS-TO 03-02 0101 PENRIL <PNL> SEEKS TO SELL TWO UNITS ROCKVILLE, Md., March 2 - Penril Corp said it is seeking to sell its Triplett Electrical Instrument Corp subsidiary in Bluffton, Ohio, and Triplett's Alltest division in Hoffman Estates, Ill., as part of a plan to concentrate on its three profitable division and reduce its debt load. The company also said it is evaluating a plan to satisfy its obligations under its 10-7/8 pct subordinated notes but gave no details. Interest on the notes is due today. Penril further said director Clifford L. Alexander Jr. has resigned from the board. It gave no reason. Penril said shareholders at the annual meeting approved the limitation of directors' liability. Reuter tm/inst/texts/acq/reut-00043.xml0000644000175100001440000000170612074065306015754 0ustar hornikusers 2-MAR-1987 10:06:32.63 acq usa F f0625 reute u f BC-THE-JAPAN-FUND-<JPN> 03-02 0085 THE JAPAN FUND <JPN> GETS BUYOUT OFFER NEW YORK, March 2 - The Japan Fund Inc said it has received an unsolicited offer from <Sterling Grace Capital Management LP>, acting together with certain other persons and entities, to purchase all the assets of the fund at five pct below its aggregate net asset value. The Japan Find said tne deal is subject to obtaining satisfactory financing and a due diligence review. It added that the proposal has been referred to its Board of Directors for consideration. Reuter tm/inst/texts/acq/reut-00023.xml0000644000175100001440000000457012074065306015754 0ustar hornikusers 2-MAR-1987 06:54:19.43 acq uk usa RM F f0026 reute u f BC-EXCO-BUYS-U.S.-GOVERN 03-02 0114 EXCO BUYS U.S. GOVERNMENT SECURITIES BROKER LONDON, Mar 2 - <Exco International Plc>, a subsidiary of British and Commonwealth Shipping Co Plc <BCOM.L>, said it had agreed in principle to buy an 80 pct stake in <RMJ Holdings Corp> for about 79 mln dlrs. Exco Chairman Richard Lacy told Reuters the acquisition was being made from Bank of New York Co Inc <BK.N>, which currently holds a 50.1 pct, and from RMJ partners who hold the remainder. Bank of New York and the partners will retain about 10 pct each and these stakes will be bought over the next six years. RMJ is the holding company of RMJ Securities, one of the largest U.S. Government securities brokers. It is also involved in broking notes, obligations and other instruments sponsored by U.S. Federal agencies. Lacy said Exco had been considering buying a U.S. Government securities broker for the past four years and had made an offer for RMJ when it was sold by Security Pacific Corp <SPC.N> in 1985. RMJ was then valued at about 50 mln dlrs. B and C managing director Peter Goldie said RMJ would be bought at about the same multiple as Exco, suggesting net income of around 16 mln dlrs. The company's earnings had not been hit by the halving of brokerage fees some 14 months ago as volumes had since doubled. Lacy said that RMJ employed some 300 people, with 200 in the brokerage business and about 70 in its <SMS> unit, which provided computer software for the financial services community. RMJ Securities had offices in New York, where total market turnover of U.S. Government securities was 110 billion dlrs a day, and in London where it has 15 billion. It was also given permission last week to open an office in Tokyo where total market turnover had lifted rapidly to about five billion dlrs a day. The acquisition would contribute between five and 10 pct of B and C's share earnings in 1987 on a proforma basis. REUTER tm/inst/texts/acq/reut-00026.xml0000644000175100001440000000777612074065306015772 0ustar hornikusers 2-MAR-1987 08:17:56.66 acq usa F f0274 reute u f PM-SHEARSON 03-02 0105 AMERICAN EXPRESS <AXP> VIEWING SHEARSON OPTIONS By Patti Domm, Reuters NEW YORK, March 2 - American Express Co, rumored to be considering a spinoff of part of Shearson Lehman Brothers Inc, said it is studying a range of options for its brokerage unit that could improve Shearon's access to capital and help it meet broadening international competition. In a joint statement, American Express and Shearson said the actions under consideration are an integral part of American Express' worldwide financial services strategy and that the two companies have been having both internal and external discussions on the matters. American Express said no decision has been reached on the strategic options and that it and Shearson could ultimately decide to follow growth plans already in place. Last week, rumors circulated on Wall Street that the financial services giant was considering a spinoff of part of Shearson and there was speculation it may be considering selling a stake to a Japanese firm. Analysts said the speculation also focused on American Express selling 20 pct of the profitable brokerage firm to the public. There was some speculation that American Express had also considered a total spinoff of Shearson, but the plan was considered highly unlikely, analysts said. American Express said in the statement on Sunday that it will not comment on rumors and speculation and a spokesman would not go beyond the statement. The company also remained silent last Thursday and Friday, as rumors drove American Express stock up a total of 5-1/2 dlrs in two days to bring it to a Friday close at 74. It said it issued the statement on Sunday because a similar statement was being circulated to employees. Analysts have been divided on whether it makes sense for American Express to give up a stake in the wholly-owned brokerage, which improved its after-tax earnings by about 50 pct in the last year. Some analysts said American Express may consider spinning off part of Shearson because it is concerned that its stock price does not fully reflect the value of the brokerage firm. Shearson contributed 316 mln dlrs of American Express' 1.25 billion dlr net in 1986. American Express' ambitious plans for international growth may be also enhanced by the added cash that spinning out part of Shearson would bring. Analysts speculated that all of Shearson would have a market value of about 3.5 billion dlrs. To some however, the need for added capital is puzzling. "(American) Express is in a position where they can raise capital if they need to," said Larry Eckenfelder of Prudential-Bache Securities. Analysts said rumors were fed by the reorganization of Shearson management Wednesday. Chief operating officer Jeffrey Lane got the added, previously vacant, post of president. The reorganization also created four new positions for chairmen of Shearson's operating divisions, a move analysts speculated would allow Shearson to be a stand alone company. Analysts, contacted on Sunday said the statement does little to clarify last week's market speculation. It does confirm, however, that the financial services firm, which unsuccessfully attempted to expand Shearson with a major acquisition last year, is looking beyond its own walls for growth and positioning in the global market competition. Late last year, Shearson's takeover offer to the E.F. Hutton Group Inc was rejected by Hutton, and analysts said there had been speculation that Shearson also was rebuffed when it approached another major Wall Street brokerage. Reuter tm/inst/texts/acq/reut-00035.xml0000644000175100001440000000222212074065306015747 0ustar hornikusers 2-MAR-1987 09:03:18.94 acq uk usa F f0414 reute d f BC-SENIOR-ENGINEERING-MA 03-02 0117 SENIOR ENGINEERING MAKES 12.5 MLN DLR US PURCHASE LONDON, March 2 - <Senior Engineering Group Plc> said it reached agreement with <Cronus Industries Inc> to acquire the whole share capital of <South Western Engineering Co> for 12.5 mln dlrs cash. This sum is being financed by a term loan. South Western is one of the U.S.'s leading manufacturers of heat transfer equipment, with a turnover of 54.86 mln dlrs and pre-tax profits of 1.72 mln in 1986. Completion of the deal is conditional on approval under U.S. Hart-Scott-Rodino regulations which is expected within 30 days. Some 350,000 dlrs is payable immediately, 12 mln dlrs payable on completion with the balance due by June 30, 1987. Reuter