Lingua-StopWords-0.09000755 001751 001751 00000000000 11053555133 015667 5ustar00creamygcreamyg000000 000000 Lingua-StopWords-0.09/lib000755 001751 001751 00000000000 11053555133 016435 5ustar00creamygcreamyg000000 000000 Lingua-StopWords-0.09/lib/Lingua000755 001751 001751 00000000000 11053555133 017654 5ustar00creamygcreamyg000000 000000 Lingua-StopWords-0.09/lib/Lingua/StopWords000755 001751 001751 00000000000 11053555133 021620 5ustar00creamygcreamyg000000 000000 Lingua-StopWords-0.09/lib/Lingua/StopWords/SV.pm000644 001751 001751 00000003512 11053552117 022565 0ustar00creamygcreamyg000000 000000 package Lingua::StopWords::SV; use strict; use warnings; use Exporter; our @ISA = qw(Exporter); our %EXPORT_TAGS = ( 'all' => [ qw( getStopWords ) ] ); our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } ); our $VERSION = 0.08; sub getStopWords { if ( @_ and $_[0] eq 'UTF-8' ) { # adding U0 causes the result to be flagged as UTF-8 my %stoplist = map { ( pack("U0a*", $_), 1 ) } qw( och det att i en jag hon som han på den med var sig för så till är men ett om hade de av icke mig du henne då sin nu har inte hans honom skulle hennes där min man ej vid kunde något från ut när efter upp vi dem vara vad över än dig kan sina här ha mot alla under någon eller allt mycket sedan ju denna själv detta åt utan varit hur ingen mitt ni bli blev oss din dessa några deras blir mina samma vilken er sådan vår blivit dess inom mellan sådant varför varje vilka ditt vem vilket sitta sådana vart dina vars vårt våra ert era vilkas ); return \%stoplist; } else { my %stoplist = map { ( $_, 1 ) } qw( och det att i en jag hon som han p den med var sig fr s till r men ett om hade de av icke mig du henne d sin nu har inte hans honom skulle hennes dr min man ej vid kunde ngot frn ut nr efter upp vi dem vara vad ver n dig kan sina hr ha mot alla under ngon eller allt mycket sedan ju denna sjlv detta t utan varit hur ingen mitt ni bli blev oss din dessa ngra deras blir mina samma vilken er sdan vr blivit dess inom mellan sdant varfr varje vilka ditt vem vilket sitta sdana vart dina vars vrt vra ert era vilkas ); return \%stoplist; } } 1; Lingua-StopWords-0.09/lib/Lingua/StopWords/RU.pm000644 001751 001751 00000006022 11053552117 022562 0ustar00creamygcreamyg000000 000000 package Lingua::StopWords::RU; use strict; use warnings; use Exporter; our @ISA = qw(Exporter); our %EXPORT_TAGS = ( 'all' => [ qw( getStopWords ) ] ); our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } ); our $VERSION = 0.08; sub getStopWords { if ( @_ and $_[0] eq 'UTF-8' ) { # adding U0 causes the result to be flagged as UTF-8 my %stoplist = map { ( pack("U0a*", $_), 1 ) } qw( É × ×Ï ÎÅ ÞÔÏ ÏÎ ÎÁ Ñ Ó ÓÏ ËÁË Á ÔÏ ×ÓÅ ÏÎÁ ÔÁË ÅÇÏ ÎÏ ÄÁ ÔÙ Ë Õ ÖÅ ×Ù ÚÁ ÂÙ ÐÏ ÔÏÌØËÏ ÅÅ ÍÎÅ ÂÙÌÏ ×ÏÔ ÏÔ ÍÅÎÑ ÅÝÅ ÎÅÔ Ï ÉÚ ÅÍÕ ÔÅÐÅÒØ ËÏÇÄÁ ÄÁÖÅ ÎÕ ×ÄÒÕÇ ÌÉ ÅÓÌÉ ÕÖÅ ÉÌÉ ÎÉ ÂÙÔØ ÂÙÌ ÎÅÇÏ ÄÏ ×ÁÓ ÎÉÂÕÄØ ÏÐÑÔØ ÕÖ ×ÁÍ ÓËÁÚÁÌ ×ÅÄØ ÔÁÍ ÐÏÔÏÍ ÓÅÂÑ ÎÉÞÅÇÏ ÅÊ ÍÏÖÅÔ ÏÎÉ ÔÕÔ ÇÄÅ ÅÓÔØ ÎÁÄÏ ÎÅÊ ÄÌÑ ÍÙ ÔÅÂÑ ÉÈ ÞÅÍ ÂÙÌÁ ÓÁÍ ÞÔÏ ÂÅÚ ÂÕÄÔÏ ÞÅÌÏ×ÅË ÞÅÇÏ ÒÁÚ ÔÏÖÅ ÓÅÂÅ ÐÏÄ ÖÉÚÎØ ÂÕÄÅÔ Ö ÔÏÇÄÁ ËÔÏ ÜÔÏÔ ÇÏ×ÏÒÉÌ ÔÏÇÏ ÐÏÔÏÍÕ ÜÔÏÇÏ ËÁËÏÊ ÓÏ×ÓÅÍ ÎÉÍ ÚÄÅÓØ ÜÔÏÍ ÏÄÉÎ ÐÏÞÔÉ ÍÏÊ ÔÅÍ ÞÔÏÂÙ ÎÅÅ ËÁÖÅÔÓÑ ÓÅÊÞÁÓ ÂÙÌÉ ËÕÄÁ ÚÁÞÅÍ ÓËÁÚÁÔØ ×ÓÅÈ ÎÉËÏÇÄÁ ÓÅÇÏÄÎÑ ÍÏÖÎÏ ÐÒÉ ÎÁËÏÎÅà Ä×Á Ï ÄÒÕÇÏÊ ÈÏÔØ ÐÏÓÌÅ ÎÁÄ ÂÏÌØÛÅ ÔÏÔ ÞÅÒÅÚ ÜÔÉ ÎÁÓ ÐÒÏ ×ÓÅÇÏ ÎÉÈ ËÁËÁÑ ÍÎÏÇÏ ÒÁÚ×Å ÓËÁÚÁÌÁ ÔÒÉ ÜÔÕ ÍÏÑ ×ÐÒÏÞÅÍ ÈÏÒÏÛÏ Ó×ÏÀ ÜÔÏÊ ÐÅÒÅÄ ÉÎÏÇÄÁ ÌÕÞÛÅ ÞÕÔØ ÔÏÍ ÎÅÌØÚÑ ÔÁËÏÊ ÉÍ ÂÏÌÅÅ ×ÓÅÇÄÁ ËÏÎÅÞÎÏ ×ÓÀ ÍÅÖÄÕ ); return \%stoplist; } else { my %stoplist = map { ( $_, 1 ) } qw( ); return \%stoplist; } } 1; Lingua-StopWords-0.09/lib/Lingua/StopWords/NO.pm000644 001751 001751 00000005020 11053552117 022545 0ustar00creamygcreamyg000000 000000 package Lingua::StopWords::NO; use strict; use warnings; use Exporter; our @ISA = qw(Exporter); our %EXPORT_TAGS = ( 'all' => [ qw( getStopWords ) ] ); our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } ); our $VERSION = 0.08; sub getStopWords { if ( @_ and $_[0] eq 'UTF-8' ) { # adding U0 causes the result to be flagged as UTF-8 my %stoplist = map { ( pack("U0a*", $_), 1 ) } qw( og i jeg det at en et den til er som på de med han av ikke ikkje der så var meg seg men ett har om vi min mitt ha hadde hun nå over da ved fra du ut sin dem oss opp man kan hans hvor eller hva skal selv sjøl her alle vil bli ble blei blitt kunne inn når være kom noen noe ville dere som deres kun ja etter ned skulle denne for deg si sine sitt mot å meget hvorfor dette disse uten hvordan ingen din ditt blir samme hvilken hvilke sånn inni mellom vår hver hvem vors hvis både bare enn fordi før mange også slik vært være båe begge siden dykk dykkar dei deira deires deim di då eg ein eit eitt elles honom hjå ho hoe henne hennar hennes hoss hossen ikkje ingi inkje korleis korso kva kvar kvarhelst kven kvi kvifor me medan mi mine mykje no nokon noka nokor noko nokre si sia sidan so somt somme um upp vere vore verte vort varte vart ); return \%stoplist; } else { my %stoplist = map { ( $_, 1 ) } qw( og i jeg det at en et den til er som p de med han av ikke ikkje der s var meg seg men ett har om vi min mitt ha hadde hun n over da ved fra du ut sin dem oss opp man kan hans hvor eller hva skal selv sjl her alle vil bli ble blei blitt kunne inn nr vre kom noen noe ville dere som deres kun ja etter ned skulle denne for deg si sine sitt mot meget hvorfor dette disse uten hvordan ingen din ditt blir samme hvilken hvilke snn inni mellom vr hver hvem vors hvis bde bare enn fordi fr mange ogs slik vrt vre be begge siden dykk dykkar dei deira deires deim di d eg ein eit eitt elles honom hj ho hoe henne hennar hennes hoss hossen ikkje ingi inkje korleis korso kva kvar kvarhelst kven kvi kvifor me medan mi mine mykje no nokon noka nokor noko nokre si sia sidan so somt somme um upp vere vore verte vort varte vart ); return \%stoplist; } } 1; Lingua-StopWords-0.09/lib/Lingua/StopWords/ES.pm000644 001751 001751 00000012667 11053555042 022557 0ustar00creamygcreamyg000000 000000 package Lingua::StopWords::ES; use strict; use warnings; use Exporter; our @ISA = qw(Exporter); our %EXPORT_TAGS = ( 'all' => [ qw( getStopWords ) ] ); our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } ); our $VERSION = 0.08; sub getStopWords { if ( @_ and $_[0] eq 'UTF-8' ) { # adding U0 causes the result to be flagged as UTF-8 my %stoplist = map { ( pack("U0a*", $_), 1 ) } qw( de la que el en y a los del se las por un para con no una su al lo como más pero sus le ya o este sí porque esta entre cuando muy sin sobre también me hasta hay donde quien desde todo nos durante todos uno les ni contra otros ese eso ante ellos e esto mí antes algunos qué unos yo otro otras otra él tanto esa estos mucho quienes nada muchos cual poco ella estar estas algunas algo nosotros mi mis tú te ti tu tus ellas nosotras vosotros vosotras os mío mía míos mías tuyo tuya tuyos tuyas suyo suya suyos suyas nuestro nuestra nuestros nuestras vuestro vuestra vuestros vuestras esos esas estoy estás está estamos estáis están esté estés estemos estéis estén estaré estarás estará estaremos estaréis estarán estaría estarías estaríamos estaríais estarían estaba estabas estábamos estabais estaban estuve estuviste estuvo estuvimos estuvisteis estuvieron estuviera estuvieras estuviéramos estuvierais estuvieran estuviese estuvieses estuviésemos estuvieseis estuviesen estando estado estada estados estadas estad he has ha hemos habéis han haya hayas hayamos hayáis hayan habré habrás habrá habremos habréis habrán habría habrías habríamos habríais habrían había habías habíamos habíais habían hube hubiste hubo hubimos hubisteis hubieron hubiera hubieras hubiéramos hubierais hubieran hubiese hubieses hubiésemos hubieseis hubiesen habiendo habido habida habidos habidas soy eres es somos sois son sea seas seamos seáis sean seré serás será seremos seréis serán sería serías seríamos seríais serían era eras éramos erais eran fui fuiste fue fuimos fuisteis fueron fuera fueras fuéramos fuerais fueran fuese fueses fuésemos fueseis fuesen siendo sido tengo tienes tiene tenemos tenéis tienen tenga tengas tengamos tengáis tengan tendré tendrás tendrá tendremos tendréis tendrán tendría tendrías tendríamos tendríais tendrían tenía tenías teníamos teníais tenían tuve tuviste tuvo tuvimos tuvisteis tuvieron tuviera tuvieras tuviéramos tuvierais tuvieran tuviese tuvieses tuviésemos tuvieseis tuviesen teniendo tenido tenida tenidos tenidas tened ); return \%stoplist; } else { my %stoplist = map { ( $_, 1 ) } qw( de la que el en y a los del se las por un para con no una su al lo como ms pero sus le ya o este s porque esta entre cuando muy sin sobre tambin me hasta hay donde quien desde todo nos durante todos uno les ni contra otros ese eso ante ellos e esto m antes algunos qu unos yo otro otras otra l tanto esa estos mucho quienes nada muchos cual poco ella estar estas algunas algo nosotros mi mis t te ti tu tus ellas nosotras vosotros vosotras os mo ma mos mas tuyo tuya tuyos tuyas suyo suya suyos suyas nuestro nuestra nuestros nuestras vuestro vuestra vuestros vuestras esos esas estoy ests est estamos estis estn est ests estemos estis estn estar estars estar estaremos estaris estarn estara estaras estaramos estarais estaran estaba estabas estbamos estabais estaban estuve estuviste estuvo estuvimos estuvisteis estuvieron estuviera estuvieras estuviramos estuvierais estuvieran estuviese estuvieses estuvisemos estuvieseis estuviesen estando estado estada estados estadas estad he has ha hemos habis han haya hayas hayamos hayis hayan habr habrs habr habremos habris habrn habra habras habramos habrais habran haba habas habamos habais haban hube hubiste hubo hubimos hubisteis hubieron hubiera hubieras hubiramos hubierais hubieran hubiese hubieses hubisemos hubieseis hubiesen habiendo habido habida habidos habidas soy eres es somos sois son sea seas seamos seis sean ser sers ser seremos seris sern sera seras seramos serais seran era eras ramos erais eran fui fuiste fue fuimos fuisteis fueron fuera fueras furamos fuerais fueran fuese fueses fusemos fueseis fuesen siendo sido tengo tienes tiene tenemos tenis tienen tenga tengas tengamos tengis tengan tendr tendrs tendr tendremos tendris tendrn tendra tendras tendramos tendrais tendran tena tenas tenamos tenais tenan tuve tuviste tuvo tuvimos tuvisteis tuvieron tuviera tuvieras tuviramos tuvierais tuvieran tuviese tuvieses tuvisemos tuvieseis tuviesen teniendo tenido tenida tenidos tenidas tened ); return \%stoplist; } } 1; Lingua-StopWords-0.09/lib/Lingua/StopWords/FR.pm000644 001751 001751 00000004623 11053552117 022550 0ustar00creamygcreamyg000000 000000 package Lingua::StopWords::FR; use strict; use warnings; use Exporter; our @ISA = qw(Exporter); our %EXPORT_TAGS = ( 'all' => [ qw( getStopWords ) ] ); our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } ); our $VERSION = 0.08; sub getStopWords { if ( @_ and $_[0] eq 'UTF-8' ) { # adding U0 causes the result to be flagged as UTF-8 my %stoplist = map { ( pack("U0a*", $_), 1 ) } qw( au aux avec ce ces dans de des du elle en et eux il je la le leur lui ma mais me même mes moi mon ne nos notre nous on ou par pas pour qu que qui sa se ses son sur ta te tes toi ton tu un une vos votre vous c d j l à m n s t y été étée étées étés étant étante étants étantes suis es est sommes êtes sont serai seras sera serons serez seront serais serait serions seriez seraient étais était étions étiez étaient fus fut fûmes fûtes furent sois soit soyons soyez soient fusse fusses fût fussions fussiez fussent ayant ayante ayantes ayants eu eue eues eus ai as avons avez ont aurai auras aura aurons aurez auront aurais aurait aurions auriez auraient avais avait avions aviez avaient eut eûmes eûtes eurent aie aies ait ayons ayez aient eusse eusses eût eussions eussiez eussent ); return \%stoplist; } else { my %stoplist = map { ( $_, 1 ) } qw( au aux avec ce ces dans de des du elle en et eux il je la le leur lui ma mais me mme mes moi mon ne nos notre nous on ou par pas pour qu que qui sa se ses son sur ta te tes toi ton tu un une vos votre vous c d j l m n s t y t te tes ts tant tante tants tantes suis es est sommes tes sont serai seras sera serons serez seront serais serait serions seriez seraient tais tait tions tiez taient fus fut fmes ftes furent sois soit soyons soyez soient fusse fusses ft fussions fussiez fussent ayant ayante ayantes ayants eu eue eues eus ai as avons avez ont aurai auras aura aurons aurez auront aurais aurait aurions auriez auraient avais avait avions aviez avaient eut emes etes eurent aie aies ait ayons ayez aient eusse eusses et eussions eussiez eussent ); return \%stoplist; } } 1; Lingua-StopWords-0.09/lib/Lingua/StopWords/EN.pm000644 001751 001751 00000005437 11053552117 022547 0ustar00creamygcreamyg000000 000000 package Lingua::StopWords::EN; use strict; use warnings; use Exporter; our @ISA = qw(Exporter); our %EXPORT_TAGS = ( 'all' => [ qw( getStopWords ) ] ); our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } ); our $VERSION = 0.08; sub getStopWords { if ( @_ and $_[0] eq 'UTF-8' ) { # adding U0 causes the result to be flagged as UTF-8 my %stoplist = map { ( pack("U0a*", $_), 1 ) } qw( i me my myself we our ours ourselves you your yours yourself yourselves he him his himself she her hers herself it its itself they them their theirs themselves what which who whom this that these those am is are was were be been being have has had having do does did doing would should could ought i'm you're he's she's it's we're they're i've you've we've they've i'd you'd he'd she'd we'd they'd i'll you'll he'll she'll we'll they'll isn't aren't wasn't weren't hasn't haven't hadn't doesn't don't didn't won't wouldn't shan't shouldn't can't cannot couldn't mustn't let's that's who's what's here's there's when's where's why's how's a an the and but if or because as until while of at by for with about against between into through during before after above below to from up down in out on off over under again further then once here there when where why how all any both each few more most other some such no nor not only own same so than too very ); return \%stoplist; } else { my %stoplist = map { ( $_, 1 ) } qw( i me my myself we our ours ourselves you your yours yourself yourselves he him his himself she her hers herself it its itself they them their theirs themselves what which who whom this that these those am is are was were be been being have has had having do does did doing would should could ought i'm you're he's she's it's we're they're i've you've we've they've i'd you'd he'd she'd we'd they'd i'll you'll he'll she'll we'll they'll isn't aren't wasn't weren't hasn't haven't hadn't doesn't don't didn't won't wouldn't shan't shouldn't can't cannot couldn't mustn't let's that's who's what's here's there's when's where's why's how's a an the and but if or because as until while of at by for with about against between into through during before after above below to from up down in out on off over under again further then once here there when where why how all any both each few more most other some such no nor not only own same so than too very ); return \%stoplist; } } 1; Lingua-StopWords-0.09/lib/Lingua/StopWords/DE.pm000644 001751 001751 00000007311 11053552117 022526 0ustar00creamygcreamyg000000 000000 package Lingua::StopWords::DE; use strict; use warnings; use Exporter; our @ISA = qw(Exporter); our %EXPORT_TAGS = ( 'all' => [ qw( getStopWords ) ] ); our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } ); our $VERSION = 0.08; sub getStopWords { if ( @_ and $_[0] eq 'UTF-8' ) { # adding U0 causes the result to be flagged as UTF-8 my %stoplist = map { ( pack("U0a*", $_), 1 ) } qw( aber alle allem allen aller alles als also am an ander andere anderem anderen anderer anderes anderm andern anderr anders auch auf aus bei bin bis bist da damit dann der den des dem die das daß derselbe derselben denselben desselben demselben dieselbe dieselben dasselbe dazu dein deine deinem deinen deiner deines denn derer dessen dich dir du dies diese diesem diesen dieser dieses doch dort durch ein eine einem einen einer eines einig einige einigem einigen einiger einiges einmal er ihn ihm es etwas euer eure eurem euren eurer eures für gegen gewesen hab habe haben hat hatte hatten hier hin hinter ich mich mir ihr ihre ihrem ihren ihrer ihres euch im in indem ins ist jede jedem jeden jeder jedes jene jenem jenen jener jenes jetzt kann kein keine keinem keinen keiner keines können könnte machen man manche manchem manchen mancher manches mein meine meinem meinen meiner meines mit muss musste nach nicht nichts noch nun nur ob oder ohne sehr sein seine seinem seinen seiner seines selbst sich sie ihnen sind so solche solchem solchen solcher solches soll sollte sondern sonst über um und uns unse unsem unsen unser unses unter viel vom von vor während war waren warst was weg weil weiter welche welchem welchen welcher welches wenn werde werden wie wieder will wir wird wirst wo wollen wollte würde würden zu zum zur zwar zwischen ); return \%stoplist; } else { my %stoplist = map { ( $_, 1 ) } qw( aber alle allem allen aller alles als also am an ander andere anderem anderen anderer anderes anderm andern anderr anders auch auf aus bei bin bis bist da damit dann der den des dem die das da derselbe derselben denselben desselben demselben dieselbe dieselben dasselbe dazu dein deine deinem deinen deiner deines denn derer dessen dich dir du dies diese diesem diesen dieser dieses doch dort durch ein eine einem einen einer eines einig einige einigem einigen einiger einiges einmal er ihn ihm es etwas euer eure eurem euren eurer eures fr gegen gewesen hab habe haben hat hatte hatten hier hin hinter ich mich mir ihr ihre ihrem ihren ihrer ihres euch im in indem ins ist jede jedem jeden jeder jedes jene jenem jenen jener jenes jetzt kann kein keine keinem keinen keiner keines knnen knnte machen man manche manchem manchen mancher manches mein meine meinem meinen meiner meines mit muss musste nach nicht nichts noch nun nur ob oder ohne sehr sein seine seinem seinen seiner seines selbst sich sie ihnen sind so solche solchem solchen solcher solches soll sollte sondern sonst ber um und uns unse unsem unsen unser unses unter viel vom von vor whrend war waren warst was weg weil weiter welche welchem welchen welcher welches wenn werde werden wie wieder will wir wird wirst wo wollen wollte wrde wrden zu zum zur zwar zwischen ); return \%stoplist; } } 1; Lingua-StopWords-0.09/lib/Lingua/StopWords/PT.pm000644 001751 001751 00000006745 11053552117 022573 0ustar00creamygcreamyg000000 000000 package Lingua::StopWords::PT; use strict; use warnings; use Exporter; our @ISA = qw(Exporter); our %EXPORT_TAGS = ( 'all' => [ qw( getStopWords ) ] ); our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } ); our $VERSION = 0.08; sub getStopWords { if ( @_ and $_[0] eq 'UTF-8' ) { # adding U0 causes the result to be flagged as UTF-8 my %stoplist = map { ( pack("U0a*", $_), 1 ) } qw( de a o que e do da em um para com não uma os no se na por mais as dos como mas ao ele das à seu sua ou quando muito nos já eu também só pelo pela até isso ela entre depois sem mesmo aos seus quem nas me esse eles você essa num nem suas meu às minha numa pelos elas qual nós lhe deles essas esses pelas este dele tu te vocês vos lhes meus minhas teu tua teus tuas nosso nossa nossos nossas dela delas esta estes estas aquele aquela aqueles aquelas isto aquilo estou está estamos estão estive esteve estivemos estiveram estava estávamos estavam estivera estivéramos esteja estejamos estejam estivesse estivéssemos estivessem estiver estivermos estiverem hei há havemos hão houve houvemos houveram houvera houvéramos haja hajamos hajam houvesse houvéssemos houvessem houver houvermos houverem houverei houverá houveremos houverão houveria houveríamos houveriam sou somos são era éramos eram fui foi fomos foram fora fôramos seja sejamos sejam fosse fôssemos fossem for formos forem serei será seremos serão seria seríamos seriam tenho tem temos tém tinha tínhamos tinham tive teve tivemos tiveram tivera tivéramos tenha tenhamos tenham tivesse tivéssemos tivessem tiver tivermos tiverem terei terá teremos terão teria teríamos teriam ); return \%stoplist; } else { my %stoplist = map { ( $_, 1 ) } qw( de a o que e do da em um para com no uma os no se na por mais as dos como mas ao ele das seu sua ou quando muito nos j eu tambm s pelo pela at isso ela entre depois sem mesmo aos seus quem nas me esse eles voc essa num nem suas meu s minha numa pelos elas qual ns lhe deles essas esses pelas este dele tu te vocs vos lhes meus minhas teu tua teus tuas nosso nossa nossos nossas dela delas esta estes estas aquele aquela aqueles aquelas isto aquilo estou est estamos esto estive esteve estivemos estiveram estava estvamos estavam estivera estivramos esteja estejamos estejam estivesse estivssemos estivessem estiver estivermos estiverem hei h havemos ho houve houvemos houveram houvera houvramos haja hajamos hajam houvesse houvssemos houvessem houver houvermos houverem houverei houver houveremos houvero houveria houveramos houveriam sou somos so era ramos eram fui foi fomos foram fora framos seja sejamos sejam fosse fssemos fossem for formos forem serei ser seremos sero seria seramos seriam tenho tem temos tm tinha tnhamos tinham tive teve tivemos tiveram tivera tivramos tenha tenhamos tenham tivesse tivssemos tivessem tiver tivermos tiverem terei ter teremos tero teria teramos teriam ); return \%stoplist; } } 1; Lingua-StopWords-0.09/lib/Lingua/StopWords/IT.pm000644 001751 001751 00000010624 11053552117 022553 0ustar00creamygcreamyg000000 000000 package Lingua::StopWords::IT; use strict; use warnings; use Exporter; our @ISA = qw(Exporter); our %EXPORT_TAGS = ( 'all' => [ qw( getStopWords ) ] ); our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } ); our $VERSION = 0.08; sub getStopWords { if ( @_ and $_[0] eq 'UTF-8' ) { # adding U0 causes the result to be flagged as UTF-8 my %stoplist = map { ( pack("U0a*", $_), 1 ) } qw( ad al allo ai agli all agl alla alle con col coi da dal dallo dai dagli dall dagl dalla dalle di del dello dei degli dell degl della delle in nel nello nei negli nell negl nella nelle su sul sullo sui sugli sull sugl sulla sulle per tra contro io tu lui lei noi voi loro mio mia miei mie tuo tua tuoi tue suo sua suoi sue nostro nostra nostri nostre vostro vostra vostri vostre mi ti ci vi lo la li le gli ne il un uno una ma ed se perché anche come dov dove che chi cui non più quale quanto quanti quanta quante quello quelli quella quelle questo questi questa queste si tutto tutti a c e i l o ho hai ha abbiamo avete hanno abbia abbiate abbiano avrò avrai avrà avremo avrete avranno avrei avresti avrebbe avremmo avreste avrebbero avevo avevi aveva avevamo avevate avevano ebbi avesti ebbe avemmo aveste ebbero avessi avesse avessimo avessero avendo avuto avuta avuti avute sono sei è siamo siete sia siate siano sarò sarai sarà saremo sarete saranno sarei saresti sarebbe saremmo sareste sarebbero ero eri era eravamo eravate erano fui fosti fu fummo foste furono fossi fosse fossimo fossero essendo faccio fai facciamo fanno faccia facciate facciano farò farai farà faremo farete faranno farei faresti farebbe faremmo fareste farebbero facevo facevi faceva facevamo facevate facevano feci facesti fece facemmo faceste fecero facessi facesse facessimo facessero facendo sto stai sta stiamo stanno stia stiate stiano starò starai starà staremo starete staranno starei staresti starebbe staremmo stareste starebbero stavo stavi stava stavamo stavate stavano stetti stesti stette stemmo steste stettero stessi stesse stessimo stessero stando ); return \%stoplist; } else { my %stoplist = map { ( $_, 1 ) } qw( ad al allo ai agli all agl alla alle con col coi da dal dallo dai dagli dall dagl dalla dalle di del dello dei degli dell degl della delle in nel nello nei negli nell negl nella nelle su sul sullo sui sugli sull sugl sulla sulle per tra contro io tu lui lei noi voi loro mio mia miei mie tuo tua tuoi tue suo sua suoi sue nostro nostra nostri nostre vostro vostra vostri vostre mi ti ci vi lo la li le gli ne il un uno una ma ed se perch anche come dov dove che chi cui non pi quale quanto quanti quanta quante quello quelli quella quelle questo questi questa queste si tutto tutti a c e i l o ho hai ha abbiamo avete hanno abbia abbiate abbiano avr avrai avr avremo avrete avranno avrei avresti avrebbe avremmo avreste avrebbero avevo avevi aveva avevamo avevate avevano ebbi avesti ebbe avemmo aveste ebbero avessi avesse avessimo avessero avendo avuto avuta avuti avute sono sei siamo siete sia siate siano sar sarai sar saremo sarete saranno sarei saresti sarebbe saremmo sareste sarebbero ero eri era eravamo eravate erano fui fosti fu fummo foste furono fossi fosse fossimo fossero essendo faccio fai facciamo fanno faccia facciate facciano far farai far faremo farete faranno farei faresti farebbe faremmo fareste farebbero facevo facevi faceva facevamo facevate facevano feci facesti fece facemmo faceste fecero facessi facesse facessimo facessero facendo sto stai sta stiamo stanno stia stiate stiano star starai star staremo starete staranno starei staresti starebbe staremmo stareste starebbero stavo stavi stava stavamo stavate stavano stetti stesti stette stemmo steste stettero stessi stesse stessimo stessero stando ); return \%stoplist; } } 1; Lingua-StopWords-0.09/lib/Lingua/StopWords/HU.pm000644 001751 001751 00000006516 11053555042 022560 0ustar00creamygcreamyg000000 000000 package Lingua::StopWords::HU; use strict; use warnings; use Exporter; our @ISA = qw(Exporter); our %EXPORT_TAGS = ( 'all' => [ qw( getStopWords ) ] ); our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } ); our $VERSION = 0.08; sub getStopWords { if ( @_ and $_[0] eq 'UTF-8' ) { # adding U0 causes the result to be flagged as UTF-8 my %stoplist = map { ( pack("U0a*", $_), 1 ) } qw( a ahogy ahol aki akik akkor alatt által általában amely amelyek amelyekben amelyeket amelyet amelynek ami amit amolyan amíg amikor át abban ahhoz annak arra arról az azok azon azt azzal azért aztán azután azonban bár be belül benne cikk cikkek cikkeket csak de e eddig egész egy egyes egyetlen egyéb egyik egyre ekkor el elég ellen elõ elõször elõtt elsõ én éppen ebben ehhez emilyen ennek erre ez ezt ezek ezen ezzel ezért és fel felé hanem hiszen hogy hogyan igen így illetve ill. ill ilyen ilyenkor ison ismét itt jó jól jobban kell kellett keresztül keressünk ki kívül között közül legalább lehet lehetett legyen lenne lenni lesz lett maga magát majd majd már más másik meg még mellett mert mely melyek mi mit míg miért milyen mikor minden mindent mindenki mindig mint mintha mivel most nagy nagyobb nagyon ne néha nekem neki nem néhány nélkül nincs olyan ott össze õ õk õket pedig persze rá s saját sem semmi sok sokat sokkal számára szemben szerint szinte talán tehát teljes tovább továbbá több úgy ugyanis új újabb újra után utána utolsó vagy vagyis valaki valami valamint való vagyok van vannak volt voltam voltak voltunk vissza vele viszont volna ); return \%stoplist; } else { my %stoplist = map { ( $_, 1 ) } qw( a ahogy ahol aki akik akkor alatt ltal ltalban amely amelyek amelyekben amelyeket amelyet amelynek ami amit amolyan amg amikor t abban ahhoz annak arra arrl az azok azon azt azzal azrt aztn azutn azonban br be bell benne cikk cikkek cikkeket csak de e eddig egsz egy egyes egyetlen egyb egyik egyre ekkor el elg ellen el elszr eltt els n ppen ebben ehhez emilyen ennek erre ez ezt ezek ezen ezzel ezrt s fel fel hanem hiszen hogy hogyan igen gy illetve ill. ill ilyen ilyenkor ison ismt itt j jl jobban kell kellett keresztl keressnk ki kvl kztt kzl legalbb lehet lehetett legyen lenne lenni lesz lett maga magt majd majd mr ms msik meg mg mellett mert mely melyek mi mit mg mirt milyen mikor minden mindent mindenki mindig mint mintha mivel most nagy nagyobb nagyon ne nha nekem neki nem nhny nlkl nincs olyan ott ssze k ket pedig persze r s sajt sem semmi sok sokat sokkal szmra szemben szerint szinte taln teht teljes tovbb tovbb tbb gy ugyanis j jabb jra utn utna utols vagy vagyis valaki valami valamint val vagyok van vannak volt voltam voltak voltunk vissza vele viszont volna ); return \%stoplist; } } 1; Lingua-StopWords-0.09/lib/Lingua/StopWords/FI.pm000644 001751 001751 00000010161 11053552117 022531 0ustar00creamygcreamyg000000 000000 package Lingua::StopWords::FI; use strict; use warnings; use Exporter; our @ISA = qw(Exporter); our %EXPORT_TAGS = ( 'all' => [ qw( getStopWords ) ] ); our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } ); our $VERSION = 0.08; sub getStopWords { if ( @_ and $_[0] eq 'UTF-8' ) { # adding U0 causes the result to be flagged as UTF-8 my %stoplist = map { ( pack("U0a*", $_), 1 ) } qw( olla olen olet on olemme olette ovat ole oli olisi olisit olisin olisimme olisitte olisivat olit olin olimme olitte olivat ollut olleet en et ei emme ette eivät minä minun minut minua minussa minusta minuun minulla minulta minulle sinä sinun sinut sinua sinussa sinusta sinuun sinulla sinulta sinulle hän hänen hänet häntä hänessä hänestä häneen hänellä häneltä hänelle me meidän meidät meitä meissä meistä meihin meillä meiltä meille te teidän teidät teitä teissä teistä teihin teillä teiltä teille he heidän heidät heitä heissä heistä heihin heillä heiltä heille tämä tämän tätä tässä tästä tähän tallä tältä tälle tänä täksi tuo tuon tuotä tuossa tuosta tuohon tuolla tuolta tuolle tuona tuoksi se sen sitä siinä siitä siihen sillä siltä sille sinä siksi nämä näiden näitä näissä näistä näihin näillä näiltä näille näinä näiksi nuo noiden noita noissa noista noihin noilla noilta noille noina noiksi ne niiden niitä niissä niistä niihin niillä niiltä niille niinä niiksi kuka kenen kenet ketä kenessä kenestä keneen kenellä keneltä kenelle kenenä keneksi ketkä keiden ketkä keitä keissä keistä keihin keillä keiltä keille keinä keiksi mikä minkä minkä mitä missä mistä mihin millä miltä mille minä miksi mitkä joka jonka jota jossa josta johon jolla jolta jolle jona joksi jotka joiden joita joissa joista joihin joilla joilta joille joina joiksi että ja jos koska kuin mutta niin sekä sillä tai vaan vai vaikka kanssa mukaan noin poikki yli kun niin nyt itse ); return \%stoplist; } else { my %stoplist = map { ( $_, 1 ) } qw( olla olen olet on olemme olette ovat ole oli olisi olisit olisin olisimme olisitte olisivat olit olin olimme olitte olivat ollut olleet en et ei emme ette eivt min minun minut minua minussa minusta minuun minulla minulta minulle sin sinun sinut sinua sinussa sinusta sinuun sinulla sinulta sinulle hn hnen hnet hnt hness hnest hneen hnell hnelt hnelle me meidn meidt meit meiss meist meihin meill meilt meille te teidn teidt teit teiss teist teihin teill teilt teille he heidn heidt heit heiss heist heihin heill heilt heille tm tmn tt tss tst thn tall tlt tlle tn tksi tuo tuon tuot tuossa tuosta tuohon tuolla tuolta tuolle tuona tuoksi se sen sit siin siit siihen sill silt sille sin siksi nm niden nit niss nist nihin nill nilt nille nin niksi nuo noiden noita noissa noista noihin noilla noilta noille noina noiksi ne niiden niit niiss niist niihin niill niilt niille niin niiksi kuka kenen kenet ket keness kenest keneen kenell kenelt kenelle kenen keneksi ketk keiden ketk keit keiss keist keihin keill keilt keille kein keiksi mik mink mink mit miss mist mihin mill milt mille min miksi mitk joka jonka jota jossa josta johon jolla jolta jolle jona joksi jotka joiden joita joissa joista joihin joilla joilta joille joina joiksi ett ja jos koska kuin mutta niin sek sill tai vaan vai vaikka kanssa mukaan noin poikki yli kun niin nyt itse ); return \%stoplist; } } 1; Lingua-StopWords-0.09/lib/Lingua/StopWords/NL.pm000644 001751 001751 00000003165 11053552117 022552 0ustar00creamygcreamyg000000 000000 package Lingua::StopWords::NL; use strict; use warnings; use Exporter; our @ISA = qw(Exporter); our %EXPORT_TAGS = ( 'all' => [ qw( getStopWords ) ] ); our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } ); our $VERSION = 0.08; sub getStopWords { if ( @_ and $_[0] eq 'UTF-8' ) { # adding U0 causes the result to be flagged as UTF-8 my %stoplist = map { ( pack("U0a*", $_), 1 ) } qw( de en van ik te dat die in een hij het niet zijn is was op aan met als voor had er maar om hem dan zou of wat mijn men dit zo door over ze zich bij ook tot je mij uit der daar haar naar heb hoe heeft hebben deze u want nog zal me zij nu ge geen omdat iets worden toch al waren veel meer doen toen moet ben zonder kan hun dus alles onder ja eens hier wie werd altijd doch wordt wezen kunnen ons zelf tegen na reeds wil kon niets uw iemand geweest andere ); return \%stoplist; } else { my %stoplist = map { ( $_, 1 ) } qw( de en van ik te dat die in een hij het niet zijn is was op aan met als voor had er maar om hem dan zou of wat mijn men dit zo door over ze zich bij ook tot je mij uit der daar haar naar heb hoe heeft hebben deze u want nog zal me zij nu ge geen omdat iets worden toch al waren veel meer doen toen moet ben zonder kan hun dus alles onder ja eens hier wie werd altijd doch wordt wezen kunnen ons zelf tegen na reeds wil kon niets uw iemand geweest andere ); return \%stoplist; } } 1; Lingua-StopWords-0.09/lib/Lingua/StopWords/DA.pm000644 001751 001751 00000003035 11053552117 022521 0ustar00creamygcreamyg000000 000000 package Lingua::StopWords::DA; use strict; use warnings; use Exporter; our @ISA = qw(Exporter); our %EXPORT_TAGS = ( 'all' => [ qw( getStopWords ) ] ); our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } ); our $VERSION = 0.08; sub getStopWords { if ( @_ and $_[0] eq 'UTF-8' ) { # adding U0 causes the result to be flagged as UTF-8 my %stoplist = map { ( pack("U0a*", $_), 1 ) } qw( og i jeg det at en den til er som på de med han af for ikke der var mig sig men et har om vi min havde ham hun nu over da fra du ud sin dem os op man hans hvor eller hvad skal selv her alle vil blev kunne ind når være dog noget ville jo deres efter ned skulle denne end dette mit også under have dig anden hende mine alt meget sit sine vor mod disse hvis din nogle hos blive mange ad bliver hendes været thi jer sådan ); return \%stoplist; } else { my %stoplist = map { ( $_, 1 ) } qw( og i jeg det at en den til er som p de med han af for ikke der var mig sig men et har om vi min havde ham hun nu over da fra du ud sin dem os op man hans hvor eller hvad skal selv her alle vil blev kunne ind nr vre dog noget ville jo deres efter ned skulle denne end dette mit ogs under have dig anden hende mine alt meget sit sine vor mod disse hvis din nogle hos blive mange ad bliver hendes vret thi jer sdan ); return \%stoplist; } } 1; Lingua-StopWords-0.09/lib/Lingua/StopWords.pm000644 001751 001751 00000007347 11053555042 022247 0ustar00creamygcreamyg000000 000000 package Lingua::StopWords; use strict; use warnings; require Exporter; our @ISA = qw(Exporter); our %EXPORT_TAGS = ( 'all' => [ qw( getStopWords ) ] ); our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } ); our $VERSION = 0.09; sub getStopWords { my ( $language, $encoding ) = @_; return undef unless $language; $language = uc($language); eval { require "Lingua/StopWords/$language.pm"; }; return undef if $@; my @args = $encoding ? ($encoding) : (); no strict 'refs'; return &{ "Lingua::StopWords::$language\::getStopWords" }(@args); } 1; __END__ =head1 NAME Lingua::StopWords - Stop words for several languages. =head1 SYNOPSIS use Lingua::StopWords qw( getStopWords ); my $stopwords = getStopWords('en'); my @words = qw( i am the walrus goo goo g'joob ); # prints "walrus goo goo g'joob" print join ' ', grep { !$stopwords->{$_} } @words; =head1 DESCRIPTION In keyword search, it is common practice to suppress a collection of "stopwords": words such as "the", "and", "maybe", etc. which exist in in a large number of documents and do not tell you anything important about any document which contains them. This module provides such "stoplists" in several languages. =head2 Supported Languages |-----------------------------------------------------------| | Language | ISO code | default encoding | also available | |-----------------------------------------------------------| | Danish | da | ISO-8859-1 | UTF-8 | | Dutch | nl | ISO-8859-1 | UTF-8 | | English | en | ISO-8859-1 | UTF-8 | | Finnish | fi | ISO-8859-1 | UTF-8 | | French | fr | ISO-8859-1 | UTF-8 | | German | de | ISO-8859-1 | UTF-8 | | Hungarian | hu | ISO-8859-1 | UTF-8 | | Italian | it | ISO-8859-1 | UTF-8 | | Norwegian | no | ISO-8859-1 | UTF-8 | | Portuguese | pt | ISO-8859-1 | UTF-8 | | Spanish | es | ISO-8859-1 | UTF-8 | | Swedish | sv | ISO-8859-1 | UTF-8 | | Russian | ru | KOI8-R | UTF-8 | |-----------------------------------------------------------| =head1 FUNCTIONS =head2 getStopWords my $stoplist = getStopWords('en'); my $utf8_stoplist = getStopWords('en', 'UTF-8'); Retrieve a stoplist in the form of a hashref where the keys are all stopwords and the values are all 1. $stoplist = { and => 1, if => 1, # ... }; getStopWords() expects 1-2 arguments. The first, which is required, is an ISO code representing a supported language. If the ISO code cannot be found, getStopWords returns undef. The second argument should be 'UTF-8' if you want the stopwords encoded in UTF-8. The UTF-8 flag will be turned on, so make sure you understand all the implications of that. =head1 SEE ALSO The stoplists supplied by this module were created as part of the Snowball project (see L, L). L provides a different stoplist for English. =head1 AUTHOR Maintained by Marvin Humphrey Emarvin at rectangular dot comE. Original author Fabien Potencier, Efabpot at cpan dot orgE. =head1 COPYRIGHT AND LICENSE Copyright 2004-2008 Fabien Potencier, Marvin Humphrey This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself, either Perl version 5.8.3 or, at your option, any later version of Perl 5 you may have available. =cut Lingua-StopWords-0.09/Changes000644 001751 001751 00000001640 11053555042 017241 0ustar00creamygcreamyg000000 000000 Revision history for Perl extension Lingua::StopWords 0.09 2008-08-22 * Update Spanish stoplist to latest Snowball version. This is a backwards-incompatible change, but the old list was broken, containing "vosostros" instead of "vosotros" and so on. * Add Hungarian. 0.08 2006-09-19 * Fix version mismatch between main module and submodules. 0.07 2006-09-19 * Fix failing test: skip instead of fail if Test::Pod::Coverage can't be found. 0.06 2006-08-18 * Add Finnish stoplist. 0.05 2006-02-19 * Fix versioning problem with submodules 0.04 2006-02-19 * Minor mods to documentation 0.03 2006-02-19 * New maintainer Marvin Humphrey * Add UTF-8 option * Add Russian 0.02 2004-04-18 * fix the test file (no need to use Data::Denter) 0.01 Sat Apr 10 19:31:27 2004 * original version; created by h2xs 1.23 with options -X Lingua::StopWords Lingua-StopWords-0.09/t000755 001751 001751 00000000000 11053555133 016132 5ustar00creamygcreamyg000000 000000 Lingua-StopWords-0.09/t/1-basic.t000644 001751 001751 00000001212 11053552117 017610 0ustar00creamygcreamyg000000 000000 use strict; use Test::More tests => 8; BEGIN { use_ok('Lingua::StopWords'); use_ok('Lingua::StopWords::EN'); use_ok('Lingua::StopWords::FR'); }; my $wordlist = Lingua::StopWords::getStopWords('en'); ok($wordlist->{me}); ok(!$wordlist->{moi}); my $wordlist3 = Lingua::StopWords::getStopWords('xx'); is($wordlist3, undef); my $wordlist1 = Lingua::StopWords::getStopWords('fr'); my $wordlist2 = Lingua::StopWords::FR::getStopWords(); is_deeply($wordlist1, $wordlist2); my $text = 'ceci est un texte avec des mots au hasard'; my @words = split / /, $text; my $t = join ' ', grep { !$wordlist1->{$_} } @words; is ($t, 'ceci texte mots hasard'); Lingua-StopWords-0.09/t/2-utf8.t000644 001751 001751 00000001355 11053552117 017426 0ustar00creamygcreamyg000000 000000 use strict; use Test::More; BEGIN { eval "use Encode qw( _utf8_on is_utf8 );"; if ($@) { plan skip_all => "Encode module not available"; } else { plan tests => 5; } use_ok('Lingua::StopWords'); } my $stoplist = Lingua::StopWords::getStopWords( 'fr', 'UTF-8' ); my $utf8_ete = "été"; _utf8_on($utf8_ete); ok( $stoplist->{$utf8_ete}, "UTF-8 encoded version present in stoplist" ); for ( keys %$stoplist ) { ok( is_utf8($_), "the stoplist keys are flagged as UTF-8" ); last; } $stoplist = Lingua::StopWords::getStopWords('fr'); ok( $stoplist->{"t"}, "Non-utf8-flagged version present" ); for ( keys %$stoplist ) { ok( !is_utf8($_), "the stoplist keys are not flagged as UTF-8" ); last; } Lingua-StopWords-0.09/t/99-TestPodCoverage.t000644 001751 001751 00000000430 11053552117 021667 0ustar00creamygcreamyg000000 000000 #!/usr/bin/perl use Test::More; if ( eval "use Test::Pod::Coverage; 1" ) { plan( tests => 1 ); pod_coverage_ok( "Lingua::StopWords", "Pod coverage is OK for Lingua::StopWords" ); } else { plan( skip_all => "Test::Pod::Coverage required for testing POD" ); } Lingua-StopWords-0.09/t/98-TestPod.t000644 001751 001751 00000000232 11053552117 020212 0ustar00creamygcreamyg000000 000000 #!/usr/bin/perl use Test::More; plan skip_all => "Test::Pod 1.00 required for testing POD" unless eval "use Test::Pod 1.00; 1"; all_pod_files_ok(); Lingua-StopWords-0.09/MANIFEST000644 001751 001751 00000001037 11053555042 017077 0ustar00creamygcreamyg000000 000000 Changes devel/gen_modules.plx lib/Lingua/StopWords.pm lib/Lingua/StopWords/DA.pm lib/Lingua/StopWords/DE.pm lib/Lingua/StopWords/EN.pm lib/Lingua/StopWords/ES.pm lib/Lingua/StopWords/FI.pm lib/Lingua/StopWords/FR.pm lib/Lingua/StopWords/HU.pm lib/Lingua/StopWords/IT.pm lib/Lingua/StopWords/NL.pm lib/Lingua/StopWords/NO.pm lib/Lingua/StopWords/PT.pm lib/Lingua/StopWords/RU.pm lib/Lingua/StopWords/SV.pm Makefile.PL MANIFEST META.yml Module meta-data (added by MakeMaker) README t/1-basic.t t/2-utf8.t t/98-TestPod.t t/99-TestPodCoverage.t Lingua-StopWords-0.09/devel000755 001751 001751 00000000000 11053555133 016766 5ustar00creamygcreamyg000000 000000 Lingua-StopWords-0.09/devel/gen_modules.plx000755 001751 001751 00000004750 11053555042 022103 0ustar00creamygcreamyg000000 000000 #!/usr/bin/perl # use to automatically generate the Lingua::StopWords::XX modules. use lib qw( lib ); use strict; use Lingua::StopWords; use Lingua::Stem::Snowball qw( stemmers ); use Encode qw( from_to ); use Text::Wrap qw( wrap ); use Getopt::Long; # tabs are evil $Text::Wrap::unexpand = 0; # snowdir should be the snowball_all directory my $snowdir; GetOptions( 'snowdir=s' => \$snowdir ); die "Usage ./bin/gen_modules.plx --snowdir=SNOWDIR" unless -d $snowdir; my $template = <<'END_MODULE'; package Lingua::StopWords::#ISO#; use strict; use warnings; use Exporter; our @ISA = qw(Exporter); our %EXPORT_TAGS = ( 'all' => [ qw( getStopWords ) ] ); our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } ); our $VERSION = #VERSION#; sub getStopWords { if ( @_ and $_[0] eq 'UTF-8' ) { # adding U0 causes the result to be flagged as UTF-8 my %stoplist = map { ( pack("U0a*", $_), 1 ) } qw( #UTF8# ); return \%stoplist; } else { my %stoplist = map { ( $_, 1 ) } qw( #PLAIN# ); return \%stoplist; } } 1; END_MODULE my %languages = ( DA => "danish", NL => "dutch", EN => "english", FI => "finnish", FR => "french", DE => "german", HU => "hungarian", IT => "italian", NO => "norwegian", PT => "portuguese", RU => "russian", ES => "spanish", SV => "swedish", ); while ( my ( $iso, $lang ) = each %languages ) { my $file = "$snowdir/algorithms/$lang/stop.txt"; print STDERR "Generating '$lang' stopword list module\n"; # extract stoplists from snowball source files; parse my @words; open( SNOWBALL_STOPFILE, "<", $file ) or die "Couldn't open file '$file': $!"; while () { s/\|.*//g; next unless length; my @these_words = split; s/\s*// for @these_words; push @words, @these_words; } # translate to UTF-8 my $plain = join(' ', @words); $plain = wrap(' ', ' ', @words); my $source_enc = $lang eq 'ru' ? 'koi8-r' : 'iso-8859-1'; from_to($_, $source_enc, 'UTF-8') for @words; my $utf8 = join(' ', @words); $utf8 = wrap(' ', ' ', @words); # sub in the lists my $mod = $template; $mod =~ s/#VERSION#/$Lingua::StopWords::VERSION/g; $mod =~ s/#ISO#/$iso/g; $mod =~ s/#PLAIN#/$plain/g; $mod =~ s/#UTF8#/$utf8/g; # blast it out open(F, ">lib/Lingua/StopWords/$iso.pm"); print F $mod; close(F); } Lingua-StopWords-0.09/META.yml000644 001751 001751 00000000634 11053555133 017222 0ustar00creamygcreamyg000000 000000 --- #YAML:1.0 name: Lingua-StopWords version: 0.09 abstract: Stop words for several languages. license: ~ generated_by: ExtUtils::MakeMaker version 6.33 distribution_type: module requires: meta-spec: url: http://module-build.sourceforge.net/META-spec-v1.2.html version: 1.2 author: - Marvin Humphrey Lingua-StopWords-0.09/README000644 001751 001751 00000000275 11053552117 016631 0ustar00creamygcreamyg000000 000000 Lingua-StopWords ================ Stopword lists in several languages. INSTALLATION To install this module type the following: perl Makefile.PL make make test make install Lingua-StopWords-0.09/Makefile.PL000644 001751 001751 00000000552 11053552117 017721 0ustar00creamygcreamyg000000 000000 use 5.006001; use ExtUtils::MakeMaker; WriteMakefile( NAME => 'Lingua::StopWords', AUTHOR => 'Marvin Humphrey ', VERSION_FROM => 'lib/Lingua/StopWords.pm', ABSTRACT_FROM => 'lib/Lingua/StopWords.pm', PREREQ_PM => {}, clean => { FILES => 'Lingua-StopWords-* MANIFEST.bak' }, );