Lingua-PT-Stemmer-0.01/0040755000076400007640000000000007615425127013343 5ustar xernxernLingua-PT-Stemmer-0.01/MANIFEST0100644000076400007640000000013607615424142014465 0ustar xernxernChanges MANIFEST Makefile.PL README test.pl lib/Lingua/PT/Stemmer.pm lib/Lingua/GL/Stemmer.pm Lingua-PT-Stemmer-0.01/lib/0040755000076400007640000000000007615425127014111 5ustar xernxernLingua-PT-Stemmer-0.01/lib/Lingua/0040755000076400007640000000000007615425127015330 5ustar xernxernLingua-PT-Stemmer-0.01/lib/Lingua/GL/0040755000076400007640000000000007615425127015632 5ustar xernxernLingua-PT-Stemmer-0.01/lib/Lingua/GL/Stemmer.pm0100644000076400007640000003165307615423770017613 0ustar xernxernpackage Lingua::GL::Stemmer; use 5.006; use strict; use warnings; our $VERSION = '0.01'; my $aa = "\xe1"; my $ea = "\xe9"; my $ia = "\xed"; my $oa = "\xf3"; my $ua = "\xfa"; my $at = "\xe3"; my $ot = "\xf5"; my $nt = "\xf1"; my $ac = "\xe2"; my $ec = "\xea"; my $cc = "\xe7"; my %rule; $rule{plural} = { "ns" => [ 1, "n" ], "${ot}es" => [ 3, "${ot}n" ], "${at}es" => [ 1, "${at}o" ], "ais" => [ 1, "al" ], "${ea}is" => [ 2, "el" ], "eis" => [ 2, "el" ], "${oa}is" => [ 2, "ol" ], "ois" => [ 2, "ol" ], "${ia}s" => [ 2, "il" ], "les" => [ 2, "l" ], "res" => [ 3, "r" ], "s" => [ 2, "" ], }; $rule{femin} = { "ona" => [ 3, "${oa}n" ], "oa" => [ 3, "${oa}n" ], "ora" => [ 3, "or" ], "na" => [ 4, "no" ], "inha" => [ 3, "inho" ], "i${nt}a" => [ 3, "i${nt}o" ], "esa" => [ 3, "${ea}s" ], "osa" => [ 3, "oso" ], "${ia}aca" => [ 3, "${ia}aco" ], "ica" => [ 3, "ico" ], "ada" => [ 3, "ado" ], "ida" => [ 3, "ido" ], "${ia}da" => [ 3, "ido" ], "ana" => [ 2, "${aa}n" ], "${aa}ria" => [ 3, "${aa}rio" ], "ima" => [ 3, "imo" ], "iva" => [ 3, "ivo" ], "eira" => [ 3, "eiro" ], "${at}" => [ 2, "${at}o" ], "${aa}" => [ 2, "${at}n" ], }; $rule{augment} = { "d${ia}ssimo" => [ 5, '' ], "d${ia}simo" => [ 5, '' ], "abil${ia}ssimo" => [ 5,'' ], "abil${ia}simo" => [ 5,'' ], "${ia}ssimo" => [ 3,'' ], "${ia}simo" => [ 3,'' ], "${ea}simo" => [ 3,'' ], "${ea}sima" => [ 3,'' ], "${ea}rrimo" => [ 4,'' ], "${ea}rrima" => [ 4,'' ], "zinho" => [ 2,'' ], "ci${nt}o" => [ 2,'' ], "a${cc}o" => [ 4, '' ], "a${cc}a" => [ 4, '' ], "azo" => [ 4, '' ], "aza" => [ 4, '' ], "ad${at}o" => [ 4, '' ], "acho" => [ 2, '' ], "acha" => [ 2, '' ], "adinho" => [ 3, '' ], "adi${nt}o" => [ 3, '' ], "alh${aa}m" => [ 4, '' ], "alh${at}o" => [ 4, '' ], "all${aa}n" => [ 4, '' ], "allo" => [ 4, '' ], "alla" => [ 4, '' ], "z${at}o" => [ 2,'' ], "z${oa}n" => [ 2,'' ], "zom" => [ 2,'' ], "${aa}n" => [ 4, '' ], "${oa}n" => [ 3, '' ], "${at}o" => [ 3, '' ], "arra" => [ 3,'' ], "astro" => [ 3,'' ], "${aa}zio" => [ 3,'' ], "echo" => [ 3,'' ], "echa" => [ 3,'' ], "edela" => [ 3,'' ], "ela" => [ 4,'' ], "elo" => [ 4,'' ], "eta" => [ 3,'' ], "ete" => [ 3,'' ], "ica" => [ 3,'' ], "id${at}o" => [ 3,'' ], "quinho" => [ 4, "c" ], "qui${nt}o" => [ 4, "c" ], "uinho" => [ 4,'' ], "ui${nt}o" => [ 4,'' ], "inho" => [ 3,'' ], "i${nt}o" => [ 3,'' ], "ito" => [ 3, '' ], "ocho" => [ 4, '' ], "ocha" => [ 4, '' ], "oide" => [ 3, '' ], "ola" => [ 3, '' ], "olo" => [ 3, '' ], "ote" => [ 3, '' ], "ota" => [ 3, '' ], "u${cc}a" => [ 4,'' ], "ucha" => [ 3,'' ], "ucho" => [ 3,'' ], "uco" => [ 4,'' ], "uza" => [ 4,'' ], "uxa" => [ 3,'' ], }; $rule{noun} = { "abilidade" => [ 5, "" ], "${aa}bel" => [ 2, "" ], "able" => [ 2, "" ], "aci" => [ 3, "" ], "a${cc}" => [ 3, "" ], "adeiro" => [ 3, "" ], "ador" => [ 3, "" ], "ado" => [ 2, "" ], "agem" => [ 3, "" ], "age" => [ 3, "" ], "alismo" => [ 4, "" ], "al${ia}stico" => [ 3, "" ], "alista" => [ 5, "" ], "alizado" => [ 4, "" ], "alizaci" => [ 5, "" ], "aliza${cc}" => [ 5, "" ], "alizaz" => [ 5, "" ], "al" => [ 4, "" ], "ancia" => [ 4, "" ], "${aa}ncia" => [ 4, "" ], "${ac}ncia" => [ 4, "" ], "ano" => [ 4, "" ], "ante" => [ 2, "" ], "ario" => [ 3, "" ], "${aa}rio" => [ 3, "" ], "${aa}stico" => [ 4, "" ], "ativo" => [ 4, "" ], "atizado" => [ 4, "" ], "atizaci" => [ 4, "" ], "atiza${cc}" => [ 4, "" ], "atizaz" => [ 4, "" ], "atoria" => [ 5, "" ], "at${oa}ria" => [ 5, "" ], "atorio" => [ 3, "" ], "at${oa}rio" => [ 3, "" ], "${aa}utico" => [ 4, "" ], "ico" => [ 4, "" ], "auta" => [ 5, "" ], "${aa}vel" => [ 2, "" ], "axe" => [ 3, "" ], "az" => [ 3, "" ], "bel" => [ 5, "" ], "bil" => [ 0, "vel" ], "ble" => [ 5, "" ], "cionista" => [ 5, "" ], "edeiro" => [ 3, "" ], "eiro" => [ 3, "" ], "edouro" => [ 3, "" ], "edor" => [ 3, "" ], "dor" => [ 2, "" ], "encialista" => [ 4, "" ], "encial" => [ 5, "" ], "${ec}ncia" => [ 3, "" ], "encia" => [ 3, "" ], "${ea}ncia" => [ 3, "" ], "ense" => [ 3, "" ], "ente" => [ 4, "" ], "erio" => [ 6, "" ], "${ea}rio" => [ 6, "" ], "esco" => [ 4, "" ], "${ec}utico" => [ 4, "" ], "${ea}utico" => [ 4, "" ], "eza" => [ 3, "" ], "ez" => [ 4, "" ], "${ia}aco" => [ 3, "" ], "ial" => [ 3, "" ], "iamento" => [ 4, "" ], "amento" => [ 3, "" ], "imento" => [ 3, "" ], "emento" => [ 3, "" ], "mento" => [ 6, "" ], "${ia}bel" => [ 5, "" ], "ible" => [ 5, "" ], "icionista" => [ 4, "" ], "iza${cc}" => [ 5, "" ], "izaci" => [ 5, "" ], "izaz" => [ 5, "" ], "ice" => [ 4, "" ], "ici" => [ 3, "" ], "i${cc}" => [ 3, "" ], "iz" => [ 3, "" ], "idade" => [ 4, "" ], "ideiro" => [ 3, "" ], "ideira" => [ 3, "" ], "ido" => [ 3, "" ], "idor" => [ 4, "" ], "inal" => [ 3, "" ], "ional" => [ 4, "" ], "ionar" => [ 5, "" ], "ionista" => [ 5, "" ], "ismo" => [ 3, "" ], "ista" => [ 3, "" ], "${ia}vel" => [ 5, "" ], "ividade" => [ 5, "" ], "ivo" => [ 4, "" ], "izado" => [ 5, "" ], "or" => [ 3, "" ], "oria" => [ 3, "" ], "or${ia}a" => [ 4, "" ], "oso" => [ 3, "" ], "queiro" => [ 3, "c" ], "quice" => [ 4, "c" ], "rio" => [ 5, "" ], "sor" => [ 2, "" ], "tico" => [ 3, "" ], "tivo" => [ 4, "" ], "tizado" => [ 4, "" ], "tiza${cc}" => [ 5, "" ], "tizaci" => [ 5, "" ], "tizaz" => [ 5, "" ], "tor" => [ 5, "" ], "ual" => [ 3, "" ], "uoso" => [ 3, "" ], "ura" => [ 4, "" ], "vel" => [ 5, "" ], }; $rule{verb} = { "aba" => [ 2, "" ], "abade" => [ 2, "" ], "${aa}bade" => [ 2, "" ], "abamo" => [ 2, "" ], "${aa}bamo" => [ 2, "" ], "aban" => [ 2, "" ], "ache" => [ 2, "" ], "ade" => [ 2, "" ], "ai" => [ 2, "" ], "am" => [ 2, "" ], "amo" => [ 2, "" ], "an" => [ 2, "" ], "ando" => [ 2, "" ], "ar" => [ 2, "" ], "ara" => [ 2, "" ], "ar${aa}" => [ 2, "" ], "arade" => [ 2, "" ], "${aa}rade" => [ 2, "" ], "aram" => [ 2, "" ], "ar${aa}m" => [ 2, "" ], "aramo" => [ 2, "" ], "${aa}ramo" => [ 2, "" ], "ar${aa}n" => [ 2, "" ], "ar${at}o" => [ 2, "" ], "arde" => [ 2, "" ], "are" => [ 2, "" ], "arei" => [ 2, "" ], "${aa}rei" => [ 2, "" ], "arem" => [ 2, "" ], "aremo" => [ 2, "" ], "aria" => [ 2, "" ], "ar${ia}a" => [ 2, "" ], "ariade" => [ 2, "" ], "ar${ia}ade" => [ 2, "" ], "ariam" => [ 2, "" ], "ariamo" => [ 2, "" ], "ar${ia}amo" => [ 2, "" ], "ar${ia}ei" => [ 2, "" ], "armo" => [ 2, "" ], "${aa}rom" => [ 2, "" ], "aron" => [ 2, "" ], "ase" => [ 2, "" ], "asede" => [ 2, "" ], "${aa}sede" => [ 2, "" ], "asemo" => [ 2, "" ], "${aa}semo" => [ 2, "" ], "asen" => [ 2, "" ], "asse" => [ 2, "" ], "${aa}ssei" => [ 2, "" ], "assem" => [ 2, "" ], "${aa}ssemo" => [ 2, "" ], "aste" => [ 2, "" ], "ava" => [ 2, "" ], "avam" => [ 2, "" ], "${aa}vamo" => [ 2, "" ], "avan" => [ 2, "" ], "${aa}vei" => [ 2, "" ], "ear" => [ 4, "" ], "ede" => [ 1, "" ], "ei" => [ 3, "" ], "em" => [ 2, "" ], "emo" => [ 2, "" ], "en" => [ 2, "" ], "endo" => [ 1, "" ], "eou" => [ 5, "" ], "er" => [ 1, "" ], "era" => [ 1, "" ], "er${aa}" => [ 1, "" ], "erade" => [ 1, "" ], "${ea}rade" => [ 1, "" ], "eram" => [ 1, "" ], "er${aa}m" => [ 1, "" ], "eramo" => [ 1, "" ], "${ea}ramo" => [ 1, "" ], "${ec}ramo" => [ 1, "" ], "er${aa}n" => [ 1, "" ], "er${at}o" => [ 1, "" ], "erde" => [ 1, "" ], "ere" => [ 1, "" ], "erei" => [ 1, "" ], "${ec}rei" => [ 1, "" ], "erem" => [ 1, "" ], "eremo" => [ 1, "" ], "eria" => [ 1, "" ], "er${ia}a" => [ 1, "" ], "eriade" => [ 1, "" ], "er${ia}ade" => [ 1, "" ], "eriam" => [ 1, "" ], "eriamo" => [ 1, "" ], "er${ia}amo" => [ 1, "" ], "erian" => [ 1, "" ], "er${ia}an" => [ 1, "" ], "er${ia}ei" => [ 1, "" ], "ermo" => [ 1, "" ], "${ec}rom" => [ 1, "" ], "eron" => [ 1, "" ], "ese" => [ 1, "" ], "esedes" => [ 1, "" ], "${ea}sedes" => [ 1, "" ], "esemo" => [ 1, "" ], "${ea}semo" => [ 1, "" ], "esen" => [ 1, "" ], "esse" => [ 1, "" ], "${ec}ssede" => [ 1, "" ], "${ec}ssei" => [ 1, "" ], "essem" => [ 1, "" ], "${ec}ssemo" => [ 1, "" ], "este" => [ 1, "" ], "eu" => [ 1, "" ], "guem" => [ 1, "g" ], "i" => [ 1, "" ], "ia" => [ 1, "" ], "${ia}a" => [ 1, "" ], "iade" => [ 1, "" ], "${ia}ade" => [ 1, "" ], "iam" => [ 1, "" ], "iamo" => [ 1, "" ], "${ia}amo" => [ 1, "" ], "ian" => [ 1, "" ], "${ia}an" => [ 1, "" ], "iava" => [ 1, "" ], "iche" => [ 1, "" ], "ide" => [ 1, "" ], "${ia}do" => [ 3, "" ], "${ia}ei" => [ 1, "" ], "im" => [ 1, "" ], "imo" => [ 3, "" ], "imo" => [ 3, "" ], "in" => [ 3, "" ], "indo" => [ 3, "" ], "iona" => [ 3, "" ], "ir" => [ 3, "" ], "ira" => [ 3, "" ], "ir${aa}" => [ 3, "" ], "irade" => [ 3, "" ], "${ia}rade" => [ 3, "" ], "iram" => [ 3, "" ], "ir${aa}m" => [ 3, "" ], "${ia}ram" => [ 3, "" ], "iramo" => [ 3, "" ], "${ia}ramo" => [ 3, "" ], "ir${aa}n" => [ 3, "" ], "ir${at}o" => [ 2, "" ], "irde" => [ 2, "" ], "ire" => [ 3, "" ], "irei" => [ 3, "" ], "irem" => [ 3, "" ], "iremo" => [ 3, "" ], "iria" => [ 3, "" ], "ir${ia}a" => [ 3, "" ], "iriade" => [ 3, "" ], "ir${ia}ade" => [ 3, "" ], "iriam" => [ 3, "" ], "iriamo" => [ 3, "" ], "ir${ia}amo" => [ 3, "" ], "irian" => [ 3, "" ], "ir${ia}an" => [ 3, "" ], "ir${ia}ei" => [ 3, "" ], "irmo" => [ 3, "" ], "${ia}rom" => [ 3, "" ], "iron" => [ 3, "" ], "ise" => [ 3, "" ], "isede" => [ 3, "" ], "${ia}sede" => [ 3, "" ], "isemo" => [ 3, "" ], "${ia}semo" => [ 3, "" ], "isen" => [ 3, "" ], "isse" => [ 3, "" ], "${ia}ssede" => [ 3, "" ], "${ia}ssei" => [ 3, "" ], "issem" => [ 3, "" ], "${ia}ssemo" => [ 3, "" ], "iste" => [ 4, "" ], "itar" => [ 5, "" ], "iu" => [ 3, "" ], "izar" => [ 3, "" ], "omo" => [ 3, "" ], "ondo" => [ 3, "" ], "ou" => [ 3, "" ], "tizar" => [ 4, "" ], "uei" => [ 3, "" ], "u${ia}a" => [ 5, "u" ], }; $rule{accent} = { $aa => 'a', $ea => 'e', $ia => 'i', $oa => 'o', $ua => 'u', $at => 'a', $ot => 'o', $ec => 'e', $cc => 'c', $nt => 'n', }; $rule{vowel} = { "bil" => [ 2, "vel" ], "gue" => [ 2, "g" ], "a" => [ 3, "" ], "e" => [ 3, "" ], "o" => [ 3, "" ], }; sub strip($$) { my $cmd = shift; my $word = shift; if($cmd eq 'accent'){ foreach my $a (keys %{$rule{accent}}){ $word =~ s/$a/$rule{accent}->{$a}/eg; } } elsif($cmd eq 'adv'){ $word =~ s/(.{4,})mente/$1/o; } else{ my $cmdref = $rule{$cmd}; for my $key (sort { length $b <=> length $a } keys %{$cmdref}){ my $patt = join q//, "^(.{", $cmdref->{$key}->[0], ",})", $key, '$'; if($word =~ /$patt/){ $word =~ s/$patt/$1.($cmdref->{$key}->[1])/e; last; } } } return $word; } sub stem { my @stems; foreach ( ref($_[0]) ? @{$_[0]} : @_ ){ my $word = $_; $word = strip('plural', $word) if $word =~ /s$/o; $word = strip('femin', $word) if $word =~ /a$/o; foreach my $op (qw/augment adv noun verb vowel accent/){ $word = strip($op, $word); } push @stems, $word; } wantarray ? @stems : \@stems; } 1; __END__ # Below is stub documentation for your module. You better edit it! =head1 NAME Lingua::GL::Stemmer - Galician Stemmer =head1 SYNOPSIS use Lingua::GL::Stemmer; Lingua::GL::Stemmer::stem(\@words); # or Lingua::GL::Stemmer::stem(@words); =head1 DESCRIPTION Galician is an endangered language spoken in northwest region of Spain. Galician is morphologically similar to Portuguese but phonetics differs greatly. Due to the morphological similarity between Portuguese and Galician, Portuguese stemming algorithm can be adopted to stem Galician texts. See L for a sketch of the stemming algorithm, and L for stemming rules. =head1 SEE ALSO L Stemming rules L =head1 COPYRIGHT xern Exern@cpan.orgE This module is free software; you can redistribute it or modify it under the same terms as Perl itself. =cut Lingua-PT-Stemmer-0.01/lib/Lingua/PT/0040755000076400007640000000000007615425127015653 5ustar xernxernLingua-PT-Stemmer-0.01/lib/Lingua/PT/Stemmer.pm0100644000076400007640000001704107615423770017627 0ustar xernxernpackage Lingua::PT::Stemmer; use 5.006; use strict; use warnings; our $VERSION = '0.01'; my $aa = "\xe1"; my $ea = "\xe9"; my $ia = "\xed"; my $oa = "\xf3"; my $ua = "\xfa"; my $at = "\xe3"; my $ot = "\xf5"; my $ac = "\xe2"; my $ec = "\xea"; my $cc = "\xe7"; my %rule; $rule{plural} = { "ns" => [ 1, "m" ], "${ot}es" => [ 3, "${at}o" ], "${at}es" => [ 1, "${at}o" ], "ais" => [ 1, "al" ], "${ea}is" => [ 2, "el" ], "eis" => [ 2, "el" ], "${oa}is" => [ 2, "ol" ], "is" => [ 2, "il" ], "les" => [ 2, "l" ], "res" => [ 3, "r" ], "s" => [ 2, "" ], }; $rule{femin} = { "ona" => [ 3, "${at}o" ], "${at}" => [ 2, "${at}o" ], "ora" => [ 3, "or" ], "na" => [ 4, "no" ], "inha" => [ 3, "inho" ], "esa" => [ 3, "${ec}s" ], "osa" => [ 3, "oso" ], "${ia}aca" => [ 3, "${ia}aco" ], "ica" => [ 3, "ico" ], "ada" => [ 3, "ado" ], "ida" => [ 3, "ido" ], "${ia}da" => [ 3, "ido" ], "ima" => [ 3, "imo" ], "iva" => [ 3, "ivo" ], "eira" => [ 3, "eiro" ], }; $rule{augment} = { "d${ia}ssimo" => [ 5, '' ], "abil${ia}ssimo" => [ 5,'' ], "${ia}ssimo" => [ 3,'' ], "${ea}simo" => [ 3,'' ], "${ea}rrimo" => [ 4,'' ], "zinho" => [ 2,'' ], "quinho" => [ 4, "c" ], "uinho" => [ 4,'' ], "adinho" => [ 3,'' ], "inho" => [ 3,'' ], "alh${at}o" => [ 4,'' ], "u${cc}a" => [ 4,'' ], "a${cc}o" => [ 4,'' ], "ad${at}o" => [ 4,'' ], "${aa}zio" => [ 3,'' ], "arraz" => [ 4,'' ], "arra" => [ 3,'' ], "z${at}o" => [ 2,'' ], "${at}o" => [ 3,'' ], }; $rule{noun} = { "encialista" => [ 4, '' ], "alista" => [ 5, '' ], "agem" => [ 3, '' ], "iamento" => [ 4, '' ], "amento" => [ 3, '' ], "imento" => [ 3, '' ], "alizado" => [ 4, '' ], "atizado" => [ 4, '' ], "izado" => [ 5, '' ], "ativo" => [ 4, '' ], "tivo" => [ 4, '' ], "ivo" => [ 4, '' ], "ado" => [ 2, '' ], "ido" => [ 3, '' ], "ador" => [ 3,'' ], "edor" => [ 3, '' ], "idor" => [ 4, '' ], "at${oa}ria" => [ 5, '' ], "or" => [ 2, '' ], "abilidade" => [ 5,'' ], "icionista" => [ 4, '' ], "cionista" => [ 5, '' ], "ional" => [ 4, '' ], "${ec}ncia" => [ 3, '' ], "${ac}ncia" => [ 4, '' ], "edouro" => [ 3, '' ], "queiro" => [ 3, 'c' ], "eiro" => [ 3, '' ], "oso" => [ 3, '' ], "aliza${cc}" => [ 5, '' ], "ismo" => [ 3, '' ], "iza${cc}" => [ 5, '' ], "a${cc}" => [ 3, '' ], "i${cc}" => [ 3, '' ], "${aa}rio" => [ 3, '' ], "${ea}rio" => [ 6, '' ], "${ec}s" => [ 4, '' ], "eza" => [ 3, '' ], "ez" => [ 4, '' ], "esco" => [ 4, '' ], "ante" => [ 2, '' ], "${aa}stico" => [ 4, '' ], "${aa}tico" => [ 3, '' ], "ico" => [ 4, '' ], "ividade" => [ 5, '' ], "idade" => [ 5, '' ], "oria" => [ 4, '' ], "encial" => [ 5, '' ], "ista" => [ 4, '' ], "quice" => [ 4, 'c' ], "ice" => [ 4, '' ], "${ia}aco" => [ 3, '' ], "ente" => [ 4, '' ], "inal" => [ 3, '' ], "ano" => [ 4, '' ], "${aa}vel" => [ 2, '' ], "${ia}vel" => [ 5, '' ], "ura" => [ 4, '' ], "ual" => [ 3, '' ], "ial" => [ 3, '' ], "al" => [ 4, '' ], }; $rule{verb} = { "ar${ia}amo" => [ 2, ''], "eria" => [ 3, '' ], "${aa}ssemo" => [ 2, '' ], "ermo" => [ 3, '' ], "er${ia}amo" => [ 2, '' ], "esse" => [ 3, '' ], "${ec}ssemo" => [ 2, '' ], "este" => [ 3, '' ], "ir${ia}amo" => [ 3, '' ], "${ia}amo" => [ 3, '' ], "${ia}ssemo" => [ 3, '' ], "iram" => [ 3, '' ], "${aa}ramo" => [ 2, '' ], "${ia}ram" => [ 3, '' ], "${aa}rei" => [ 2, '' ], "irde" => [ 2, '' ], "aremo" => [ 2, '' ], "irei" => [ 3, '' ], "ariam" => [ 2, '' ], "irem" => [ 3, '' ], "ar${ia}ei" => [ 2, '' ], "iria" => [ 3, '' ], "${aa}ssei" => [ 2, '' ], "irmo" => [ 3, '' ], "assem" => [ 2, '' ], "isse" => [ 3, '' ], "${aa}vamo" => [ 2, '' ], "iste" => [ 4, '' ], "${ec}ramo" => [ 3, '' ], "amo" => [ 2, '' ], "eremo" => [ 3, '' ], "ara" => [ 2, '' ], "eriam" => [ 3, '' ], "ar${aa}" => [ 2, '' ], "er${ia}ei" => [ 3, '' ], "are" => [ 2, '' ], "${ec}ssei" => [ 3, '' ], "ava" => [ 2, '' ], "essem" => [ 3, '' ], "emo" => [ 2, '' ], "${ia}ramo" => [ 3, '' ], "era" => [ 3, '' ], "iremo" => [ 3, '' ], "er${aa}" => [ 3, '' ], "iriam" => [ 3, '' ], "ere" => [ 3, '' ], "ir${ia}ei" => [ 3, '' ], "iam" => [ 3, '' ], "${ia}ssei" => [ 3, '' ], "${ia}ei" => [ 3, '' ], "issem" => [ 3, '' ], "imo" => [ 3, '' ], "ando" => [ 2, '' ], "ira" => [ 3, '' ], "endo" => [ 3, '' ], "ir${aa}" => [ 3, '' ], "indo" => [ 3, '' ], "ire" => [ 3, '' ], "ondo" => [ 3, '' ], "omo" => [ 3, '' ], "aram" => [ 2, '' ], "ai" => [ 2, '' ], "arde" => [ 2, '' ], "am" => [ 2, '' ], "arei" => [ 2, '' ], "ear" => [ 4, '' ], "arem" => [ 2, '' ], "ar" => [ 2, '' ], "aria" => [ 2, '' ], "uei" => [ 3, '' ], "armo" => [ 2, '' ], "ei" => [ 3, '' ], "asse" => [ 2, '' ], "em" => [ 2, '' ], "aste" => [ 2, '' ], "er" => [ 2, '' ], "avam" => [ 2, '' ], "eu" => [ 3, '' ], "${aa}vei" => [ 2, '' ], "ia" => [ 3, '' ], "eram" => [ 3, '' ], "ir" => [ 3, '' ], "erde" => [ 3, '' ], "iu" => [ 3, '' ], "erei" => [ 3, '' ], "ou" => [ 3, '' ], "${ec}rei" => [ 3, '' ], "i" => [ 3, '' ], "erem" => [ 3, '' ], }; $rule{accent} = { $aa => 'a', $ea => 'e', $ia => 'i', $oa => 'o', $ua => 'u', $at => 'a', $ot => 'o', $ec => 'e', $cc => 'c', }; sub strip($$) { my $cmd = shift; my $word = shift; if($cmd eq 'accent'){ foreach my $a (keys %{$rule{accent}}){ $word =~ s/$a/$rule{accent}->{$a}/eg; } } elsif($cmd eq 'adv'){ $word =~ s/(.{4,})mente/$1/o; } elsif($cmd eq 'vowel'){ $word =~ s/(.{3,})$_$/$1/ for qw/a e o/; } else{ my $cmdref = $rule{$cmd}; for my $key (sort { length $b <=> length $a } keys %{$cmdref}){ my $patt = join q//, "^(.{", $cmdref->{$key}->[0], ",})", $key, '$'; if($word =~ /$patt/){ $word =~ s/$patt/$1.($cmdref->{$key}->[1])/e; last; } } } return $word; } sub stem { my @stems; foreach ( ref($_[0]) ? @{$_[0]} : @_ ){ my $word = $_; $word = strip('plural', $word) if $word =~ /s$/o; $word = strip('femin', $word) if $word =~ /a$/o; foreach my $op (qw/augment adv noun verb vowel accent/){ $word = strip($op, $word); } push @stems, $word; } wantarray ? @stems : \@stems; } 1; __END__ # Below is stub documentation for your module. You better edit it! =head1 NAME Lingua::PT::Stemmer - Portuguese language stemming =head1 SYNOPSIS use Lingua::PT::Stemmer; Lingua::PT::Stemmer::stem(\@words); # or Lingua::PT::Stemmer::stem(@words); =head1 DESCRIPTION This module implements a Portuguese stemming algorithm proposed in the paper B by B and B The eight steps of stemming algorithm are listed as follows: =over 8 =item * Plural Reduction =item * Feminine Reduction =item * Adverb Reduction =item * Augmentative/Diminutive Reduction =item * Noun Suffix Reduction =item * Verb Suffix Reduction =item * Vowel Reduction =item * Accents Removal =back =head1 SEE ALSO L =head1 COPYRIGHT xern Exern@cpan.orgE This module is free software; you can redistribute it or modify it under the same terms as Perl itself. =cut Lingua-PT-Stemmer-0.01/test.pl0100644000076400007640000000116107615423770014655 0ustar xernxernuse Test; BEGIN { plan tests => 12 }; use Lingua::PT::Stemmer; use Lingua::GL::Stemmer; ########################################################################## @ptword = Lingua::PT::Stemmer::stem(qw(bons chilena pezinho existencialista beberiam)); @ptstem = qw(bom chilen pe exist beb); ok(1); ok($ptword[$_], $ptstem[$_]) for (0..$#ptword); ########################################################################## @glword = Lingua::GL::Stemmer::stem(qw(bons chilena cazola preconceituoso chegou)); @glstem = qw(bon chilen caz preconceit cheg); ok(1); ok($glword[$_], $glstem[$_]) for (0..$#glword); Lingua-PT-Stemmer-0.01/Changes0100644000076400007640000000025307615423770014635 0ustar xernxernRevision history for Perl extension Lingua::PT::Stemmer. 0.01 Sun Jan 26 02:33:08 2003 - original version; created by h2xs 1.21 with options -XA Lingua::PT::Stemmer Lingua-PT-Stemmer-0.01/README0100644000076400007640000000050307615423770014220 0ustar xernxernText/Portuguese version 0.01 ============================ Stemmers for Portuguese and Galician INSTALLATION To install this module type the following: perl Makefile.PL make make test make install COPYRIGHT AND LICENCE Copyright (C) 2003 xern , released under the same terms as Perl itself Lingua-PT-Stemmer-0.01/Makefile.PL0100644000076400007640000000076507615424222015315 0ustar xernxernuse ExtUtils::MakeMaker; # See lib/ExtUtils/MakeMaker.pm for details of how to influence # the contents of the Makefile that is written. WriteMakefile( 'NAME' => 'Lingua::PT::Stemmer', 'VERSION_FROM' => 'lib/Lingua/PT/Stemmer.pm', 'PREREQ_PM' => {}, # e.g., Module::Name => 1.1 ($] >= 5.005 ? ## Add these new keywords supported since 5.005 (ABSTRACT_FROM => 'lib/Lingua/PT/Stemmer.pm', # retrieve abstract from module AUTHOR => 'xern ') : ()), );