Regexp-Trie-0.02/0000755000076500007650000000000010424052736015157 5ustar dankogaidankogai00000000000000Regexp-Trie-0.02/Changes0000644000076500007650000000052410424052614016446 0ustar dankogaidankogai00000000000000# Revision history for Perl extension Regexp::Trie. # # $Id: Changes,v 0.2 2006/04/27 05:24:23 dankogai Exp dankogai $ # $Revision: 0.2 $ $Date: 2006/04/27 05:24:23 $ - Regexp/Trie.pm + lib/Regexp/Trie.pm be nicer to Module::Build 0.01 Thu Apr 27 11:31:11 2006 - original version; created by h2xs 1.23 with options -XA Regexp::Trie Regexp-Trie-0.02/lib/0000755000076500007650000000000010424052736015725 5ustar dankogaidankogai00000000000000Regexp-Trie-0.02/lib/Regexp/0000755000076500007650000000000010424052736017157 5ustar dankogaidankogai00000000000000Regexp-Trie-0.02/lib/Regexp/Trie.pm0000644000076500007650000000512410424052711020413 0ustar dankogaidankogai00000000000000# # $Id: Trie.pm,v 0.2 2006/04/27 05:24:40 dankogai Exp dankogai $ # package Regexp::Trie; use 5.008001; use strict; use warnings; our $VERSION = sprintf "%d.%02d", q$Revision: 0.2 $ =~ /(\d+)/g; # use overload q("") => sub { shift->regexp }; sub new{ bless {} => shift } sub add{ my $self = shift; my $str = shift; my $ref = $self; for my $char (split //, $str){ $ref->{$char} ||= {}; $ref = $ref->{$char}; } $ref->{''} = 1; # { '' => 1 } as terminator $self; } sub _regexp{ my $self = shift; return if $self->{''} and scalar keys %$self == 1; # terminator my (@alt, @cc); my $q = 0; for my $char (sort keys %$self){ my $qchar = quotemeta $char; if (ref $self->{$char}){ if (defined (my $recurse = _regexp($self->{$char}))){ push @alt, $qchar . $recurse; }else{ push @cc, $qchar; } }else{ $q = 1; } } my $cconly = !@alt; @cc and push @alt, @cc == 1 ? $cc[0] : '['. join('', @cc). ']'; my $result = @alt == 1 ? $alt[0] : '(?:' . join('|', @alt) . ')'; $q and $result = $cconly ? "$result?" : "(?:$result)?"; return $result; } sub regexp{ my $str = shift->_regexp; qr/$str/ } 1; __END__ # Below is stub documentation for your module. You'd better edit it! =head1 NAME Regexp::Trie - builds trie-ized regexp =head1 SYNOPSIS use Regexp::Trie; my $rt = Regexp::Trie->new; for (qw/foobar fooxar foozap fooza/){ $rt->add($_); } print $rt->regexp, "\n" # (?-xism:foo(?:bar|xar|zap?)) =head1 DESCRIPTION This module is a faster but simpler version of L or L. It builds a trie-ized regexp as above. This module is faster than L but you can only add literals. C is treated as C, not "more than one a's followed by b". I wrote this module because I needed something faster than L and L. If you need more minute control, use those instead. =head1 TIPS See t/dict2rx.pl to find how to convert a big dictionary into a single regexp that can be later loaded as: my $rx = do 'dict.rx'; =head2 EXPORT None. =head1 SEE ALSO L, L, L =head1 AUTHOR Dan Kogai, Edankogai@dan.co.jpE =head1 COPYRIGHT AND LICENSE Copyright (C) 2006 by Dan Kogai This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself, either Perl version 5.8.8 or, at your option, any later version of Perl 5 you may have available. =cut Regexp-Trie-0.02/Makefile.PL0000644000076500007650000000104410424052022017114 0ustar dankogaidankogai00000000000000use 5.008001; use ExtUtils::MakeMaker; # See lib/ExtUtils/MakeMaker.pm for details of how to influence # the contents of the Makefile that is written. WriteMakefile( NAME => 'Regexp::Trie', VERSION_FROM => 'lib/Regexp/Trie.pm', # finds $VERSION PREREQ_PM => {}, # e.g., Module::Name => 1.1 ($] >= 5.005 ? ## Add these new keywords supported since 5.005 (ABSTRACT_FROM => 'lib/Regexp/Trie.pm', # retrieve abstract from module AUTHOR => 'Dan Kogai ') : ()), ); Regexp-Trie-0.02/MANIFEST0000644000076500007650000000025310424052736016310 0ustar dankogaidankogai00000000000000Changes MANIFEST Makefile.PL README lib/Regexp/Trie.pm t/00-load.t t/01-dict.t t/dict2rx.pl META.yml Module meta-data (added by MakeMaker) Regexp-Trie-0.02/META.yml0000644000076500007650000000046210424052736016432 0ustar dankogaidankogai00000000000000# http://module-build.sourceforge.net/META-spec.html #XXXXXXX This is a prototype!!! It will change in the future!!! XXXXX# name: Regexp-Trie version: 0.02 version_from: lib/Regexp/Trie.pm installdirs: site requires: distribution_type: module generated_by: ExtUtils::MakeMaker version 6.30 Regexp-Trie-0.02/README0000644000076500007650000000237410424040752016040 0ustar dankogaidankogai00000000000000NAME Regexp::Trie - builds trie-ized regexp SYNOPSIS use Regexp::Trie; my $rt = Regexp::Trie->new; for (qw/foobar fooxar foozap fooza/){ $rt->add($_); } print $rt->regexp, "\n" # (?-xism:foo(?:bar|xar|zap?)) DESCRIPTION This module is a faster but simpler version of Regexp::Assemble or Regexp::Optimizer. It builds a trie-ized regexp as above. This module is faster than Regexp::Assemble but you can only add literals. "a+b" is treated as "a\+b", not "more than one a's followed by b". I wrote this module because I needed something faster than Regexp::Assemble and Regexp::Optimizer. If you need more minute control, use those instead. TIPS See t/dict2rx.pl to find how to convert a big dictionary into a single regexp that can be later loaded as: my $rx = do 'dict.rx'; EXPORT None. SEE ALSO Regexp::Optimizer, Regexp::Assemble, Regex::PreSuf AUTHOR Dan Kogai, COPYRIGHT AND LICENSE Copyright (C) 2006 by Dan Kogai This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself, either Perl version 5.8.8 or, at your option, any later version of Perl 5 you may have available. Regexp-Trie-0.02/t/0000755000076500007650000000000010424052736015422 5ustar dankogaidankogai00000000000000Regexp-Trie-0.02/t/00-load.t0000644000076500007650000000073110424026357016744 0ustar dankogaidankogai00000000000000# Before `make install' is performed this script should be runnable with # `make test'. After `make install' it should work as `perl Regexp-Trie.t' ######################### # change 'tests => 1' to 'tests => last_test_to_print'; use Test::More tests => 1; BEGIN { use_ok('Regexp::Trie') }; ######################### # Insert your test code below, the Test::More module is use()ed here so read # its man page ( perldoc Test::More ) for help writing this test script. Regexp-Trie-0.02/t/01-dict.t0000644000076500007650000000142510424052614016745 0ustar dankogaidankogai00000000000000# # $Id: 01-dict.t,v 0.1 2006/04/27 04:01:27 dankogai Exp $ # BEGIN { if (@ARGV) { my $dict = shift; symlink( $dict, "t/_dict" ) or die $!; system qw(perl t/dict2rx.pl), "t/_dict"; } if ( !-f 't/_dict.rx' ) { print qq(1..0 # Skip: _dict.rx not found.\n), qq(# "$0 /usr/share/dict/words" to prepare the test\n); exit 0; } } use strict; use warnings; use Test::More qw/no_plan/; use Regexp::Trie; use Time::HiRes qw/time/; $| = 1; my $time2load = time(); print "# loading t/_dict.rx ... "; my $rx = do 't/_dict.rx'; $time2load = time() - $time2load; print "done. took $time2load seconds.\n"; open my $dict, "<:raw", "t/dict" or die "$!"; while ( my $line = <$dict> ) { chomp $line; ok( $line =~ /^$rx$/, $line ); } Regexp-Trie-0.02/t/dict2rx.pl0000644000076500007650000000107510424052612017332 0ustar dankogaidankogai00000000000000#!/usr/bin/env perl # # $Id: dict2rx.pl,v 0.1 2006/04/27 04:01:27 dankogai Exp $ # use strict; use warnings; use Regexp::Trie; my $src = shift || die "$0 src [dst]"; my $dst = shift || "$src.rx"; my $trie = Regexp::Trie->new; my $count; $|=1; open my $in, "<:raw", $src or die "$src : $!"; while(<$in>){ chomp; $trie->add($_); ++$count % 1000 == 0 and print "$count\r"; } close $in; print "$count\n"; system ("ps v$$"); my $qr = $trie->regexp; open my $out, ">:raw", $dst or die "$dst : $!"; print $out 'qr{'.$qr.'}'; close $out; system ("ps v$$"); __END__