DBIx-FullTextSearch-0.73/0040755000076500007650000000000007630504720015363 5ustar tjmathertjmatherDBIx-FullTextSearch-0.73/lib/0040755000076500007650000000000007630504720016131 5ustar tjmathertjmatherDBIx-FullTextSearch-0.73/lib/DBIx/0040755000076500007650000000000007630504720016717 5ustar tjmathertjmatherDBIx-FullTextSearch-0.73/lib/DBIx/FullTextSearch/0040755000076500007650000000000007630504720021614 5ustar tjmathertjmatherDBIx-FullTextSearch-0.73/lib/DBIx/FullTextSearch/Phrase.pm0100644000076500007650000001147407630447023023402 0ustar tjmathertjmatherpackage DBIx::FullTextSearch::Phrase; use strict; use DBIx::FullTextSearch::Column; use vars qw! @ISA !; @ISA = qw! DBIx::FullTextSearch::Column !; # Open in the backend just sets the object sub open { my ($class, $fts) = @_; return bless { 'fts' => $fts }, $class; } sub DESTROY { my ($self) = @_; if (defined $self->{'select_wordid_sth'}) { $self->{'select_wordid_sth'}->finish(); } } # Create creates the table(s) according to the parameters sub _create_tables { my ($class, $fts) = @_; my $COUNT_FIELD = ''; my $CREATE_DATA = <{'data_table'} ( word_id $DBIx::FullTextSearch::BITS_TO_INT{$fts->{'word_id_bits'}} unsigned not null, doc_id $DBIx::FullTextSearch::BITS_TO_INT{$fts->{'doc_id_bits'}} unsigned not null, idx longblob default '' not null, index (word_id), index (doc_id) ) EOF $fts->{'word_id_table'} = $fts->{'table'}.'_words' unless defined $fts->{'word_id_table'}; my $CREATE_WORD_ID = <{'word_id_table'} ( word varchar($fts->{'word_length'}) binary default '' not null, id $DBIx::FullTextSearch::BITS_TO_INT{$fts->{'word_id_bits'}} unsigned not null auto_increment, primary key (id), unique (word) ) EOF my $dbh = $fts->{'dbh'}; $dbh->do($CREATE_DATA) or return $dbh->errstr; push @{$fts->{'created_tables'}}, $fts->{'data_table'}; $dbh->do($CREATE_WORD_ID) or return $dbh->errstr; push @{$fts->{'created_tables'}}, $fts->{'word_id_table'}; return; } sub add_document { my ($self, $id, $words) = @_; # here the value in the %$words hash is an array of word # positions my $fts = $self->{'fts'}; my $dbh = $fts->{'dbh'}; my $word_id_table = $fts->{'word_id_table'}; if (not defined $self->{'select_wordid_sth'}) { $self->{'select_wordid_sth'} = $dbh->prepare(" select id from $word_id_table where word = ? "); } my $data_table = $fts->{'data_table'}; my $packstring = $DBIx::FullTextSearch::BITS_TO_PACK{$fts->{'position_bits'}}; my $num_words = 0; my (@wids,@data,@widshandler,@datahandler); my $wordid; $dbh->do("lock tables $word_id_table write"); my ($maxid) = $dbh->selectrow_array("select max(id) from $word_id_table"); foreach my $word (keys %$words) { if(!defined $self->{'wordids'}->{$word}) { $self->{'select_wordid_sth'}->execute($word); ($wordid) = $self->{'select_wordid_sth'}->fetchrow_array(); unless ($wordid) { $maxid++; push @widshandler, "(?,$maxid)"; push @wids, $word; $wordid = $maxid; } $self->{'wordids'}->{$word} = $wordid; } else { $wordid=$self->{'wordids'}->{$word}; } push @datahandler, "($wordid,$id,?)"; push @data, pack $packstring.'*', @{$words->{$word}}; $num_words++; }; $dbh->do("insert into $word_id_table values " . join (',',@widshandler),undef,@wids) if @wids; $dbh->do("unlock tables"); $dbh->do("insert into $data_table values " . join (',',@datahandler),undef,@data) if @data; return $num_words; } sub update_document { my ($self, $id, $words) = @_; my $fts = $self->{'fts'}; my $dbh = $fts->{'dbh'}; my $data_table = $fts->{'data_table'}; $dbh->do("delete from $data_table where doc_id = ?", {}, $id); $self->add_document($id, $words); } sub contains_hashref { my $self = shift; my $fts = $self->{'fts'}; my $dbh = $fts->{'dbh'}; my $data_table = $fts->{'data_table'}; my $word_id_table = $fts->{'word_id_table'}; my $packstring = $DBIx::FullTextSearch::BITS_TO_PACK{$fts->{'position_bits'}}; my $SQL = qq{ select doc_id, idx from $data_table, $word_id_table where word like ? and id = word_id }; my $out = {}; for my $phrase (@_){ my @words = split(' ', $phrase); my @sths; for (my $i = 0; $i < @words; $i++) { $sths[$i] = $dbh->prepare($SQL); $sths[$i]->execute($words[$i]); } my %prev_pos = (); my %cur_pos = (); # iterate through words in phrase for (my $i = 0; $i < @words; $i++){ if($i > 0){ %prev_pos = %cur_pos; %cur_pos = (); } # get docs that have this word while (my ($doc, $data) = $sths[$i]->fetchrow_array){ # get positions of words in doc my @positions = unpack $packstring.'*', $data; map { $cur_pos{$doc}->{$_} = 1 } @positions; } if($i > 0){ # check to see if word $i comes after word $i-1 for my $doc (keys %cur_pos){ my $isPhrase = 0; for my $position (keys %{$cur_pos{$doc}}){ if ($position > 0 && exists $prev_pos{$doc}{$position - 1}){ $isPhrase = 1; } else { delete $cur_pos{$doc}{$position}; } } delete $cur_pos{$doc} unless $isPhrase; } } } for my $doc (keys %cur_pos){ my @positions = keys %{$cur_pos{$doc}}; $out->{$doc} += scalar (@positions); } } return $out; } *parse_and_index_data = \&DBIx::FullTextSearch::parse_and_index_data_list; 1; DBIx-FullTextSearch-0.73/lib/DBIx/FullTextSearch/StopList.pm0100644000076500007650000003362107604065676023752 0ustar tjmathertjmatherpackage DBIx::FullTextSearch::StopList; use strict; use Carp; sub create_default { my ($class, $dbh, $TABLE, $language) = @_; croak("Error: no language specified") unless $language; $language = lc $language; my @stopList; if($language eq 'english'){ @stopList = qw/ a about after all also an and any are as at be because been but by can co corp could for from had has have he her his if in inc into is it its last more most mr mrs ms mz no not of on one only or other out over s says she so some such than that the their there they this to up was we were when which who will with would /; } elsif ($language eq 'czech'){ @stopList = qw/ a aby ale ani a¾ bude by byl byla bylo být co èi dal¹í do i jak jako je jeho jejich jen je¹tì ji¾ jsem jsme jsou k kde kdy¾ korun která které který kteøí let mezi má mù¾e na nebo není ne¾ o od pak po podle pouze pro proti první pøed pøi roce roku øekl s se si své tak také tedy to tom tím u u¾ v ve v¹ak z za ze ¾e/; } elsif ($language eq 'danish'){ @stopList = qw/ af aldrig alle altid bagved De de der du efter eller en endnu et få fjernt for foran fra gennem god han her hos hovfor hun hurtig hvad hvem hvonår hvor hvordan hvorhen I i imod ja jeg langsom lidt mange måske med meget mellem mere mindre når nede nej nok nu og oppe på rask sammen temmelig til uden udenfor under ved vi /; } elsif ($language eq 'dutch'){ @stopList = qw/ aan aangaande aangezien achter achterna afgelopen al aldaar aldus alhoewel alias alle allebei alleen alsnog altijd altoos ander andere anders anderszins behalve behoudens beide beiden ben beneden bent bepaald betreffende bij binnen binnenin boven bovenal bovendien bovengenoemd bovenstaand bovenvermeld buiten daar daarheen daarin daarna daarnet daarom daarop daarvanlangs dan dat de die dikwijls dit door doorgaand dus echter eer eerdat eerder eerlang eerst elk elke en enig enigszins enkel er erdoor even eveneens evenwel gauw gedurende geen gehad gekund geleden gelijk gemoeten gemogen geweest gewoon gewoonweg haar had hadden hare heb hebben hebt heeft hem hen het hierbeneden hierboven hij hoe hoewel hun hunne ik ikzelf in inmiddels inzake is jezelf jij jijzelf jou jouw jouwe juist jullie kan klaar kon konden krachtens kunnen kunt later liever maar mag meer met mezelf mij mijn mijnent mijner mijzelf misschien mocht mochten moest moesten moet moeten mogen na naar nadat net niet noch nog nogal nu of ofschoon om omdat omhoog omlaag omstreeks omtrent omver onder ondertussen ongeveer ons onszelf onze ook op opnieuw opzij over overeind overigens pas precies reeds rond rondom sedert sinds sindsdien slechts sommige spoedig steeds tamelijk tenzij terwijl thans tijdens toch toen toenmaals toenmalig tot totdat tussen uit uitgezonderd vaak van vandaan vanuit vanwege veeleer verder vervolgens vol volgens voor vooraf vooral vooralsnog voorbij voordat voordezen voordien voorheen voorop vooruit vrij vroeg waar waarom wanneer want waren was wat weer weg wegens wel weldra welk welke wie wiens wier wij wijzelf zal ze zelfs zichzelf zij zijn zijne zo zodra zonder zou zouden zowat zulke zullen zult /; } elsif ($language eq 'finnish'){ @stopList = qw/ aina alla ansiosta ehkä ei enemmän ennen etessa haikki hän he hitaasti hoikein hyvin ilman ja jälkeen jos kanssa kaukana kenties keskellä kesken koskaan kuinkan kukka kyllä kylliksi lähellä läpi liian lla lla luona me mikä miksi milloin milloinkan minä missä miten nopeasti nyt oikea oikealla paljon siellä sinä ssa sta suoraan tai takana takia tarpeeksi tässä te ulkopuolella vähän vahemmän vasen vasenmalla vastan vielä vieressä yhdessä ylös /; } elsif ($language eq 'french'){ @stopList = qw/ a à afin ailleurs ainsi alors après attendant au aucun aucune au-dessous au-dessus auprès auquel aussi aussitôt autant autour aux auxquelles auxquels avec beaucoup ça ce ceci cela celle celles celui cependant certain certaine certaines certains ces cet cette ceux chacun chacune chaque chez combien comme comment concernant dans de dedans dehors déjà delà depuis des dès desquelles desquels dessus donc donné dont du duquel durant elle elles en encore entre et étaient était étant etc eux furent grâce hormis hors ici il ils jadis je jusqu jusque la là laquelle le lequel les lesquelles lesquels leur leurs lors lorsque lui ma mais malgré me même mêmes mes mien mienne miennes miens moins moment mon moyennant ne ni non nos notamment notre nôtre notres nôtres nous nulle nulles on ou où par parce parmi plus plusieurs pour pourquoi près puis puisque quand quant que quel quelle quelque quelques-unes quelques-uns quelqu''un quelqu''une quels qui quiconque quoi quoique sa sans sauf se selon ses sien sienne siennes siens soi soi-même soit sont suis sur ta tandis tant te telle telles tes tienne tiennes tiens toi ton toujours tous toute toutes très trop tu un une vos votre vôtre vôtres vous vu y /; } elsif ($language eq 'german'){ @stopList = qw/ ab aber allein als also am an auch auf aus außer bald bei beim bin bis bißchen bist da dabei dadurch dafür dagegen dahinter damit danach daneben dann daran darauf daraus darin darüber darum darunter das daß dasselbe davon davor dazu dazwischen dein deine deinem deinen deiner deines dem demselben den denn der derselben des desselben dessen dich die dies diese dieselbe dieselben diesem diesen dieser dieses dir doch dort du ebenso ehe ein eine einem einen einer eines entlang er es etwa etwas euch euer eure eurem euren eurer eures für fürs ganz gar gegen genau gewesen her herein herum hin hinter hintern ich ihm ihn Ihnen ihnen ihr Ihre ihre Ihrem ihrem Ihren ihren Ihrer ihrer Ihres ihres im in ist ja je jedesmal jedoch jene jenem jenen jener jenes kaum kein keine keinem keinen keiner keines man mehr mein meine meinem meinen meiner meines mich mir mit nach nachdem nämlich neben nein nicht nichts noch nun nur ob ober obgleich oder ohne paar sehr sei sein seine seinem seinen seiner seines seit seitdem selbst sich Sie sie sind so sogar solch solche solchem solchen solcher solches sondern sonst soviel soweit über um und uns unser unsre unsrem unsren unsrer unsres vom von vor während war wäre wären warum was wegen weil weit welche welchem welchen welcher welches wem wen wenn wer weshalb wessen wie wir wo womit zu zum zur zwar zwischen zwischens /; } elsif ($language eq 'italian'){ @stopList = qw/ a affinchè agl'' agli ai al all'' alla alle allo anzichè avere bensì che chi cioè come comunque con contro cosa da dachè dagl'' dagli dai dal dall'' dalla dalle dallo degl'' degli dei del dell'' delle dello di dopo dove dunque durante e egli eppure essere essi finché fino fra giacchè gl'' gli grazie I il in inoltre io l'' la le lo loro ma mentre mio ne neanche negl'' negli nei nel nell'' nella nelle nello nemmeno neppure noi nonchè nondimeno nostro o onde oppure ossia ovvero per perchè perciò però poichè prima purchè quand''anche quando quantunque quasi quindi se sebbene sennonchè senza seppure si siccome sopra sotto su subito sugl'' sugli sui sul sull'' sulla sulle sullo suo talchè tu tuo tuttavia tutti un una uno voi vostr/; } elsif ($language eq 'portuguese'){ @stopList = qw/ a abaixo adiante agora ali antes aqui até atras bastante bem com como contra debaixo demais depois depressa devagar direito e ela elas êle eles em entre eu fora junto longe mais menos muito não ninguem nós nunca onde ou para por porque pouco próximo qual quando quanto que quem se sem sempre sim sob sobre talvez todas todos vagarosamente você vocês /; } elsif ($language eq 'spanish'){ @stopList = qw/ a acá ahí ajena ajenas ajeno ajenos al algo algún alguna algunas alguno algunos allá allí aquel aquella aquellas aquello aquellos aquí cada cierta ciertas cierto ciertos como cómo con conmigo consigo contigo cualquier cualquiera cualquieras cuan cuán cuanta cuánta cuantas cuántas cuanto cuánto cuantos cuántos de dejar del demás demasiada demasiadas demasiado demasiados el él ella ellas ellos esa esas ese esos esta estar estas este estos hacer hasta jamás junto juntos la las lo los mas más me menos mía mientras mío misma mismas mismo mismos mucha muchas muchísima muchísimas muchísimo muchísimos mucho muchos muy nada ni ninguna ningunas ninguno ningunos no nos nosotras nosotros nuestra nuestras nuestro nuestros nunca os otra otras otro otros para parecer poca pocas poco pocos por porque que qué querer quien quién quienes quienesquiera quienquiera ser si sí siempre sín Sr Sra Sres Sta suya suyas suyo suyos tal tales tan tanta tantas tanto tantos te tener ti toda todas todo todos tomar tú tuya tuyo un una unas unos usted ustedes varias varios vosotras vosotros vuestra vuestras vuestro vuestros y yo /; } elsif ($language eq 'swedish'){ @stopList = qw/ ab aldrig all alla alltid än ännu ånyo är att av avser avses bakom bra bredvid dä där de dem den denna deras dess det detta du efter efterät eftersom ej eller emot en ett fastän för fort framför från genom gott hamske han här hellre hon hos hur i in ingen innan inte ja jag långsamt långt lite man med medan mellan mer mera mindre mot myckett när nära nej nere ni nu och oksa om över på så sådan sin skall som till tillräckligt tillsammans trotsatt under uppe ut utan utom vad väl var varför vart varthän vem vems vi vid vilken /; } croak("Error: language $language is not a supported") unless @stopList; my $sl = $class->create_empty($dbh, $TABLE); $sl->add_stop_word(\@stopList); return $sl; } sub create_empty { my ($class, $dbh, $name) = @_; my $table = $name . '_stoplist'; my $SQL = qq{ CREATE TABLE $table (word VARCHAR(255) PRIMARY KEY) }; $dbh->do($SQL) or croak "Can't create table $table: " . $dbh->errstr; my $self = {}; $self->{'dbh'} = $dbh; $self->{'name'} = $name; $self->{'table'} = $table; $self->{'stoplist'} = {}; bless $self, $class; return $self; } sub open { my ($class, $dbh, $name) = @_; my $table = $name . '_stoplist'; my $self = {}; $self->{'dbh'} = $dbh; $self->{'name'} = $name; $self->{'table'} = $table; $self->{'stoplist'} = {}; bless $self, $class; # load stoplist into a hash my $SQL = qq{ SELECT word FROM $table }; my $ary_ref = $dbh->selectcol_arrayref($SQL) or croak "Can't load stoplist from $table: " . $dbh->errstr; for (@$ary_ref){ $self->{'stoplist'}->{$_} = 1; } return $self; } sub drop { my $self = shift; my $dbh = $self->{'dbh'}; my $table = $self->{'table'}; my $SQL = qq{ DROP table $table }; $dbh->do($SQL) or croak "Can't drop table $table: " . $dbh->errstr; $self->{'stoplist'} = {}; } sub empty { my $self = shift; my $dbh = $self->{'dbh'}; my $table = $self->{'table'}; my $SQL = qq{ DELETE FROM $table }; $dbh->do($SQL) or croak "Can't empty table $table: " . $dbh->errstr; $self->{'stoplist'} = {}; } sub add_stop_word { my ($self, $words) = @_; my $dbh = $self->{'dbh'}; $words = [ $words ] unless ref($words) eq 'ARRAY'; my @new_stop_words; for my $word (@$words){ next if $self->is_stop_word($word); push @new_stop_words, $word; $self->{'stoplist'}->{lc($word)} = 1; } my $SQL = "INSERT INTO $self->{'table'} (word) VALUES " . join(',', ('(?)') x @new_stop_words); $dbh->do($SQL,{},@new_stop_words); } sub remove_stop_word { my ($self, $words) = @_; my $dbh = $self->{'dbh'}; $words = [ $words ] unless ref($words) eq 'ARRAY'; my $SQL = qq{ DELETE FROM $self->{'table'} WHERE word=? }; my $sth = $dbh->prepare($SQL); my $stoplist = $self->{'stoplist'}; for my $word (@$words){ next unless $self->is_stop_word($word); $sth->execute($word); delete $stoplist->{lc($word)}; } } sub is_stop_word { exists shift->{'stoplist'}->{lc($_[0])}; } 1; __END__ =head1 NAME DBIx::FullTextSearch::StopList - Stopwords for DBIx::FullTextSearch =head1 SYNOPSIS use DBIx::FullTextSearch::StopList; # connect to database (regular DBI) my $dbh = DBI->connect('dbi:mysql:database', 'user', 'passwd'); # create a new empty stop word list my $sl1 = DBIx::FullTextSearch::StopList->create_empty($dbh, 'sl_web_1'); # or create a new one with default stop words my $sl2 = DBIx::FullTextSearch::StopList->create_default($dbh, 'sl_web_2', 'english'); # or open an existing one my $sl3 = DBIx::FullTextSearch::StopList->open($dbh, 'sl_web_3'); # add stop words $sl1->add_stop_word(['a','in','on','the']); # remove stop words $sl2->remove_stop_word(['be','because','been','but','by']); # check if word is in stoplist $bool = $sl1->is_stop_word('in'); # empty stop words $sl3->empty; # drop stop word table $sl2->drop; =head1 DESCRIPTION DBIx::FullTextSearch::StopList provides stop lists that can be used -L. StopList objects can be reused accross several FullTextSearch objects. =head1 METHODS =over 4 =head2 CONSTRUCTERS =item create_empty my $sl = DBIx::FullTextSearch::StopList->create_empty($dbh, $sl_name); This class method creates a new StopList object. =item create_default my $sl = DBIx::FullTextSearch::StopList->create_default($dbh, $sl_name, $language); This class method creates a new StopList object, with default words loaded in for the given language. Supported languages include Czech, Danish, Dutch, English, Finnish, French, German, Italian, Portuguese, Spanish, and Swedish. =item open my $sl = DBIx::FullTextSearch::StopList->open($dbh, $sl_name); Opens and returns StopList object =head2 OBJECT METHODS =item add_stop_word $sl->add_stop_word(\@stop_words); Adds stop words to StopList object. Expects array reference as argument. =item remove_stop_word $sl->remove_stop_word(\@stop_words); Remove stop words from StopList object. =item is_stop_word $bool = $sl->is_stop_word($stop_word); Returns true iff stop_word is StopList object =item empty $sl->empty; Removes all stop words in StopList object. =item drop $sl->drop; Removes table associated with the StopList object. =back =head1 AUTHOR T.J. Mather, tjmather@tjmather.com, http://www.tjmather.com/ =head1 COPYRIGHT All rights reserved. This package is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =head1 SEE ALSO L DBIx-FullTextSearch-0.73/lib/DBIx/FullTextSearch/File.pm0100644000076500007650000000076107604065676023047 0ustar tjmathertjmather package DBIx::FullTextSearch::File; use DBIx::FullTextSearch::String; use strict; use vars qw! @ISA !; @ISA = qw! DBIx::FullTextSearch::String !; sub index_document { my ($self, $file, $extra_data) = @_; my $dbh = $self->{'dbh'}; open FILE, $file or do { $self->{'errstr'} = "Reading the file `$file' failed: $!"; return; }; my $data; { local $/ = undef; $data = ; } $data .= " $extra_data" if $extra_data; close FILE; $self->SUPER::index_document($file, $data); } 1; DBIx-FullTextSearch-0.73/lib/DBIx/FullTextSearch/Blob.pm0100644000076500007650000001673107604065676023052 0ustar tjmathertjmather package DBIx::FullTextSearch::Blob; use strict; # Open in the backend just sets the object sub open { my ($class, $fts) = @_; return bless { 'fts' => $fts }, $class; } # Create creates the table(s) according to the parameters sub _create_tables { my ($class, $fts) = @_; my $CREATE_DATA = <{'data_table'} ( word varchar($fts->{'word_length'}) binary default '' not null, idx longblob default '' not null, primary key (word) ) EOF my $dbh = $fts->{'dbh'}; $dbh->do($CREATE_DATA) or return $dbh->errstr; push @{$fts->{'created_tables'}}, $fts->{'data_table'}; return; } sub add_document { my ($self, $id, $words) = @_; my $fts = $self->{'fts'}; my $dbh = $fts->{'dbh'}; my $data_table = $fts->{'data_table'}; my $update_sth = ( defined $self->{'adding_update_sth'} ? $self->{'adding_update_sth'} : $self->{'adding_update_sth'} = $dbh->prepare( "update $data_table set idx = concat(idx, ?) where word = ?") ); my @insert_values; my $packstring = $DBIx::FullTextSearch::BITS_TO_PACK{$fts->{'doc_id_bits'}} . $DBIx::FullTextSearch::BITS_TO_PACK{$fts->{'count_bits'}}; my $num_words = 0; for my $word ( keys %$words ) { ### print STDERR "$word($id) adding\n"; # here we will want to parametrize the bit size of the # data my $value = pack $packstring, $id, $words->{$word}; my $rows = $update_sth->execute($value, $word); push @insert_values, $word, $value if $rows == 0; $num_words += $words->{$word}; } if(@insert_values){ my $sql_str = "insert into $data_table values ". join(',', ('(?, ?)') x (@insert_values/2)); $dbh->do($sql_str,{},@insert_values); } return $num_words; } sub delete_document { my $self = shift; for my $id (@_) { $self->update_document($id, {}); } } sub update_document { my ($self, $id, $words) = @_; my $fts = $self->{'fts'}; my $dbh = $fts->{'dbh'}; my $data_table = $fts->{'data_table'}; my $insert_sth = ( defined $self->{'insert_sth'} ? $self->{'insert_sth'} : $self->{'insert_sth'} = $dbh->prepare(" insert into $data_table values (?, ?)") ); my $update_sth = ( defined $self->{'update_update_sth'} ? $self->{'update_update_sth'} : $self->{'update_update_sth'} = $dbh->prepare("update $data_table set idx = concat(substring(idx, 1, ?), ?, substring(idx, ?)) where word = ?") ); my @insert_values; $dbh->do("lock tables $data_table write"); my $select_sth = $dbh->prepare("select word from $data_table"); $select_sth->execute; my $packstring = $DBIx::FullTextSearch::BITS_TO_PACK{$fts->{'doc_id_bits'}} . $DBIx::FullTextSearch::BITS_TO_PACK{$fts->{'count_bits'}}; my ($packnulls) = pack $packstring, 0, 0; my $packlength = length $packnulls; my $num_words = 0; while (my ($word) = $select_sth->fetchrow_array) { my $value = (defined $words->{$word} ? pack($packstring, $id, $words->{$word}) : ''); # the method find_position finds the position of the # "record" for document $id with word $word; returned is # the position in bytes and yes/no values specifying if # the record is already present in the blob; if it is, # we need to replace it, otherwise just insert. my ($pos, $shift) = $self->find_position($word, $id); if (not defined $pos) { push @insert_values, $word, $value; } else { my $spos = $pos + 1; # I'm not sure why this $spos += $packlength if $shift; $update_sth->execute($pos, $value, $spos, $word); } delete $words->{$word}; $num_words++ if defined $value; } for my $word ( keys %$words ) { my $value = pack $packstring, $id, $words->{$word}; push @insert_values, $word, $value; # $insert_sth->execute($word, $value); $num_words++; } if(@insert_values){ my $sql_str = "insert into $data_table values ". join(',', ('(?, ?)') x (@insert_values/2)); $dbh->do($sql_str,{},@insert_values); } $dbh->do("unlock tables"); return $num_words; } sub find_position { my ($self, $word, $id) = @_; # here, with the calculation of where in the blob we have the # docid and where the count of words and how long they are, we # should really look at the parameters (num of bits of various # structures and values) given to create my $fts = $self->{'fts'}; my $dbh = $fts->{'dbh'}; my $data_table = $fts->{'data_table'}; # Sth to read the length of the blob holding the document/count info my $get_length_sth = ( defined $self->{'get_length_sth'} ? $self->{'get_length_sth'} : $self->{'get_length_sth'} = $dbh->prepare("select length(idx) from $data_table where word = ?")); my $length = $dbh->selectrow_array($get_length_sth, {}, $word); my $packstring = $DBIx::FullTextSearch::BITS_TO_PACK{$fts->{'doc_id_bits'}} . $DBIx::FullTextSearch::BITS_TO_PACK{$fts->{'count_bits'}}; my ($packnulls) = pack $packstring, 0, 0; my $packlength = length $packnulls; if (not defined $length) { return; } $length = int($length/$packlength); my ($bot, $top, $med, $val) = (0, $length); if (not defined $fts->{'max_doc_id'}) { $med = int(($top - $bot) / 2); } else { $med = int($top * $id / $fts->{'max_doc_id'}); } my $blob_direct_fetch = $fts->{'blob_direct_fetch'}; # we divide the interval while ($bot != $top) { $med = $top - 1 if $med >= $top; $med = $bot if $med < $bot; if ($top - $bot <= $blob_direct_fetch) { my $get_interval_sth = ( defined $self->{'get_interval_sth'} ? $self->{'get_interval_sth'} : $self->{'get_interval_sth'} = $dbh->prepare("select substring(idx,?,?) from $data_table where word = ?")); my $alldata = $dbh->selectrow_array($get_interval_sth, {}, $bot * $packlength + 1, ($top - $bot) * $packlength, $word); return unless defined $alldata; my @docs; my $i = 0; while ($i < length $alldata) { push @docs, unpack $packstring, substr $alldata, $i, $packlength; $i += $packlength; } for (my $i = 0; $i < @docs; $i += 2) { if ($docs[$i] == $id) { return (($bot+($i/2))*$packlength, 1); } if ($docs[$i] > $id) { return (($bot+($i/2))*$packlength, 0); } } return ($top * $packlength, 0); } ($val) = $dbh->selectrow_array( "select substring(idx, ?, 2) from $data_table where word = ?", {}, ($med * $packlength) + 1, $word); ($val) = unpack $packstring, $val; if (not defined $val) { return; } if ($val == $id) { return ($med * $packlength, 1); } elsif ($val < $id) { $bot = $med + 1; } else { $top = $med; } $med = int($med * $id / $val); } return ($bot * $packlength, 0); } sub contains_hashref { my $self = shift; my $fts = $self->{'fts'}; my $dbh = $fts->{'dbh'}; my $data_table = $fts->{'data_table'}; my $packstring = $DBIx::FullTextSearch::BITS_TO_PACK{$fts->{'doc_id_bits'}} . $DBIx::FullTextSearch::BITS_TO_PACK{$fts->{'count_bits'}}; my ($packnulls) = pack $packstring, 0, 0; my $packlength = length $packnulls; my $sth = ( defined $self->{'get_idx_sth'} ? $self->{'get_idx_sth'} : $self->{'get_idx_sth'} = $dbh->prepare( "select idx from $data_table where word like ?" )); my $out = {}; for my $word (@_) { $sth->execute($word); while (my ($blob) = $sth->fetchrow_array) { next unless defined $blob; my @data; my $i = 0; while ($i < length $blob) { push @data, unpack $packstring, substr $blob, $i, $packlength; $i += $packlength; } while (@data) { my $doc = shift @data; my $count = shift @data; unless (defined $out->{$doc}) { $out->{$doc} = 0; } $out->{$doc} += $count; } } $sth->finish; } $out; } *parse_and_index_data = \&DBIx::FullTextSearch::parse_and_index_data_count; 1; DBIx-FullTextSearch-0.73/lib/DBIx/FullTextSearch/URL.pm0100644000076500007650000000123407604065677022627 0ustar tjmathertjmather package DBIx::FullTextSearch::URL; use DBIx::FullTextSearch::String; use strict; use vars qw! @ISA !; @ISA = qw! DBIx::FullTextSearch::String !; use LWP::UserAgent; sub index_document { my ($self, $uri, $extra_data) = @_; my $ua = ( defined $self->{'user_agent'} ? $self->{'user_agent'} : $self->{'user_agent'} = new LWP::UserAgent ); my $request = new HTTP::Request('GET', $uri); my $response = $ua->simple_request($request); if ($response->is_success) { my $data = $response->content; $data .= " $extra_data" if $extra_data; return $self->SUPER::index_document($uri, $data); } else { $self->{'errstr'} = $response->message; } return; } 1; DBIx-FullTextSearch-0.73/lib/DBIx/FullTextSearch/BlobFast.pm0100644000076500007650000000366507604065677023673 0ustar tjmathertjmather package DBIx::FullTextSearch::BlobFast; use DBIx::FullTextSearch::Blob; use vars qw! @ISA !; @ISA = qw! DBIx::FullTextSearch::Blob !; use strict; sub delete_document { my $self = shift; my $fts = $self->{'fts'}; my $dbh = $fts->{'dbh'}; my $data_table = $fts->{'data_table'}; my $packstring = $DBIx::FullTextSearch::BITS_TO_PACK{$fts->{'doc_id_bits'}} . $DBIx::FullTextSearch::BITS_TO_PACK{$fts->{'count_bits'}}; my $value = ''; for my $id (@_) { $value .= pack $packstring, $id, 0; } $dbh->do(" update $data_table set idx = concat(idx, ?) ", {}, $value); 1; } sub update_document { my $self = shift; $self->delete_document($_[0]); $self->add_document(@_); } sub contains_hashref { my $self = shift; my $fts = $self->{'fts'}; my $dbh = $fts->{'dbh'}; my $data_table = $fts->{'data_table'}; my $packstring = $DBIx::FullTextSearch::BITS_TO_PACK{$fts->{'doc_id_bits'}} . $DBIx::FullTextSearch::BITS_TO_PACK{$fts->{'count_bits'}}; my ($packnulls) = pack $packstring, 0, 0; my $packlength = length $packnulls; my $sth = ( defined $self->{'get_idx_sth'} ? $self->{'get_idx_sth'} : $self->{'get_idx_sth'} = $dbh->prepare( "select idx from $data_table where word like ?" )); my $out = {}; for my $word (@_) { $sth->execute($word); while (my ($blob) = $sth->fetchrow_array) { next unless defined $blob; my %docs = (); my @data; my $i = length($blob) - $packlength; while ($i >= 0) { my ($doc_id, $count) = unpack "\@$i$packstring", $blob; ### print STDERR "$doc_id $count\n"; $i -= $packlength; next if exists $docs{$doc_id}; $docs{$doc_id} = 1; next unless $count; push @data, $doc_id, $count; } while (@data) { my $doc = shift @data; my $count = shift @data; unless (defined $out->{$doc}) { $out->{$doc} = 0; } $out->{$doc} += $count; } } $sth->finish; } $out; } *parse_and_index_data = \&DBIx::FullTextSearch::parse_and_index_data_count; 1; DBIx-FullTextSearch-0.73/lib/DBIx/FullTextSearch/String.pm0100644000076500007650000000430407604065677023434 0ustar tjmathertjmather package DBIx::FullTextSearch::String; use DBIx::FullTextSearch; use strict; use vars qw! @ISA !; @ISA = qw! DBIx::FullTextSearch !; # Create creates the conversion table that converts string names of # documents to numbers sub _create_tables { my $fts = shift; $fts->{'doc_id_table'} = $fts->{'table'} . '_docid' unless defined $fts->{'doc_id_table'}; unless($fts->{'name_length'}){ return "The parameter name_length has to be specified."; } my $CREATE_DOCID = <{'doc_id_table'} ( name varchar($fts->{'name_length'}) binary not null, id $DBIx::FullTextSearch::BITS_TO_INT{$fts->{'doc_id_bits'}} unsigned not null auto_increment, primary key (id), unique (name) ) EOF my $dbh = $fts->{'dbh'}; $dbh->do($CREATE_DOCID) or return $dbh->errstr; push @{$fts->{'created_tables'}}, $fts->{'doc_id_table'}; return; } sub get_id_for_name { my ($self, $string) = @_; my $dbh = $self->{'dbh'}; my $doc_id_table = $self->{'doc_id_table'}; my $name_to_id_sth = ( defined $self->{'name_to_id_sth'} ? $self->{'name_to_id_sth'} : $self->{'name_to_id_sth'} = $dbh->prepare("select id from $doc_id_table where name = ?") or die $dbh->errstr); my $id = $dbh->selectrow_array($name_to_id_sth, {}, $string); if (not defined $id) { my $new_name_sth = (defined $self->{'new_name_sth'} ? $self->{'new_name_sth'} : $self->{'new_name_sth'} = $dbh->prepare("insert into $doc_id_table values (?, null)") or die $dbh->errstr ); $new_name_sth->execute($string) or die $new_name_sth->errstr; $id = $new_name_sth->{'mysql_insertid'}; } $id; } sub index_document { my ($self, $string, $data) = @_; my $id = $self->get_id_for_name($string); $self->SUPER::index_document($id, $data); } sub delete_document { my ($self, $doc_id) = @_; $self->SUPER::delete_document($self->get_id_for_name($doc_id)); } sub contains_hashref { my $self = shift; my $res = $self->SUPER::contains_hashref(@_); return unless keys %$res; my $doc_id_table = $self->{'doc_id_table'}; my $data = $self->{'dbh'}->selectall_arrayref("select name, id from $doc_id_table where id in (" . join(',', ('?') x keys %$res).")", {}, keys %$res); return { map { ( $_->[0], $res->{$_->[1]} ) } @$data }; } 1; DBIx-FullTextSearch-0.73/lib/DBIx/FullTextSearch/Table.pm0100644000076500007650000000727007604065677023222 0ustar tjmathertjmather package DBIx::FullTextSearch::TableString; use vars qw! @ISA !; @ISA = qw! DBIx::FullTextSearch::String DBIx::FullTextSearch::Table !; sub index_document { my ($self, $id, $data) = @_; my @data_sets = $self->get_the_data_from_table($id); push @data_sets, $data if $data; $self->SUPER::index_document($id, \@data_sets); } package DBIx::FullTextSearch::TableNum; use vars qw! @ISA !; @ISA = qw! DBIx::FullTextSearch::Table !; sub index_document { my ($self, $id, $extra_data) = @_; my @data_sets = $self->get_the_data_from_table($id); push @data_sets, $extra_data if $extra_data; $self->SUPER::index_document($id, \@data_sets); } package DBIx::FullTextSearch::Table; use DBIx::FullTextSearch; use strict; use vars qw! @ISA !; @ISA = qw! DBIx::FullTextSearch !; sub _open_tables { my $self = shift; if (defined $self->{'doc_id_table'}) { eval 'use DBIx::FullTextSearch::String'; bless $self, 'DBIx::FullTextSearch::TableString'; } else { bless $self, 'DBIx::FullTextSearch::TableNum'; } } # we do not create any new tables, we just check that the parameters are # OK (the table and columns exist, etc.) sub _create_tables { my $fts = shift; my ($table, $column, $id) = @{$fts}{ qw! table_name column_name column_id_name ! }; if (not defined $table and $column =~ /\./) { ($table, $column) = ($column =~ /^(.*)\.(.*)$/s); } my $id_type; if (not defined $table) { return "The parameter table_name has to be specified with the table frontend."; } if (not defined $column) { return "The parameter column_name has to be specified with the table frontend."; } my $dbh = $fts->{'dbh'}; my $sth = $dbh->prepare("show columns from $table"); $sth->{'PrintError'} = 0; $sth->{'RaiseError'} = 0; $sth->execute or return "The table `$table' doesn't exist."; my $info = $dbh->selectall_arrayref($sth, { 'PrintError' => 0, 'RaiseError' => 0 }); if (not defined $info) { return "The table `$table' doesn't exist."; } # use Data::Dumper; print Dumper $info; if (not defined $id) { # search for column with primary key my $pri_num = 0; for my $i (0 .. $#$info) { if ($info->[$i][3] eq 'PRI') { $pri_num++; $id = $info->[$i][0]; $id_type = $info->[$i][1]; } } if ($pri_num > 1) { return 'The primary key has to be one-column.'; } if ($pri_num == 0) { return "No primary key found in the table `$table'."; } } else { # find '$id' column for my $i (0 .. $#$info) { if ($info->[$i][0] eq $id){ $id_type = $info->[$i][1]; last; } } } unless(defined $id_type){ return "No key named '$id' found in the table '$table'"; } my $testcol = $dbh->prepare("select $column from $table where 1 = 0"); $testcol->execute or return "Column `$column' doesn't exist in table `$table'."; $testcol->finish; $fts->{'column_id_name'} = $id; my $errstr; if ($id_type =~ /([a-z]*int)/) { $fts->{'doc_id_bits'} = $DBIx::FullTextSearch::INT_TO_BITS{$1}; bless $fts, 'DBIx::FullTextSearch::TableNum'; } else { my ($length) = ($id_type =~ /^\w+\((\d+)\)$/); $fts->{'name_length'} = $1; eval 'use DBIx::FullTextSearch::String'; bless $fts, 'DBIx::FullTextSearch::TableString'; $errstr = $fts->DBIx::FullTextSearch::String::_create_tables($fts); } ### use Data::Dumper; print Dumper $fts; return $errstr; } sub get_the_data_from_table { my ($self, $id) = @_; my $dbh = $self->{'dbh'}; my $get_data = ( defined $self->{'get_data_sth'} ? $self->{'get_data_sth'} : $self->{'get_data_sth'} = $dbh->prepare(" select $self->{'column_name'} from $self->{'table_name'} where $self->{'column_id_name'} = ? ") ); my @data_ary = $dbh->selectrow_array($get_data, {}, $id); return wantarray ? @data_ary : join(" ", @data_ary); } 1; DBIx-FullTextSearch-0.73/lib/DBIx/FullTextSearch/Column.pm0100644000076500007650000001201507604065677023421 0ustar tjmathertjmather package DBIx::FullTextSearch::Column; use strict; # Open in the backend just sets the object sub open { my ($class, $fts) = @_; return bless { 'fts' => $fts }, $class; } sub DESTROY { my ($self) = @_; if (defined $self->{'select_wordid_sth'}) { $self->{'select_wordid_sth'}->finish(); } } # Create creates the table(s) according to the parameters sub _create_tables { my ($class, $fts) = @_; my $COUNT_FIELD = ''; if ($fts->{'count_bits'}) { $COUNT_FIELD = "count $DBIx::FullTextSearch::BITS_TO_INT{$fts->{'count_bits'}} unsigned," } my $CREATE_DATA = <{'data_table'} ( word_id $DBIx::FullTextSearch::BITS_TO_INT{$fts->{'word_id_bits'}} unsigned not null, doc_id $DBIx::FullTextSearch::BITS_TO_INT{$fts->{'doc_id_bits'}} unsigned not null, $COUNT_FIELD index (word_id), index (doc_id) ) EOF $fts->{'word_id_table'} = $fts->{'table'}.'_words' unless defined $fts->{'word_id_table'}; my $CREATE_WORD_ID = <{'word_id_table'} ( word varchar($fts->{'word_length'}) binary default '' not null, id $DBIx::FullTextSearch::BITS_TO_INT{$fts->{'word_id_bits'}} unsigned not null auto_increment, primary key (id), unique (word) ) EOF my $dbh = $fts->{'dbh'}; $dbh->do($CREATE_DATA) or return $dbh->errstr; push @{$fts->{'created_tables'}}, $fts->{'data_table'}; $dbh->do($CREATE_WORD_ID) or return $dbh->errstr; push @{$fts->{'created_tables'}}, $fts->{'word_id_table'}; return; } sub add_document { my ($self, $id, $words) = @_; my $fts = $self->{'fts'}; my $dbh = $fts->{'dbh'}; my $word_id_table = $fts->{'word_id_table'}; if (not defined $self->{'select_wordid_sth'}) { $self->{'select_wordid_sth'} = $dbh->prepare(" select id from $word_id_table where word = ? "); } my $data_table = $fts->{'data_table'}; my $count_bits = $fts->{'count_bits'}; my $num_words = 0; my (@wids,@data,@widshandler,@datahandler); my $wordid; $dbh->do("lock tables $word_id_table write"); my ($maxid) = $dbh->selectrow_array("select max(id) from $word_id_table"); foreach my $word (keys %$words) { if(!defined $self->{'wordids'}->{$word}) { $self->{'select_wordid_sth'}->execute($word); ($wordid) = $self->{'select_wordid_sth'}->fetchrow_array(); unless ($wordid) { $maxid++; push @widshandler, "(?,$maxid)"; push @wids, $word; $wordid = $maxid; } $self->{'wordids'}->{$word} = $wordid; } else { $wordid=$self->{'wordids'}->{$word}; } if ($count_bits) { push @datahandler, "($wordid,$id,?)"; push @data, $words->{$word}; } else { push @datahandler, "($wordid,$id)"; } $num_words++; }; $dbh->do("insert into $word_id_table values " . join (',',@widshandler),undef,@wids) if @wids; $dbh->do("unlock tables"); if ($count_bits) { $dbh->do("insert into $data_table values " . join (',',@datahandler),undef,@data) if @data; } else { $dbh->do("insert into $data_table values " . join (',',@datahandler)) if @datahandler; } return $num_words; } sub delete_document { my $self = shift; my $fts = $self->{'fts'}; my $dbh = $fts->{'dbh'}; my $data_table = $fts->{'data_table'}; my $sth = $dbh->prepare("delete from $data_table where doc_id = ?"); for my $id (@_) { $sth->execute($id); } } sub update_document { my ($self, $id, $words) = @_; $self->delete_document($id); $self->add_document($id, $words); } sub contains_hashref { my $self = shift; my $fts = $self->{'fts'}; my $dbh = $fts->{'dbh'}; my $data_table = $fts->{'data_table'}; my $word_id_table = $fts->{'word_id_table'}; my $count_bits = $fts->{'count_bits'}; my $sth = ( defined $self->{'get_data_sth'} ? $self->{'get_data_sth'} : ( $count_bits ? ( $self->{'get_data_sth'} = $dbh->prepare( "select doc_id, count from $data_table, $word_id_table where word like ? and id = word_id" ) ) : ( $self->{'get_data_sth'} = $dbh->prepare( "select doc_id, 1 from $data_table, $word_id_table where word like ? and id = word_id" ) ) ) ); my $out = {}; for my $word (@_) { $sth->execute($word); while (my ($doc, $count) = $sth->fetchrow_array) { $out->{$doc} += $count; } $sth->finish; } $out; } sub common_word { my ($self, $k) = @_; my $fts = $self->{'fts'}; my $dbh = $fts->{'dbh'}; my $num = $fts->document_count; $k /= 100; my $SQL = <{'data_table'} group by word_id having k >= ? EOF my $ary_ref = $dbh->selectcol_arrayref($SQL, {}, $num, $k); return unless @$ary_ref; my $QUESTION_MARKS = join ',', ('?') x scalar(@$ary_ref); $SQL = <{'word_id_table'} where id IN ($QUESTION_MARKS) EOF return $dbh->selectcol_arrayref($SQL, {}, @$ary_ref); } *parse_and_index_data = \&DBIx::FullTextSearch::parse_and_index_data_count; 1; DBIx-FullTextSearch-0.73/lib/DBIx/FullTextSearch.pm0100644000076500007650000011371107630504443022155 0ustar tjmathertjmather# -*- Mode: Perl; indent-tabs-mode: t; tab-width: 2 -*- =head1 NAME DBIx::FullTextSearch - Indexing documents with MySQL as storage =cut package DBIx::FullTextSearch; use strict; use Parse::RecDescent; use vars qw($errstr $VERSION $parse); $errstr = undef; $VERSION = '0.73'; use locale; my %DEFAULT_PARAMS = ( 'num_of_docs' => 0, # statistical value, should be maintained 'word_length' => 30, # max length of words we index 'protocol' => 40, # we only support protocol with the same numbers 'blob_direct_fetch' => 20, # with the blob store, when we stop searching # and fetch everything at once 'data_table' => undef, # table where the actual index is stored 'name_length' => 255, # for filenames or URLs, what's the max length 'word_id_bits' => 16, # num of bits for word_id (column store) 'doc_id_bits' => 16, # num of bits for doc_id 'count_bits' => 8, # num of bits for count value 'position_bits' => 32, # num of bits for word positions 'backend' => 'blob', # what database backend (way the data is # stored) we use 'frontend' => 'none', # what application frontend we use (how # the index behaves externaly) 'filter' => 'map { lc $_ }', 'search_splitter' => '/(\w{2,$word_length}\*?)/g', 'index_splitter' => '/(\w{2,$word_length})/g', # can use the $word_length # variable 'init_env' => '' ); my %backend_types = ( 'blob' => 'DBIx::FullTextSearch::Blob', 'blobfast' => 'DBIx::FullTextSearch::BlobFast', 'column' => 'DBIx::FullTextSearch::Column', 'phrase' => 'DBIx::FullTextSearch::Phrase', ); my %frontend_types = ( 'none' => 'DBIx::FullTextSearch', 'default' => 'DBIx::FullTextSearch', 'file' => 'DBIx::FullTextSearch::File', 'string' => 'DBIx::FullTextSearch::String', 'url' => 'DBIx::FullTextSearch::URL', 'table' => 'DBIx::FullTextSearch::Table', ); use vars qw! %BITS_TO_PACK %BITS_TO_INT %INT_TO_BITS !; %BITS_TO_PACK = qw! 0 A0 8 C 16 S 32 L !; %BITS_TO_INT = qw! 8 tinyint 16 smallint 24 mediumint 32 int 64 bigint !; %INT_TO_BITS = map { ($BITS_TO_INT{$_} => $_ ) }keys %BITS_TO_INT; # Open reads in the information about existing index, creates an object # in memory sub open { my ($class, $dbh, $TABLE) = @_; $errstr = undef; # the $dbh is either a real dbh of a DBI->connect parameters arrayref my $mydbh = 0; if (ref $dbh eq 'ARRAY') { if (not $dbh = DBI->connect(@$dbh)) { $errstr = $DBI::errstr; return; } $mydbh = 1; } # load the parameters to the object my %PARAMS = %DEFAULT_PARAMS; my $sth = $dbh->prepare("select * from $TABLE"); $sth->{'PrintError'} = 0; $sth->{'RaiseError'} = 0; $sth->execute or do { if (not grep { $TABLE eq $_ } DBIx::FullTextSearch->list_fts_indexes($dbh)) { $errstr = "FullTextSearch index $TABLE doesn't exist."; } else { $errstr = $sth->errstr; } return; }; while (my ($param, $value) = $sth->fetchrow_array) { $PARAMS{$param} = $value; } my $self = bless { 'dbh' => $dbh, 'table' => $TABLE, %PARAMS, }, $class; my $data_table = $self->{'data_table'}; # we should disconnect if we've opened the dbh here if ($mydbh) { $self->{'disconnect_on_destroy'} = 1; } # some basic sanity check if (not defined $dbh->selectrow_array("select count(*) from $data_table")) { $errstr = "Table $data_table not found in the database\n"; return; } # load and set the application frontend my $front_module = $frontend_types{$PARAMS{'frontend'}}; if (defined $front_module) { if ($front_module ne $class) { eval "use $front_module"; die $@ if $@; } bless $self, $front_module; $self->_open_tables; } else { $errstr = "Specified frontend type `$PARAMS{'frontend'}' is unknown\n"; return; } # load and set the backend (actual database access) module my $back_module = $backend_types{$PARAMS{'backend'}}; if (defined $back_module) { eval "use $back_module"; die $@ if $@; $self->{'db_backend'} = $back_module->open($self); } else { $errstr = "Specified backend type `$PARAMS{'backend'}' is unknown\n"; return; } # load DBIx::FullTextSearch::StopList object (if specified) if ($PARAMS{'stoplist'}) { eval "use DBIx::FullTextSearch::StopList"; die $@ if $@; $self->{'stoplist'} = DBIx::FullTextSearch::StopList->open($dbh, $PARAMS{'stoplist'}); } # load Lingua::Stem object (if specified) if($PARAMS{'stemmer'}){ eval "use Lingua::Stem"; die $@ if $@; $self->{'stemmer'} = Lingua::Stem->new(-locale => $PARAMS{'stemmer'}); } # finally, return the object $self; } # Create creates tables in the database according to the options, then # calls open to load the object to memory sub create { my ($class, $dbh, $TABLE, %OPTIONS) = @_; $errstr = undef; my $mydbh = 0; if (ref $dbh eq 'ARRAY') { $dbh = DBI->connect(@$dbh) or do { $errstr = $DBI::errstr; return; }; $mydbh = 1; } my $self = bless { 'dbh' => $dbh, 'table' => $TABLE, %DEFAULT_PARAMS, %OPTIONS }, $class; $self->{'data_table'} = $TABLE.'_data' unless defined $self->{'data_table'}; # convert array reference to CSV string $self->{'column_name'} = join(",",@{$self->{'column_name'}}) if ref($self->{'column_name'}) eq 'ARRAY'; my $CREATE_PARAM = <do($CREATE_PARAM) or do { $errstr = $dbh->errstr; return; }; push @{$self->{'created_tables'}}, $TABLE; # load and set the frontend database structures my $front_module = $frontend_types{$self->{'frontend'}}; if (defined $front_module) { eval "use $front_module"; die $@ if $@; bless $self, $front_module; $errstr = $self->_create_tables; if (defined $errstr) { $self->clean_failed_create; warn $errstr; return; } } else { $errstr = "Specified frontend type `$self->{'frontend'}' is unknown\n"; $self->clean_failed_create; return; } # create the backend database structures my $back_module = $backend_types{$self->{'backend'}}; if (defined $back_module) { eval "use $back_module"; die $@ if $@; $errstr = $back_module->_create_tables($self); if (defined $errstr) { $self->clean_failed_create; warn $errstr; return; } } else { $errstr = "Specified backend type `$self->{'backend'}' is unknown\n"; $self->clean_failed_create; return; } for (grep { not ref $self->{$_} } keys %$self) { $dbh->do("insert into $TABLE values (?, ?)", {}, $_, $self->{$_}); } return $class->open($dbh, $TABLE); } sub _create_tables {} sub _open_tables {} sub clean_failed_create { my $self = shift; my $dbh = $self->{'dbh'}; for my $table (@{$self->{'created_tables'}}) { $dbh->do("drop table $table"); } } sub drop { my $self = shift; my $dbh = $self->{'dbh'}; for my $tag (keys %$self) { next unless $tag =~ /(^|_)table$/; $dbh->do("drop table $self->{$tag}"); } 1; } sub empty { my $self = shift; my $dbh = $self->{'dbh'}; for my $tag (keys %$self) { next unless $tag =~ /_table$/; $dbh->do("delete from $self->{$tag}"); } $dbh->do("replace into $self->{'table'} values ('max_doc_id', 0)"); return 1; } sub errstr { my $self = shift; ref $self ? $self->{'errstr'} : $errstr; } sub list_fts_indexes { my ($class, $dbh) = @_; my %tables = map { ( $_->[0] => 1 ) } @{$dbh->selectall_arrayref('show tables')}; my %indexes = (); for my $table (keys %tables) { local $dbh->{'PrintError'} = 0; local $dbh->{'RaiseError'} = 0; if ($dbh->selectrow_array("select param, value from $table where param = 'data_table'")) { $indexes{$table} = 1; } } return sort keys %indexes; } sub index_document { my ($self, $id, $data) = @_; return unless defined $id; my $dbh = $self->{'dbh'}; my $param_table = $self->{'table'}; my $adding_doc = 0; my $adding = 0; if (not defined $self->{'max_doc_id'} or $id > $self->{'max_doc_id'}) { $self->{'max_doc_id'} = $id; my $update_max_doc_id_sth = ( defined $self->{'update_max_doc_id_sth'} ? $self->{'update_max_doc_id_sth'} : $self->{'update_max_doc_id_sth'} = $dbh->prepare("replace into $param_table values (?, ?)")); $update_max_doc_id_sth->execute('max_doc_id', $id); $adding_doc = 1; } my $init_env = $self->{'init_env'}; # use packages, etc. eval $init_env if defined $init_env; print STDERR "Init_env failed with $@\n" if $@; $data = '' unless defined $data; return $self->{'db_backend'}->parse_and_index_data($adding_doc, $id, $data); } # used for backends that need a count for each of the words sub parse_and_index_data_count { my ($backend, $adding_doc, $id, $data) = @_; ## note that this is run with backend object my $self = $backend->{'fts'}; my $word_length = $self->{'word_length'}; # this needs to get parametrized (lc, il2_to_ascii, parsing of # HTML tags, ...) my %words; my @data_sets = ref $data ? @$data : ($data); # We can just join the data sets together, since we don't care about position my $data_string = join(" ", @data_sets); my $filter = $self->{'filter'} . ' $data_string =~ ' . $self->{'index_splitter'}; my $stoplist = $self->{'stoplist'}; my $stemmer = $self->{'stemmer'}; my @words = eval $filter; @words = grep !$stoplist->is_stop_word($_), @words if defined($stoplist); @words = @{$stemmer->stem(@words)} if defined($stemmer); for my $word ( @words ) { $words{$word} = 0 if not defined $words{$word}; $words{$word}++; } my @result; if ($adding_doc) { @result = $backend->add_document($id, \%words); } else { @result = $backend->update_document($id, \%words); } if (wantarray) { return @result; } return $result[0]; } # used for backends where list of occurencies is needed sub parse_and_index_data_list { my ($backend, $adding_doc, $id, $data) = @_; ## note that this is run with backend object my $self = $backend->{'fts'}; my $word_length = $self->{'word_length'}; # this needs to get parametrized (lc, il2_to_ascii, parsing of # HTML tags, ...) my %words; my @data_sets = ref $data ? @$data : ($data); foreach my $data_set (@data_sets) { my $filter = $self->{'filter'}.' $data_set =~ '.$self->{'index_splitter'}; my $i = 0; # $i stores the position(s) of each word in the document. my $stoplist = $self->{'stoplist'}; my $stemmer = $self->{'stemmer'}; my @words = eval $filter; @words = grep !$stoplist->is_stop_word($_), @words if defined($stoplist); @words = @{$stemmer->stem(@words)} if defined($stemmer); for my $word ( @words ) { push @{$words{$word}}, ++$i; } # Make sure the data sets are considered far apart in position, to # avoid phrase searches overlapping between table columns. $i += 100; } my @result; if ($adding_doc) { @result = $backend->add_document($id, \%words); } else { @result = $backend->update_document($id, \%words); } if (wantarray) { return @result; } return $result[0]; } sub delete_document { my $self = shift; $self->{'db_backend'}->delete_document(@_); } sub contains_hashref { my $self = shift; my $word_length = $self->{'word_length'}; my $stemmer = $self->{'stemmer'}; my $filter = $self->{'filter'}; my $stoplist = $self->{'stoplist'}; my @phrases; for (@_){ my $phrase; my $splitter = ' map { ' . $self->{'search_splitter'} . ' } $_'; my @words = eval $splitter; @words = eval $filter.' @words'; @words = grep !$stoplist->is_stop_word($_), @words if defined($stoplist); if (defined($stemmer)){ my @stemmed_words = (); for (@words){ if (m/\*$/){ # wildcard search, make work with stemming my $stem_word = $stemmer->stem($_); for (@$stem_word){ $_ .= "*"; push @stemmed_words, $_; } } else { push @stemmed_words, @{$stemmer->stem($_)}; } } $phrase = join(' ',@stemmed_words); } else { $phrase = join(' ',@words); } # change wildcard to SQL version (* -> %) $phrase =~ s/\*/%/g; push @phrases, $phrase; } $self->{'db_backend'}->contains_hashref(@phrases); } sub contains { my $self = shift; my $res = $self->contains_hashref(@_); if (not $self->{'count_bits'}) { return keys %$res; } return sort { $res->{$b} <=> $res->{$a} } keys %$res; } sub econtains_hashref { my $self = shift; my $docs = {}; my $word_num = 0; my $stoplist = $self->{'stoplist'}; my @plus_words = map { /^\+(.+)$/s } @_; @plus_words = grep !$stoplist->is_stop_word($_), @plus_words if defined($stoplist); # required words for my $word (@plus_words) { $word_num++; my $oneword = $self->contains_hashref($word); if ($word_num == 1) { $docs = $oneword; next; } for my $doc (keys %$oneword) { $docs->{$doc} += $oneword->{$doc} if defined $docs->{$doc}; } for my $doc (keys %$docs) { delete $docs->{$doc} unless defined $oneword->{$doc}; } } # optional words for my $word ( map { /^([^+-].*)$/s } @_) { my $oneword = $self->contains_hashref($word); for my $doc (keys %$oneword) { if (@plus_words) { $docs->{$doc} += $oneword->{$doc} if defined $docs->{$doc}; } else { $docs->{$doc} = 0 unless defined $docs->{$doc}; $docs->{$doc} += $oneword->{$doc}; } } } # prohibited words for my $word ( map { /^-(.+)$/s } @_) { my $oneword = $self->contains_hashref($word); for my $doc (keys %$oneword) { delete $docs->{$doc}; } } $docs; } sub econtains { my $self = shift; my $res = $self->econtains_hashref(@_); if (not $self->{'count_bits'}) { return keys %$res; } return sort { $res->{$b} <=> $res->{$a} } keys %$res; } sub _search_terms { my ($self, $query) = @_; if ($self->{'backend'} eq 'phrase') { # phrase backend, must deal with quotes # handle + and - operations on phrases $query =~ s/([\+\-])"/"$1/g; my $inQuote = 0; my @phrases = (); my @blocks = split(/\"/, $query); # deal with quotes for (@blocks){ if($inQuote == 0){ # we are outside quotes, search for individual words push @phrases, split(' '); } else { # we are inside quote, search for whole phrase push @phrases, $_; } $inQuote = ++$inQuote % 2; } return @phrases; } else { # not phrase backend, don't deal with quotes return split(' ', $query); } } sub _search_boolean { my ($self, $query) = @_; unless ($parse) { $::RD_AUTOACTION = q{ [@item] }; my $grammar = q{ expr : disj disj : conj 'or' disj | conj conj : unary 'and' conj | unary unary : '(' expr ')' | atom atom : /([^\(\)\s]|\s(?!and)(?!or))+/ }; $parse = new Parse::RecDescent ($grammar); } my $tree = $parse->expr($query); return $self->_search_in_tree($tree); } sub _search_in_tree { my ($self, $tree) = @_; if (ref($tree->[1]) && ref($tree->[3])) { if (defined($tree->[2]) && $tree->[2] eq 'and') { my $hash_ref1 = $self->_search_in_tree($tree->[1]); my $hash_ref2 = $self->_search_in_tree($tree->[3]); for my $k (keys %$hash_ref1) { unless ($hash_ref2->{$k}) { delete $hash_ref1->{$k}; } else { $hash_ref1->{$k} += $hash_ref2->{$k}; } } return $hash_ref1; } elsif (defined($tree->[2]) && $tree->[2] eq 'or') { my $hash_ref1 = $self->_search_in_tree($tree->[1]); my $hash_ref2 = $self->_search_in_tree($tree->[3]); for my $k (keys %$hash_ref2) { $hash_ref1->{$k} += $hash_ref2->{$k}; } return $hash_ref1; } return {}; } elsif ($tree->[1] eq '(' && ref($tree->[2]) && $tree->[3] eq ')') { return $self->_search_in_tree($tree->[2]); } elsif (ref($tree->[1])) { return $self->_search_in_tree($tree->[1]); } elsif (defined($tree->[0]) && $tree->[0] eq 'atom') { return $self->econtains_hashref($self->_search_terms($tree->[1])); } else { warn "Unknown tree nodes " . join("\t", @$tree); return {}; } } sub search { my ($self, $query) = @_; if ($query =~ s/\b(and|or|not)\b/lc($1)/eig) { return keys %{$self->_search_boolean($query)}; } return $self->econtains($self->_search_terms($query)); } sub search_hashref { my ($self, $query) = @_; if ($query =~ s/\b(and|or|not)\b/lc($1)/eig) { return $self->_search_boolean($query); } return $self->econtains_hashref($self->_search_terms($query)); } sub document_count { my $self = shift; my $dbh = $self->{'dbh'}; my $SQL = qq{ select distinct doc_id from $self->{'data_table'} }; my $ary_ref = $dbh->selectall_arrayref($SQL); return scalar @$ary_ref; } # find all words that are contained in at least $k % of all documents sub common_word { my $self = shift; my $k = shift || 80; $self->{'db_backend'}->common_word($k); } sub DESTROY { my $self = shift; $self->{'db_backend'}->DESTROY() if (exists $self->{'db_backend'} && $self->{'db_backend'} &&$self->{'db_backend'}->can('DESTROY')); } 1; =head1 SYNOPSIS DBIx::FullTextSearch uses a MySQL database backend to index files, web documents and database fields. Supports must include, can include, and cannot include words and phrases. Support for boolean (AND/OR) queries, stop words and stemming. use DBIx::FullTextSearch; use DBI; # connect to database (regular DBI) my $dbh = DBI->connect('dbi:mysql:database', 'user', 'passwd'); # create a new stoplist my $sl = DBIx::FullTextSearch::StopList->create_default($dbh, 'sl_en', 'English'); # create a new index with default english stoplist and english stemmer my $fts = DBIx::FullTextSearch->create($dbh, 'fts_web_1', frontend => 'string', backend => 'blob', stoplist => 'sl_en', stemmer => 'en-us'); # or open existing one # my $fts = DBIx::FullTextSearch->open($dbh, 'fts_web_1'); # index documents $fts->index_document('krtek', 'krtek leze pod zemi'); $fts->index_document('jezek', 'Jezek ma ostre bodliny.'); # search for matches my @docs = $fts->contains('foo'); my @docs = $fts->econtains('+foo', '-Bar'); my @docs = $fts->search('+foo -Bar'); my @docs = $fts->search('foo AND (bar OR baz)'); =head1 DESCRIPTION DBIx::FullTextSearch is a flexible solution for indexing contents of documents. It uses the MySQL database to store the information about words and documents and provides Perl interface for indexing new documents, making changes and searching for matches. For DBIx::FullTextSearch, a document is nearly anything -- Perl scalar, file, Web document, database field. The basic style of interface is shown above. What you need is a MySQL database and a L with L. Then you create a DBIx::FullTextSearch index -- a set of tables that maintain all necessary information. Once created it can be accessed many times, either for updating the index (adding documents) or searching. DBIx::FullTextSearch uses one basic table to store parameters of the index. Second table is used to store the actual information about documents and words, and depending on the type of the index (specified during index creation) there may be more tables to store additional information (like conversion from external string names (eg. URL's) to internal numeric form). For a user, these internal thingies and internal behaviour of the index are not important. The important part is the API, the methods to index document and ask questions about words in documents. However, certain understanding of how it all works may be usefull when you are deciding if this module is for you and what type of index will best suit your needs. =head2 Frontends From the user, application point of view, the DBIx::FullTextSearch index stores documents that are named in a certain way, allows adding new documents, and provides methods to ask: "give me list of names of documents that contain this list of words". The DBIx::FullTextSearch index doesn't store the documents itself. Instead, it stores information about words in the documents in such a structured way that it makes easy and fast to look up what documents contain certain words and return names of the documents. DBIx::FullTextSearch provides a couple of predefined frontend classes that specify various types of documents (and the way they relate to their names). =over 4 =item default By default, user specifies the integer number of the document and the content (body) of the document. The code would for example read $fts->index_document(53, 'zastavujeme vyplaty vkladu'); and DBIx::FullTextSearch will remember that the document 53 contains three words. When looking for all documents containing word (string) vklad, a call my @docs = $fts->contains('vklad*'); would return numbers of all documents containing words starting with 'vklad', 53 among them. So here it's user's responsibility to maintain a relation between the document numbers and their content, to know that a document 53 is about vklady. Perhaps the documents are already stored somewhere and have unique numeric id. Note that the numeric id must be no larger than 2^C. =item string Frontend B allows the user to specify the names of the documents as strings, instead of numbers. Still the user has to specify both the name of the document and the content: $fts->index_document('foobar', 'the quick brown fox jumped over lazy dog!'); After that, $fts->contains('dog') will return 'foobar' as one of the names of documents with word 'dog' in it. =item file To index files, use the frontend B. Here the content of the document is clearly the content of the file specified by the filename, so in a call to index_document, only the name is needed -- the content of the file is read by the DBIx::FullTextSearch transparently: $fts->index_document('/usr/doc/FAQ/Linux-FAQ'); my @files = $fts->contains('penguin'); =item url Web document can be indexed by the frontend B. DBIx::FullTextSearch uses L to get the document and then parses it normally: $fts->index_document('http://www.perl.com/'); Note that the HTML tags themselves are indexed along with the text. =item table You can have a DBIx::FullTextSearch index that indexes char or blob fields in MySQL table. Since MySQL doesn't support triggers, you have to call the C method of DBIx::FullTextSearch any time something changes in the table. So the sequence probably will be $dbh->do('insert into the_table (id, data, other_fields) values (?, ?, ?)', {}, $name, $data, $date_or_something); $fts->index_document($name); When calling C, the id (name) of the record will be returned. If the id in the_table is numeric, it's directly used as the internal numeric id, otherwise a string's way of converting the id to numeric form is used. When creating this index, you'll have to pass it three additionial options, C, C, and C. You may use the optional column_process option to pre-process data in the specified columns. =back The structure of DBIx::FullTextSearch is very flexible and adding new frontend (what will be indexed) is very easy. =head2 Backends While frontend specifies what is indexed and how the user sees the collection of documents, backend is about low level database way of actually storing the information in the tables. Three types are available: =over 4 =item blob For each word, a blob holding list of all documents containing that word is stored in the table, with the count (number of occurencies) associated with each document number. That makes it for very compact storage. Since the document names (for example URL) are internally converted to numbers, storing and fetching the data is fast. However, updating the information is very slow, since information concerning one document is spread across all table, without any direct database access. Updating a document (or merely reindexing it) requires update of all blobs, which is slow. The list of documents is stored sorted by document name so that fetching an information about a document for one word is relatively easy, still a need to update (or at least scan) all records in the table makes this storage unsuitable for collections of documents that often change. =item column The B backend stores a word/document pair in database fields, indexing both, thus allowing both fast retrieval and updates -- it's easy to delete all records describing one document and insert new ones. However, the database indexes that have to be maintained are large. Both B and B backends only store a count -- number of occurencies of the word in the document (and even this can be switched off, yielding just a yes/no information about the word's presence). This allows questions like all documents containing words 'voda' or 'Mattoni' but not a word 'kyselka' but you cannot ask whether a document contains a phrase 'kyselka Mattoni' because such information is not maintained by these types of backends. =item phrase To allow phrase matching, a B backend is available. For each word and document number it stores a blob of lists of positions of the word in the document. A query $fts->contains('kyselk* Mattoni'); then only returns those documents (document names/numbers) where word kyselka (or kyselky, or so) is just before word Mattoni. =back =head2 Mixing frontends and backends Any frontend can be used with any backend in one DBIx::FullTextSearch index. You can index Web documents with C frontend and C backend to be able to find phrases in the documents. And you can use the default, number based document scheme with C backend to use the disk space as efficiently as possible -- this is usefull for example for mailing-list archives, where we need to index huge number of documents that do not change at all. Finding optimal combination is very important and may require some analysis of the document collection and manipulation, as well as the speed and storage requirements. Benchmarking on actual target platform is very useful during the design phase. =head1 METHODS The following methods are available on the user side as DBIx::FullTextSearch API. =over 4 =item create my $fts = DBIx::FullTextSearch->create($dbh, $index_name, %opts); The class method C creates index of given name (the name of the index is the name of its basic parameter table) and all necessary tables, returns an object -- newly created index. The options that may be specified after the index name define the frontend and backend types, storage parameters (how many bits for what values), etc. See below for list of create options and discussion of their use. =item open my $fts = DBIx::FullTextSearch->open($dbh, $index_name); Opens and returns object, accessing specifies DBIx::FullTextSearch index. Since all the index parameters and information are stored in the C<$index_name> table (including names of all other needed tables), the database handler and the name of the parameter table are the only needed arguments. =item index_document $fts->index_document(45, 'Sleva pri nakupu stribra.'); $fts->index_document('http://www.mozilla.org/'); $fts->index_document('http://www.mozilla.org/','This is the mozilla web site'); For the C and C frontends, two arguments are expected -- the name (number or string) of the document and its content. For C, C, and C frontends the content is optional. Any content that you pass will be appended to the content from the file, URL, or database table. =item delete_document $fts->delete_document('http://www.mozilla.org/'); Removes information about document from the index. Note that for C backend this is very time consuming process. =item contains my @docs = $fts->contains('sleva', 'strib*'); Returns list of names (numbers or strings, depending on the frontend) of documents that contain some of specified words. =item econtains my @docs = $fts->contains('foo', '+bar*', '-koo'); Econtains stands for extended contains and allows words to be prefixed by plus or minus signs to specify that the word must or mustn't be present in the document for it to match. =item search my @docs = $fts->search(qq{+"this is a phrase" -koo +bar foo}); my @docs = $fts->search("(foo OR baz) AND (bar OR caz)"); This is a wrapper to econtains which takes a user input string and parses it into can-include, must-include, and must-not-include words and phrases. It also can handle boolean (AND/OR) queries. =item contains_hashref, econtains_hashref, search_hashref Similar to C, C and C, only instead of list of document names, these methods return a hash reference to a hash where keys are the document names and values are the number of occurencies of the words. =item drop Removes all tables associated with the index, including the base parameter table. Effectivelly destroying the index form the database. $fts->drop; =item empty Emptys the index so you can reindex the data. $fts->empty; =back =head1 INDEX OPTIONS Here we list the options that may be passed to C method. These allow to specify the style and storage parameters in great detail. =over 4 =item backend The backend type, default C, possible values C, C and C (see above for explanation). =item frontend The frontend type. The C frontend requires the user to specify numeric id of the document together with the content of the document, other possible values are C, C and C (see above for more info). =item word_length Maximum length of words that may be indexed, default 30. =item data_table Name of the table where the actual data about word/document relation is stored. By default, the name of the index (of the base table) with _data suffix is used. =item name_length Any frontend that uses strings as names of documents needs to maintain a conversion table from these names to internal integer ids. This value specifies maximum length of these string names (URLs, file names, ...). =item blob_direct_fetch Only for C backend. When looking for information about specific document in the list stored in the blob, the blob backend uses division of interval to find the correct place in the blob. When the interval gets equal or shorter that this value, all values are fetched from the database and the final search is done in Perl code sequentially. =item word_id_bits With C or C backends, DBIx::FullTextSearch maintains a numeric id for each word to optimize the space requirements. The word_id_bits parameter specifies the number of bits to reserve for this conversion and thus effectively limits number of distinct words that may be indexed. The default is 16 bits and possible values are 8, 16, 24 or 32 bits. =item word_id_table Name of the table that holds conversion from words to their numeric id (for C and C backends). By default is the name of the index with _words suffix. =item doc_id_bits A number of bits to hold a numeric id of the document (that is either provided by the user (with C frontend) or generated by the module to accomplish the conversion from the string name of the document). This value limits the maximum number of documents to hold. The default is 16 bits and possible values are 8, 16 and 32 bits for C backend and 8, 16, 24 and 32 bits for C and C backends. =item doc_id_table Name of the table that holds conversion from string names of documents to their numeric id, by default the name of the index with _docid suffix. =item count_bits Number of bits reserved for storing number of occurencies of each word in the document. The default is 8 and possible values are the same as with doc_id_bits. =item position_bits With C, DBIx::FullTextSearch stores positions of each word of the documents. This value specifies how much space should be reserved for this purpose. The default is 32 bits and possible values are 8, 16 or 32 bits. This value limits the maximum number of words of each document that can be stored. =item index_splitter DBIx::FullTextSearch allows the user to provide any Perl code that will be used to split the content of the document to words when indexing documents. The code will be evalled inside of the DBIx::FullTextSearch code. The default is /(\w{2,$word_length})/g and shows that the input is stored in the variable C<$data> and the code may access any other variable available in the perl_and_index_data_* methods (see source), especially C<$word_length> to get the maximum length of words and C<$backend> to get the backend object. The default value also shows that by default, the minimum length of words indexed is 2. =item search_splitter This is similar to the C method, except that it is used in the C method when searching for documents instead of when indexing documents. The default is /(\w{2,$word_length}\*?)/g Which, unlike the default C, allows for the wild card character (*). =item filter The output words of splitter (and also any parameter of (e)contains* methods) are send to filter that may do further processing. Filter is again a Perl code, the default is map { lc $_ } showing that the filter operates on input list and by default does conversion to lowercase (yielding case insensitive index). =item init_env Because user defined splitter or filter may depend on other things that it is reasonable to set before the actual procession of words, you can use yet another Perl hook to set things up. The default is no initialization hook. =item stoplist This is the name of a L object that is used for stop words. =item stemmer If this option is set, then word stemming will be enabled in the indexing and searching. The value is the name of a L recognized locale. Currently, 'en', 'en-us' and 'en-uk' are the only recognized locales. All locale identifiers are converted to lowercase. =item table_name For C
frontend; this is the name of the table that will be indexed. =item column_name For C
frontend; this is a reference to an array of columns in the C that contains the documents -- data to be indexed. It can also have a form table.column that will be used if the C option is not specified. =item column_id_name For C
frontend; this is the name of the field in C that holds names (ids) of the records. If not specified, a field that has primary key on it is used. If this field is numeric, it's values are directly used as identifiers, otherwise a conversion to numeric values is made. =back =head1 NOTES To handle internationalization, it may help to use the following in your code (for example Spanish in Chile): use POSIX; my $loc = POSIX::setlocale( &POSIX::LC_ALL, "es_CL" ); I haven't tested this, so I would be interested in hearing whether this works. =head1 ERROR HANDLING The create and open methods return the DBIx::FullTextSearch object on success, upon failure they return undef and set error message in C<$DBIx::FullTextSearch::errstr> variable. All other methods return reasonable (documented above) value on success, failure is signalized by unreasonable (typically undef or null) return value; the error message may then be retrieved by C<$fts-Eerrstr> method call. =head1 VERSION This documentation describes DBIx::FullTextSearch module version 0.73. =head1 BUGS Error handling needs more polishing. We do not check if the stored values are larger that specified by the *_bits parameters. No CGI administration tool at the moment. No scoring algorithm implemented. =head1 DEVELOPMENT These modules are under active development. If you would like to contribute, please e-mail tjmather@maxmind.com There are two mailing lists for this module, one for users, and another for developers. To subscribe, visit http://sourceforge.net/mail/?group_id=8645 =head1 AUTHOR (Original) Jan Pazdziora, adelton@fi.muni.cz, http://www.fi.muni.cz/~adelton/ at Faculty of Informatics, Masaryk University in Brno, Czech Republic (Current Maintainer) T.J. Mather, tjmather@maxmind.com, http://www.maxmind.com/app/opensourceservices Princeton, NJ USA Paid support is available from directly from the maintainers of this package. Please see L for more details. =head1 CREDITS Fixes, Bug Reports, Docs have been generously provided by: Vladimir Bogdanov Ade Olonoh Kate Pugh Sven Paulus Andrew Turner Tom Bille Joern Reder Tarik Alkasab Dan Collis Puro Tony Bowden Mario Minati Miroslav Suchý Stephen Patterson Joern Reder Hans Poo Of course, big thanks to Jan Pazdziora, the original author of this module. Especially for providing a clean, modular code base! =head1 COPYRIGHT All rights reserved. This package is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =head1 SEE ALSO L, L =head1 OTHER PRODUCTS and why I've written this module I'm aware of L and L modules and about UdmSearch utility, and about htdig and glimpse on the non-database side of the world. To me, using a database gives reasonable maintenance benefits. With products that use their own files to store the information (even if the storage algorithms are efficient and well thought of), you always struggle with permissions on files and directories for various users, with files that somebody accidently deleted or mungled, and making the index available remotely is not trivial. That's why I've wanted a module that will use a database as a storage backend. With MySQL, you get remote access and access control for free, and on many web servers MySQL is part of the standard equipment. So using it for text indexes seemed natural. However, existing L and UdmSearch are too narrow-aimed to me. The first only supports indexing of data that is stored in the database, but you may not always want or need to store the documents in the database as well. The UdmSearch on the other hand is only for web documents, making it unsuitable for indexing mailing-list archives or local data. I believe that DBIx::FullTextSearch is reasonably flexible and still very efficient. It doesn't enforce its own idea of what is good for you -- the number of options is big and you can always extend the module with your own backend of frontend if you feel that those provided are not sufficient. Or you can extend existing by adding one or two parameters that will add new features. Of course, patches are always welcome. DBIx::FullTextSearch is a tool that can be deployed in many projects. It's not a complete environment since different people have different needs. On the other hand, the methods that it provides make it easy to build a complete solution on top of this in very short course of time. =cut DBIx-FullTextSearch-0.73/test_data/0040755000076500007650000000000007630504720017333 5ustar tjmathertjmatherDBIx-FullTextSearch-0.73/test_data/driver_characteristics0100644000076500007650000002735607604065677024034 0ustar tjmathertjmather =head1 Database and Driver Characteristics This document is designed to give you a flavour of the functionality and quirks of the different DBI drivers and their databases. The line between the functionality and quirks of a given driver and the functionality and quirks of its corresponding database is blurred. In some cases the database has functionality that the driver can't or doesn't access, in others the driver may emulate functionality that the database doesn't support, like placeholders. So when you see the terms driver and database below, take them with a pinch of salt. We don't attempt to describe the drivers and database in full detail here, we're only interested in the key features that are most commonly used. And for those features we're just providing an outline guide. Consult the database documentation for full details. The primary goals are: - to provide a simple overview of each driver and database. - to help you initially select a suitable DBI driver and database. - to help you quickly identify potential issues if you need to port an application from one driver and database to another. =head2 Driver Name, Version, Author and Contact Details This driver summary is for DBD::XBase version 0.130 The driver author is Jan Pazdziora and he can be contacted via his e-mail adelton@fi.muni.cz or the dbi-(dev|users) mailing lists. =head2 Supported Database Versions and Options The DBD::XBase module supports dBaseIII and IV and Fox* flawors of dbf files, including their dbt and fpt memo files. =head2 Connect Syntax Details of the syntax of the dsn including any optional parts. The DBI->connect Data Source Name (DSN) should include the directory where the dbt files are located as the third part. dbi:XBase:/path/to/directory It defaults to current directory. Details of any driver specific attributes applicable to the connect method. There are no driver specific attributes for the DBI->connect method. =head2 Numeric Data Handling What numeric data types do the database and driver support? (INTEGER, FLOAT, NUMBER(p,s) etc). What is the maximum scale and precision for each type? Generic NUMBER(p,s) and FLOAT(p,s), INTEGER(l). Maximul scale and precision unknown, resp. limited by Perl's handling of numbers. In the dbf files, the numbers are stored as ASCII strings, or binary integers or floats. Numbers are always returned converted to numbers, so numbers outside of Perl's valid range are not possible (even if this restriction might be withdrawn in the future). Does the database and driver support numbers outside the valid range for perl numbers? No. Are numbers returned as strings in this case? N/A =head2 String Data Handling What string data types does the database support? (CHAR, VARCHAR, etc) DBD::XBase knows CHAR(length) and VARCHAR(length), both are stored as fixed length chars however. These can contain any binary values. No charset options are recongnized. What is the maximum size for each type? 65535 characters (even if the older dBase's only allowed 255 characters, so created dbf might not be porteble to other xbase-like software). Are any types blank padded? If so which, e.g., CHAR. Yes. How does the database handle data with the 8th bit set (national language character sets etc)? Data with the 8th bit set are handles transparently, no national language character set conversions are done. Is Unicode supported? No (there is no notion of charsets). =head2 Date Data Handling What date, time and date+time data types are supported and what is their valid range and resolution? Default (and only possible) date format for input and output is 8 char string 'YYYYMMDD'. DBD::XBase doesn't check for validity of this string. The datetime type works internally with the precision up to 1/1000 s. DBD::XBase currently supports this using Un*x standard seconds-since-epoch value (possibly with decimal part). This might change in the future. What date, time and date+time formats are supported? No formats except the defaults are supported. What is the default output format for each? 'YYYYMMDD' and number of seconds since 1970/1/1. What is the default input format for each? Are multiple input format recognised? 'YYYYMMDD' and number of seconds since 1970/1/1. This is the only possibility. If only part of a date is specified, how does the rest default? No partial definitions are allowed. If two digit years can be used, how is the century determined? N/A Can the default format be changed? If so, how (both for a single expression in an sql statement, and as a database connection default)? No. How can I get the current date+time in an SQL expression? There is no way to get the current date/time. How can I input date and date+time values in other formats? N/A How can I output date and date+time values in other formats? N/A What kinds of date and time arithmetic and functions are supported? None. What SQL expression can be used to convert from an integer "seconds since 1-jan-1970 GMT" value to the corresponding database date+time? If not possible, then what perl expression can be used? N/A What SQL expression can be used to convert from a database date+time value into the corresponding "seconds since 1-jan-1970 GMT" value? If not possible, then what perl expression can be used? N/A How does the database deal with time zones, especially where the client inserting a date, the server and a client reading the date are all in different time zones? There is no time zones handling. =head2 LONG/BLOB Data Handling What LONG/BLOB data types does the database support? (LONG, LONG RAW, CLOB, BLOB, BFILE etc) What are their maximum sizes? BLOB/MEMO data types are supported. With dBaseIII dbt files, the memo field cannot contain \x1a byte, with dBaseIV and Fox* dbt/fpt's any value can be stored. Which types, if any, must be passed to and from the database as pairs of hex digits? None. Do the LongReadLen and LongTruncOk attributes work as defined? What is the maximum value, if any, for LongReadLen? The LongReadLen and LongTruncOk attributes are ignored/are broken (will be fixed). Is special handling required for some or all LONG/BLOB data types? No. =head2 Other Data Handling issues Does the driver support type_info method? The DBD::XBase driver supports the type_info method. =head2 Transactions and Transaction Isolation Does the database support transactions? No. If so, what is the default transaction isolation level? N/A What other isolation levels are supported and how can they be enabled per-connection or per-transaction? N/A =head2 Explicit Locks What is the default locking behaviour for reading and writing? N/A What statements can be used to explicitly lock a table with various types/levels of lock? E.g., "lock table ...". N/A How can a select statement be modified to lock the selected rows from being changed by another transaction. E.g., "select ... for update". N/A =head2 No-Table Expression Select Syntax What syntax is used for 'selecting' a constant expression from the database? E.g., "select 42 from dual" (Oracle). N/A =head2 Table Join Syntax If equi-joins are supported but don't use the standard "WHERE a.field = b.field" syntax, then describe the syntax. Are 'outer joins' supported, if so with what syntax? N/A =head2 Table and Column Names What is the max size of table names? And column names? For table names limited by OS's max filename length. For column name 11 characters. What characters are valid without quoting? It has to start with letter and a combination of letters, digits and underscores may follow. Can table and field names be quoted? If so, how? What characters are valid with quoting? N/A Are table/column names stored as uppercase, lowercase or as-entered? Are table/column names case sensitive? Table names are stored and treated as entered (case sensitively), column names as uppoercase. Can national character sets (with the 8th bit set) be used? National character sets can be used. =head2 Case sensivity of like operator Is the LIKE operator case sensitive? If so, how can case insensitive LIKE operations be performed? The LIKE operator is not case sensitive. There is (currently) no case sensitive operator. =head2 Row ID If the database supports a 'row id' pseudocolumn, what is it called? E.g., 'rowid' in Oracle, 'tid' in Ingres. No. Can it be treated as a string when fetching and reusing in later statements? If not, what special handling is required? N/A =head2 Automatic Key or Sequence Generation Does the database offer automatic key generation such as 'auto increment' or 'system generated' keys? No. Does the database support sequence generators? If so, what syntax is used? No. =head2 Automatic Row Numbering and Row Count Limiting Can you select a row-numbering pseudocolumn and if so, what is it called? No. =head2 Parameter binding Is parameter binding supported by the database, emulated by the driver or not supported at all? Parameter binding is emulated by the driver. If parameter binding is supported, is the :1 placeholder style also supported? No (yet). Does the driver support the TYPE attribute do bind_param? If so, which types are supported and how do they affect the bind? TBWS. Do unsupported values of the TYPE attribute generate a warning? TBWS. =head2 Stored procedures What syntax is used to call stored procedures and, where possible, get results? N/A =head2 Table Metadata Does the driver support table_info method? DBD::XBase supports the table_info method. How can detailed information about the columns of a table be fetched? TBWS. How can detailed information about the indexes of a table be fetched? Indexes are not supported. How can detailed information about the keys of a table be fetched? Keys are not supported. =head2 Driver-specific database handle attributes xbase_ignorememo. =head2 Driver-specific statement handle attributes xbase_ignorememo. =head2 Default local row cache size and behaviour Does the driver or database implement a local row cache when fetching rows from a select statement? What is the default size? DBD::XBase doesn't maintain any row cache. =head2 Positioned updates and deletes Does the driver support positioned updates and deletes (also called updatable cursors)? If so, what syntax is used? E.g, "update ... where current of $cursor_name". DBD::XBase does not support positioned updates or deletes. =head2 Differences from the DBI specification List any significant differences in behaviour from the current DBI specification. DBD::XBase has no known significant differences in behaviour from the current DBI specification. =head2 URLs to more database/driver specific information http://www.e-bachmann.dk/docs/xbase.htm =head2 Multiple concurrent database connections Does the driver allow multiple concurrent database connections to the same database? DBD::XBase supports an unlimited number of concurrent database connections to one or more databases. =head2 Concurrent use of multiple statement handles from same $dbh. Does the driver allow a new statement handle to be prepared and used while still fetching data from another statment handle associated with the same database handle? DBD::XBase supports the preparation and execution of a new statement handle while still fetching data from another statment handle associated with the same database handle. =head2 Driver specific methods What generally useful private ($h->func(...)) methods are provided? None. =cut DBIx-FullTextSearch-0.73/test_data/create_table0100644000076500007650000000020207604065677021675 0ustar tjmathertjmather CREATE TABLE __TABLE_NAME__ ( text varchar(__WORD_LENGTH__) binary DEFAULT '' NOT NULL, idx longblob, PRIMARY KEY (text) ); DBIx-FullTextSearch-0.73/test_data/cdx.modul0100644000076500007650000000174607604065677021176 0ustar tjmathertjmather # ################################## # Here starts the XBase::cdx package package XBase::cdx; use strict; use XBase::Base; use vars qw( $VERSION @ISA ); @ISA = qw( XBase::Base ); $VERSION = "0.03"; sub read_header { my $self = shift; my $header; $self->{'fh'}->read($header, 512) == 512 or do { Error "Error reading header of $self->{'filename'}\n"; return; }; my ($root_page1, $root_page2, $free_list, $total_no_pages, $key_len, $index_opts, $index_sign, $reserved1, $sort_order, $total_exp_len, $for_exp_len, $reserved2, $key_exp_len) = unpack "nnNNvCCA486vvvvv", $header; my $root_page = $root_page1 | ($root_page2 << 16); @{$self}{ qw( root_page free_list total_no_pages key_len index_opts index_sign sort_order total_exp_len for_exp_len key_exp_len ) } = ($root_page, $free_list, $total_no_pages, $key_len, $index_opts, $index_sign, $sort_order, $total_exp_len, $for_exp_len, $key_exp_len); 1; } sub dump_records { my $self = shift; } 1; DBIx-FullTextSearch-0.73/test_data/Index.modul0100644000076500007650000003030207604065677021455 0ustar tjmathertjmather =head1 NAME XBase::Index - base class for the index files for dbf =cut package XBase::Index; use strict; use vars qw( @ISA $DEBUG $VERSION ); use XBase::Base; @ISA = qw( XBase::Base ); $VERSION = '0.0942'; $DEBUG = 0; sub new { my ($class, $file) = (shift, shift); if ($file =~ /\.ndx$/i) { return new XBase::ndx $file, @_; } elsif ($file =~ /\.ntx$/i) { return new XBase::ntx $file, @_; } elsif ($file =~ /\.mdx$/i) { return new XBase::mdx $file, @_; } else { __PACKAGE__->Error("Error loading index: unknown extension\n"); } return; } sub get_record { my $self = shift; my $newpage = ref $self; $newpage .= '::Page' unless substr($newpage, -6) eq '::Page'; $newpage .= '::new'; return $self->$newpage(@_); } sub fetch { my $self = shift; my ($level, $page, $row, $key, $val, $left); while (not defined $val) { $level = $self->{'level'}; if (not defined $level) { $level = $self->{'level'} = 0; $page = $self->get_record($self->{'start_page'}); if (not defined $page) { $self->Error("Index corrupt: $self: no root page $self->{'start_page'}\n"); return; } $self->{'pages'} = [ $page ]; $self->{'rows'} = []; } $page = $self->{'pages'}[$level]; if (not defined $page) { $self->Error("Index corrupt: $self: page for level $level lost\n"); return; } my $row = $self->{'rows'}[$level]; if (not defined $row) { $row = $self->{'rows'}[$level] = 0; } else { $self->{'rows'}[$level] = ++$row; } ($key, $val, $left) = $page->get_key_val_left($row); if (defined $left) { $level++; my $oldpage = $page; $page = $oldpage->get_record($left); if (not defined $page) { $self->Error("Index corrupt: $self: no page $left, referenced from $oldpage, for level $level\n"); return; } $self->{'pages'}[$level] = $page; $self->{'rows'}[$level] = undef; $self->{'level'} = $level; $val = undef; next; } if (defined $val) { return ($key, $val); } else { $self->{'level'} = --$level; next if $level < 0; $page = $self->{'pages'}[$level]; next unless defined $page; $row = $self->{'rows'}[$level]; my ($backkey, $backval, $backleft) = $page->get_key_val_left($row); if (defined $backleft and defined $backval) { return ($backkey, $backval); } } } } sub prepare_select_eq { my ($self, $eq) = @_; $self->prepare_select(); my $left = $self->{'start_page'}; my $level = 0; my $parent = $self; my $numdate = ($self->{'key_type'} ? 1 : 0); while (1) { my $page = $parent->get_record($left); if (not defined $page) { $self->Error("Index corrupt: $self: no page $left, for level $level\n"); return; } my $row = 0; my ($key, $val); while (($key, $val, my $newleft) = $page->get_key_val_left($row)) { $left = $newleft; if (not defined $key) { last; } if ($numdate ? $key >= $eq : $key ge $eq) { last; } $row++; } $self->{'pages'}[$level] = $page; $self->{'rows'}[$level] = $row; if (not defined $left) { $self->{'rows'}[$level] = ( $row ? $row - 1: undef); $self->{'level'} = $level; last; } $parent = $page; $level++; } 1; } sub prepare_select { my $self = shift; delete $self->{'level'}; 1; } sub get_key_val_left { my ($self, $num) = @_; { local $^W = 0; my $printkey = $self->{'keys'}[$num]; $printkey =~ s/\s+$//; print "Getkeyval: Page $self->{'num'}, row $num: $printkey, $self->{'values'}[$num], $self->{'lefts'}[$num]\n" if $DEBUG; } return ($self->{'keys'}[$num], $self->{'values'}[$num], $self->{'lefts'}[$num]) if $num <= $#{$self->{'keys'}}; (); } sub num_keys { $#{shift->{'keys'}}; } # # dBase III NDX # package XBase::ndx; use strict; use vars qw( @ISA $DEBUG ); @ISA = qw( XBase::Base XBase::Index ); $DEBUG = 0; sub read_header { my $self = shift; my $header; $self->{'fh'}->read($header, 512) == 512 or do { __PACKAGE__->Error("Error reading header of $self->{'filename'}: $!\n"); return; }; @{$self}{ qw( start_page total_pages key_length keys_per_page key_type key_record_length unique key_string ) } = unpack 'VV @12vvvv @23c a*', $header; $self->{'key_string'} =~ s/[\000 ].*$//s; $self->{'record_len'} = 512; $self->{'header_len'} = 0; $self; } sub last_record { shift->{'total_pages'}; } package XBase::ndx::Page; use strict; use vars qw( $DEBUG @ISA ); @ISA = qw( XBase::ntx ); $DEBUG = 0; sub new { my ($indexfile, $num) = @_; my $parent; if ((ref $indexfile) =~ /::Page$/) ### parent page { $parent = $indexfile; $indexfile = $parent->{'indexfile'}; } my $data = $indexfile->read_record($num) or return; my $noentries = unpack 'v', $data; my $keylength = $indexfile->{'key_length'}; my $keyreclength = $indexfile->{'key_record_length'}; print "page $num, noentries $noentries, keylength $keylength\n" if $DEBUG; my $numdate = $indexfile->{'key_type'}; my $bigend = substr(pack( "d", 1), 0, 2) eq '?ð'; my $offset = 4; my $i =0; my ($keys, $values, $lefts) = ([], [], []); while ($i < $noentries) { my ($left, $recno, $key) = unpack "\@$offset VVa$keylength", $data; if ($numdate) { $key = reverse $key if $bigend; $key = unpack "d", $key; } ### print "$i: \@$offset VVa$keylength -> ($left, $recno, $key)\n" if $DEBUG; push @$keys, $key; push @$values, ($recno ? $recno : undef); $left = ($left ? $left : undef); push @$lefts, $left; if ($i == 0 and defined $left) { $noentries++; } } continue { $i++; $offset += $keyreclength; } { local $^W = 0; print "Page $num:\tkeys: @{[ map { s/\s+$//; $_; } @$keys]} -> values: @$values\n\tlefts: @$lefts\n" if $DEBUG; } my $self = bless { 'keys' => $keys, 'values' => $values, 'num' => $num, 'keylength' => $keylength, 'lefts' => $lefts, 'indexfile' => $indexfile }, __PACKAGE__; $self; } # # Clipper NTX # package XBase::ntx; use strict; use vars qw( @ISA ); @ISA = qw( XBase::Base XBase::Index ); sub read_header { my $self = shift; my $header; $self->{'fh'}->read($header, 1024) == 1024 or do { __PACKAGE__->Error("Error reading header of $self->{'filename'}: $!\n"); return; }; @{$self}{ qw( signature compiler_version start_offset first_unused key_record_length key_length decimals max_item half_page key_string unique ) } = unpack 'vvVVvvvvva256c', $header; $self->{'key_string'} =~ s/[\000 ].*$//s; $self->{'record_len'} = 1024; $self->{'header_len'} = 0; $self->{'start_page'} = int($self->{'start_offset'} / $self->{'record_len'}); $self; } sub last_record { -1; } package XBase::ntx::Page; use strict; use vars qw( $DEBUG @ISA ); @ISA = qw( XBase::ntx ); $DEBUG = 0; sub new { my ($indexfile, $num) = @_; my $parent; if ((ref $indexfile) =~ /::Page$/) ### parent page { $parent = $indexfile; $indexfile = $parent->{'indexfile'}; } my $data = $indexfile->read_record($num) or return; my $maxnumitem = $indexfile->{'max_item'} + 1; my $keylength = $indexfile->{'key_length'}; my $record_len = $indexfile->{'record_len'}; my ($noentries, @pointers) = unpack "vv$maxnumitem", $data; print "page $num, noentries $noentries, keylength $keylength; pointers @pointers\n" if $DEBUG; my ($keys, $values, $lefts) = ([], [], []); for (my $i = 0; $i < $noentries; $i++) { my $offset = $pointers[$i]; my ($left, $recno, $key) = unpack "\@$offset VVa$keylength", $data; push @$keys, $key; push @$values, ($recno ? $recno : undef); $left = ($left ? ($left / $record_len) : undef); push @$lefts, $left; if ($i == 0 and defined $left) ### if ($i == 0 and defined $left and (not defined $parent or $num == $parent->{'lefts'}[-1])) { $noentries++; } } print "Page $num:\tkeys: @{[ map { s/\s+$//; $_; } @$keys]} -> values: @$values\n\tlefts: @$lefts\n" if $DEBUG; my $self = bless { 'num' => $num, 'indexfile' => $indexfile, 'keys' => $keys, 'values' => $values, 'lefts' => $lefts, }, __PACKAGE__; $self; } # # dBase IV MDX # package XBase::mdx; use strict; use vars qw( @ISA ); @ISA = qw( XBase::Base XBase::Index ); sub read_header { my $self = shift; my $expr_name = shift; my $header; $self->{'fh'}->read($header, 544) == 544 or do { __PACKAGE__->Error("Error reading header of $self->{'filename'}: $!\n"); return; }; @{$self}{ qw( version created dbf_filename block_size block_size_adder production noentries tag_length res tags_used res nopages first_free noavail last_update ) } = unpack 'Ca3A16vvccccvvVVVa3', $header; $self->{'record_len'} = 512; $self->{'header_len'} = 0; for my $i (1 .. $self->{'tags_used'}) { my $len = $self->{'tag_length'}; $self->seek_to(544 + ($i - 1) * $len) or do { __PACKAGE__->Error($self->errstr); return; }; $self->{'fh'}->read($header, $len) == $len or do { __PACKAGE__->Error("Error reading tag header $i in $self->{'filename'}: $!\n"); return; }; my $tag; @{$tag}{ qw( header_page tag_name key_format fwd_low fwd_high backward res key_type ) } = unpack 'VA11ccccca1', $header; $self->{'tags'}{$tag->{'tag_name'}} = $tag; $self->seek_to($tag->{'header_page'} * 512) or do { __PACKAGE__->Error($self->errstr); return; }; $self->{'fh'}->read($header, 24) == 24 or do { __PACKAGE__->Error("Error reading tag definition in $self->{'filename'}: $!\n"); return; }; @{$tag}{ qw( start_page file_size key_format_1 key_type_1 res key_length max_no_keys_per_page second_key_type key_record_length res unique) } = unpack 'VVca1vvvvva3c', $header; $self->seek_to($tag->{'root_page_ptr'} * 512) or do { __PACKAGE__->Error($self->errstr); return; }; } ## use Data::Dumper; ## print Dumper $self; if (defined $self->{'tags'}{$expr_name}) { $self->{'active'} = $self->{'tags'}{$expr_name}; $self->{'start_page'} = $self->{'active'}{'start_page'}; } $self; } sub last_record { -1; } package XBase::mdx::Page; use strict; use vars qw( $DEBUG @ISA ); @ISA = qw( XBase::mdx ); $DEBUG = 1; sub new { my ($indexfile, $num) = @_; my $parent; if ((ref $indexfile) =~ /::Page$/) ### parent page { $parent = $indexfile; $indexfile = $parent->{'indexfile'}; } $indexfile->seek_to_record($num) or return; my $data; $indexfile->{'fh'}->read($data, 1024) == 1024 or return; my $keylength = $indexfile->{'active'}{'key_length'}; my $keyreclength = $indexfile->{'active'}{'key_record_length'}; my $offset = 8; my ($noentries, $noleaf) = unpack 'VV', $data; print "page $num, noentries $noentries, keylength $keylength; noleaf: $noleaf\n" if $DEBUG; if ($noleaf == 54 or $noleaf == 20 or $noleaf == 32 or $noleaf == 80) { $noentries++; } my ($keys, $values, $lefts) = ([], [], []); for (my $i = 0; $i < $noentries; $i++) { my ($left, $key) = unpack "\@${offset}Va${keylength}", $data; push @$keys, $key; if ($noleaf == 54 or $noleaf == 20 or $noleaf == 32 or $noleaf == 80) { push @$lefts, $left; } else { push @$values, $left; } $offset += $keyreclength; } print "Page $num:\tkeys: @{[ map { s/\s+$//; $_; } @$keys]} -> values: @$values\n\tlefts: @$lefts\n" if $DEBUG; my $self = bless { 'num' => $num, 'indexfile' => $indexfile, 'keys' => $keys, 'values' => $values, 'lefts' => $lefts, }, __PACKAGE__; $self; } 1; __END__ =head1 SYNOPSIS use XBase; my $table = new XBase "data.dbf"; my $cur = $table->prepare_select_with_index("id.ndx", "ID", "NAME); $cur->find_eq(1097); while (my @data = $cur->fetch()) { last if $data[0] != 1097; print "@data\n"; } This is a snippet of code to print ID and NAME fields from dbf data.dbf where ID equals 1097. Provided you have index on ID in file id.ndx. =head1 DESCRIPTION This is the class that currently supports B index files. The name will change in the furute as we later add other index formats, but for now this is the only index support. The support is read only. If you update your data, you have to reindex using some other tool than XBase::Index currently. Anyway, you have the tool to do that because XBase::Index doesn't support creating the index files either. So, read only. I will stop documenting here for now because the module is not finalized and you might think that if I write something in the man page, it will stay so. Most probably not ;-) Please see eg/use_index in the distribution directory for more information. =head1 VERSION 0.0942 =head1 AUTHOR (c) 1998 Jan Pazdziora, adelton@fi.muni.cz =cut DBIx-FullTextSearch-0.73/test_data/ToDo0100644000076500007650000000135107604065677020136 0ustar tjmathertjmather This is the to-do list for the XBase and DBD::XBase packages: Short term (I will do them as my time allows): Need to fix handling attributes in DBD ... When creating table, have option to specify version. Support named placeholders. Support permissions setup. Add support for MDX, CDX and IDX. Mid term (would appreciate help or comments): Add sorting on more than one field in DBD. Add memo position caching. Polish error reporting from XBase::Index. Long term (it really depends on many things, mainly whether other people will demand the feature, offer help): Add index support to DBD::XBase (once we have MDX, probably). Add write index support. -- Jan Pazdziora adelton@fi.muni.cz http://www.fi.muni.cz/~adelton/ DBIx-FullTextSearch-0.73/test_data/MANIFEST0100644000076500007650000000152407604065677020501 0ustar tjmathertjmatherChanges MANIFEST Makefile.PL README ToDo bin/dbfdump.PL dbit/00base.t dbit/10dsnlist.t dbit/20createdrop.t dbit/30insertfetch.t dbit/40bindparam.t dbit/40blobs.t dbit/40listfields.t dbit/40nulls.t dbit/40numrows.t dbit/50chopblanks.t dbit/50commit.t driver_characteristics eg/copy_table eg/create_table eg/use_index lib/DBD/XBase.pm lib/XBase.pm lib/XBase/Base.pm lib/XBase/Index.pm lib/XBase/Memo.pm lib/XBase/SQL.pm lib/XBase/cdx.pm t/1_header.t t/2_read.t t/2_read_stream.t t/2_write.t t/3_create_drop.t t/3_ndx.t t/3_ntx.t t/4_dbfdump.t t/5_dbd_select.t t/6_dbd_delete.t t/7_dbd_update.t t/8_dbd_insert.t t/9_dbd_create.t t/XBase.dbtest t/XBase.mtest t/afox5.FPT t/afox5.dbf t/lib.pl t/ndx-char.dbf t/ndx-char.ndx t/ndx-date.dbf t/ndx-date.ndx t/ndx-num.dbf t/ndx-num.ndx t/ntx-char.dbf t/ntx-char.ntx t/rooms.cdx t/rooms.dbf t/test.dbf t/test.dbt DBIx-FullTextSearch-0.73/test_data/XBase.modul0100644000076500007650000004030307604065700021375 0ustar tjmathertjmather =head1 NAME DBD::XBase - DBI driver for XBase compatible database files =cut # ################################## # Here starts the DBD::XBase package package DBD::XBase; use strict; use DBI (); use XBase; use XBase::SQL; use vars qw( $VERSION @ISA @EXPORT $err $errstr $drh $sqlstate ); require Exporter; $VERSION = '0.130'; $err = 0; $errstr = ''; $sqlstate = ''; $drh = undef; sub driver { return $drh if $drh; my ($class, $attr) = @_; $class .= '::dr'; $drh = DBI::_new_drh($class, { 'Name' => 'XBase', 'Version' => $VERSION, 'Err' => \$DBD::XBase::err, 'Errstr' => \$DBD::XBase::errstr, 'State' => \$DBD::XBase::sqlstate, 'Attribution' => 'DBD::XBase by Jan Pazdziora', }); } sub data_sources { 'dbi:XBase:.'; } package DBD::XBase::dr; use strict; $DBD::XBase::dr::imp_data_size = 0; sub connect { my ($drh, $dsn) = @_; $dsn = '.' if $dsn eq ''; if (not -d $dsn) { $DBD::XBase::err = 1; $DBD::XBase::errstr = "Directory $dsn doesn't exist"; return; } DBI::_new_dbh($drh, { 'Name' => $dsn } ); } sub disconnect_all { 1; } sub data_sources { 'dbi:XBase:.'; } package DBD::XBase::db; use strict; $DBD::XBase::db::imp_data_size = 0; sub prepare { my ($dbh, $statement)= @_; my $parsed_sql = parse XBase::SQL($statement); ### use Data::Dumper; print Dumper $parsed_sql; if (defined $parsed_sql->{'errstr'}) { DBI::set_err($dbh, 2, 'Error in SQL parse: ' . $parsed_sql->{'errstr'}); return; } my $sth = DBI::_new_sth($dbh, { 'Statement' => $statement, 'xbase_parsed_sql' => $parsed_sql, 'NUM_OF_PARAMS' => ( defined($parsed_sql->{'numofbinds'}) ? $parsed_sql->{'numofbinds'} : 0), }); $sth; } sub STORE { my ($dbh, $attrib, $value) = @_; if ($attrib eq 'AutoCommit') { unless ($value) { die "Can't disable AutoCommit"; } return 1; } elsif ($attrib =~ /^xbase_/) { $dbh->{$attrib} = $value; return 1; } $dbh->DBD::_::db::STORE($attrib, $value); } sub FETCH { my ($dbh, $attrib) = @_; if ($attrib eq 'AutoCommit') { return 1; } elsif ($attrib =~ /^xbase_/) { return $dbh->{$attrib}; } $dbh->DBD::_::db::FETCH($attrib); } sub _ListTables { my $dbh = shift; opendir DIR, $dbh->{'Name'} or return; my @result = (); while (defined(my $item = readdir DIR)) { next unless $item =~ s/\.dbf$//; push @result, $item; } closedir DIR; @result; } sub tables { my $dbh = shift; $dbh->DBD::XBase::db::_ListTables; } sub quote { my $text = $_[1]; return 'NULL' unless defined $text; $text =~ s/\\/\\\\/sg; $text =~ s/\'/\\\'/sg; return "'$text'"; return "'\Q$text\E'"; } sub commit { warn "Commit ineffective while AutoCommit is on" if $_[0]->FETCH('Warn'); 1; } sub rollback { warn "Rollback ineffective while AutoCommit is on" if $_[0]->FETCH('Warn'); 0; } sub disconnect { my $dbh = shift; foreach my $xbase (values %{$dbh->{'xbase_tables'}}) { $xbase->close; delete $dbh->{'xbase_tables'}{$xbase}; } 1; } sub table_info { my $dbh = shift; my @tables = map { [ undef, undef, $_, 'TABLE', undef ] } $dbh->tables(); my $sth = DBI::_new_sth($dbh, { 'xbase_lines' => [ @tables ] } ); $sth->STORE('NUM_OF_FIELDS', 5); $sth->execute and return $sth; return; } my @TYPE_INFO_ALL = ( [ qw( TYPE_NAME DATA_TYPE PRECISION LITERAL_PREFIX LITERAL_SUFFIX CREATE_PARAMS NULLABLE CASE_SENSITIVE SEARCHABLE UNSIGNED_ATTRIBUTE MONEY AUTO_INCREMENT LOCAL_TYPE_NAME MINIMUM_SCALE MAXIMUM_SCALE ) ], [ 'VARCHAR', DBI::SQL_VARCHAR, 65535, "'", "'", 'max length', 0, 1, 2, undef, 0, 0, undef, undef, undef ], [ 'CHAR', DBI::SQL_CHAR, 65535, "'", "'", 'max length', 0, 1, 2, undef, 0, 0, undef, undef, undef ], [ 'INTEGER', DBI::SQL_INTEGER, 0, '', '', 'number of digits', 1, 0, 2, 0, 0, 0, undef, 0, undef ], [ 'FLOAT', DBI::SQL_FLOAT, 0, '', '', 'number of digits', 1, 0, 2, 0, 0, 0, undef, 0, undef ], [ 'NUMERIC', DBI::SQL_FLOAT, 0, '', '', 'number of digits', 1, 0, 2, 0, 0, 0, undef, 0, undef ], [ 'BOOLEAN', DBI::SQL_BINARY, 0, '', '', 'number of digits', 1, 0, 2, 0, 0, 0, undef, 0, undef ], [ 'DATE', DBI::SQL_DATE, 0, '', '', 'number of digits', 1, 0, 2, 0, 0, 0, undef, 0, undef ], [ 'BLOB', DBI::SQL_LONGVARBINARY, 0, '', '', 'number of bytes', 1, 0, 2, 0, 0, 0, undef, 0, undef ], ); my %TYPE_INFO_TYPES = map { ( $TYPE_INFO_ALL[$_][0] => $_ ) } ( 1 .. $#TYPE_INFO_ALL ); my %REVTYPES = qw( C char N numeric F float L boolean D date M blob ); my %REVSQLTYPES = map { ( $_ => $TYPE_INFO_ALL[ $TYPE_INFO_TYPES{ uc $REVTYPES{$_} } ][1] ) } keys %REVTYPES; ### use Data::Dumper; print Dumper \%TYPE_INFO_TYPES, \%REVSQLTYPES; sub type_info_all { my $dbh = shift; my $result = [ @TYPE_INFO_ALL ]; my $i = 0; my $hash = { map { ( $_ => $i++) } @{$result->[0]} }; $result->[0] = $hash; $result; } sub type_info { my ($dbh, $type) = @_; my $result = []; for my $row ( 1 .. $#TYPE_INFO_ALL ) { if ($type == DBI::SQL_ALL_TYPES or $type == $TYPE_INFO_ALL[$row][1]) { push @$result, { map { ( $TYPE_INFO_ALL[0][$_] => $TYPE_INFO_ALL[$row][$_] ) } ( 0 .. $#{$TYPE_INFO_ALL[0]} ) } } } $result; } package DBD::XBase::st; use strict; $DBD::XBase::st::imp_data_size = 0; sub bind_param { my ($sth, $param, $value, $attribs) = @_; $sth->{'param'}[$param - 1] = $value; 1; } sub rows { my $sth = shift; if (defined $sth->{'xbase_rows'}) { return $sth->{'xbase_rows'}; } return -1; } sub execute { my $sth = shift; if (@_) { $sth->{'param'} = [ @_ ]; } my $param = $sth->{'param'}; if (defined $sth->{'xbase_lines'}) { return -1; } delete $sth->{'xbase_rows'} if defined $sth->{'xbase_rows'}; my $parsed_sql = $sth->{'xbase_parsed_sql'}; my $command = $parsed_sql->{'command'}; my $table = $parsed_sql->{'table'}[0]; my $dbh = $sth->{'Database'}; ### if (not defined $sth->FETCH('NUM_OF_FIELDS')) ### { $sth->STORE('NUM_OF_FIELDS', 0); } # Create table first -- we do not need to work with the table anymore if ($command eq 'create') { my $filename = $dbh->{'Name'} . '/' . $table; my %opts; @opts{ qw( name field_names field_types field_lengths field_decimals ) } = ( $filename, @{$parsed_sql}{ qw( createfields createtypes createlengths createdecimals ) } ); my $xbase = XBase->create(%opts) or do { DBI::set_err($sth, 10, XBase->errstr()); return; }; $dbh->{'xbase_tables'}->{$table} = $xbase; return 1; } my $xbase = $dbh->{'xbase_tables'}->{$table}; # If we do not have the table yet, open it if (not defined $xbase) { my $filename = $dbh->{'Name'} . '/' . $table; my %opts = ('name' => $filename); $opts{'ignorememo'} = 1 if $dbh->{'xbase_ignorememo'}; $xbase = new XBase(%opts) or do { DBI::set_err($sth, 3, "Table $table not found: " . XBase->errstr()); return; }; $dbh->{'xbase_tables'}->{$table} = $xbase; } if (defined $parsed_sql->{'ChopBlanks'}) { $xbase->{'ChopBlanks'} = $parsed_sql->{'ChopBlanks'}; } $parsed_sql->{'ChopBlanks'} = \$xbase->{'ChopBlanks'}; my %nonexist; for my $field (@{$parsed_sql->{'usedfields'}}) { $nonexist{$field} = 1 unless defined $xbase->field_type($field); } if (keys %nonexist) { my @f = sort keys %nonexist; DBI::set_err($sth, 4, sprintf 'Unknown field %s found in table %s', join(', ', @f), $table); return; } if ($command eq 'insert') { my $last = $xbase->last_record; my @values = &{$parsed_sql->{'insertfn'}}($xbase, $param, 0); if (defined $parsed_sql->{'fields'}) { my %newval; @newval{ @{$parsed_sql->{'fields'} } } = @values; @values = @newval{ $xbase->field_names }; } $xbase->set_record($last + 1, @values) or do { DBI::set_err($sth, 49, 'Insert failed: ' . $xbase->errstr); return; }; return 1; } if (not defined $parsed_sql->{'fields'} and defined $parsed_sql->{'selectall'}) { $parsed_sql->{'fields'} = [ $xbase->field_names ]; for my $field (@{$parsed_sql->{'fields'}}) { push @{$parsed_sql->{'usedfields'}}, $field unless grep { $_ eq $field } @{$parsed_sql->{'usedfields'}}; } } my $cursor = $xbase->prepare_select( @{$parsed_sql->{'usedfields'}} ); my $wherefn = $parsed_sql->{'wherefn'}; my @fields = @{$parsed_sql->{'fields'}} if defined $parsed_sql->{'fields'}; ### use Data::Dumper; print STDERR Dumper $parsed_sql; my $rows; if ($command eq 'select') { if (defined $parsed_sql->{'orderfield'}) { my $orderfield = ${$parsed_sql->{'orderfield'}}[0]; my $subparsed_sql = { %$parsed_sql }; delete $subparsed_sql->{'orderfield'}; unshift @{$subparsed_sql->{'fields'}}, $orderfield; my $substh = DBI::_new_sth($dbh, { 'Statement' => $sth->{'Statement'}, 'xbase_parsed_sql' => $subparsed_sql, }); $substh->execute(@$param); my $data = $substh->fetchall_arrayref; my $type = $xbase->field_type($orderfield); my $sortfn; if (not defined $parsed_sql->{'orderdesc'}) { if ($type =~ /^[CML]$/) { $sortfn = sub { $_[0] cmp $_[1] } } else { $sortfn = sub { $_[0] <=> $_[1] } } } else { if ($type =~ /^[CML]$/) { $sortfn = sub { $_[1] cmp $_[0] } } else { $sortfn = sub { $_[1] <=> $_[0] } } } $sth->{'xbase_lines'} = [ map { shift @$_; [ @$_ ] } sort { &{$sortfn}($a->[0], $b->[0]) } @$data ]; shift(@{$parsed_sql->{'fields'}}); } else { $sth->{'xbase_cursor'} = $cursor; } if (not $sth->FETCH('NUM_OF_FIELDS') and scalar @fields) { $sth->STORE('NUM_OF_FIELDS', scalar @fields); } } elsif ($command eq 'delete') { if (not defined $wherefn) { my $last = $xbase->last_record; for (my $i = 0; $i <= $last; $i++) { if (not ($xbase->get_record_nf($i, 0))[0]) { $xbase->delete_record($i); $rows = 0 unless defined $rows; $rows++; } } } else { my $values; while (defined($values = $cursor->fetch_hashref)) { next unless &{$wherefn}($xbase, $values, $param, 0); $xbase->delete_record($cursor->last_fetched); $rows = 0 unless defined $rows; $rows++; } } } elsif ($command eq 'update') { my $values; while (defined($values = $cursor->fetch_hashref)) { ### print Dumper $values; next if defined $wherefn and not &{$wherefn}($xbase, $values, $param, $parsed_sql->{'bindsbeforewhere'}); my %newval; @newval{ @fields } = &{$parsed_sql->{'updatefn'}}($xbase, $values, $param, 0); $xbase->update_record_hash($cursor->last_fetched, %newval); $rows = 0 unless defined $rows; $rows++; } } elsif ($command eq 'drop') { $xbase->drop; $rows = -1; } $sth->{'xbase_rows'} = $rows if defined $rows; return defined $rows ? ( $rows ? $rows : '0E0' ) : -1; } sub fetch { my $sth = shift; my $retarray; if (defined $sth->{'xbase_lines'}) { $retarray = shift @{$sth->{'xbase_lines'}}; } elsif (defined $sth->{'xbase_cursor'}) { my $cursor = $sth->{'xbase_cursor'}; my $wherefn = $sth->{'xbase_parsed_sql'}{'wherefn'}; my $xbase = $cursor->table; my $values; while (defined($values = $cursor->fetch_hashref)) { next if defined $wherefn and not &{$wherefn}($xbase, $values, $sth->{'param'}, 0); last; } $retarray = [ @{$values}{ @{$sth->{'xbase_parsed_sql'}{'fields'}}} ] if defined $values; } ### use Data::Dumper; print Dumper $retarray; return unless defined $retarray; ### print STDERR "sth->{'NUM_OF_FIELDS'}: $sth->{'NUM_OF_FIELDS'} sth->{'NUM_OF_PARAMS'}: $sth->{'NUM_OF_PARAMS'}\n"; $sth->_set_fbav($retarray); return $retarray; my $i = 0; for my $ref ( @{$sth->{'xbase_bind_col'}} ) { next unless defined $ref; $$ref = $retarray->[$i]; } continue { $i++; } return $retarray; } *fetchrow_arrayref = \&fetch; sub FETCH { my ($sth, $attrib) = @_; my $parsed_sql = $sth->{'xbase_parsed_sql'}; if ($attrib eq 'NAME') { return [ @{$parsed_sql->{'fields'}} ]; } elsif ($attrib eq 'NULLABLE') { return [ (1) x scalar(@{$parsed_sql->{'fields'}}) ]; } elsif ($attrib eq 'TYPE') { return [ map { $REVSQLTYPES{$_} } map { $sth->{'Database'}->{'xbase_tables'}->{$parsed_sql->{'table'}[0]}->field_type($_) } @{$parsed_sql->{'fields'}} ]; } elsif ($attrib eq 'ChopBlanks') { return $parsed_sql->{'ChopBlanks'}; } else { return $sth->DBD::_::st::FETCH($attrib); } } sub STORE { my ($sth, $attrib, $value) = @_; if ($attrib eq 'ChopBlanks') { $sth->{'xbase_parsed_sql'}->{'ChopBlanks'} = $value; } return $sth->DBD::_::st::STORE($attrib, $value); } sub finish { 1; } sub DESTROY { } 1; __END__ =head1 SYNOPSIS use DBI; my $dbh = DBI->connect("DBI:XBase:/directory/subdir") or die $DBI::errstr; my $sth = $dbh->prepare("select MSG from test where ID != 1") or die $dbh->errstr(); $sth->execute() or die $sth->errstr(); my @data; while (@data = $sth->fetchrow_array()) { ## further processing } $dbh->do('update table set name = ? where id = 45', {}, 'krtek'); =head1 DESCRIPTION DBI compliant driver for module XBase. Please refer to DBI(3) documentation for how to actually use the module. In the B call, specify the directory containing the dbf files (and other, memo, etc.) as the third part of the connect string. It defaults to the current directory. Note that with dbf, there is no database server that the driver would talk to. This DBD::XBase calls methods from XBase.pm module to read and write the files on the disk directly, so any limitations and features of XBase.pm apply to DBD::XBase as well. DBD::XBase basically adds SQL, DBI compliant interface to XBase.pm. The DBD::XBase doesn't make use of index files at the moment. If you really need indexed access, check XBase(3) for notes about ndx support. =head1 SUPPORTED SQL COMMANDS The SQL commands currently supported by DBD::XBase's prepare are: =head2 select select fields from table [ where condition ] [ order by field ] Fields is a comma separated list of fields or a C<*> for all. The C condition specifies which rows will be returned, you can have arbitrary arithmetic and boolean expression here, compare fields and constants and use C and C. Match using C is also supported. Examples: select * from salaries where name = "Smith" select first,last from people where login = "ftp" or uid = 1324 select id,first_name,last_name from employ where last_name like 'Ki%' select id,name from employ where id = ? You can use bind parameters in the where clause, as the last example shows. The actual value has to be supplied via bind_param or in the call to execute or do, see DBI(3) for details. To check for NULL values in the C expression, use C and C, not C. =head2 delete delete from table [ where condition ] The C condition is the same as for B