Search-QueryParser-0.95000755000000000000 014014544035 15463 5ustar00unknownunknown000000000000Search-QueryParser-0.95/Build.PL000444000000000000 123214014543043 17110 0ustar00unknownunknown000000000000use strict; use warnings; use Module::Build; my $builder = Module::Build->new( module_name => 'Search::QueryParser', license => 'perl', dist_author => 'Laurent Dami ', dist_version_from => 'lib/Search/QueryParser.pm', requires => { perl => 5.008, constant => 0, }, build_requires => { 'Test::More' => 0, }, add_to_cleanup => [ 'Search-QueryParser-*' ], meta_merge => { resources => { repository => 'https://github.com/damil/Search-QueryParser', } }, ); $builder->create_build_script(); Search-QueryParser-0.95/Changes000444000000000000 124614014541405 17114 0ustar00unknownunknown000000000000Revision history for Perl extension Search::QueryParser. 0.95 21.02.2021 - fix RT#52814 (report an error when unable to parse the entire query string) - thanks larryl - fix RT#129981 (explicit fields in parenthesis when defField is present) - thanks Christopher Causer 0.94 30.09.2009 - patch from Peter Karman to add 'defField' option - add support for quoting field names (RT 47423) 0.93 04.02.2008 - fix RT#32840 (word boolean connectors) - thanks Frank Wesemann 0.92 04.12.2007 - added '#' operator (for matching against sets of integers) 0.91 25.05.2005 - quoted "exact phrases" are no longer parsed as arrays (was not useful) Search-QueryParser-0.95/MANIFEST000444000000000000 14514014543553 16735 0ustar00unknownunknown000000000000Changes Build.PL MANIFEST README t/Search-QueryParser.t lib/Search/QueryParser.pm META.yml META.json Search-QueryParser-0.95/META.json000444000000000000 230214014544035 17236 0ustar00unknownunknown000000000000{ "abstract" : "parses a query string into a data structure", "author" : [ "Laurent Dami " ], "dynamic_config" : 1, "generated_by" : "Module::Build version 0.4231", "license" : [ "perl_5" ], "meta-spec" : { "url" : "http://search.cpan.org/perldoc?CPAN::Meta::Spec", "version" : 2 }, "name" : "Search-QueryParser", "prereqs" : { "build" : { "requires" : { "Test::More" : "0" } }, "configure" : { "requires" : { "Module::Build" : "0.42" } }, "runtime" : { "requires" : { "constant" : "0", "perl" : "5.008" } } }, "provides" : { "Search::QueryParser" : { "file" : "lib/Search/QueryParser.pm", "version" : "0.95" } }, "release_status" : "stable", "resources" : { "license" : [ "http://dev.perl.org/licenses/" ], "repository" : { "url" : "https://github.com/damil/Search-QueryParser" } }, "version" : "0.95", "x_serialization_backend" : "JSON::PP version 4.05" } Search-QueryParser-0.95/META.yml000444000000000000 136714014544035 17100 0ustar00unknownunknown000000000000--- abstract: 'parses a query string into a data structure' author: - 'Laurent Dami ' build_requires: Test::More: '0' configure_requires: Module::Build: '0.42' dynamic_config: 1 generated_by: 'Module::Build version 0.4231, CPAN::Meta::Converter version 2.150010' license: perl meta-spec: url: http://module-build.sourceforge.net/META-spec-v1.4.html version: '1.4' name: Search-QueryParser provides: Search::QueryParser: file: lib/Search/QueryParser.pm version: '0.95' requires: constant: '0' perl: '5.008' resources: license: http://dev.perl.org/licenses/ repository: https://github.com/damil/Search-QueryParser version: '0.95' x_serialization_backend: 'CPAN::Meta::YAML version 0.018' Search-QueryParser-0.95/README000444000000000000 275712014717473 16521 0ustar00unknownunknown000000000000Search-QueryParser version 1.0 ============================== This module parses a query string into a data structure to be handled by external search engines. For examples of such engines, see File::Tabular and Search::Indexer. The query string can contain simple terms, "exact phrases", field names and comparison operators, '+/-' prefixes, parentheses, and boolean connectors. The parser can be parameterized by regular expressions for specific notions of "term", "field name" or "operator" ; see the "new" method. The parser has no support for lemmatization or other term transformations : these should be done externally, before passing the query data structure to the search engine. The data structure resulting from a parsed query is a tree of terms and operators, as described in the "parse" method. The interpretation of the structure is up to the external search engine that will receive the parsed query ; the present module does not make any assumption about what it means to be "equal" or to "contain" a term. INSTALLATION To install this module type the following: perl Makefile.PL make make test make install DEPENDENCIES None RELATED MODULES File::Tabular (management of flat text files containg data organised in rows and columns) and Search::Indexer (full-text indexing) make use of the present module. COPYRIGHT AND LICENCE Copyright (C) 2005 by Laurent Dami. This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself. Search-QueryParser-0.95/lib000755000000000000 014014544035 16231 5ustar00unknownunknown000000000000Search-QueryParser-0.95/lib/Search000755000000000000 014014544035 17436 5ustar00unknownunknown000000000000Search-QueryParser-0.95/lib/Search/QueryParser.pm000444000000000000 3246514014543447 22453 0ustar00unknownunknown000000000000package Search::QueryParser; use strict; use warnings; use locale; our $VERSION = "0.95"; =head1 NAME Search::QueryParser - parses a query string into a data structure suitable for external search engines =head1 SYNOPSIS my $qp = new Search::QueryParser; my $s = '+mandatoryWord -excludedWord +field:word "exact phrase"'; my $query = $qp->parse($s) or die "Error in query : " . $qp->err; $someIndexer->search($query); # query with comparison operators and implicit plus (second arg is true) $query = $qp->parse("txt~'^foo.*' date>='01.01.2001' date<='02.02.2002'", 1); # boolean operators (example below is equivalent to "+a +(b c) -d") $query = $qp->parse("a AND (b OR c) AND NOT d"); # subset of rows $query = $qp->parse("Id#123,444,555,666 AND (b OR c)"); =head1 DESCRIPTION This module parses a query string into a data structure to be handled by external search engines. For examples of such engines, see L and L. The query string can contain simple terms, "exact phrases", field names and comparison operators, '+/-' prefixes, parentheses, and boolean connectors. The parser can be parameterized by regular expressions for specific notions of "term", "field name" or "operator" ; see the L method. The parser has no support for lemmatization or other term transformations : these should be done externally, before passing the query data structure to the search engine. The data structure resulting from a parsed query is a tree of terms and operators, as described below in the L method. The interpretation of the structure is up to the external search engine that will receive the parsed query ; the present module does not make any assumption about what it means to be "equal" or to "contain" a term. =head1 QUERY STRING The query string is decomposed into "items", where each item has an optional sign prefix, an optional field name and comparison operator, and a mandatory value. =head2 Sign prefix Prefix '+' means that the item is mandatory. Prefix '-' means that the item must be excluded. No prefix means that the item will be searched for, but is not mandatory. As far as the result set is concerned, C<+a +b c> is strictly equivalent to C<+a +b> : the search engine will return documents containing both terms 'a' and 'b', and possibly also term 'c'. However, if the search engine also returns relevance scores, query C<+a +b c> might give a better score to documents containing also term 'c'. See also section L below, which is another way to combine items into a query. =head2 Field name and comparison operator Internally, each query item has a field name and comparison operator; if not written explicitly in the query, these take default values C<''> (empty field name) and C<':'> (colon operator). Operators have a left operand (the field name) and a right operand (the value to be compared with); for example, C means "search documents containing term 'bar' in field 'foo'", whereas C means "search documents where field 'foo' has exact value 'bar'". Here is the list of admitted operators with their intended meaning : =over =item C<:> treat value as a term to be searched within field. This is the default operator. =item C<~> or C<=~> treat value as a regex; match field against the regex. =item C negation of above =item C<==> or C<=>, C=>, C=>, C, C>, C> classical relational operators =item C<#> Inclusion in the set of comma-separated integers supplied on the right-hand side. =back Operators C<:>, C<~>, C<=~>, C and C<#> admit an empty left operand (so the field name will be C<''>). Search engines will usually interpret this as "any field" or "the whole data record". =head2 Value A value (right operand to a comparison operator) can be =over =item * just a term (as recognized by regex C, see L method below) =item * A quoted phrase, i.e. a collection of terms within single or double quotes. Quotes can be used not only for "exact phrases", but also to prevent misinterpretation of some values : for example C<-2> would mean "value '2' with prefix '-'", in other words "exclude term '2'", so if you want to search for value -2, you should write C<"-2"> instead. In the last example of the synopsis, quotes were used to prevent splitting of dates into several search terms. =item * a subquery within parentheses. Field names and operators distribute over parentheses, so for example C is equivalent to C. Nested field names such as C are not allowed. Sign prefixes do not distribute : C<+(foo bar) +bie> is not equivalent to C<+foo +bar +bie>. =back =head2 Boolean connectors Queries can contain boolean connectors 'AND', 'OR', 'NOT' (or their equivalent in some other languages). This is mere syntactic sugar for the '+' and '-' prefixes : C is translated into C<+a +b>; C is translated into C<(a b)>; C is translated into C<-a>. C<+a OR b> does not make sense, but it is translated into C<(a b)>, under the assumption that the user understands "OR" better than a '+' prefix. C<-a OR b> does not make sense either, but has no meaningful approximation, so it is rejected. Combinations of AND/OR clauses must be surrounded by parentheses, i.e. C<(a AND b) OR c> or C are allowed, but C is not. =head1 METHODS =over =cut use constant DEFAULT => { rxTerm => qr/[^\s()]+/, rxField => qr/\w+/, rxOp => qr/==|<=|>=|!=|=~|!~|[:=<>~#]/, # longest ops first ! rxOpNoField => qr/=~|!~|[~:#]/, # ops that admit an empty left operand rxAnd => qr/AND|ET|UND|E/, rxOr => qr/OR|OU|ODER|O/, rxNot => qr/NOT|PAS|NICHT|NON/, defField => "", }; =item new new(rxTerm => qr/.../, rxOp => qr/.../, ...) Creates a new query parser, initialized with (optional) regular expressions : =over =item rxTerm Regular expression for matching a term. Of course it should not match the empty string. Default value is C. A term should not be allowed to include parenthesis, otherwise the parser might get into trouble. =item rxField Regular expression for matching a field name. Default value is C (meaning of C<\w> according to C). =item rxOp Regular expression for matching an operator. Default value is C=|E=|!=|=~|!~|:|=|E|E|~/>. Note that the longest operators come first in the regex, because "alternatives are tried from left to right" (see L) : this is to avoid C=3> being parsed as C '=3'>. =item rxOpNoField Regular expression for a subset of the operators which admit an empty left operand (no field name). Default value is C. Such operators can be meaningful for comparisons with "any field" or with "the whole record" ; the precise interpretation depends on the search engine. =item rxAnd Regular expression for boolean connector AND. Default value is C. =item rxOr Regular expression for boolean connector OR. Default value is C. =item rxNot Regular expression for boolean connector NOT. Default value is C. =item defField If no field is specified in the query, use I. The default is the empty string "". =back =cut sub new { my $class = shift; my $args = ref $_[0] eq 'HASH' ? $_[0] : {@_}; # create object with default values my $self = bless {}, $class; $self->{$_} = $args->{$_} || DEFAULT->{$_} foreach qw(rxTerm rxField rxOp rxOpNoField rxAnd rxOr rxNot defField); return $self; } =item parse $q = $queryParser->parse($queryString, $implicitPlus); Returns a data structure corresponding to the parsed string. The second argument is optional; if true, it adds an implicit '+' in front of each term without prefix, so C is equivalent to C. This is often seen in common WWW search engines as an option "match all words". The return value has following structure : { '+' => [{field=>'f1', op=>':', value=>'v1', quote=>'q1'}, {field=>'f2', op=>':', value=>'v2', quote=>'q2'}, ...], '' => [...], '-' => [...] } In other words, it is a hash ref with 3 keys C<'+'>, C<''> and C<'-'>, corresponding to the 3 sign prefixes (mandatory, ordinary or excluded items). Each key holds either a ref to an array of items, or C (no items with this prefix in the query). An I is a hash ref containing =over =item C scalar, field name (may be the empty string) =item C scalar, operator =item C scalar, character that was used for quoting the value ('"', "'" or undef) =item C Either =over =item * a scalar (simple term), or =item * a recursive ref to another query structure. In that case, C is necessarily C<'()'> ; this corresponds to a subquery in parentheses. =back =back In case of a parsing error, C returns C; method L can be called to get an explanatory message. =cut sub parse { my $self = shift; my $s_orig = $_[0]; my ($parsedQuery, $restOfString) = $self->_parse(@_); if ($restOfString) { $self->{err} ||= "[$s_orig] : parsed into " . $self->unparse($parsedQuery) . ", but unable to parse [$restOfString]"; return undef; } return $parsedQuery; } sub _parse{ # returns ($parsedQuery, $restOfString) my ($self, $s, $implicitPlus, $parentField, $parentOp) = @_; # last 2 args only for recursive calls my $q = {}; my $preBool = ''; my $err = undef; my $s_orig = $s; $s =~ s/^\s+//; # remove leading spaces LOOP : while ($s) { # while query string is not empty for ($s) { # temporary alias to $_ for easier regex application my $sign = $implicitPlus ? "+" : ""; my $explicit_field; my $op = $parentOp || ":"; last LOOP if m/^\)/; # return from recursive call if meeting a ')' # try to parse sign prefix ('+', '-' or 'NOT') if (s/^(\+|-)\s*//) { $sign = $1; } elsif (s/^($self->{rxNot})\b\s*//) { $sign = '-'; } # try to parse field name and operator if (s/^"($self->{rxField})"\s*($self->{rxOp})\s*// # "field name" and op or s/^'($self->{rxField})'\s*($self->{rxOp})\s*// # 'field name' and op or s/^($self->{rxField})\s*($self->{rxOp})\s*// # field name and op or s/^()($self->{rxOpNoField})\s*//) { # no field, just op ($explicit_field, $op) = ($1, $2); $err = "field '$explicit_field' inside '$parentField'", last LOOP if $parentField; } # target field, either explicit or implicit my $field = $explicit_field || $parentField || $self->{defField}; # parse a value (single term or quoted list or parens) my $subQ = undef; if (s/^(")([^"]*?)"\s*// or s/^(')([^']*?)'\s*//) { # parse a quoted string. my ($quote, $val) = ($1, $2); $subQ = {field=>$field, op=>$op, value=>$val, quote=>$quote}; } elsif (s/^\(\s*//) { # parse parentheses my ($r, $s2) = $self->_parse($s, $implicitPlus, $explicit_field, $op); $err = $self->err, last LOOP if not $r; $s = $s2; $s =~ s/^\)\s*// or $err = "no matching ) ", last LOOP; $subQ = {field=>'', op=>'()', value=>$r}; } elsif (s/^($self->{rxTerm})\s*//) { # parse a single term $subQ = {field=>$field, op=>$op, value=>$1}; } # deal with boolean connectors my $postBool = ''; if (s/^($self->{rxAnd})\b\s*//) { $postBool = 'AND' } elsif (s/^($self->{rxOr})\b\s*//) { $postBool = 'OR' } $err = "cannot mix AND/OR in requests; use parentheses", last LOOP if $preBool and $postBool and $preBool ne $postBool; my $bool = $preBool || $postBool; $preBool = $postBool; # for next loop # insert subquery in query structure if ($subQ) { $sign = '' if $sign eq '+' and $bool eq 'OR'; $sign = '+' if $sign eq '' and $bool eq 'AND'; $err = 'operands of "OR" cannot have "-" or "NOT" prefix', last LOOP if $sign eq '-' and $bool eq 'OR'; push @{$q->{$sign}}, $subQ; } else { $err = "unexpected string in query : $_", last LOOP if $_; $err = "missing value after $field $op" , last LOOP if $field; } } } $err ||= "no positive value in query" unless $q->{'+'} or $q->{''}; $self->{err} = $err ? "[$s_orig] : $err" : ""; $q = undef if $err; return ($q, $s); } =item err $msg = $queryParser->err; Message describing the last parse error =cut sub err { my $self = shift; return $self->{err}; } =item unparse $s = $queryParser->unparse($query); Returns a string representation of the C<$query> data structure. =cut sub unparse { my $self = shift; my $q = shift; my @subQ; foreach my $prefix ('+', '', '-') { next if not $q->{$prefix}; push @subQ, $prefix . $self->unparse_subQ($_) foreach @{$q->{$prefix}}; } return join " ", @subQ; } sub unparse_subQ { my $self = shift; my $subQ = shift; return "(" . $self->unparse($subQ->{value}) . ")" if $subQ->{op} eq '()'; my $quote = $subQ->{quote} || ""; return "$subQ->{field}$subQ->{op}$quote$subQ->{value}$quote"; } =back =head1 AUTHOR Laurent Dami, Elaurent.dami AT etat ge chE =head1 COPYRIGHT AND LICENSE Copyright (C) 2005, 2007 by Laurent Dami. This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =cut 1; Search-QueryParser-0.95/t000755000000000000 014014544035 15726 5ustar00unknownunknown000000000000Search-QueryParser-0.95/t/Search-QueryParser.t000444000000000000 415014014540176 21737 0ustar00unknownunknown000000000000use strict; use warnings; use Test::More; BEGIN { use_ok('Search::QueryParser') }; my $qp = Search::QueryParser->new; isa_ok($qp, 'Search::QueryParser'); my $s = '+mandatoryWord -excludedWord +field:word "exact phrase"'; my $q = $qp->parse($s); isa_ok($q, 'HASH'); is($qp->unparse($q), '+:mandatoryWord +field:word :"exact phrase" -:excludedWord', "mixed features"); # query with comparison operators and implicit plus (second arg is true) $q = $qp->parse("txt~'^foo.*' date>='01.01.2001' date<='02.02.2002'", 1); is($qp->unparse($q), "+txt~'^foo.*' +date>='01.01.2001' +date<='02.02.2002'", "comparison operators and implicit plus"); # boolean operators (example below is equivalent to "+a +(b c) -d") $q = $qp->parse("a AND (b OR c) AND NOT d"); is($qp->unparse($q), '+:a +(:b :c) -:d', "boolean operators"); # '#' operator $q = $qp->parse("+foo#12,34,567,890,1000 +bar#9876 #54321"); is($qp->unparse($q), "+foo#12,34,567,890,1000 +bar#9876 #54321", "'#' operator"); # boolean operators $q = $qp->parse("Prince Edward"); # test bug RT#32840 is($qp->unparse($q), ':Prince :Edward', "RT32840"); $q = $qp->parse("a E(b)"); is($qp->unparse($q), '+:a +(:b)', "a E(b)"); # quoted field $q = $qp->parse(q{"LastEdit">"2009-01-01" 'FirstEdit'<"2008-01-01"}); is($qp->unparse($q), q{LastEdit>"2009-01-01" FirstEdit<"2008-01-01"}, "quoted field"); # default field $qp = Search::QueryParser->new(defField => 'def'); $q = $qp->parse("foo +bar -buz"); is($qp->unparse($q), '+def:bar def:foo -def:buz', "default field"); $q = $qp->parse("foo:foo bar buz:(boo bing)"); is($qp->unparse($q), 'foo:foo def:bar (buz:boo buz:bing)', "parent field"); $q = $qp->parse("foo:(bar:buz)"); ok(!$q, 'parse error'); like($qp->err, qr/'bar' inside 'foo'/, 'ERR parent field'); $q = $qp->parse("(domain:example.org OR domain:example.com)"); is($qp->unparse($q), '(domain:example.org domain:example.com)', "explicit field within parenthesis"); $q = $qp->parse("foo bar )and garbage"); ok(!$q, 'parse error'); like($qp->err, qr/unable to parse/, 'could not parse entire query'); done_testing;