WWW-Search-2.517/0000755000175000017500000000000013101353062012757 5ustar martinmartinWWW-Search-2.517/Makefile.PL0000644000175000017500000000605513101347623014745 0ustar martinmartin# This -*- cperl -*- program writes the Makefile for WWW::Search use lib '.'; use inc::Module::Install; version(2.517); all_from('lib/WWW/Search.pm'); perl_version(5.005); license('perl'); install_script('Programs/AutoSearch'); install_script('Programs/WebSearch'); # These are needed for AutoSearch: requires('Data::Dumper'); requires('Date::Manip'); recommends('Email::Send'); recommends('Email::MIME'); recommends('Email::MIME::Creator'); requires('File::Copy'); requires('Getopt::Long' => 2.24); requires('POSIX'); # These are needed for WebSearch: requires('Getopt::Long' => 2.24); # These are needed for WWW::Search base module and sub-modules: requires('CGI'); requires('File::Basename'); requires('File::Find'); requires('File::Path'); requires('File::Slurp'); requires('File::Spec'); requires('HTML::Parser' => 2.23); requires('HTML::TreeBuilder'); # HTML-Tree-0.51 is good requires('LWP::UserAgent' => 2.0); requires('LWP::MemberMixin'); requires('Net::Domain'); requires('Pod::Usage'); requires('URI'); requires('URI::Escape'); requires('User'); # These are used by WWW::Search::Test: requires('Bit::Vector'); requires('Date::Manip'); requires('File::Path'); requires('File::Slurp'); requires('File::Spec'); requires('Test::More'); # These are needed during `make test`: test_requires('File::Temp'); test_requires('IO::Capture::Stderr'); test_requires('Test::File'); test_requires('Test::More'); recommends('Test::Pod'); recommends('Test::Pod::Coverage'); # I think there's a bug in Module::CoreList or something, because this # comes up missing on some platforms: test_requires('Test::Simple'); # For the defunct test_parsing rule, we need a version that sets # FULLPERLRUNINST: test_requires('ExtUtils::MM_Unix' => 1.41); sub MY::preamble { # This does not work because MakeMaker apparently does not have a # preamble method. return <<'PREAMBLE'; AutoSearch :: FORCE /bin/rm -f AutoSearch PREAMBLE } # MY::preamble my $ret = q{}; if (1) { $ret .= <<'PART0'; coverage : - cover -delete $(MAKE) HARNESS_PERL_SWITCHES=-MDevel::Cover test cover PART0 } # if if (0) { $ret .= <<'PART2'; test_parsing : pure_all $(FULLPERLRUNINST) t/test_parsing.pl $(TEST_ARGS) PART2 } # if if (1) { $ret .= <<'PART3'; tags: cat MANIFEST | etags - PART3 } # if postamble $ret; use Env; if ($ENV{MTHURN_PERL_DEV}) { warn "DDD This is author's development environment\n"; use_ptar(); } # if WriteAll; if (0 && $ENV{MTHURN_PERL_DEV}) { print "DDD This is author's development environment\n"; my $sFnameMakefile = q{Makefile}; eval "use File::Slurp"; my $s = read_file($sFnameMakefile); if ($s !~ s/(DIRFILESEP\s*=\s*).+$/$1\//m) { print " EEE did not find DIRFILESEP in Makefile\n"; } # if if ($s !~ s/(pm_to_blib\()\{\{\@ARGV\}\}/$1 . '{@ARGV}'/e) { print " EEE did not find pm_to_blib@ARGV in Makefile\n"; } # if if ($s !~ s/(from_to\s+=>\s+)\{\{\@ARGV\}\}/$1 . '{@ARGV}'/e) { print " EEE did not find from_to@ARGV in Makefile\n"; } # if write_file($sFnameMakefile, $s); } # if __END__ WWW-Search-2.517/README0000644000175000017500000003345613101347451013657 0ustar martinmartin WWW::Search and AutoSearch and WebSearch ======================================== WHAT IS NEW IN WWW::Search 2.507? (2008-11-29) ----------------------------------------------- Overview of changes: * tweaked WWW::Search::Test For details, see the Changes file (updates after 2008-11-27) and/or the pod of each affected module. WHAT IS WWW::Search? -------------------- WWW::Search is a collection of Perl modules which provide an API to search engines on the world-wide web (and similar HTTP-based search engines). Backends for many specific engines can be obtained separately, such as AltaVista, Ebay, HotBot, and Yahoo. This distribution includes two applications built from this library: AutoSearch, a program to automate tracking of search results over time; and WebSearch, a small demonstration program to drive the library. By default, WWW::Search does NOT try to emulate the default search that you would get with each search engine's GUI. I.e. WWW::Search does NOT necessarily return the same results you would get by visiting the search engine's web page. WWW::Search performs the search in a way that is efficient and convenient for text processing. This might include using the "advanced search" interface; getting "text-only" pages; making "OR" the default query term operator instead of "AND"; ungrouping same-site results; making sure descriptions are turned on; and increasing the number of hits per page, among other tricks. A few backends implement the method gui_query(), which does attempt to get the same results as searches from the engine's default web page; after installation, see `perldoc WWW::Search::` for details. Because WWW::Search depends on parsing the HTML output of web search engines, it will fail if the search engine operators change their format. This base WWW::Search distribution contains a few backends that can be used for testing. The backend Null::Empty always returns no results; Null::Error always returns an error condition; and Null::Count returns a number of sample results that you can specify. After installation, consult `perldoc WWW::Search::Null::Count` et al. for details. The following real, working backends (and more!) are registered at CPAN independently (not included with this WWW::Search distribution): AltaVista http://www.perl.com/CPAN/modules/by-module/WWW/MTHURN/ AP in the WWW::Search::News distribution Ebay http://www.perl.com/CPAN/modules/by-module/WWW/MTHURN/ Ebay::Motors in the Ebay distribution Ebay::Complete in the WWW-Ebay distribution Ebay::Mature in the WWW-Ebay distribution Euroseek http://www.perl.com/CPAN/modules/by-module/WWW/JSMYSER/ Go http://www.perl.com/CPAN/modules/by-module/WWW GoTo http://www.perl.com/CPAN/modules/by-module/WWW/JSMYSER/ HotBot http://www.perl.com/CPAN/modules/by-module/WWW/ LookSmart http://www.perl.com/CPAN/modules/by-module/WWW/JSMYSER Lycos http://www.perl.com/CPAN/modules/by-module/WWW/MTHURN/ Magellan http://www.perl.com/CPAN/modules/by-module/WWW/MTHURN/ Monster http://www.perl.com/CPAN/modules/by-module/WWW Nomade http://www.perl.com/CPAN/modules/by-module/WWW NorthernLight http://www.perl.com/CPAN/modules/by-module/WWW/JSMYSER/ OpenDirectory http://www.perl.com/CPAN/modules/by-module/WWW/JSMYSER/ PRWire http://www.perl.com/CPAN/modules/by-module/WWW Pubmed http://www.perl.com/CPAN/modules/by-module/WWW Yahoo http://www.perl.com/CPAN/modules/by-module/WWW ZDNet http://www.perl.com/CPAN/modules/by-module/WWW/JSMYSER/ WashPost in the WWW::Search::News distribution There are several backends which I don't even know if they work or not; I put them in a distribution called WWW::Search::Backends which you can find at http://www.perl.com/CPAN/modules/by-module/WWW There are even more backends available for manual download and installation at http://www.idexer.com/backends/ (thanks to Jim Smyser). REQUIREMENTS ------------ WWW::Search requires Perl5, the libwww-perl module suite, the URI module, the HTML::Parser module, and several other modules (see Makefile.PL for a complete list). For information on Perl5, see . For modules, see . AVAILABILITY ------------ The latest version of WWW::Search is always available on CPAN. Here is a good URL for finding it: http://www.perl.com/CPAN/modules/by-module/WWW INSTALLATION ------------ It is highly recommended that you use CPAN.pm to install WWW::Search. It will automatically install all the prerequisite modules and put everything in the right places. While connected to the internet, just type perl -MCPAN -e 'install WWW::Search' Otherwise, you can install WWW::Search as you would any perl module library, by running the following commands in the WWW-Search-x.xx directory after unpacking the archive (and after installing all the prerequisite modules): perl Makefile.PL make test make install On Win32, use 'nmake' instead of 'make' in the above sequence of commands. If you want to install a private copy of WWW::Search in your home directory, then you should do the installation with something like these commands: perl Makefile.PL INSTALLDIRS=perl PREFIX=/my/perl/lib make test make pure_perl_install UNINST=1 Don't forget to add /my/perl/lib to your PERL5LIB environment variable (or use lib '/my/perl/lib'; or unshift @INC, '/my/perl/lib')! WHAT IS AutoSearch? ------------------- The WWW::Search distribution includes a search client called AutoSearch. AutoSearch performs a web-based search and puts the results set into a series of web pages. It periodically updates this web page, indicating how the search changes over time. Sample output from AutoSearch can be found at . Output format is configurable. See `perldoc AutoSearch` for details, or the DEMONSTRATION section below for quick-start instructions. DISCUSSION, BUG REPORTS, AND IMPROVEMENTS ----------------------------------------- When submitting a bug report or request for help, please remember to include: - the operating system name and version - the version of perl (this can be found from `perl -v`) - the version of WWW::Search (this can be found from `WebSearch -V`) - the version of the backend (this can be found from `WebSearch -e Yahoo -V`) - the code you ran to produce the error (PLEASE cut-and-paste, do not just summarize!) - actual output showing the error (PLEASE cut-and-paste, do not just summarize!) There is a mailing list for WWW::Search discussion. To subscribe, send "subscribe info-www-search" as the body of a message to . If you use WWW::Search at all, you should subscribe to the mailing list. Feedback about WWW::Search is encouraged. If you're using it for a neat application, please let us know. If you'd like to (or have already) implement and publish a new backend for WWW::Search, let us know so we don't duplicate work. Backend-related bug reports ("backend ABC doesn't work") should be sent to the author/maintainer of the backend (backend maintainers are identified in the corresponding man page). All other feedback, bug reports, fixes, and new backends (if you are unwilling or unable to publish them on CPAN yourself) should be sent to Martin Thurn . When sending e-mail, please please put [WWW-Search] in the subject line (or risk me losing the message among the spam). DEMONSTRATION ------------- After installing the distribution, connect to the internet and type: # WebSearch '"Your Name Here"' or, if you are on Win32: C:\> WebSearch "\"Your Name Here\"" to see where your name is mentioned on the web. Then try: # AutoSearch -n me_on_the_web -s '"Your Name Here"' me # netscape me/index.html & or, if you are on Win32: C:\> AutoSearch -n me_on_the_web -s "\"Your Name Here\"" me C:\> netscape me\index.html If you are on UNIX you can add 0 3 * * 1 /usr/local/bin/AutoSearch /path/to/me to your crontab to update this search every week at 3:00 Monday morning. If you install WWW::Search::Ebay, and add the --mail option to AutoSearch, you'll have your own private replacement for Ebay's personal search service... WITHOUT the three-query limit! DOCUMENTATION ------------- See `perldoc WWW::Search` after installation for an overview of the library. POD-style documentation is also included in all modules and programs, so you can do `perldoc WebSearch` and `perldoc AutoSearch` after installation. FUTURE PLANS ------------ Some things we need, and ideas for new features: - more robust test mechanism (i.e. more than just counting the number of URLs returned) (e.g. look at the various values and make sure they're being parsed correctly) (e.g. change_date() is really a date, URL is really a URL, URL is not double-encoded, results are not duplicated, etc.) Contact - updates to each backend to implement the submit() method. Contact each backend's maintainer. - updates to each backend that will force WWW::Search to perform the same search as the engine's default web GUI (I'm looking for contributions of the precise arguments that will produce such a search for each engine; i.e. the hash that should be passed as the second argument to native_query). Contact - test cases for WebSearch. Contact - test cases for AutoSearch. Contact - use LWP::ParallelUA to speed up multiple backend search requests (I'm trying to decide what the API interface will look like; please send suggestions). Contact - add a "language" parameter to the WWW::Search object? We would need a critical mass of backends/engines that can search multiple languages before this would be useful. - more widespread use of result tags such as description, date, size, etc. across all backends. Contact backend maintainers. - a freeze/restore interface to suspend and resume in-progress queries. - more backends! Contributions are always welcome. Send me e-mail if you plan a new backend, or to discuss architectural changes (to avoid duplicating work). Contact SUPPORT AND CREDITS ------------------- For more information about Martin Thurn's Perl Modules, visit http://www.sandcrawler.com/SWB/cpan-modules.html The WWW::Search architecture was originally written by John Heidemann, with feedback from other contributors listed below. NOTE: This list is no longer updated; consult the on-line documentation (i.e. man pages) to find out who is currently maintaining each component. PLATFORM SUPPORT: Unix John Heidemann Windows Jim Smyser (see ) COOKIE & HTTP_REFERER TESTING: Jerry Hermel APPLICATIONS: WebSearch John Heidemann AutoSearch William Scheding BACKENDS: AltaVista John Heidemann Dejanews Cesare Feroldi de Rosa and Martin Thurn Crawler Andreas Borchert Excite GLen Pringle and Martin Thurn ExciteForWebServers Paul Lindner Fireball Andreas Borchert FolioViews Paul Lindner Gopher Paul Lindner HotBot William Scheding and Martin Thurn HotFiles Jim Smyser Infoseek Cesare Feroldi de Rosa and Martin Thurn Livelink Paul Lindner Lycos William Scheding and John Heidemann, Martin Thurn Magellan Martin Thurn MSIndexServer Paul Lindner NorthernLight Jim Smyser Null Paul Lindner OpenDirectory Jim Smyser PLWeb Paul Lindner Profusion Jim Smyser Search97 Paul Lindner SFgate Paul Lindner Simple Paul Lindner Snap Jim Smyser Verity Paul Lindner WebCrawler Martin Thurn Yahoo William Scheding and Martin Thurn ZDNet Jim Smyser AutoSearch is based on an earlier implementation by Kedar Jog with advice from Joe Touch . Bugs and extensions (to the software and documentation) have been identified by William Scheding , T. V. Raman (proxy support), C. Feroldi , Larry Virden , Paul Lindner , Guy Decoux , R Chandrasekar (Mickey) , Martin Thurn , Chris Nandor , Martin Valldeby , Jim Smyser , Darren Stalder , Neil Bowers , Ave Wrigley , Andreas Borchert , Jim Smyser . Bugs have reported by Joseph McDonald , Juan Jose Amor , Bowen Dwelle , Vassilis Papadimos , Vidyut Luther , Chris P. Acantilado . COPYRIGHT --------- Copyright (c) 1996 University of Southern California. All rights reserved. Redistribution and use in source and binary forms are permitted provided that the above copyright notice and this paragraph are duplicated in all such forms and that any documentation, advertising materials, and other materials related to such distribution and use acknowledge that the software was developed by the University of Southern California, Information Sciences Institute. The name of the University may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. Portions of this README were derived from the README for libwww-perl. WWW-Search-2.517/inc/0000755000175000017500000000000013101353062013530 5ustar martinmartinWWW-Search-2.517/inc/Module/0000755000175000017500000000000013101353062014755 5ustar martinmartinWWW-Search-2.517/inc/Module/Install/0000755000175000017500000000000013101353062016363 5ustar martinmartinWWW-Search-2.517/inc/Module/Install/WriteAll.pm0000644000175000017500000000237613101350056020454 0ustar martinmartin#line 1 package Module::Install::WriteAll; use strict; use Module::Install::Base (); use vars qw{$VERSION @ISA $ISCORE}; BEGIN { $VERSION = '1.17'; @ISA = qw{Module::Install::Base}; $ISCORE = 1; } sub WriteAll { my $self = shift; my %args = ( meta => 1, sign => 0, inline => 0, check_nmake => 1, @_, ); $self->sign(1) if $args{sign}; $self->admin->WriteAll(%args) if $self->is_admin; $self->check_nmake if $args{check_nmake}; unless ( $self->makemaker_args->{PL_FILES} ) { # XXX: This still may be a bit over-defensive... unless ($self->makemaker(6.25)) { $self->makemaker_args( PL_FILES => {} ) if -f 'Build.PL'; } } # Until ExtUtils::MakeMaker support MYMETA.yml, make sure # we clean it up properly ourself. $self->realclean_files('MYMETA.yml'); if ( $args{inline} ) { $self->Inline->write; } else { $self->Makefile->write; } # The Makefile write process adds a couple of dependencies, # so write the META.yml files after the Makefile. if ( $args{meta} ) { $self->Meta->write; } # Experimental support for MYMETA if ( $ENV{X_MYMETA} ) { if ( $ENV{X_MYMETA} eq 'JSON' ) { $self->Meta->write_mymeta_json; } else { $self->Meta->write_mymeta_yaml; } } return 1; } 1; WWW-Search-2.517/inc/Module/Install/Metadata.pm0000644000175000017500000004330213101350056020443 0ustar martinmartin#line 1 package Module::Install::Metadata; use strict 'vars'; use Module::Install::Base (); use vars qw{$VERSION @ISA $ISCORE}; BEGIN { $VERSION = '1.17'; @ISA = 'Module::Install::Base'; $ISCORE = 1; } my @boolean_keys = qw{ sign }; my @scalar_keys = qw{ name module_name abstract version distribution_type tests installdirs }; my @tuple_keys = qw{ configure_requires build_requires requires recommends bundles resources }; my @resource_keys = qw{ homepage bugtracker repository }; my @array_keys = qw{ keywords author }; *authors = \&author; sub Meta { shift } sub Meta_BooleanKeys { @boolean_keys } sub Meta_ScalarKeys { @scalar_keys } sub Meta_TupleKeys { @tuple_keys } sub Meta_ResourceKeys { @resource_keys } sub Meta_ArrayKeys { @array_keys } foreach my $key ( @boolean_keys ) { *$key = sub { my $self = shift; if ( defined wantarray and not @_ ) { return $self->{values}->{$key}; } $self->{values}->{$key} = ( @_ ? $_[0] : 1 ); return $self; }; } foreach my $key ( @scalar_keys ) { *$key = sub { my $self = shift; return $self->{values}->{$key} if defined wantarray and !@_; $self->{values}->{$key} = shift; return $self; }; } foreach my $key ( @array_keys ) { *$key = sub { my $self = shift; return $self->{values}->{$key} if defined wantarray and !@_; $self->{values}->{$key} ||= []; push @{$self->{values}->{$key}}, @_; return $self; }; } foreach my $key ( @resource_keys ) { *$key = sub { my $self = shift; unless ( @_ ) { return () unless $self->{values}->{resources}; return map { $_->[1] } grep { $_->[0] eq $key } @{ $self->{values}->{resources} }; } return $self->{values}->{resources}->{$key} unless @_; my $uri = shift or die( "Did not provide a value to $key()" ); $self->resources( $key => $uri ); return 1; }; } foreach my $key ( grep { $_ ne "resources" } @tuple_keys) { *$key = sub { my $self = shift; return $self->{values}->{$key} unless @_; my @added; while ( @_ ) { my $module = shift or last; my $version = shift || 0; push @added, [ $module, $version ]; } push @{ $self->{values}->{$key} }, @added; return map {@$_} @added; }; } # Resource handling my %lc_resource = map { $_ => 1 } qw{ homepage license bugtracker repository }; sub resources { my $self = shift; while ( @_ ) { my $name = shift or last; my $value = shift or next; if ( $name eq lc $name and ! $lc_resource{$name} ) { die("Unsupported reserved lowercase resource '$name'"); } $self->{values}->{resources} ||= []; push @{ $self->{values}->{resources} }, [ $name, $value ]; } $self->{values}->{resources}; } # Aliases for build_requires that will have alternative # meanings in some future version of META.yml. sub test_requires { shift->build_requires(@_) } sub install_requires { shift->build_requires(@_) } # Aliases for installdirs options sub install_as_core { $_[0]->installdirs('perl') } sub install_as_cpan { $_[0]->installdirs('site') } sub install_as_site { $_[0]->installdirs('site') } sub install_as_vendor { $_[0]->installdirs('vendor') } sub dynamic_config { my $self = shift; my $value = @_ ? shift : 1; if ( $self->{values}->{dynamic_config} ) { # Once dynamic we never change to static, for safety return 0; } $self->{values}->{dynamic_config} = $value ? 1 : 0; return 1; } # Convenience command sub static_config { shift->dynamic_config(0); } sub perl_version { my $self = shift; return $self->{values}->{perl_version} unless @_; my $version = shift or die( "Did not provide a value to perl_version()" ); # Normalize the version $version = $self->_perl_version($version); # We don't support the really old versions unless ( $version >= 5.005 ) { die "Module::Install only supports 5.005 or newer (use ExtUtils::MakeMaker)\n"; } $self->{values}->{perl_version} = $version; } sub all_from { my ( $self, $file ) = @_; unless ( defined($file) ) { my $name = $self->name or die( "all_from called with no args without setting name() first" ); $file = join('/', 'lib', split(/-/, $name)) . '.pm'; $file =~ s{.*/}{} unless -e $file; unless ( -e $file ) { die("all_from cannot find $file from $name"); } } unless ( -f $file ) { die("The path '$file' does not exist, or is not a file"); } $self->{values}{all_from} = $file; # Some methods pull from POD instead of code. # If there is a matching .pod, use that instead my $pod = $file; $pod =~ s/\.pm$/.pod/i; $pod = $file unless -e $pod; # Pull the different values $self->name_from($file) unless $self->name; $self->version_from($file) unless $self->version; $self->perl_version_from($file) unless $self->perl_version; $self->author_from($pod) unless @{$self->author || []}; $self->license_from($pod) unless $self->license; $self->abstract_from($pod) unless $self->abstract; return 1; } sub provides { my $self = shift; my $provides = ( $self->{values}->{provides} ||= {} ); %$provides = (%$provides, @_) if @_; return $provides; } sub auto_provides { my $self = shift; return $self unless $self->is_admin; unless (-e 'MANIFEST') { warn "Cannot deduce auto_provides without a MANIFEST, skipping\n"; return $self; } # Avoid spurious warnings as we are not checking manifest here. local $SIG{__WARN__} = sub {1}; require ExtUtils::Manifest; local *ExtUtils::Manifest::manicheck = sub { return }; require Module::Build; my $build = Module::Build->new( dist_name => $self->name, dist_version => $self->version, license => $self->license, ); $self->provides( %{ $build->find_dist_packages || {} } ); } sub feature { my $self = shift; my $name = shift; my $features = ( $self->{values}->{features} ||= [] ); my $mods; if ( @_ == 1 and ref( $_[0] ) ) { # The user used ->feature like ->features by passing in the second # argument as a reference. Accomodate for that. $mods = $_[0]; } else { $mods = \@_; } my $count = 0; push @$features, ( $name => [ map { ref($_) ? ( ref($_) eq 'HASH' ) ? %$_ : @$_ : $_ } @$mods ] ); return @$features; } sub features { my $self = shift; while ( my ( $name, $mods ) = splice( @_, 0, 2 ) ) { $self->feature( $name, @$mods ); } return $self->{values}->{features} ? @{ $self->{values}->{features} } : (); } sub no_index { my $self = shift; my $type = shift; push @{ $self->{values}->{no_index}->{$type} }, @_ if $type; return $self->{values}->{no_index}; } sub read { my $self = shift; $self->include_deps( 'YAML::Tiny', 0 ); require YAML::Tiny; my $data = YAML::Tiny::LoadFile('META.yml'); # Call methods explicitly in case user has already set some values. while ( my ( $key, $value ) = each %$data ) { next unless $self->can($key); if ( ref $value eq 'HASH' ) { while ( my ( $module, $version ) = each %$value ) { $self->can($key)->($self, $module => $version ); } } else { $self->can($key)->($self, $value); } } return $self; } sub write { my $self = shift; return $self unless $self->is_admin; $self->admin->write_meta; return $self; } sub version_from { require ExtUtils::MM_Unix; my ( $self, $file ) = @_; $self->version( ExtUtils::MM_Unix->parse_version($file) ); # for version integrity check $self->makemaker_args( VERSION_FROM => $file ); } sub abstract_from { require ExtUtils::MM_Unix; my ( $self, $file ) = @_; $self->abstract( bless( { DISTNAME => $self->name }, 'ExtUtils::MM_Unix' )->parse_abstract($file) ); } # Add both distribution and module name sub name_from { my ($self, $file) = @_; if ( Module::Install::_read($file) =~ m/ ^ \s* package \s* ([\w:]+) [\s|;]* /ixms ) { my ($name, $module_name) = ($1, $1); $name =~ s{::}{-}g; $self->name($name); unless ( $self->module_name ) { $self->module_name($module_name); } } else { die("Cannot determine name from $file\n"); } } sub _extract_perl_version { if ( $_[0] =~ m/ ^\s* (?:use|require) \s* v? ([\d_\.]+) \s* ; /ixms ) { my $perl_version = $1; $perl_version =~ s{_}{}g; return $perl_version; } else { return; } } sub perl_version_from { my $self = shift; my $perl_version=_extract_perl_version(Module::Install::_read($_[0])); if ($perl_version) { $self->perl_version($perl_version); } else { warn "Cannot determine perl version info from $_[0]\n"; return; } } sub author_from { my $self = shift; my $content = Module::Install::_read($_[0]); if ($content =~ m/ =head \d \s+ (?:authors?)\b \s* ([^\n]*) | =head \d \s+ (?:licen[cs]e|licensing|copyright|legal)\b \s* .*? copyright .*? \d\d\d[\d.]+ \s* (?:\bby\b)? \s* ([^\n]*) /ixms) { my $author = $1 || $2; # XXX: ugly but should work anyway... if (eval "require Pod::Escapes; 1") { # Pod::Escapes has a mapping table. # It's in core of perl >= 5.9.3, and should be installed # as one of the Pod::Simple's prereqs, which is a prereq # of Pod::Text 3.x (see also below). $author =~ s{ E<( (\d+) | ([A-Za-z]+) )> } { defined $2 ? chr($2) : defined $Pod::Escapes::Name2character_number{$1} ? chr($Pod::Escapes::Name2character_number{$1}) : do { warn "Unknown escape: E<$1>"; "E<$1>"; }; }gex; } elsif (eval "require Pod::Text; 1" && $Pod::Text::VERSION < 3) { # Pod::Text < 3.0 has yet another mapping table, # though the table name of 2.x and 1.x are different. # (1.x is in core of Perl < 5.6, 2.x is in core of # Perl < 5.9.3) my $mapping = ($Pod::Text::VERSION < 2) ? \%Pod::Text::HTML_Escapes : \%Pod::Text::ESCAPES; $author =~ s{ E<( (\d+) | ([A-Za-z]+) )> } { defined $2 ? chr($2) : defined $mapping->{$1} ? $mapping->{$1} : do { warn "Unknown escape: E<$1>"; "E<$1>"; }; }gex; } else { $author =~ s{E}{<}g; $author =~ s{E}{>}g; } $self->author($author); } else { warn "Cannot determine author info from $_[0]\n"; } } #Stolen from M::B my %license_urls = ( perl => 'http://dev.perl.org/licenses/', apache => 'http://apache.org/licenses/LICENSE-2.0', apache_1_1 => 'http://apache.org/licenses/LICENSE-1.1', artistic => 'http://opensource.org/licenses/artistic-license.php', artistic_2 => 'http://opensource.org/licenses/artistic-license-2.0.php', lgpl => 'http://opensource.org/licenses/lgpl-license.php', lgpl2 => 'http://opensource.org/licenses/lgpl-2.1.php', lgpl3 => 'http://opensource.org/licenses/lgpl-3.0.html', bsd => 'http://opensource.org/licenses/bsd-license.php', gpl => 'http://opensource.org/licenses/gpl-license.php', gpl2 => 'http://opensource.org/licenses/gpl-2.0.php', gpl3 => 'http://opensource.org/licenses/gpl-3.0.html', mit => 'http://opensource.org/licenses/mit-license.php', mozilla => 'http://opensource.org/licenses/mozilla1.1.php', open_source => undef, unrestricted => undef, restrictive => undef, unknown => undef, ); sub license { my $self = shift; return $self->{values}->{license} unless @_; my $license = shift or die( 'Did not provide a value to license()' ); $license = __extract_license($license) || lc $license; $self->{values}->{license} = $license; # Automatically fill in license URLs if ( $license_urls{$license} ) { $self->resources( license => $license_urls{$license} ); } return 1; } sub _extract_license { my $pod = shift; my $matched; return __extract_license( ($matched) = $pod =~ m/ (=head \d \s+ L(?i:ICEN[CS]E|ICENSING)\b.*?) (=head \d.*|=cut.*|)\z /xms ) || __extract_license( ($matched) = $pod =~ m/ (=head \d \s+ (?:C(?i:OPYRIGHTS?)|L(?i:EGAL))\b.*?) (=head \d.*|=cut.*|)\z /xms ); } sub __extract_license { my $license_text = shift or return; my @phrases = ( '(?:under )?the same (?:terms|license) as (?:perl|the perl (?:\d )?programming language)' => 'perl', 1, '(?:under )?the terms of (?:perl|the perl programming language) itself' => 'perl', 1, 'Artistic and GPL' => 'perl', 1, 'GNU general public license' => 'gpl', 1, 'GNU public license' => 'gpl', 1, 'GNU lesser general public license' => 'lgpl', 1, 'GNU lesser public license' => 'lgpl', 1, 'GNU library general public license' => 'lgpl', 1, 'GNU library public license' => 'lgpl', 1, 'GNU Free Documentation license' => 'unrestricted', 1, 'GNU Affero General Public License' => 'open_source', 1, '(?:Free)?BSD license' => 'bsd', 1, 'Artistic license 2\.0' => 'artistic_2', 1, 'Artistic license' => 'artistic', 1, 'Apache (?:Software )?license' => 'apache', 1, 'GPL' => 'gpl', 1, 'LGPL' => 'lgpl', 1, 'BSD' => 'bsd', 1, 'Artistic' => 'artistic', 1, 'MIT' => 'mit', 1, 'Mozilla Public License' => 'mozilla', 1, 'Q Public License' => 'open_source', 1, 'OpenSSL License' => 'unrestricted', 1, 'SSLeay License' => 'unrestricted', 1, 'zlib License' => 'open_source', 1, 'proprietary' => 'proprietary', 0, ); while ( my ($pattern, $license, $osi) = splice(@phrases, 0, 3) ) { $pattern =~ s#\s+#\\s+#gs; if ( $license_text =~ /\b$pattern\b/i ) { return $license; } } return ''; } sub license_from { my $self = shift; if (my $license=_extract_license(Module::Install::_read($_[0]))) { $self->license($license); } else { warn "Cannot determine license info from $_[0]\n"; return 'unknown'; } } sub _extract_bugtracker { my @links = $_[0] =~ m#L<( https?\Q://rt.cpan.org/\E[^>]+| https?\Q://github.com/\E[\w_]+/[\w_]+/issues| https?\Q://code.google.com/p/\E[\w_\-]+/issues/list )>#gx; my %links; @links{@links}=(); @links=keys %links; return @links; } sub bugtracker_from { my $self = shift; my $content = Module::Install::_read($_[0]); my @links = _extract_bugtracker($content); unless ( @links ) { warn "Cannot determine bugtracker info from $_[0]\n"; return 0; } if ( @links > 1 ) { warn "Found more than one bugtracker link in $_[0]\n"; return 0; } # Set the bugtracker bugtracker( $links[0] ); return 1; } sub requires_from { my $self = shift; my $content = Module::Install::_readperl($_[0]); my @requires = $content =~ m/^use\s+([^\W\d]\w*(?:::\w+)*)\s+(v?[\d\.]+)/mg; while ( @requires ) { my $module = shift @requires; my $version = shift @requires; $self->requires( $module => $version ); } } sub test_requires_from { my $self = shift; my $content = Module::Install::_readperl($_[0]); my @requires = $content =~ m/^use\s+([^\W\d]\w*(?:::\w+)*)\s+([\d\.]+)/mg; while ( @requires ) { my $module = shift @requires; my $version = shift @requires; $self->test_requires( $module => $version ); } } # Convert triple-part versions (eg, 5.6.1 or 5.8.9) to # numbers (eg, 5.006001 or 5.008009). # Also, convert double-part versions (eg, 5.8) sub _perl_version { my $v = $_[-1]; $v =~ s/^([1-9])\.([1-9]\d?\d?)$/sprintf("%d.%03d",$1,$2)/e; $v =~ s/^([1-9])\.([1-9]\d?\d?)\.(0|[1-9]\d?\d?)$/sprintf("%d.%03d%03d",$1,$2,$3 || 0)/e; $v =~ s/(\.\d\d\d)000$/$1/; $v =~ s/_.+$//; if ( ref($v) ) { # Numify $v = $v + 0; } return $v; } sub add_metadata { my $self = shift; my %hash = @_; for my $key (keys %hash) { warn "add_metadata: $key is not prefixed with 'x_'.\n" . "Use appopriate function to add non-private metadata.\n" unless $key =~ /^x_/; $self->{values}->{$key} = $hash{$key}; } } ###################################################################### # MYMETA Support sub WriteMyMeta { die "WriteMyMeta has been deprecated"; } sub write_mymeta_yaml { my $self = shift; # We need YAML::Tiny to write the MYMETA.yml file unless ( eval { require YAML::Tiny; 1; } ) { return 1; } # Generate the data my $meta = $self->_write_mymeta_data or return 1; # Save as the MYMETA.yml file print "Writing MYMETA.yml\n"; YAML::Tiny::DumpFile('MYMETA.yml', $meta); } sub write_mymeta_json { my $self = shift; # We need JSON to write the MYMETA.json file unless ( eval { require JSON; 1; } ) { return 1; } # Generate the data my $meta = $self->_write_mymeta_data or return 1; # Save as the MYMETA.yml file print "Writing MYMETA.json\n"; Module::Install::_write( 'MYMETA.json', JSON->new->pretty(1)->canonical->encode($meta), ); } sub _write_mymeta_data { my $self = shift; # If there's no existing META.yml there is nothing we can do return undef unless -f 'META.yml'; # We need Parse::CPAN::Meta to load the file unless ( eval { require Parse::CPAN::Meta; 1; } ) { return undef; } # Merge the perl version into the dependencies my $val = $self->Meta->{values}; my $perl = delete $val->{perl_version}; if ( $perl ) { $val->{requires} ||= []; my $requires = $val->{requires}; # Canonize to three-dot version after Perl 5.6 if ( $perl >= 5.006 ) { $perl =~ s{^(\d+)\.(\d\d\d)(\d*)}{join('.', $1, int($2||0), int($3||0))}e } unshift @$requires, [ perl => $perl ]; } # Load the advisory META.yml file my @yaml = Parse::CPAN::Meta::LoadFile('META.yml'); my $meta = $yaml[0]; # Overwrite the non-configure dependency hashes delete $meta->{requires}; delete $meta->{build_requires}; delete $meta->{recommends}; if ( exists $val->{requires} ) { $meta->{requires} = { map { @$_ } @{ $val->{requires} } }; } if ( exists $val->{build_requires} ) { $meta->{build_requires} = { map { @$_ } @{ $val->{build_requires} } }; } return $meta; } 1; WWW-Search-2.517/inc/Module/Install/Scripts.pm0000644000175000017500000000101113101350056020341 0ustar martinmartin#line 1 package Module::Install::Scripts; use strict 'vars'; use Module::Install::Base (); use vars qw{$VERSION @ISA $ISCORE}; BEGIN { $VERSION = '1.17'; @ISA = 'Module::Install::Base'; $ISCORE = 1; } sub install_script { my $self = shift; my $args = $self->makemaker_args; my $exe = $args->{EXE_FILES} ||= []; foreach ( @_ ) { if ( -f $_ ) { push @$exe, $_; } elsif ( -d 'script' and -f "script/$_" ) { push @$exe, "script/$_"; } else { die("Cannot find script '$_'"); } } } 1; WWW-Search-2.517/inc/Module/Install/Fetch.pm0000644000175000017500000000462713101350056017763 0ustar martinmartin#line 1 package Module::Install::Fetch; use strict; use Module::Install::Base (); use vars qw{$VERSION @ISA $ISCORE}; BEGIN { $VERSION = '1.17'; @ISA = 'Module::Install::Base'; $ISCORE = 1; } sub get_file { my ($self, %args) = @_; my ($scheme, $host, $path, $file) = $args{url} =~ m|^(\w+)://([^/]+)(.+)/(.+)| or return; if ( $scheme eq 'http' and ! eval { require LWP::Simple; 1 } ) { $args{url} = $args{ftp_url} or (warn("LWP support unavailable!\n"), return); ($scheme, $host, $path, $file) = $args{url} =~ m|^(\w+)://([^/]+)(.+)/(.+)| or return; } $|++; print "Fetching '$file' from $host... "; unless (eval { require Socket; Socket::inet_aton($host) }) { warn "'$host' resolve failed!\n"; return; } return unless $scheme eq 'ftp' or $scheme eq 'http'; require Cwd; my $dir = Cwd::getcwd(); chdir $args{local_dir} or return if exists $args{local_dir}; if (eval { require LWP::Simple; 1 }) { LWP::Simple::mirror($args{url}, $file); } elsif (eval { require Net::FTP; 1 }) { eval { # use Net::FTP to get past firewall my $ftp = Net::FTP->new($host, Passive => 1, Timeout => 600); $ftp->login("anonymous", 'anonymous@example.com'); $ftp->cwd($path); $ftp->binary; $ftp->get($file) or (warn("$!\n"), return); $ftp->quit; } } elsif (my $ftp = $self->can_run('ftp')) { eval { # no Net::FTP, fallback to ftp.exe require FileHandle; my $fh = FileHandle->new; local $SIG{CHLD} = 'IGNORE'; unless ($fh->open("|$ftp -n")) { warn "Couldn't open ftp: $!\n"; chdir $dir; return; } my @dialog = split(/\n/, <<"END_FTP"); open $host user anonymous anonymous\@example.com cd $path binary get $file $file quit END_FTP foreach (@dialog) { $fh->print("$_\n") } $fh->close; } } else { warn "No working 'ftp' program available!\n"; chdir $dir; return; } unless (-f $file) { warn "Fetching failed: $@\n"; chdir $dir; return; } return if exists $args{size} and -s $file != $args{size}; system($args{run}) if exists $args{run}; unlink($file) if $args{remove}; print(((!exists $args{check_for} or -e $args{check_for}) ? "done!" : "failed! ($!)"), "\n"); chdir $dir; return !$?; } 1; WWW-Search-2.517/inc/Module/Install/Win32.pm0000644000175000017500000000340313101350056017623 0ustar martinmartin#line 1 package Module::Install::Win32; use strict; use Module::Install::Base (); use vars qw{$VERSION @ISA $ISCORE}; BEGIN { $VERSION = '1.17'; @ISA = 'Module::Install::Base'; $ISCORE = 1; } # determine if the user needs nmake, and download it if needed sub check_nmake { my $self = shift; $self->load('can_run'); $self->load('get_file'); require Config; return unless ( $^O eq 'MSWin32' and $Config::Config{make} and $Config::Config{make} =~ /^nmake\b/i and ! $self->can_run('nmake') ); print "The required 'nmake' executable not found, fetching it...\n"; require File::Basename; my $rv = $self->get_file( url => 'http://download.microsoft.com/download/vc15/Patch/1.52/W95/EN-US/Nmake15.exe', ftp_url => 'ftp://ftp.microsoft.com/Softlib/MSLFILES/Nmake15.exe', local_dir => File::Basename::dirname($^X), size => 51928, run => 'Nmake15.exe /o > nul', check_for => 'Nmake.exe', remove => 1, ); die <<'END_MESSAGE' unless $rv; ------------------------------------------------------------------------------- Since you are using Microsoft Windows, you will need the 'nmake' utility before installation. It's available at: http://download.microsoft.com/download/vc15/Patch/1.52/W95/EN-US/Nmake15.exe or ftp://ftp.microsoft.com/Softlib/MSLFILES/Nmake15.exe Please download the file manually, save it to a directory in %PATH% (e.g. C:\WINDOWS\COMMAND\), then launch the MS-DOS command line shell, "cd" to that directory, and run "Nmake15.exe" from there; that will create the 'nmake.exe' file needed by this module. You may then resume the installation process described in README. ------------------------------------------------------------------------------- END_MESSAGE } 1; WWW-Search-2.517/inc/Module/Install/Can.pm0000644000175000017500000000640513101350056017427 0ustar martinmartin#line 1 package Module::Install::Can; use strict; use Config (); use ExtUtils::MakeMaker (); use Module::Install::Base (); use vars qw{$VERSION @ISA $ISCORE}; BEGIN { $VERSION = '1.17'; @ISA = 'Module::Install::Base'; $ISCORE = 1; } # check if we can load some module ### Upgrade this to not have to load the module if possible sub can_use { my ($self, $mod, $ver) = @_; $mod =~ s{::|\\}{/}g; $mod .= '.pm' unless $mod =~ /\.pm$/i; my $pkg = $mod; $pkg =~ s{/}{::}g; $pkg =~ s{\.pm$}{}i; local $@; eval { require $mod; $pkg->VERSION($ver || 0); 1 }; } # Check if we can run some command sub can_run { my ($self, $cmd) = @_; my $_cmd = $cmd; return $_cmd if (-x $_cmd or $_cmd = MM->maybe_command($_cmd)); for my $dir ((split /$Config::Config{path_sep}/, $ENV{PATH}), '.') { next if $dir eq ''; require File::Spec; my $abs = File::Spec->catfile($dir, $cmd); return $abs if (-x $abs or $abs = MM->maybe_command($abs)); } return; } # Can our C compiler environment build XS files sub can_xs { my $self = shift; # Ensure we have the CBuilder module $self->configure_requires( 'ExtUtils::CBuilder' => 0.27 ); # Do we have the configure_requires checker? local $@; eval "require ExtUtils::CBuilder;"; if ( $@ ) { # They don't obey configure_requires, so it is # someone old and delicate. Try to avoid hurting # them by falling back to an older simpler test. return $self->can_cc(); } # Do we have a working C compiler my $builder = ExtUtils::CBuilder->new( quiet => 1, ); unless ( $builder->have_compiler ) { # No working C compiler return 0; } # Write a C file representative of what XS becomes require File::Temp; my ( $FH, $tmpfile ) = File::Temp::tempfile( "compilexs-XXXXX", SUFFIX => '.c', ); binmode $FH; print $FH <<'END_C'; #include "EXTERN.h" #include "perl.h" #include "XSUB.h" int main(int argc, char **argv) { return 0; } int boot_sanexs() { return 1; } END_C close $FH; # Can the C compiler access the same headers XS does my @libs = (); my $object = undef; eval { local $^W = 0; $object = $builder->compile( source => $tmpfile, ); @libs = $builder->link( objects => $object, module_name => 'sanexs', ); }; my $result = $@ ? 0 : 1; # Clean up all the build files foreach ( $tmpfile, $object, @libs ) { next unless defined $_; 1 while unlink; } return $result; } # Can we locate a (the) C compiler sub can_cc { my $self = shift; if ($^O eq 'VMS') { require ExtUtils::CBuilder; my $builder = ExtUtils::CBuilder->new( quiet => 1, ); return $builder->have_compiler; } my @chunks = split(/ /, $Config::Config{cc}) or return; # $Config{cc} may contain args; try to find out the program part while (@chunks) { return $self->can_run("@chunks") || (pop(@chunks), next); } return; } # Fix Cygwin bug on maybe_command(); if ( $^O eq 'cygwin' ) { require ExtUtils::MM_Cygwin; require ExtUtils::MM_Win32; if ( ! defined(&ExtUtils::MM_Cygwin::maybe_command) ) { *ExtUtils::MM_Cygwin::maybe_command = sub { my ($self, $file) = @_; if ($file =~ m{^/cygdrive/}i and ExtUtils::MM_Win32->can('maybe_command')) { ExtUtils::MM_Win32->maybe_command($file); } else { ExtUtils::MM_Unix->maybe_command($file); } } } } 1; __END__ #line 245 WWW-Search-2.517/inc/Module/Install/Base.pm0000644000175000017500000000214713101350056017577 0ustar martinmartin#line 1 package Module::Install::Base; use strict 'vars'; use vars qw{$VERSION}; BEGIN { $VERSION = '1.17'; } # Suspend handler for "redefined" warnings BEGIN { my $w = $SIG{__WARN__}; $SIG{__WARN__} = sub { $w }; } #line 42 sub new { my $class = shift; unless ( defined &{"${class}::call"} ) { *{"${class}::call"} = sub { shift->_top->call(@_) }; } unless ( defined &{"${class}::load"} ) { *{"${class}::load"} = sub { shift->_top->load(@_) }; } bless { @_ }, $class; } #line 61 sub AUTOLOAD { local $@; my $func = eval { shift->_top->autoload } or return; goto &$func; } #line 75 sub _top { $_[0]->{_top}; } #line 90 sub admin { $_[0]->_top->{admin} or Module::Install::Base::FakeAdmin->new; } #line 106 sub is_admin { ! $_[0]->admin->isa('Module::Install::Base::FakeAdmin'); } sub DESTROY {} package Module::Install::Base::FakeAdmin; use vars qw{$VERSION}; BEGIN { $VERSION = $Module::Install::Base::VERSION; } my $fake; sub new { $fake ||= bless(\@_, $_[0]); } sub AUTOLOAD {} sub DESTROY {} # Restore warning handler BEGIN { $SIG{__WARN__} = $SIG{__WARN__}->(); } 1; #line 159 WWW-Search-2.517/inc/Module/Install/Makefile.pm0000644000175000017500000002743713101350056020453 0ustar martinmartin#line 1 package Module::Install::Makefile; use strict 'vars'; use ExtUtils::MakeMaker (); use Module::Install::Base (); use Fcntl qw/:flock :seek/; use vars qw{$VERSION @ISA $ISCORE}; BEGIN { $VERSION = '1.17'; @ISA = 'Module::Install::Base'; $ISCORE = 1; } sub Makefile { $_[0] } my %seen = (); sub prompt { shift; # Infinite loop protection my @c = caller(); if ( ++$seen{"$c[1]|$c[2]|$_[0]"} > 3 ) { die "Caught an potential prompt infinite loop ($c[1]|$c[2]|$_[0])"; } # In automated testing or non-interactive session, always use defaults if ( ($ENV{AUTOMATED_TESTING} or -! -t STDIN) and ! $ENV{PERL_MM_USE_DEFAULT} ) { local $ENV{PERL_MM_USE_DEFAULT} = 1; goto &ExtUtils::MakeMaker::prompt; } else { goto &ExtUtils::MakeMaker::prompt; } } # Store a cleaned up version of the MakeMaker version, # since we need to behave differently in a variety of # ways based on the MM version. my $makemaker = eval $ExtUtils::MakeMaker::VERSION; # If we are passed a param, do a "newer than" comparison. # Otherwise, just return the MakeMaker version. sub makemaker { ( @_ < 2 or $makemaker >= eval($_[1]) ) ? $makemaker : 0 } # Ripped from ExtUtils::MakeMaker 6.56, and slightly modified # as we only need to know here whether the attribute is an array # or a hash or something else (which may or may not be appendable). my %makemaker_argtype = ( C => 'ARRAY', CONFIG => 'ARRAY', # CONFIGURE => 'CODE', # ignore DIR => 'ARRAY', DL_FUNCS => 'HASH', DL_VARS => 'ARRAY', EXCLUDE_EXT => 'ARRAY', EXE_FILES => 'ARRAY', FUNCLIST => 'ARRAY', H => 'ARRAY', IMPORTS => 'HASH', INCLUDE_EXT => 'ARRAY', LIBS => 'ARRAY', # ignore '' MAN1PODS => 'HASH', MAN3PODS => 'HASH', META_ADD => 'HASH', META_MERGE => 'HASH', PL_FILES => 'HASH', PM => 'HASH', PMLIBDIRS => 'ARRAY', PMLIBPARENTDIRS => 'ARRAY', PREREQ_PM => 'HASH', CONFIGURE_REQUIRES => 'HASH', SKIP => 'ARRAY', TYPEMAPS => 'ARRAY', XS => 'HASH', # VERSION => ['version',''], # ignore # _KEEP_AFTER_FLUSH => '', clean => 'HASH', depend => 'HASH', dist => 'HASH', dynamic_lib=> 'HASH', linkext => 'HASH', macro => 'HASH', postamble => 'HASH', realclean => 'HASH', test => 'HASH', tool_autosplit => 'HASH', # special cases where you can use makemaker_append CCFLAGS => 'APPENDABLE', DEFINE => 'APPENDABLE', INC => 'APPENDABLE', LDDLFLAGS => 'APPENDABLE', LDFROM => 'APPENDABLE', ); sub makemaker_args { my ($self, %new_args) = @_; my $args = ( $self->{makemaker_args} ||= {} ); foreach my $key (keys %new_args) { if ($makemaker_argtype{$key}) { if ($makemaker_argtype{$key} eq 'ARRAY') { $args->{$key} = [] unless defined $args->{$key}; unless (ref $args->{$key} eq 'ARRAY') { $args->{$key} = [$args->{$key}] } push @{$args->{$key}}, ref $new_args{$key} eq 'ARRAY' ? @{$new_args{$key}} : $new_args{$key}; } elsif ($makemaker_argtype{$key} eq 'HASH') { $args->{$key} = {} unless defined $args->{$key}; foreach my $skey (keys %{ $new_args{$key} }) { $args->{$key}{$skey} = $new_args{$key}{$skey}; } } elsif ($makemaker_argtype{$key} eq 'APPENDABLE') { $self->makemaker_append($key => $new_args{$key}); } } else { if (defined $args->{$key}) { warn qq{MakeMaker attribute "$key" is overriden; use "makemaker_append" to append values\n}; } $args->{$key} = $new_args{$key}; } } return $args; } # For mm args that take multiple space-separated args, # append an argument to the current list. sub makemaker_append { my $self = shift; my $name = shift; my $args = $self->makemaker_args; $args->{$name} = defined $args->{$name} ? join( ' ', $args->{$name}, @_ ) : join( ' ', @_ ); } sub build_subdirs { my $self = shift; my $subdirs = $self->makemaker_args->{DIR} ||= []; for my $subdir (@_) { push @$subdirs, $subdir; } } sub clean_files { my $self = shift; my $clean = $self->makemaker_args->{clean} ||= {}; %$clean = ( %$clean, FILES => join ' ', grep { length $_ } ($clean->{FILES} || (), @_), ); } sub realclean_files { my $self = shift; my $realclean = $self->makemaker_args->{realclean} ||= {}; %$realclean = ( %$realclean, FILES => join ' ', grep { length $_ } ($realclean->{FILES} || (), @_), ); } sub libs { my $self = shift; my $libs = ref $_[0] ? shift : [ shift ]; $self->makemaker_args( LIBS => $libs ); } sub inc { my $self = shift; $self->makemaker_args( INC => shift ); } sub _wanted_t { } sub tests_recursive { my $self = shift; my $dir = shift || 't'; unless ( -d $dir ) { die "tests_recursive dir '$dir' does not exist"; } my %tests = map { $_ => 1 } split / /, ($self->tests || ''); require File::Find; File::Find::find( sub { /\.t$/ and -f $_ and $tests{"$File::Find::dir/*.t"} = 1 }, $dir ); $self->tests( join ' ', sort keys %tests ); } sub write { my $self = shift; die "&Makefile->write() takes no arguments\n" if @_; # Check the current Perl version my $perl_version = $self->perl_version; if ( $perl_version ) { eval "use $perl_version; 1" or die "ERROR: perl: Version $] is installed, " . "but we need version >= $perl_version"; } # Make sure we have a new enough MakeMaker require ExtUtils::MakeMaker; if ( $perl_version and $self->_cmp($perl_version, '5.006') >= 0 ) { # This previous attempted to inherit the version of # ExtUtils::MakeMaker in use by the module author, but this # was found to be untenable as some authors build releases # using future dev versions of EU:MM that nobody else has. # Instead, #toolchain suggests we use 6.59 which is the most # stable version on CPAN at time of writing and is, to quote # ribasushi, "not terminally fucked, > and tested enough". # TODO: We will now need to maintain this over time to push # the version up as new versions are released. $self->build_requires( 'ExtUtils::MakeMaker' => 6.59 ); $self->configure_requires( 'ExtUtils::MakeMaker' => 6.59 ); } else { # Allow legacy-compatibility with 5.005 by depending on the # most recent EU:MM that supported 5.005. $self->build_requires( 'ExtUtils::MakeMaker' => 6.36 ); $self->configure_requires( 'ExtUtils::MakeMaker' => 6.36 ); } # Generate the MakeMaker params my $args = $self->makemaker_args; $args->{DISTNAME} = $self->name; $args->{NAME} = $self->module_name || $self->name; $args->{NAME} =~ s/-/::/g; $args->{VERSION} = $self->version or die <<'EOT'; ERROR: Can't determine distribution version. Please specify it explicitly via 'version' in Makefile.PL, or set a valid $VERSION in a module, and provide its file path via 'version_from' (or 'all_from' if you prefer) in Makefile.PL. EOT if ( $self->tests ) { my @tests = split ' ', $self->tests; my %seen; $args->{test} = { TESTS => (join ' ', grep {!$seen{$_}++} @tests), }; } elsif ( $Module::Install::ExtraTests::use_extratests ) { # Module::Install::ExtraTests doesn't set $self->tests and does its own tests via harness. # So, just ignore our xt tests here. } elsif ( -d 'xt' and ($Module::Install::AUTHOR or $ENV{RELEASE_TESTING}) ) { $args->{test} = { TESTS => join( ' ', map { "$_/*.t" } grep { -d $_ } qw{ t xt } ), }; } if ( $] >= 5.005 ) { $args->{ABSTRACT} = $self->abstract; $args->{AUTHOR} = join ', ', @{$self->author || []}; } if ( $self->makemaker(6.10) ) { $args->{NO_META} = 1; #$args->{NO_MYMETA} = 1; } if ( $self->makemaker(6.17) and $self->sign ) { $args->{SIGN} = 1; } unless ( $self->is_admin ) { delete $args->{SIGN}; } if ( $self->makemaker(6.31) and $self->license ) { $args->{LICENSE} = $self->license; } my $prereq = ($args->{PREREQ_PM} ||= {}); %$prereq = ( %$prereq, map { @$_ } # flatten [module => version] map { @$_ } grep $_, ($self->requires) ); # Remove any reference to perl, PREREQ_PM doesn't support it delete $args->{PREREQ_PM}->{perl}; # Merge both kinds of requires into BUILD_REQUIRES my $build_prereq = ($args->{BUILD_REQUIRES} ||= {}); %$build_prereq = ( %$build_prereq, map { @$_ } # flatten [module => version] map { @$_ } grep $_, ($self->configure_requires, $self->build_requires) ); # Remove any reference to perl, BUILD_REQUIRES doesn't support it delete $args->{BUILD_REQUIRES}->{perl}; # Delete bundled dists from prereq_pm, add it to Makefile DIR my $subdirs = ($args->{DIR} || []); if ($self->bundles) { my %processed; foreach my $bundle (@{ $self->bundles }) { my ($mod_name, $dist_dir) = @$bundle; delete $prereq->{$mod_name}; $dist_dir = File::Basename::basename($dist_dir); # dir for building this module if (not exists $processed{$dist_dir}) { if (-d $dist_dir) { # List as sub-directory to be processed by make push @$subdirs, $dist_dir; } # Else do nothing: the module is already present on the system $processed{$dist_dir} = undef; } } } unless ( $self->makemaker('6.55_03') ) { %$prereq = (%$prereq,%$build_prereq); delete $args->{BUILD_REQUIRES}; } if ( my $perl_version = $self->perl_version ) { eval "use $perl_version; 1" or die "ERROR: perl: Version $] is installed, " . "but we need version >= $perl_version"; if ( $self->makemaker(6.48) ) { $args->{MIN_PERL_VERSION} = $perl_version; } } if ($self->installdirs) { warn qq{old INSTALLDIRS (probably set by makemaker_args) is overriden by installdirs\n} if $args->{INSTALLDIRS}; $args->{INSTALLDIRS} = $self->installdirs; } my %args = map { ( $_ => $args->{$_} ) } grep {defined($args->{$_} ) } keys %$args; my $user_preop = delete $args{dist}->{PREOP}; if ( my $preop = $self->admin->preop($user_preop) ) { foreach my $key ( keys %$preop ) { $args{dist}->{$key} = $preop->{$key}; } } my $mm = ExtUtils::MakeMaker::WriteMakefile(%args); $self->fix_up_makefile($mm->{FIRST_MAKEFILE} || 'Makefile'); } sub fix_up_makefile { my $self = shift; my $makefile_name = shift; my $top_class = ref($self->_top) || ''; my $top_version = $self->_top->VERSION || ''; my $preamble = $self->preamble ? "# Preamble by $top_class $top_version\n" . $self->preamble : ''; my $postamble = "# Postamble by $top_class $top_version\n" . ($self->postamble || ''); local *MAKEFILE; open MAKEFILE, "+< $makefile_name" or die "fix_up_makefile: Couldn't open $makefile_name: $!"; eval { flock MAKEFILE, LOCK_EX }; my $makefile = do { local $/; }; $makefile =~ s/\b(test_harness\(\$\(TEST_VERBOSE\), )/$1'inc', /; $makefile =~ s/( -I\$\(INST_ARCHLIB\))/ -Iinc$1/g; $makefile =~ s/( "-I\$\(INST_LIB\)")/ "-Iinc"$1/g; $makefile =~ s/^(FULLPERL = .*)/$1 "-Iinc"/m; $makefile =~ s/^(PERL = .*)/$1 "-Iinc"/m; # Module::Install will never be used to build the Core Perl # Sometimes PERL_LIB and PERL_ARCHLIB get written anyway, which breaks # PREFIX/PERL5LIB, and thus, install_share. Blank them if they exist $makefile =~ s/^PERL_LIB = .+/PERL_LIB =/m; #$makefile =~ s/^PERL_ARCHLIB = .+/PERL_ARCHLIB =/m; # Perl 5.005 mentions PERL_LIB explicitly, so we have to remove that as well. $makefile =~ s/(\"?)-I\$\(PERL_LIB\)\1//g; # XXX - This is currently unused; not sure if it breaks other MM-users # $makefile =~ s/^pm_to_blib\s+:\s+/pm_to_blib :: /mg; seek MAKEFILE, 0, SEEK_SET; truncate MAKEFILE, 0; print MAKEFILE "$preamble$makefile$postamble" or die $!; close MAKEFILE or die $!; 1; } sub preamble { my ($self, $text) = @_; $self->{preamble} = $text . $self->{preamble} if defined $text; $self->{preamble}; } sub postamble { my ($self, $text) = @_; $self->{postamble} ||= $self->admin->postamble; $self->{postamble} .= $text if defined $text; $self->{postamble} } 1; __END__ #line 544 WWW-Search-2.517/inc/Module/Install.pm0000644000175000017500000002714513101350056016732 0ustar martinmartin#line 1 package Module::Install; # For any maintainers: # The load order for Module::Install is a bit magic. # It goes something like this... # # IF ( host has Module::Install installed, creating author mode ) { # 1. Makefile.PL calls "use inc::Module::Install" # 2. $INC{inc/Module/Install.pm} set to installed version of inc::Module::Install # 3. The installed version of inc::Module::Install loads # 4. inc::Module::Install calls "require Module::Install" # 5. The ./inc/ version of Module::Install loads # } ELSE { # 1. Makefile.PL calls "use inc::Module::Install" # 2. $INC{inc/Module/Install.pm} set to ./inc/ version of Module::Install # 3. The ./inc/ version of Module::Install loads # } use 5.006; use strict 'vars'; use Cwd (); use File::Find (); use File::Path (); use vars qw{$VERSION $MAIN}; BEGIN { # All Module::Install core packages now require synchronised versions. # This will be used to ensure we don't accidentally load old or # different versions of modules. # This is not enforced yet, but will be some time in the next few # releases once we can make sure it won't clash with custom # Module::Install extensions. $VERSION = '1.17'; # Storage for the pseudo-singleton $MAIN = undef; *inc::Module::Install::VERSION = *VERSION; @inc::Module::Install::ISA = __PACKAGE__; } sub import { my $class = shift; my $self = $class->new(@_); my $who = $self->_caller; #------------------------------------------------------------- # all of the following checks should be included in import(), # to allow "eval 'require Module::Install; 1' to test # installation of Module::Install. (RT #51267) #------------------------------------------------------------- # Whether or not inc::Module::Install is actually loaded, the # $INC{inc/Module/Install.pm} is what will still get set as long as # the caller loaded module this in the documented manner. # If not set, the caller may NOT have loaded the bundled version, and thus # they may not have a MI version that works with the Makefile.PL. This would # result in false errors or unexpected behaviour. And we don't want that. my $file = join( '/', 'inc', split /::/, __PACKAGE__ ) . '.pm'; unless ( $INC{$file} ) { die <<"END_DIE" } Please invoke ${\__PACKAGE__} with: use inc::${\__PACKAGE__}; not: use ${\__PACKAGE__}; END_DIE # This reportedly fixes a rare Win32 UTC file time issue, but # as this is a non-cross-platform XS module not in the core, # we shouldn't really depend on it. See RT #24194 for detail. # (Also, this module only supports Perl 5.6 and above). eval "use Win32::UTCFileTime" if $^O eq 'MSWin32' && $] >= 5.006; # If the script that is loading Module::Install is from the future, # then make will detect this and cause it to re-run over and over # again. This is bad. Rather than taking action to touch it (which # is unreliable on some platforms and requires write permissions) # for now we should catch this and refuse to run. if ( -f $0 ) { my $s = (stat($0))[9]; # If the modification time is only slightly in the future, # sleep briefly to remove the problem. my $a = $s - time; if ( $a > 0 and $a < 5 ) { sleep 5 } # Too far in the future, throw an error. my $t = time; if ( $s > $t ) { die <<"END_DIE" } Your installer $0 has a modification time in the future ($s > $t). This is known to create infinite loops in make. Please correct this, then run $0 again. END_DIE } # Build.PL was formerly supported, but no longer is due to excessive # difficulty in implementing every single feature twice. if ( $0 =~ /Build.PL$/i ) { die <<"END_DIE" } Module::Install no longer supports Build.PL. It was impossible to maintain duel backends, and has been deprecated. Please remove all Build.PL files and only use the Makefile.PL installer. END_DIE #------------------------------------------------------------- # To save some more typing in Module::Install installers, every... # use inc::Module::Install # ...also acts as an implicit use strict. $^H |= strict::bits(qw(refs subs vars)); #------------------------------------------------------------- unless ( -f $self->{file} ) { foreach my $key (keys %INC) { delete $INC{$key} if $key =~ /Module\/Install/; } local $^W; require "$self->{path}/$self->{dispatch}.pm"; File::Path::mkpath("$self->{prefix}/$self->{author}"); $self->{admin} = "$self->{name}::$self->{dispatch}"->new( _top => $self ); $self->{admin}->init; @_ = ($class, _self => $self); goto &{"$self->{name}::import"}; } local $^W; *{"${who}::AUTOLOAD"} = $self->autoload; $self->preload; # Unregister loader and worker packages so subdirs can use them again delete $INC{'inc/Module/Install.pm'}; delete $INC{'Module/Install.pm'}; # Save to the singleton $MAIN = $self; return 1; } sub autoload { my $self = shift; my $who = $self->_caller; my $cwd = Cwd::getcwd(); my $sym = "${who}::AUTOLOAD"; $sym->{$cwd} = sub { my $pwd = Cwd::getcwd(); if ( my $code = $sym->{$pwd} ) { # Delegate back to parent dirs goto &$code unless $cwd eq $pwd; } unless ($$sym =~ s/([^:]+)$//) { # XXX: it looks like we can't retrieve the missing function # via $$sym (usually $main::AUTOLOAD) in this case. # I'm still wondering if we should slurp Makefile.PL to # get some context or not ... my ($package, $file, $line) = caller; die <<"EOT"; Unknown function is found at $file line $line. Execution of $file aborted due to runtime errors. If you're a contributor to a project, you may need to install some Module::Install extensions from CPAN (or other repository). If you're a user of a module, please contact the author. EOT } my $method = $1; if ( uc($method) eq $method ) { # Do nothing return; } elsif ( $method =~ /^_/ and $self->can($method) ) { # Dispatch to the root M:I class return $self->$method(@_); } # Dispatch to the appropriate plugin unshift @_, ( $self, $1 ); goto &{$self->can('call')}; }; } sub preload { my $self = shift; unless ( $self->{extensions} ) { $self->load_extensions( "$self->{prefix}/$self->{path}", $self ); } my @exts = @{$self->{extensions}}; unless ( @exts ) { @exts = $self->{admin}->load_all_extensions; } my %seen; foreach my $obj ( @exts ) { while (my ($method, $glob) = each %{ref($obj) . '::'}) { next unless $obj->can($method); next if $method =~ /^_/; next if $method eq uc($method); $seen{$method}++; } } my $who = $self->_caller; foreach my $name ( sort keys %seen ) { local $^W; *{"${who}::$name"} = sub { ${"${who}::AUTOLOAD"} = "${who}::$name"; goto &{"${who}::AUTOLOAD"}; }; } } sub new { my ($class, %args) = @_; delete $INC{'FindBin.pm'}; { # to suppress the redefine warning local $SIG{__WARN__} = sub {}; require FindBin; } # ignore the prefix on extension modules built from top level. my $base_path = Cwd::abs_path($FindBin::Bin); unless ( Cwd::abs_path(Cwd::getcwd()) eq $base_path ) { delete $args{prefix}; } return $args{_self} if $args{_self}; $base_path = VMS::Filespec::unixify($base_path) if $^O eq 'VMS'; $args{dispatch} ||= 'Admin'; $args{prefix} ||= 'inc'; $args{author} ||= ($^O eq 'VMS' ? '_author' : '.author'); $args{bundle} ||= 'inc/BUNDLES'; $args{base} ||= $base_path; $class =~ s/^\Q$args{prefix}\E:://; $args{name} ||= $class; $args{version} ||= $class->VERSION; unless ( $args{path} ) { $args{path} = $args{name}; $args{path} =~ s!::!/!g; } $args{file} ||= "$args{base}/$args{prefix}/$args{path}.pm"; $args{wrote} = 0; bless( \%args, $class ); } sub call { my ($self, $method) = @_; my $obj = $self->load($method) or return; splice(@_, 0, 2, $obj); goto &{$obj->can($method)}; } sub load { my ($self, $method) = @_; $self->load_extensions( "$self->{prefix}/$self->{path}", $self ) unless $self->{extensions}; foreach my $obj (@{$self->{extensions}}) { return $obj if $obj->can($method); } my $admin = $self->{admin} or die <<"END_DIE"; The '$method' method does not exist in the '$self->{prefix}' path! Please remove the '$self->{prefix}' directory and run $0 again to load it. END_DIE my $obj = $admin->load($method, 1); push @{$self->{extensions}}, $obj; $obj; } sub load_extensions { my ($self, $path, $top) = @_; my $should_reload = 0; unless ( grep { ! ref $_ and lc $_ eq lc $self->{prefix} } @INC ) { unshift @INC, $self->{prefix}; $should_reload = 1; } foreach my $rv ( $self->find_extensions($path) ) { my ($file, $pkg) = @{$rv}; next if $self->{pathnames}{$pkg}; local $@; my $new = eval { local $^W; require $file; $pkg->can('new') }; unless ( $new ) { warn $@ if $@; next; } $self->{pathnames}{$pkg} = $should_reload ? delete $INC{$file} : $INC{$file}; push @{$self->{extensions}}, &{$new}($pkg, _top => $top ); } $self->{extensions} ||= []; } sub find_extensions { my ($self, $path) = @_; my @found; File::Find::find( {no_chdir => 1, wanted => sub { my $file = $File::Find::name; return unless $file =~ m!^\Q$path\E/(.+)\.pm\Z!is; my $subpath = $1; return if lc($subpath) eq lc($self->{dispatch}); $file = "$self->{path}/$subpath.pm"; my $pkg = "$self->{name}::$subpath"; $pkg =~ s!/!::!g; # If we have a mixed-case package name, assume case has been preserved # correctly. Otherwise, root through the file to locate the case-preserved # version of the package name. if ( $subpath eq lc($subpath) || $subpath eq uc($subpath) ) { my $content = Module::Install::_read($File::Find::name); my $in_pod = 0; foreach ( split /\n/, $content ) { $in_pod = 1 if /^=\w/; $in_pod = 0 if /^=cut/; next if ($in_pod || /^=cut/); # skip pod text next if /^\s*#/; # and comments if ( m/^\s*package\s+($pkg)\s*;/i ) { $pkg = $1; last; } } } push @found, [ $file, $pkg ]; }}, $path ) if -d $path; @found; } ##################################################################### # Common Utility Functions sub _caller { my $depth = 0; my $call = caller($depth); while ( $call eq __PACKAGE__ ) { $depth++; $call = caller($depth); } return $call; } sub _read { local *FH; open( FH, '<', $_[0] ) or die "open($_[0]): $!"; binmode FH; my $string = do { local $/; }; close FH or die "close($_[0]): $!"; return $string; } sub _readperl { my $string = Module::Install::_read($_[0]); $string =~ s/(?:\015{1,2}\012|\015|\012)/\n/sg; $string =~ s/(\n)\n*__(?:DATA|END)__\b.*\z/$1/s; $string =~ s/\n\n=\w+.+?\n\n=cut\b.+?\n+/\n\n/sg; return $string; } sub _readpod { my $string = Module::Install::_read($_[0]); $string =~ s/(?:\015{1,2}\012|\015|\012)/\n/sg; return $string if $_[0] =~ /\.pod\z/; $string =~ s/(^|\n=cut\b.+?\n+)[^=\s].+?\n(\n=\w+|\z)/$1$2/sg; $string =~ s/\n*=pod\b[^\n]*\n+/\n\n/sg; $string =~ s/\n*=cut\b[^\n]*\n+/\n\n/sg; $string =~ s/^\n+//s; return $string; } sub _write { local *FH; open( FH, '>', $_[0] ) or die "open($_[0]): $!"; binmode FH; foreach ( 1 .. $#_ ) { print FH $_[$_] or die "print($_[0]): $!"; } close FH or die "close($_[0]): $!"; } # _version is for processing module versions (eg, 1.03_05) not # Perl versions (eg, 5.8.1). sub _version { my $s = shift || 0; my $d =()= $s =~ /(\.)/g; if ( $d >= 2 ) { # Normalise multipart versions $s =~ s/(\.)(\d{1,3})/sprintf("$1%03d",$2)/eg; } $s =~ s/^(\d+)\.?//; my $l = $1 || 0; my @v = map { $_ . '0' x (3 - length $_) } $s =~ /(\d{1,3})\D?/g; $l = $l . '.' . join '', @v if @v; return $l + 0; } sub _cmp { _version($_[1]) <=> _version($_[2]); } # Cloned from Params::Util::_CLASS sub _CLASS { ( defined $_[0] and ! ref $_[0] and $_[0] =~ m/^[^\W\d]\w*(?:::\w+)*\z/s ) ? $_[0] : undef; } 1; # Copyright 2008 - 2012 Adam Kennedy. WWW-Search-2.517/Programs/0000755000175000017500000000000013101353062014551 5ustar martinmartinWWW-Search-2.517/Programs/WebSearch0000644000175000017500000002743113101347451016353 0ustar martinmartin#!perl -w # # WebSearch # Copyright (C) 1996-1997 by USC/ISI # $Id: WebSearch,v 1.1 2008/07/14 03:28:13 Martin Exp $ # # Complete copyright notice follows below. sub usage { my $msg = shift; defined($msg) && ($msg ne '') && print STDERR "$0: $msg\n"; print STDERR <<"END"; usage: WebSearch [--engine ] [--gui] [--max ] [--options ]... [--count] [--terse] [--all] [--raw] [--list] [--verbose] [--VERSION] [--help] [--host ] [--port ] [--username bbunny --password c4rr0t5] [--lwpdebug] [--debug] query Make a query to a web search engine, printing to STDOUT the URLs which match (one per line). See `perldoc WebSearch` for details. END exit 1; } # usage =head1 NAME WebSearch - a web-searching application demonstrating WWW::Search =head1 SYNOPSIS B =head1 DESCRIPTION This program is provides a command-line interface to web search engines, listing all URLs found for a given query. This program also provides a simple demonstration of the WWW::Search Perl library for web searches. The program supports a number of search engines; use WebSearch --list to see which backends are installed. A more sophisticated client is L which maintains a change list of found objects. For examples and hints about searches, see L. =head1 OPTIONS WebSearch uses Getopt::Long, so you can use double-minus with long option names, or single-minus with one-letter abbreviations. =over 8 =item --engine e_name, -e e_name The string e_name is the name of (the module for) the desired search engine. Capitalization matters. See `perldoc WWW::Search` to find out what the default is (probably Null). Use --list to get a list of installed backends. =item --gui, -g Perform the search to mimic the default browser-based search. Not implemented for all backends, see the documentation for each backend. =item --list Prints to STDERR a \n-separated list of installed backends. =item --max max_count, -m max_count Specify the maximum number of hits to retrieve. =item --option o_string, -o o_string Specify a search-engine option in the form 'key=value' (or just 'key'). Can be repeated for as many options are needed. Keys can be repeated. =item --count, -c As the first line of output, print the approximate hit count. As the last line of output, print the actual hit count. =item --terse, -t Do not print any URLs. Only useful if you also specify --count. If you specify --terse but not --count, there will be no output no matter how many hits are found! =item --all, -a For each hit result, print all the URLs that the search engine indicated were equivalent to it. (Some URLs may refer to the same object.) Can be combined with --verbose; can not be combined with --raw. =item --raw, -r For each hit result, print the raw HTML. Not implemented for all backends. =item --verbose, -v Verbose mode. Enumerate the returned URLs and show the description, score, date, etc. for each. =item --VERSION, -V Print version information and exit immediately. =item --debug , -d Display back-end debugging information (with debug level ) =item --host Set the _host option for the WWW::Search object (backend-dependent). =item --port Set the _port option for the WWW::Search object (backend-dependent). =item --username Set the username with which to login to the backend. =item --password Set the password with which to login to the backend. =item --lwpdebug, -l Display low-level libwww-perl debugging information =back =head1 ENVIRONMENT VARIABLES The environment variable F (or F) specifies a proxy, if any. =head1 SEE ALSO For the library, see L. For a more sophisticated client, see L. =head1 AUTHOR C was written by John Heidemann, . C is maintained by Martin Thurn, . =head1 COPYRIGHT Copyright (c) 1996-1997 University of Southern California. All rights reserved. Redistribution and use in source and binary forms are permitted provided that the above copyright notice and this paragraph are duplicated in all such forms and that any documentation, advertising materials, and other materials related to such distribution and use acknowledge that the software was developed by the University of Southern California, Information Sciences Institute. The name of the University may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. =cut use strict; &usage('no arguments given') if ($#ARGV == -1); # &usage if ($#ARGV >= 0 && $ARGV[0] eq '-?'); use Getopt::Long; use WWW::Search; use vars qw( $VERSION ); $VERSION = '2.14'; use vars qw( $sEngine $all $verbose $raw $iMax $print_version $debuglwp $debug ); use vars qw( @options $_host $_port $help $iShowCount $iGui $opt_list ); use vars qw( $opt_username $opt_password ); # Set default values: $iShowCount = 0; $iGui = 0; my $iTerse = 0; $_host = ''; $_port = ''; $sEngine = ''; undef $debug; $debuglwp = 0; $opt_list = 0; # Get the command-line options: &Getopt::Long::config(qw(no_ignore_case no_getopt_compat)); &usage('getopt failed') unless &GetOptions('all', \$all, 'count', \$iShowCount, 'debug:i', \$debug, 'engine=s', \$sEngine, 'gui', \$iGui, 'help', \$help, 'host=s', \$_host, 'list', 'lwpdebug', \$debuglwp, 'max=i', \$iMax, 'options=s@', \@options, 'password=s', 'port=s', \$_port, 'raw', \$raw, 'terse', \$iTerse, 'username=s', 'verbose', \$verbose, 'VERSION', \$print_version, ); &usage('user requested help') if $help; $debug = 1 if (defined($debug) and ($debug < 1)); &print_version() if defined($print_version); &list_engines() if ($opt_list); # there MUST be some argument(s) left, the query: &usage('no query found on command line') if (scalar(@ARGV) <= 0); &main(join(" ", @ARGV)); exit 0; sub print_version { print "$0 version $VERSION\nWWW::Search version $WWW::Search::VERSION\n"; if ($sEngine ne '') { my $mod = "WWW::Search::$sEngine"; if (eval "use $mod; 1;") { my $e = '$'. $mod .'::VERSION'; # print STDERR " + e is ===$e===\n"; my $iVersion = eval $e; $iVersion ||= 'unknown'; print "$mod version $iVersion\n"; } # if } # if exit 0; } # print_version sub list_engines { my @as = WWW::Search::installed_engines(); $, = "\n"; print STDERR (sort @as), ''; exit 0; } # list_engines my $verbose_code; sub print_result { my($result, $count) = @_; return if $iTerse; my($prefix) = ""; if ($verbose) { my(@attribs) = (); $prefix = "$count. "; if (!defined($verbose_code)) { $verbose_code = ""; foreach (qw(title description score normalized_score size change_date index_date)) { $verbose_code .= "push(\@attribs, \"$_: \" . \$result->$_())\n" . "\tif (defined(\$result->$_()));\n"; }; }; eval $verbose_code; $prefix .= "(" . join(",\n\t", @attribs) . ")\n\t" if ($#attribs >= 0); }; if (defined($all)) { foreach ($result->urls()) { print "$prefix$_\n"; $prefix = "\t"; }; } else { if (defined($raw)) { print $result->raw(), "\n"; } else { print $prefix, $result->url, "\n"; }; }; } sub print_error { my ($msg, $count) = @_; my $error = $verbose ? sprintf("[%3d] ", $count) : ''; $error .= $msg; print STDERR $error, "\n" if $error ne ''; } sub main { my $query = shift; my $count = 0; my $search = new WWW::Search($sEngine); $search->{_host} = $_host if $_host ne ''; $search->{_port} = $_port if $_port ne ''; my %hsOptions = (); if (0 < $debuglwp) { require LWP::Debug; LWP::Debug::level('+'); } $search->http_proxy($ENV{'HTTP_PROXY'}) if ($ENV{'HTTP_PROXY'}); $search->http_proxy($ENV{'http_proxy'}) if ($ENV{'http_proxy'}); if (0 < scalar(@options)) { foreach my $sPair (@options) { if ($sPair =~ m/^([^=]+)=(.*)$/) { my ($key, $value) = ($1, $2); # This is a bit of a hack. A set of CGI options is not # strictly a hash, because multiple values for the same key # can be specified. To get around this, we rely on the fact # that this hash of options is only used to construct a CGI # parameter list. If we see multiple values for the same key, # we append the multiple values onto the value of the key in # CGI '?key=value' format. if (exists($hsOptions{$key}) && $hsOptions{$key} ne '') { # There was already an option of this key given; append # multiple values as CGI arguments: $hsOptions{$key} .= '&'.$key.'='.WWW::Search::escape_query($value); } # if exists else { # This is the only instance of this key; just insert the # hash value: $hsOptions{$key} = WWW::Search::escape_query($value); } } # if option is of the form key=value else { $hsOptions{$sPair} = ''; } } # foreach $sPair } # if @options if (defined($iMax)) { $search->maximum_to_retrieve($iMax); } else { $iMax = 10000; } $hsOptions{'search_debug'} = $debug if (defined($debug) and (0 < $debug)); $iGui ? $search->gui_query(WWW::Search::escape_query($query), \%hsOptions) : $search->native_query(WWW::Search::escape_query($query), \%hsOptions); $search->login($opt_username, $opt_password); my($way) = 0; # 0=piecemeal, 1=all at once my($result); if (($iShowCount) && defined($search->approximate_result_count)) { print "There are approximately " if $verbose; print $search->approximate_result_count; print " results." if $verbose; print "\n"; } # if my $iNumPrinted = 0; if ($way) # return all at once. { foreach $result ($search->results()) { print_result($result, ++$count); $iNumPrinted++; last if ($iMax < $count); }; } else # return page by page { while ($result = $search->next_result()) { print_result($result, ++$count); $iNumPrinted++; last if ($iMax < $count); } # while } # else page by page if ($iShowCount) { print "There are exactly " if $verbose; print $iNumPrinted; print " results (maximum of $iMax were requested)." if $verbose; print "\n"; } # if # handle errors if ($count == 0) { my($response) = $search->response(); my $nothing = $verbose ? "Nothing found." : ''; if ($response->is_success) { print_error($nothing, $count); } else { print_error("Error: " . $response->as_string(), $count); }; }; }; WWW-Search-2.517/Programs/AutoSearch0000644000175000017500000020424713101347451016550 0ustar martinmartin#!perl -w # # AutoSearch # Copyright (c) 1996-1997 University of Southern California. # All rights reserved. # $Id: AutoSearch,v 1.2 2010-12-02 23:46:30 Martin Exp $ # # Complete copyright notice follows below. =head1 NAME AutoSearch -- a web-search tracking application =head1 SYNOPSIS AutoSearch [--stats] [--verbose] -n "Query Name" -s "query string" --engine engine [--mail you@where.com] [--options "opt=val"]... [--filter "filter"] [--host host] [--port port] [--userid bbunny --password c4rr0t5] [--ignore_channels KABC,KCBS,KNBC] qid AutoSearch --VERSION AutoSearch --help AutoSearch --man =cut use Data::Dumper; # for debugging use Date::Manip; use File::Copy; use Getopt::Long qw( :config no_ignore_case ); # use LWP::Debug qw(+ ); # -conns); use Pod::Usage; use POSIX qw(strftime); use WWW::Search; use strict; use vars qw( $VERSION ); $VERSION = do { my @r = (q$Revision: 1.2 $ =~ /\d+/g); sprintf "%d."."%03d" x $#r, @r }; use constant DEBUG_EMAIL => 0; my $TESTONLY = 0; sub print_version { print STDERR "$0 version $VERSION; WWW::Search version $WWW::Search::VERSION\n"; } # print_version my (%opts,@query_list,$query_name,$query_string,$search_engine,$query_options,$url_filter); # Default options: $opts{'m'} = ''; $opts{'cleanup'} = 0; $opts{'cmdline'} = 0; $opts{'emailfrom'} = ''; $opts{'v'} = 0; $opts{'help'} = 0; $opts{'man'} = 0; $opts{'stats'} = 0; $opts{'debug'} = 0; $opts{'listnewurls'} = 0; $opts{'ignore_channels'} = (); &GetOptions(\%opts, qw(n|qn|queryname=s s|qs|querystring=s e|engine=s m|mail=s emailfrom=s host=s p|port=s o|options=s@ f|uf|urlfilter=s listnewurls stats userid=s password=s ignore_channels=s@ cleanup=i cmdline v|verbose help man V|VERSION debug), 'http_proxy=s', 'http_proxy_user=s', 'http_proxy_pwd=s', ) or pod2usage(2); if ($opts{'V'}) { &print_version(); exit 0; } # if &pod2usage(1) if ($opts{'help'}); &pod2usage( -verbose => 2, ) if ($opts{'man'}); &pod2usage( -verbose => 0, -exitval => 1, -message => 'missing argument "qid"', ) if ($#ARGV == -1); my $s_dbg = $opts{'stats'}; my $v_dbg = $opts{'v'}; my $d_dbg = $opts{'debug'} || $v_dbg; if ($v_dbg) { print STDERR "v option: defined\n"; print STDERR "stats option: ", ! $opts{'stats'} ? 'not ' : '', "defined\n"; print STDERR "debug option: ", ! $opts{'debug'} ? 'not ' : '', "defined\n"; } # if sub module_loaded { my $sModule = shift; eval "use $sModule"; return ($@ eq ''); } # module_loaded if ($opts{'m'} ne '') { MODULE: foreach my $sMod (qw( Email::MIME Email::MIME::Creator Email::Send Email::Send::Sendmail Email::Send::Qmail Email::Send::SMTP )) { if (! &module_loaded($sMod)) { print STDERR " --- can not load $sMod module: ==$@==\n"; $opts{'m'} = ''; last MODULE; } # if else { # print STDERR " + loaded $sMod\n"; } } # foreach MODULE } # if print STDERR "will send email summary to: ", $opts{'m'}, "\n" if ($v_dbg && ($opts{'m'} ne '')); # if we want a list of args: #@query_list = split(/[,\s]+/, $opts{'n'},2) if defined($opts{'n'}); $query_name = $opts{'n'} || ''; $query_string = $opts{'s'} || ''; $search_engine = $opts{'e'} || ''; if (defined($opts{'o'})) { $query_options = {}; foreach my $sPair (@{$opts{'o'}}) { my ($key, $value) = $sPair =~ m/^([^=]+)=(.*)$/; &add_to_hash($key, &WWW::Search::escape_query($value), $query_options); } # foreach } # if $url_filter = $opts{'f'} if defined($opts{'f'}); my($local_filter) = 0; # shall we exclude our own old pages? (1=y,0=n) #print STDERR "n = \"$opts{'n'}\"\n" if defined($opts{'n'}); #print STDERR "s = \"$opts{'s'}\"\n" if defined($opts{'s'}); #print STDERR "e = \"$opts{'e'}\"\n" if defined($opts{'e'}); #print STDERR "o = \"$opts{'o'}\"\n" if defined($opts{'o'}); #print STDERR "f = \"$opts{'f'}\"\n" if defined($opts{'f'}); ##print STDERR "query_list = \"$query_list[0]\" \"$query_list[1]\"\n"; #print STDERR "query_name = \"$query_name\"\n"; #print STDERR "query_string = \"$query_string\"\n"; #print STDERR "search_engine = \"$search_engine\"\n"; #print STDERR "url_filter = \"$url_filter\"\n" if defined($opts{'f'}); &main(join(" ", @ARGV)); exit 0; ######## # subs # ######## # # all the subroutine documentation has been disabled -- oh for a ifdef # someday -- put all the subs in another file & tech documentation # from there, man page from main file. wls :) 10/18/96 :( # #=head1 SUBROUTINES # #=cut # # Files expected first_date.html and first_index.html in the # 'home' directory. These are used to build the initial # search files. # # To start a new search: specify a query name (-qn) and # query string (-qs) (and more). If either are not provided, AS will # ask the user (STDIN) for the name and/or the string as necessary. # The string "AutoSearch WEB Searching" is replaced by the query name # and the string "ask user" is replaced by the query string. # # AS 1.x accepts the basic query sub-directory identifier # and looks to see if qid/date.html exists. If this file # file does not exist, AS looks for a file first_date.html # If the default file in the parent directory is missing, AS # creates a default-default file named qid/date.html. # In any case the directory 'qid' is created. # A second file first_index.html is used to create the initial # qid/index.html file in a like fashon. # # (LATER: ADD SEARCH ENGINE(S)) # There are three important items which must be defined for any # search that AS will process. First the query-subdirectory # identifier (qid). This is the handle to this query. # Once a query session is established; only the qid need be # specified on the command line. # Second is the 'pretty name' that you will assign to this # Query. This is used at the top of the screens to help # the user identify what the searche topic is. Third # is the actual search string. This is passed directly to # search processor, and the search engine. # # File date.html is used to define the format of the pages which are # created each time (weekly) AutoSearch.pl is run. Major sections of # the file are identified with html comments. The search string is # embedded in this file too. One method of creating a new search is # to create the new directory manually and create a file named # date.html in the sub-directory. However; this requires you to # understand the tags and build a file which is compatible with AS. # # A second method is to manually copy first_date.html to qid/date.html # and edit it. This is the John method to introduce a new search. It # is cumbersome and not-automated-ish. # # A third approach involves using command line arguments to specify # the query name and string. This is the wls method. One variation # on the wls method is the "ask user" method. In this approach, AS # will ask the user for the required items. In both of these methods # the necessary files will be created. # # Once the initial search has been made and the files created, then # the simple command "AutoSearch qid" will update the search files and # pages. # #=head1 NAME # #main - start of actual work. # # #=head1 DESCRIPTION # #Submit a search and build output files. # #=cut # submit a search and optionally build output file(s). # sub main { my ($query_dir) = @_; my $dbg_search = $d_dbg || 0; my $v_dbg = $v_dbg; # shall we be verbose?? my $sEmail = ''; my $url_filter_count = 0; # get the date. my $now = &time_now; my $today = &time_today; print STDERR "Now = ", $now if $v_dbg; print STDERR ", Today = ", $today, "\n" if $v_dbg; # Build query directory string: my $qid = $query_dir; die ("qid is a required argument.") unless ($qid); die ("qid is a required argument.") unless ($qid =~ m!\S!); print STDERR "query directory: $qid\n" if $v_dbg; # Do we have the necessary infrastructure? We require two files: 1) # the summary of searches and 2) weekly updates. Look for # index.html, or default first_index.html, or make one from scratch. # index.html contains the previous search results. (aka old summary) &check_index_file($qid); # make qid/index.html $v_dbg && print STDERR " + found/created $qid/index.html\n"; if ($opts{'cmdline'}) { # Show the original cmdline and exit: my %hssOptions = ( Query => '--querystring', QueryName => '--queryname', SearchEngine => '--engine', URLFilter => '--filter', QueryOptions => '--option', # IgnoreChannels => '--ignore_channels', ); my $sFile = qq{$qid/index.html}; open(INDEX, $sFile) or die " --- can not open $sFile for read: $!"; # Slurp entire file: local $/ = undef; my $sHTML = ; close(INDEX) or warn " --- can not close $sFile after reading: $!"; my $sCmdline = qq{$0 $qid }; # Special case: look for QueryName inside

tags: $sHTML =~ s#

(.+?)

##; while (my ($sTag, $sArg) = each %hssOptions) { while ($sHTML =~ m##g) { my $sValue = $1 || ''; $sCmdline .= qq{$sArg "$sValue" } if ($sValue ne ''); } # while } # while print STDERR "\n$sCmdline\n"; return; } # if --cmdline my $iCleanup = $opts{'cleanup'}; if (0 < $iCleanup) { print STDERR " + doing cleanup $iCleanup in ==$qid==\n" if $opts{debug}; # We don't care what the timezone is, we only deal with days: &Date_Init('TZ=US/Eastern'); my $dateCutoff = &DateCalc('today', " - $iCleanup days"); my $sCutoff = &UnixDate($dateCutoff, '%Y%m%d'); print STDERR " + cutoff date is ==$sCutoff==\n" if $opts{debug}; opendir(DIR, $qid) or die(" --- can not read directory $qid: $!"); CLEANUP_FILE: while (my $sFile = readdir(DIR)) { next CLEANUP_FILE if $sFile eq '.'; next CLEANUP_FILE if $sFile eq '..'; print STDERR " + found file ==$sFile==\n" if $opts{debug}; if ($sFile lt $sCutoff) { print STDERR " + unlink ==$sFile==\n" if $opts{debug}; unlink qq{$qid/$sFile} or warn $!; } # if file is old } # while closedir DIR; # Now delete old lines from index.html: my $sFile = qq{$qid/index.html}; my $sFileNew = qq{$sFile.new}; open(INDEX, $sFile) or die " --- can not open $sFile for read: $!"; open(NEW, '>'. $sFileNew) or die " --- can not open $sFileNew for write: $!"; local $/ = "\n"; CLEANUP_LINE: while (my $sLine = ) { if ($sLine =~ m!search on ([A-Za-z]+ \d+, \d+)!) { # This line contains a date. my $date = &ParseDate($1); my $iCmp = &Date_Cmp($date, $dateCutoff); if ($iCmp < 0) { # This line's date is before the cutoff. print STDERR " + delete line $sLine" if $opts{debug}; next CLEANUP_LINE; } # if } # if print NEW $sLine; } # while close INDEX; close NEW; copy($sFileNew, $sFile) or die " --- can not copy $sFileNew to $sFile: $!"; unlink $sFileNew; # No big deal if this fails # All done: return; } # if --cleanup $qid .= '/' unless substr($qid,-1,1) eq '/'; # we MUST have a / # Read index.html and break into fields. my ($SummaryTop, $SummaryQuery, $SummarySearchEngine, $SummaryURLFilter, $SummaryHeading, $SummaryTemplate, $Summary, $WeeklyHeading, $WeeklyTemplate, $Weekly, $SummaryBottom, @SummaryQueryOptions) = &get_summary_parts($qid); $v_dbg && print STDERR " + got summary parts...\n"; # Split the old summary into a list. (later sort it) my ($url, $description, $title); my (@old_summary_url,@old_summary_title); my @old_summary = split(/\n/,$Summary); my ($line); my ($i, $n, $j, $m); # Break each hyperlink into its url and title. for my $i (0..$#old_summary) { # $old_summary -> $url & $title $line = $old_summary[$i]; if ($line =~ m#(.*)
#i) { $url = $1; $title = $2; push (@old_summary_url,$url); push (@old_summary_title,$title); } # if } # for # For each item in weekly list: # If it is in old summary list remove it, else leave it. # Hint: note alphabetical order to reduce searching. # Append weekly list to summary list. Sort it. # Use summary list to build a new summary. # Output first half of the NEW index.html page. (summary) # Append either "no new results" (if zero unique left in weekly list) # or the date to weekly results list. # Output second half of NEW index.html page. (weekly results) # # If non-zero, use weekly list to build weekly file. # These are the input params to autosearch: # 1) qid, 2) query name, 3) query string, 4) search engine, # 5) query options, and 6) url filter RE. # Dispose of input params as follows: # Insert the Query Name into 'Top' for index.html & date.html, # Insert the Query String, Search Engine, and URL Filter into # the 'HTML fields' from index.html # Precedence of input: Query Name (pretty name) and Query String (search engine) # Search Engine (AltaVista), Search Options (## need example ##), # URL Filter (to suppress display and tracking of particular URLs) # 1) from existing file or default files (files already in place), or # 2) from user. # (either a) command line or b) "ask user" (or error if we don't dare)) my ($QueryName, $QueryString, $SearchEngine, $URLFilter); $QueryName = $query_name; # did we get a Query Name/String/Engine/Options/Filter from existing files? if ($SummaryTop =~ m/AutoSearch WEB Searching/i) { if ($query_name eq '') { # We did not get --queryname from command line $QueryName = &read_query("Please enter a Query Name:"); } # if $SummaryTop =~ s/AutoSearch WEB Searching/$QueryName/i; } # if print STDERR "Query Name is \"$QueryName\"\n" if $v_dbg; if ($SummaryQuery =~ m/ask user/i) { if ($query_string ne '') { # We did get --querystring from command line $QueryString = $query_string; } else { # We did NOT get --querystring, ask the user: $QueryString = &read_query("Please enter a Query String:"); } $SummaryQuery =~ s/ask user/$QueryString/i; print STDERR "Query String is \"$QueryString\"\n" if $v_dbg; } # if my $sTitle = " Search results for $SummaryQuery as of $now \n"; # This is not a required field. # This MUST BE 'ask user' to get AutoSearch to ask. if ($SummarySearchEngine =~ m/ask user/i) { if ($search_engine ne '') { # We DID get --engine on command line: $SearchEngine = $search_engine; } else { # We did NOT get --engine, ask user: $SearchEngine = &read_query("Please enter a Search Engine:"); } $SummarySearchEngine =~ s/ask user/$SearchEngine/i; } else { # don't ask; try command line: if ($search_engine ne '') { # We DID get --engine on command line: $SearchEngine = $search_engine; } else { # No --engine on command line, no Search Engine in index file. # Use whatever was in the first_index.html file $SearchEngine = $SummarySearchEngine; } $SummarySearchEngine = $SearchEngine; } print STDERR "Search Engine is \"$SearchEngine\"\n" if $v_dbg; # This is not a required field. # This MUST BE 'ask user' to get AutoSearch to ask. # # print $SummaryQueryOptions[0], " v. ask user\n"; if ($SummaryQueryOptions[0] =~ m/ask user/i) { # either ask or use command line # print $SummaryQueryOptions[0], " is ask user\n"; if (defined($opts{'o'})) { # from command line ? # print STDERR "defined.\n"; # use $query_options } else { # no, ask 'em # print STDERR "not defined.\n"; @SummaryQueryOptions = &read_query_list("Please enter Query Options:"); $query_options = {}; foreach (@SummaryQueryOptions) { next if m/^$/; my($key, $value) = m/^([^=]+)=(.*)$/; # print STDERR "option:$_ is $key=$value\n"; &add_to_hash($key, &WWW::Search::escape_query($value), $query_options); } } } else { # or use what came from index.html file $query_options = {}; foreach (@SummaryQueryOptions) { next if m/^$/; my($key, $value) = m/^([^=]+)=(.*)$/; # print STDERR "option:$_ is $key=$value\n"; &add_to_hash($key, &WWW::Search::escape_query($value), $query_options); } } # this is not a required field. # this MUST BE ask user to get AutoSearch to ask. if ($SummaryURLFilter =~ m/ask user/i) { # no, shall we ask the user if (defined($url_filter)) { # from command line ? $URLFilter = $url_filter; } else { # no, ask 'em $URLFilter = &read_query("Please enter a URL Filter:"); } $SummaryURLFilter =~ s/ask user/$URLFilter/i; } else { # don't ask; try command line if (defined($url_filter)) { # from command line ? $URLFilter = $url_filter; # yes } else { # if no comamnd line, no filter!! $URLFilter = $SummaryURLFilter; # use whatever was in the first_index.html file } $SummaryURLFilter = $URLFilter; } if ($s_dbg) { print STDERR qq{Query is : "$SummaryQuery"}; print STDERR qq{ with "@SummaryQueryOptions"} if ($#SummaryQueryOptions); print STDERR "\n"; } # if print STDERR "URL Filter is \"$URLFilter\"\n" if $v_dbg; # # now locate the weekly format file. # 1) qid/date.html, or 2) first_date.html, or 3) create one. &check_date_file($qid,$QueryName,$QueryString); #make qid/date.html # read date.html and break into fields. # make the search results into a list of urls, title, & descr). # (later sort it) # note: Top & Bottom CAN BE different from index.html. my($WeeklyTop, $AppendedHeading,$AppendedTemplate,$Appended, $SuspendedHeading,$SuspendedTemplate,$Suspended, $WeeklyBottom) = &get_weekly_parts($qid); # insert queryname into html Top from date.html # usually this is not set up, because when we created the file we didn't # have the data. Do we have the Query Name? $WeeklyTop =~ s/>AutoSearch WEB Searching{_host} = $opts{'h'} if defined($opts{'h'}); $search->{_port} = $opts{'p'} if defined($opts{'p'}); if (defined($opts{'http_proxy'}) && ($opts{'http_proxy'} ne '')) { print STDERR qq{ + applying http_proxy }, Dumper(\$opts{http_proxy}) if $opts{'debug'}; $search->http_proxy(['http', ] => $opts{'http_proxy'}); if (defined($opts{'http_proxy_user'}) && ($opts{'http_proxy_user'} ne '')) { print STDERR qq{ + applying $opts{http_proxy_user}...\n} if $opts{'debug'}; $search->http_proxy_user($opts{'http_proxy_user'}); $search->http_proxy_pwd($opts{'http_proxy_pwd'}); } # if } # if elsif ($opts{'env_proxy'}) { $search->env_proxy($opts{'env_proxy'}); } elsif (0) { # This is the OLD code: $search->http_proxy($ENV{'HTTP_PROXY'}) if ($ENV{'HTTP_PROXY'}); $search->http_proxy($ENV{'http_proxy'}) if ($ENV{'http_proxy'}); } # if # submit search w/options. $search->native_query(WWW::Search::escape_query($SummaryQuery), $query_options); $search->login($opts{'userid'}, $opts{'password'}); # Process the --ignore_channels argument(s): my @asChannel; foreach my $sChannel (@{$opts{'ignore_channels'}}) { push @asChannel, split(/,/, $sChannel); } # foreach if ($search->can('ignore_channels') && scalar(@asChannel) ) { $search->ignore_channels(@asChannel); } # if # examine search results my($next_result); my(@new_weekly_url,@new_weekly_title,@new_weekly_description); my @aoResult; # Parallel array to new_weekly_url my(@weekly_url,@weekly_title); # care to see the old summary list? # print STDERR "old summary:\n"; # foreach $line (@old_summary_url) { # print STDERR "$line\n"; # } # how many hits? # convert latest search results to a list of urls (descriptions & titles) # filtered by $SummaryURLFilter called new_weekly_* NEXT_URL: while ($next_result = $search->next_result()) { # page-by-page $url = $next_result->url; $hits++; # how many were returned? if ($local_filter) { # exclude old pages from prev. version? # let's not display references to our own pages. next if $url =~ m,www\.isi\.edu/div7/ib/(.+)/(\d+)\.html$,o; # let's not display references to our old pages. next if $url =~ m,www\.isi\.edu/div7/ib/jog,o; } # let the user filter out URLs. if ( ($SummaryURLFilter) && ($url =~ m,$SummaryURLFilter,oi) ) { # print STDERR "filter out $url \n with filter: $SummaryURLFilter\n"; $url_filter_count++; next; } $saved++; # how many were saved? push(@weekly_url,$url); # the complete set of hits $title = $next_result->title; push(@weekly_title,$title); # Was it in the old summary? If so, don't save it. # If not, it is a new search results for this week. foreach $line (@old_summary_url) { # See if this url is in the summary; skip this url if it's in # the summary: next NEXT_URL if ($url eq $line); } # foreach print STDERR "url:$url ** new result **\n" if $dbg_search; print "$url\n" if $opts{'listnewurls'}; $description = $next_result->description; print STDERR "description: ", $description, "\n" if $dbg_search; print STDERR "title: ", $title, "\n" if $dbg_search; push(@new_weekly_url,$url); # the newest set of hits (added) push(@new_weekly_description,$description); push(@new_weekly_title,$title); push @aoResult, $next_result; } # while NEXT_URL # Report errors, if any: my $response = $search->response(); if ($response->is_success) { if ($hits == 0) { print STDERR "Warning: Empty results set.\n" if ($v_dbg); } # if no hits } # if HTTP success else { my $sMsg = "Error: " . http_error_as_nice_string($response); print STDERR "$sMsg\n"; $sEmail .= "$sMsg\n"; } # else HTTP error # Only save the ones that don't show up in the current query list. # Those we shall call suspended_* my (@suspended_url,@suspended_title); # We must use a for loop, to get the urls to match their descr & # title. OLD_URL: for my $i (0..$#old_summary_url) { $url = $old_summary_url[$i]; for my $j (0..$#weekly_url) { $line = $weekly_url[$j]; # if we match the weekly (active search) hits, skip it. next OLD_URL if $url eq $line; } # for $j printf STDERR "suspend: [%02d]%s\n",$i,$url if $dbg_search; # not found? save it, it's been suspended push(@suspended_url,$url); $title = $old_summary_title[$i]; push(@suspended_title,$title); } # for $i # stats?? (to see 'em use -stats) print STDERR "old summary count: ",$#old_summary_url + 1,"\n" if ($s_dbg); print STDERR "new raw hits : ",$hits,"\n" if ($s_dbg); print STDERR "urls filtered : ",$url_filter_count,", filter \"",$URLFilter,"\"\n" if ($s_dbg); print STDERR "not filtered : ",$saved,"\n" if ($s_dbg); print STDERR "results set count: ",$#new_weekly_url + 1,"\n" if ($s_dbg); print STDERR "suspended count : ",$#suspended_url + 1,"\n" if ($s_dbg); print STDERR "final count : ",$#weekly_url + 1,"\n" if ($s_dbg); my $changes = (($#new_weekly_url != -1) || ($#suspended_url != -1)); # printf STDERR "changes is %d\n",$changes if ($v_dbg); # For every search, AutoSearch (aka AS) will make a 'weekly' file. # Usually AutoSearch is run as a 'cron' job; but can be run manually. # AS will attempt to handle multiple (non-concurrent) runs per day. my $file = (&time_file_of_the_day_numeric).'.html'; my $section; my $junk; # to test this 'diff' code 1) rm -r qid/* 2) run AS -stats qid -qn -qs # 3) edit qid/index.html remove some, add some (new urls) # 4) run AS -stats qid; look at .html # you will see 'deleted' (from above) as 'recently added' & # 'added' (from above) as 'recently suspended' # reason: the changes were forced in 'old summary', # usually (normal, non-testing cases) the changes will appear in new_weekly_* # now if this is a run that adds information (re) write the file. # # do we have a reason to care? New Results, Appensions, Suspensions??? my $iNewResultCount = 0; if ($changes) { # print STDERR "test file: $qid$file\n"; if (-e $qid.$file) { # File already exists; modify it. print STDERR "modify existing weekly file.\n" if ($v_dbg); # Copy in previous file from today: open (PARTS,'<'.$qid.$file) || die "Can't open weekly input file.\nReason: $!\n"; my($part) = ; close (PARTS); # print STDERR "Part:\"$part\"\n"; # break into parts to insert modifications my($part1,$part2) = split (//,$part,2); my($part3,$part4) = split (//,$part2,2); # write back out w/ changes. open (HTML,'>'.$qid.$file) || die "Can't open weekly output file.\nReason: $!\n"; print HTML $part1; $n = $#new_weekly_url + 1; if ($n) { print HTML "\n"; print HTML "

Recently Added:

\n"; } # format each unique result. for ($i=0; $i < $n; $i++) { $url = $new_weekly_url[$i]; $title = $new_weekly_title[$i]; $description = $new_weekly_description[$i]; my $sHTMLlink = &make_link($AppendedTemplate, $url, $title, $description); print HTML "$sHTMLlink\n"; # print STDERR " DDD search is $search\n"; $sEmail .= q{

}. $search->result_as_HTML($aoResult[$i], '%Y-%m-%d %H:%M %Z'); } # for print HTML ""; # replace due to split print HTML $part3; $n = $#suspended_url + 1; if ($n) { print HTML "\n"; print HTML "

Recently Suspended:

\n"; } # format each suspended result. for ($i=0; $i < $n; $i++) { $url = $suspended_url[$i]; $title = $suspended_title[$i]; print HTML &make_link($SuspendedTemplate,$url,$title,""),"\n"; } print HTML ""; # replace due to split print HTML $part4; close (HTML); } else { # create the file # print STDERR "make new weekly file.\n" if ($v_dbg); open (HTML,'>'.$qid.$file) || die "Can't open weekly output file.\nReason: $!\n"; print HTML $sTitle; print HTML "\n"; print HTML "\n$WeeklyTop\n"; DEBUG_EMAIL && print STDERR " + start creating email, new_weekly_url is ", Dumper(\@new_weekly_url); # Output weekly search status: appended $iNewResultCount = $#new_weekly_url + 1; # Always do the header: print HTML "\n"; print HTML &format_link($AppendedHeading,"DATE", $now) if $iNewResultCount; print HTML "\n"; $section = "Appended"; # the section of the file/output. print HTML "\n"; # Format each unique result: for my $i (0..$iNewResultCount-1) { $url = $new_weekly_url[$i]; $title = $new_weekly_title[$i]; $description = $new_weekly_description[$i]; my $sHTMLlink = &make_link($AppendedTemplate, $url, $title, $description); print HTML "$sHTMLlink\n"; $sEmail .= q{

}. $search->result_as_HTML($aoResult[$i], '%Y-%m-%d %H:%M %Z'); } # for $i print HTML $Appended; print HTML "\n\n"; # output weekly search status: suspended $n = $#suspended_url + 1; # always do the heading print HTML "\n"; print HTML &format_link($SuspendedHeading,"DATE","$now") if $n; print HTML "\n"; $section = "Suspended"; # the section of the file/output. print HTML "\n"; for ($i=0; $i < $n; $i++) { $url = $suspended_url[$i]; $title = $suspended_title[$i]; print HTML &make_link($SuspendedTemplate,$url,$title,""),"\n"; } # for $i print HTML $Suspended; print HTML "\n\n"; print HTML "\n$WeeklyBottom\n"; close (HTML); } # else daily file doesn't exist } # if any new hits else { print STDERR "no weekly changes required.\n" if ($v_dbg); } $QueryName = $SummaryQuery if ($QueryName eq ''); # Now write the new index file. # Create the index.html output file: open (HTML,'>'.$qid.'index.html') || die "Can't open summary output file.\nReason: $!\n"; print HTML " Summary of Search results for $SummaryQuery\n"; print HTML "\n"; print HTML "\n$SummaryTop\n"; print HTML "\n"; print HTML "\n"; print HTML "\n" if ($SummarySearchEngine); print HTML "\n" if ($SummaryURLFilter); foreach my $key (keys (%{$query_options})) { # print STDERR "option::$key=$query_options->{$key}\n"; print HTML "\n"; } # output summary of updated unique findings print HTML "\n"; print HTML &format_link($SummaryHeading, "DATE", $now); print HTML "\n"; $section = "Summary"; # the section of the file/output. print HTML "\n"; if ($#new_weekly_url < 0) { # it has't changed; just re-cycle it. print HTML $Summary; } else { # format each unique result. # $description = ""; $n = $#weekly_url + 1; for ($i=0; $i < $n; $i++) { $url = $weekly_url[$i]; $title = $weekly_title[$i]; print HTML &make_link($SummaryTemplate,$url,$title,""),"\n"; } } # Output daily results status (none or ptr to new file). print HTML "\n\n"; print HTML "\n"; print HTML &format_link($WeeklyHeading,"DATE", $now); print HTML "\n"; $section = "Weekly"; # the section of the file/output. print HTML "\n"; # report errors FIRST. if ($hits == 0) { my($response) = $search->response(); if ($response->is_success) { # dbg message, normally we're quiet # print HTML "AutoSearch Warning: Empty Results Set.
\n"; } else { # SearchEngine error message: my $sMsg = "AutoSearch Error during search on $today: " . http_error_as_nice_string($response); print HTML "$sMsg
\n"; $sEmail .= "$sMsg\n"; } } # if no hits # Let's use reverse chronological order. # Update the 'weekly' status: if ($changes) { # there were changes. # second run today? if ($Weekly =~ m/^No unique results found for(.*)$today/i) { # print STDERR "change 'No' to 'Yes'\n"; # The first line SHOULD be today's, assume so. # Delete first line of $Weekly and write new link: ($junk, $Weekly) = split (/\n/,$Weekly,2); # split off first line print HTML "Web search results for search on ",$today,"
\n"; } elsif ($Weekly =~ m/^AutoSearch Error during search on $today/i) { # print STDERR "change 'Error' to 'Yes'\n"; # The first line SHOULD be today's, assume so. # Delete first line of $Weekly and write new link: ($junk, $Weekly) = split(/\n/, $Weekly, 2); # split off first line print HTML "Web search results for search on ",$today,"
\n"; } elsif ($Weekly =~ m/^Web search results for(.*)$today/) { # yes we already have a link, leave it. # print STDERR "leave 'Yes'\n"; } else { # no link for today of any kind. add one. # print STDERR "insert 'Yes'\n"; print HTML "Web search results for search on ",$today,"
\n"; } } # there were any $changes else { # second run today? if ($Weekly =~ m/^No unique results found for(.*)$today/i) { # print STDERR "leave 'No'\n"; } elsif ($Weekly =~ m/^AutoSearch Error during search on $today/i) { # print STDERR "change 'Error' to 'No'\n"; # The first line SHOULD be today's, assume so. # Delete first line of $Weekly and write new link: ($junk, $Weekly) = split(/\n/, $Weekly, 2); # split off first line print HTML "No unique results found for search on ",$today,"
\n"; } elsif ($Weekly =~ m/^Web search results for(.*)$today/) { # We found results earlier today, leave them. # print STDERR "leave 'Yes'\n"; } else { # print STDERR "insert 'No'\n"; print HTML "No unique results found for search on ",$today,"
\n"; } } # there were no $changes print HTML $Weekly; print HTML "\n\n"; print HTML "\n$SummaryBottom\n"; close (HTML); # DEBUG_EMAIL && print STDERR " + opts{m} =$opts{m}=\n"; # DEBUG_EMAIL && print " + raw sEmail =====$sEmail=====\n"; if (($opts{'m'} ne '') && ($sEmail ne '')) { $sEmail = <<"EMAILEND"; $sTitle

The following $iNewResultCount URLs are new matches for your $SummarySearchEngine query '$SummaryQuery':

$sEmail EMAILEND # DEBUG_EMAIL && print STDERR " + cooked sEmail =====$sEmail=====\n"; # DEBUG_EMAIL && exit 88; my $sSubject = "Results of AutoSearch query '$SummaryQuery'"; my $sFrom = 'AutoSearch@localhost'; $sFrom = $opts{emailfrom} if ($opts{emailfrom} ne ''); my $oMsg = Email::MIME->create( header => [ To => $opts{'m'}, Subject => $sSubject, From => $sFrom, ], attributes => { 'content_type' => 'text/html', }, body => $sEmail, ); DEBUG_EMAIL && print STDERR " + oMsg is ", Dumper($oMsg); if (ref($oMsg)) { DEBUG_EMAIL && print STDERR " + oMsg in sendable format is:\n"; DEBUG_EMAIL && Email::Send::send(IO => $oMsg); my $sRes = &send_email($oMsg); print STDERR "result of send_email() is ==$sRes=\n" if ($v_dbg || ($sRes ne '1')); # print STDERR $sRes if ($v_dbg || ($sRes ne '1')); } else { print STDERR " --- could not create Email::MIME object\n"; } } # if } # main sub send_email { my $oMessage = shift; my $sSMTPserver = $ENV{SMTPSERVER} || shift || ''; if ($sSMTPserver ne '') { my $sUsername = $ENV{SMTPUSERNAME} || shift || ''; my $sPassword = $ENV{SMTPPASSWORD} || shift || ''; if ($TESTONLY) { $sUsername = '' if ($sUsername eq ''); $sPassword = ($sPassword eq '') ? '' : ''; return qq{$0 chose method SMTP::Auth with server=$sSMTPserver, username=$sUsername, password=$sPassword}; } # if return Email::Send::SMTP->send($oMessage, Host => $sSMTPserver, username => $sUsername, password => $sPassword, ); } # if SMTP if ( # User has customized the pointer to the qmail program: ($Email::Send::Qmail::QMAIL ne 'qmail-inject') || # I hate that "used only once" warning: ($Email::Send::Qmail::QMAIL ne 'qmail-inject') # HOW else do we indicate in the environment that we want to use # Qmail? ) { if ($TESTONLY) { return qq{$0 chose method Qmail}; } # if return Email::Send::Qmail::send($oMessage, @_); } # if qmail # For undef warnings: $ENV{SENDMAIL} ||= ''; if ( ($^O =~ m!solaris!i) # A fair assumption? || ($^O =~ m!linux!i) # A fair assumption? ) { # User has customized the pointer to the sendmail program: if ($ENV{SENDMAIL} ne '') { $Email::Send::Sendmail::SENDMAIL = $Email::Send::Sendmail::SENDMAIL = $ENV{SENDMAIL}; } # if if ($TESTONLY) { return qq{$0 chose method Sendmail}; } # if return Email::Send::Sendmail::send($oMessage, @_); } # if solaris or linux return "Error: can not determine email send method\n"; } # send_email #=head1 NAME # #check_index_file($qid) - insure the index.html file exists. # # #=head1 DESCRIPTION # #If qid/index.html exists just return. #Else create qid/ if necessary and call make_index #to make the F file. # #=cut sub check_index_file { my ($qid) = @_; # do we have the necessary infrastructure? if (open (FIRST,"<$qid/index.html") ) { # yes # OK, close it. close (FIRST); } else { # no, make dir. if (mkdir ($qid, 0755) ) { if ($! =~ m/file exists/i) { # already done die "Can't create directory $qid.\nReason $!"; } } chmod 0755, $qid || die "Can't chmod directory $qid.\nReason $!"; &make_index($qid); } print STDERR "index.html exists, " if $main::v_dbg; } # check_index_file #=head1 NAME # #check_date_file($qid) - insure the date.html file exists. # #=head1 DESCRIPTION # #If qid/date.html exists just return. #Else create qid/ if necessary and call make_date #to make the F file. # #=cut sub check_date_file { my($qid,$qn,$qs) = @_; # do we have the necessary infrastructure? if (open (FIRST,'<'.$qid."date.html") ) { # yes # OK, close it. close (FIRST); } else { # no, make dir. if (mkdir ($qid, 0755) ) { if ($! =~ m/file exists/i) { # already done die "Can't create directory $qid.\nReason $!"; } } chmod 0755, $qid || die "Can't chmod directory $qid.\nReason $!"; &make_date($qid,$qn,$qs); } print STDERR "date exists, " if $main::v_dbg; } #=head1 NAME # #read_query($prompt) - read a string from STDIN. # #=head1 DESCRIPTION # #read STDIN, with a prompt and backspace editing, until # #=cut sub read_query { my($prompt) = @_; my($query) = ''; my($c) = ''; my($oldfh) = select(STDOUT); $| =1; select ($oldfh); print $prompt; while (read(STDIN,$c,1)) { # get a byte. # printf STDERR "%02x",ord($c); last if $c eq "\x0a"; if ($c eq "\x08") { chop ($query); next; } else { $query .= $c; } } # print STDERR "\ni see $query\n"; return ($query); } #=head1 NAME # #read_query_list($prompt) - read a list from STDIN. # #=head1 DESCRIPTION # #read STDIN, with a prompt and backspace editing, until # #=cut sub read_query_list { my($prompt) = @_; my(@query_list); my($s); while (1) { $s = &read_query($prompt); last unless ($s); push(@query_list,$s); } # print STDERR "\ni see $query_list\n"; return (@query_list); } #=head1 NAME # #C - create hyper links from a template. # #=head1 DESCRIPTION # #Replace the $field with $data in the $template given. #For example: # print C<&format_link("Hello place.\n","place","world");> #produces # Hello world. # #Used to replace default strings in the default documents with #user supplied information. # #=cut # format by text replacement. sub format_link { my ($unformated,$field,$data) = @_; my ($temp) = $unformated; # start with unformated string. $temp =~ s/$field/$data/; # make the replacement. return ($temp); } #=head1 NAME # #C - create url. # #=head1 DESCRIPTION # #Replace the URL, TITLE, and DESCRIPTION with $url, $title, and #$description (respectively) in the $template given. # #Used to convert a url template to a hyper-link with title and description. # #=cut sub make_link { my ($template,$url,$title,$description) = @_; my ($link) = $template; $link =~ s/URL/$url/ if defined($url); $link =~ s/TITLE/$title/ if defined($title); $link =~ s/DESCRIPTION/$description/ if defined($description); return ($link); } #=head1 NAME # #C - create url. # #=head1 DESCRIPTION # #Replace the URL, TITLE, and DESCRIPTION with $url, $title, and #$description (respectively) in the $template given. # #Used to convert a url template to a url with title and description. # #=cut sub make_no_link { my ($template,$url,$title,$description) = @_; my ($link) = $template; $link =~ s/URL/$url/ if defined($url); $link =~ s/TITLE/$title/ if defined($title); $link =~ s/DESCRIPTION/$description/ if defined($description); return ($link); } #=head1 NAME # #C - break input file into sub-fields. # #=head1 DESCRIPTION # #Used to convert an input file into the data elements, template and headings #required to build nice looking web pages. # #=cut # # read top.html to get the basic format of the web pages. # parts include Top, Topic, Appended, Suspended, Bottom # sub-parts include Heading, Template, actual contents. # these objects are identified and extracted from the # complete file. The format for all derived documents # is determined by this file. How the objects are combined # to created dervied pages is determined by the software # enclosed here-in. # sub get_weekly_parts { my($qid) = @_; $/ = undef(); # enable paragraph mode. my($Top,$Bottom); my($AppendedHeading,$AppendedTemplate,$Appended); my($SuspendedHeading,$SuspendedTemplate,$Suspended); open (PARTS,'<'.$qid.'date.html') || die "Can't open first date input file.\nReason: $!\n"; my($part) = ; close (PARTS); # print STDERR "Part:\"$part\"\n"; $Top = &get_pair_part($part,"Top"); $AppendedHeading = &get_part($part,"AppendedHeading"); $AppendedTemplate = &get_part($part,"AppendedTemplate"); $Appended = &get_pair_part($part,"Appended"); $SuspendedHeading = &get_part($part,"SuspendedHeading"); $SuspendedTemplate = &get_part($part,"SuspendedTemplate"); $Suspended = &get_pair_part($part,"Suspended"); $Bottom = &get_pair_part($part,"Bottom"); return($Top, $AppendedHeading,$AppendedTemplate,$Appended, $SuspendedHeading,$SuspendedTemplate,$Suspended, $Bottom); } #=head1 NAME # #C - break input file into sub-fields. # #=head1 DESCRIPTION # #Used to convert an input file into the data elements, template and headings #required to build nice looking web pages. # #=cut sub get_summary_parts { my($qid) = @_; $/ = undef(); # enable paragraph mode. my($Top,$Query,$SearchEngine,@QueryOptions,$URLFilter,$Bottom); my($SummaryHeading,$SummaryTemplate,$Summary); my($WeeklyHeading,$WeeklyTemplate,$Weekly); open (PARTS,'<'.$qid.'index.html') || die "Can't open index.html input file.\nReason: $!\n"; my($part) = ; close (PARTS); # print STDERR "Part:\"$part\"\n"; $Top = &get_pair_part($part,"Top"); $Query = &get_inline_part($part,"Query"); $SearchEngine = &get_inline_part($part,"SearchEngine"); # get array of key=value pairs @QueryOptions = &get_inline_list($part,"QueryOptions"); $URLFilter = &get_inline_part($part,"URLFilter"); $SummaryHeading = &get_part($part,"SummaryHeading"); $SummaryTemplate = &get_part($part,"SummaryTemplate"); $Summary = &get_pair_part($part,"Summary"); $WeeklyHeading = &get_part($part,"WeeklyHeading"); $WeeklyTemplate = &get_part($part,"WeeklyTemplate"); $Weekly = &get_pair_part($part,"Weekly"); $Bottom = &get_pair_part($part,"Bottom"); return($Top,$Query,$SearchEngine,$URLFilter, $SummaryHeading,$SummaryTemplate,$Summary, $WeeklyHeading,$WeeklyTemplate,$Weekly, $Bottom,@QueryOptions); } #=head1 NAME # #C - locate and return sub-fields. # #=head1 DESCRIPTION # #Use regular expressions to locate and and #return everything in between (including s). # #=cut # these objects are surrounded by comments # to be easily recognized; but always display.. sub get_pair_part { my($part,$mark) = @_; if ($part =~ m,\n(.*),s) { # print STDERR "$mark: \"$1\"\n"; return ($1); } # print STDERR "Warning: can't find ... <--/$mark-->\n" if ($v_dbg); return (""); } #=head1 NAME # #C - locate and return sub-fields. # #=head1 DESCRIPTION # #Use regular expressions to locate and #return everything in between (including s). # #=cut # these objects are surrounded by similiar matching x /x marks. # these objects are actually comments and won't be seen unless # modified. sub get_part { my($part,$mark) = @_; if ($part =~ m,,s) { # print STDERR "$mark: \"$1\"\n"; return ($1); } # print STDERR "Warning: can't find \n" if ($v_dbg); return (""); } #=head1 NAME # #C - locate and return sub-fields. # #=head1 DESCRIPTION # #Use regular expressions to locate and #return everything in between. # #=cut # these objects are surrounded by similiar matching x /x marks. # these objects are actually comments and won't be seen unless # modified. sub get_inline_part { my($part,$mark) = @_; if ($part =~ m,,s) { # print STDERR "inline $mark: \{$1\}\n"; return ($1); } # print STDERR "Warning: can't find \n" if ($v_dbg); return (""); } #=head1 NAME # #C - locate and return sub-fields. # #=head1 DESCRIPTION # #Use regular expressions to locate multiple occurances of # and return them as a list. # #=cut # these objects are surrounded by similiar matching x /x marks. # these objects are actually comments and won't be seen unless # modified. sub get_inline_list { my($part,$mark) = @_; if ($part =~ m,,s) { my(@PARTS) = split(/\n/, $part); my(@LINES) = grep(m,,s, @PARTS); my(@LIST,$is); foreach (@LINES) { next unless ($_ =~ m,,s); # print STDERR "inline $mark: \{$1\}\n"; push (@LIST,$1); } return (@LIST); } # print STDERR "Warning: can't find \n" if ($v_dbg); return (""); } #=head1 NAME # #C - make working copy of F. # #=head1 DESCRIPTION # #Create F from either F #or from 'memory'. # #=cut # check the root directory for default first_index file; # else make one of our own. sub make_index { my ($qid) = @_; open (INDEX, ">$qid/index.html") || die "Can't create index.html in $qid\nReason $!"; # copy user-provided file... if (open (DEFAULT, "<$qid/../first_index.html") ) { # look for a default # copy in default provided by user. while () { print INDEX $_; } close (DEFAULT) || die "Can't close file $qid/../first_index.html: $!"; close (INDEX) || die "Can't close file $qid/index.html: $!"; return; } # or OUR provided file my($it) = < index.html empty page to establish summary file format

AutoSearch WEB Searching

Last modified October 17, 1996. EOF print INDEX $it; close(INDEX); } #=head1 NAME # #make_date($qid) - make working copy of F. # #=head1 DESCRIPTION # #Create F from either F #or from 'memory'. # #=cut # check the root directory for default first_date file; # else make one of our own. sub make_date { my($qid,$qn,$qs) = @_; open (FIRST,'>'.$qid."date.html") || die "Can't create date in $qid\nReason $!"; # copy user-provided file... if (open (DEFAULT,'<'.$qid."../first_date.html") ) { # look for a default # copy in default provided by user. while () { s/{ask user}/\{$qs\}/ if (m,Query\{ask user\}/Query,); s/AutoSearch WEB Searching/$qn/ if (m,AutoSearch WEB Searching,); print FIRST $_; } close (DEFAULT) || die "Can't close default first_date.html file.\nReason:$!"; close (FIRST) || die "Can't close date.html file.\nReason:$!"; return; } # or OUR provided file my($it) = < first_date.html empty page to establish weekly file format

AutoSearch WEB Searching

Web searches maintained by AutoSearch. EOF $it =~ s/{ask user}/\{$qs\}/ if ($it =~ m,Query\{ask user\}/Query,); $it =~ s/AutoSearch WEB Searching/$qn/ if ($it =~ m,AutoSearch WEB Searching,); print FIRST $it; close(FIRST); } sub time_now { return strftime("%m/%d/%Y %H:%M:%S", localtime(time)); } sub time_today { return strftime("%b %d, %Y", localtime(time)); } sub time_file_of_the_day_numeric { return strftime("%Y%m%d", localtime(time)); } sub http_error_as_nice_string { my($response) = @_; my($message) = $response->message(); my($code) = $response->code(); chomp($message); return "$message (code $code)"; } sub add_to_hash { # This is a bit of a hack. A set of CGI options is not strictly a # hash, because multiple values for the same key can be specified. # To get around this, we rely on the fact that this hash of options # is only used to construct a CGI parameter list. If we see # multiple values for the same key, we append the multiple values # onto the value of the key in CGI '?key=value' format. my ($key, $value, $hashref) = @_; if (exists($hashref->{$key}) && $hashref->{$key} ne '') { # There was already an option of this key given; append # multiple values as CGI arguments: $hashref->{$key} .= "&$key=$value"; } # if exists else { # This is the only instance of this key; just insert the # hash value: $hashref->{$key} = $value; } } # add_to_hash =head1 DESCRIPTION B performs a web-based search and puts the results set in F. Subsequent searches (i.e., the second form above) B determine what changes (if any) occured to the results sent since the last run. These incremental changes are recorded in F. B is amenable to be run as a B job because all the input parameters are saved in the web pages. B can act as a automated query agent for a particular search. The output files are designed to be a set of web pages to easily display the results set with a web browser. Example: AutoSearch -n 'LSAM Replication' -s '"lsam replication"' -e AltaVista replication_query This query (which should be all on one line) creates a directory replication_query and fills it with the fascinating output of the AltaVista query on C<"lsam replication">, with pages titled ``LSAM Replication''. (Note the quoting: the single quotes in C<'"lsam replication"'> are for the shell, the double quotes are for AltaVista to search for the phrase rather than the separate words.) A more complicated example: AutoSearch -n 'External Links to LSAM' -s '(link:www.isi.edu/lsam or link:www.isi.edu/~lsam) -url:isi.edu' -e AltaVista::AdvancedWeb -o coolness=hot This query does an advanced AltaVista search and specifies the (hypothetical) ``coolness'' option to the search engine. =head1 OPTIONS =over =item C The I specifies the directory in which all the files that relate to this query and search results will live. It can be an absolute path, or a relative path from cwd. If the directory does not exist, it will be created and a new search started. =item C<--stats> Show search statistics: the query string, number of hits, number of filtered hits, filter string, number of suspended (deleted) hits, previous set size, current set size, etc. =item C<-v> or C<--verbose> Verbose: output additional messages and warnings. =item C<-n> or C<--qn> or C<--queryname> Specify the query name. The query name is used as a heading in the web pages, therefore it should be a 'nice' looking version of the query string. =item C<-s> or C<--qs> or C<--querystring> Specify the query string. The query string is the character string which will be submitted to the search engine. You may include special characters to group or to qualify the search. =item C<-e> or C<--engine> Specify the search engine. The query string will be submitted to the user specified search engine. In many cases there are specialized versions of search engines. For example, B and B allow more powerful and Usenet searches. See L or the man page for your search engine for details about specialized variations. =item C<--listnewurls> In addition to all the normal file maintenance, print all new URLs to STDOUT, one per line. =item C<-o> or C<--options> Specify the query options. The query options will be submitted to the user search engine with the query string. This feature permits modification of the query string for a specific search engine or option. More than one query option may be specified. Example: C<-o what=news> causes AltaVista to search Usenet. Although this works, the preferred mechanism in this case would be C<-e AltaVista::News> or C<-e AltaVista::AdvancedNews>. Options are intended for internal or expert use. =item C<-f> or C<--uf> or C<--urlfilter> This option specifies a regular expression which will be compared against the URLs of any results; if they match the case-insensitive regular expression, they will be removed from the hit set. Example: C<-f '.*\.isi\.edu'> avoids all of ISI's web pages. =item C<--cleanup i> Delete all traces of query results from more than i days ago. If --cleanup is given, all other options other than the qid will be ignored. =item C<--cmdline> Reconstruct the complete command line (AutoSearch and all its arguments) that was used to create the query results. Command line will be shown on STDERR. If --cmdline is given, all other options other than the qid will be ignored. =item C<--mail user@address> or C<-m user@address> After search is complete, send email to that user, listing the NEW results. Email is HTML format. Requires the Email::Send and related modules. If you send email through an SMTP server, you must set environment variable SMTPSERVER to your server name or IP address. If your SMTP server requires password, you must set environment variables SMTPUSERNAME and SMTPPASSWORD. If you send email via sendmail, you should set environment variable SENDMAIL if the sendmail executable is not in the path. =item C<--emailfrom user@address> If your outgoing mail server rejects email from certain users, you can use this argument to set the From: header. =item C<--userid bbunny> If the search engine requires a login/password (e.g. Ebay::Completed), use this. =item C<--password Carr0t5> If the search engine requires a login/password (e.g. Ebay::Mature), use this. =back =head1 DESCRIPTION B submits a query to a search engine, produces HTML pages that reflect the set of 'hits' (filtered search results) returned by the search engine, and tracks these results over time. The URL and title are displayed in the F, the URL, the title, and description are displayed in the 'weekly' files. To organize these results, each search result is placed in a query information directory (qid). The directory becomes the search results 'handle', an easy way to track a set of results. Thus a qid of C might locate the results on your web server at C. Inside the qid directory you will find files relating to this query. The primary file is F, which reflects the latest search results. Every not-filtered hit for every search is stored in F. When a hit is no longer found by the search engine it a removed from F. As new results for a search are returned from the search engine they are placed in F. At the bottom of F, there is a heading "Weekly Search Results", which is updated each time the search is submitted (see L). The list of search runs is stored in reverse chronological order. Runs which provide no new information are identified with No Unique Results found for search on Runs which contain changes are identified by Web search results for search on which will be linked a page detailing the changes from that run. Detailed search results are noted in weekly files. These files are named F and are stored in the qid directory. The weekly files include THE URL, title, and a the description (if available). The title is a link to the original web page. =head1 AUTOMATED SEARCHING On UNIX-like systems, cron(1) may be used to establish periodic searches and the web pages will be maintained by B. To establish the first search, use the first example under SYNOPSIS. You must specify the qid, query name and query string. If any of the items are missing, you will be interactively prompted for the missing item(s). Once the first search is complete you can re-run the search with the second form under SYNOPSIS. A cron entry like: 0 3 * * 1 /nfs/u1/wls/AutoSearch.pl /www/div7/lsam/autosearch/caching might be used to run the search each Monday at 3:00 AM. The query name and query string may be repeated; but they will not be used. This means that with a cron line like: 0 3 * * 1 /nfs/u1/wls/AutoSearch.pl /www/div7/lsam/autosearch/caching -n caching -s caching a whole new search series can be originated by rm -r /www/div7/lsam/autosearch/caching However, the only reason to start a new search series would be to throw away the old weekly files. We don't recommend running searches more than once per day, but if so the per-run files will be updated in-place. Any changes are added to the page with a comment that "Recently Added:"; and deletions are indicated with "Recently Suspended:." =head1 CHANGING THE LOOK OF THE PAGES The basic format of these two pages is simple and customizable. One requirement is that the basic structure remain unchanged. HTML comments are used to identify sections of the document. Almost everything can be changed except for the strings which identify the section starts and ends. Noteworthy tags and their meaning: =over 16 =item .* The text contained within this tag is placed at the top of the output page. If the text contains I, then the query name will replace it. If the text does not contain this magic string and it is the first ever search, the user will be asked for a query name. =item The text contained between the braces is the query string. This is how B maintains the query string. You may edit this string to change the query string; but only in F. The text I is special and will force B to request the search string from the user. =item The text contained between the braces is the search engine. Other engines supported are HotBot and Lycos. You may edit this string to change the engine used; but only in F. The text I is special and will force B to to request the search string from the user. =item The text contained between the braces specifies a query options. Multiple occurrencs of this command are allowed to specify multiple options. =item The text contained between the braces is the URL filter. This is how B maintains the filter. Again you may edit this string to change the query string; but only in F. The text I is special and will force B to ask the user (STDIN) for the query string. When setting up the first search, you must edit F, not F. The URL filter is a standard perl5 regular expression. URLs which do not match will be kept. =item .* The text contained within this tag is placed at the bottom of the output page. This is a good place to put navigation, page owner information, etc. =back The remainder of the tags fall into a triplet of I<~Heading>, I<~Template>, and I<~>, where ~ is Summary, Weekly, Appended, and Suspended. The sub-sections appear in the order given. To produce a section B outputs the heading, the template, the section, n copies of the formatted data, and an /section. The tags and their function are: =over 16 =item ~Heading The heading tag identifies the heading for a section of the output file. The SummaryHeading is for the summary portion, etc. The section may be empty (e.g., Suspended) and thus no heading is output. =item ~Template The template tag identifies how each item is to be formatted. Simple text replacement is used to change the template into the actual output text. The text to be replaced is noted in ALLCAPS. =item ~ This tag is used to locate the section (Summary, Weekly, etc.). This section represents the actual n-items of data. =back You can edit these values in the F page of an existing search. The file F (in the directory above F) will be used as a default template for new queries. Examples of these files can be seen in the pages under C, or in the output generated by a new AutoSearch. =head1 FILES =over 20 =item F optional file to determine the default format of the F file of a new query. =item F optional file to determine the default format of the F file for a new query. =item F (automatically created) latest search results, and reverse chronological list of periodic searches. =item F file used as a template for the F files. =item F (automatically created) summary of changes for a particular date (AKA 'Weekly' file). =back Optional files F and F are used for the initial search as a template for F and F, respectively. If either of these files does not exist; a default-default template is stored within the F source. The intention of these two files is to permit a user to establish a framework for a group of search sets which have a common format. By leaving the default query name and query string alone, they will be overridden by command line inputs. =head1 SEE ALSO For the library, see L, for the perl regular expressions, see L. =head1 AUTHORS Wm. L. Scheding B is a re-implementation of an earlier version written by Kedar Jog. =head1 COPYRIGHT Copyright (C) 1996-1997 University of Southern California. All rights reserved. Redistribution and use in source and binary forms are permitted provided that the above copyright notice and this paragraph are duplicated in all such forms and that any documentation, advertising materials, and other materials related to such distribution and use acknowledge that the software was developed by the University of Southern California, Information Sciences Institute. The name of the University may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. =head1 DESIRED FEATURES These are good ideas that people have suggested. =over 4 =item URL validation. Validate the status of each URL (with HTTP HEAD requests) and indicate this status in the output. =item Multi-search. It should be possible to merge the results of searches from two search-engines. If this merger were done as a new search engine, this operation would be transparent to AutoSearch. =back =head1 BUGS None known at this time; please inform the maintainer mthurn@cpan.org if any crop up. =cut __END__ WWW-Search-2.517/t/0000755000175000017500000000000013101353062013222 5ustar martinmartinWWW-Search-2.517/t/test_parsing.pl0000644000175000017500000003007713101347451016275 0ustar martinmartin#!/usr/local/bin/perl -w # test.pl # Copyright (C) 1997 by USC/ISI # $Id: test_parsing.pl,v 1.1 2007-05-15 12:06:30 Daddy Exp $ # # Copyright (c) 1997 University of Southern California. # All rights reserved. # # Redistribution and use in source and binary forms are permitted # provided that the above copyright notice and this paragraph are # duplicated in all such forms and that any documentation, advertising # materials, and other materials related to such distribution and use # acknowledge that the software was developed by the University of # Southern California, Information Sciences Institute. The name of the # University may not be used to endorse or promote products derived from # this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED # WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. sub usage { print STDERR < \$debug, 'e=s' => \$desired_search_engines, 'u' => \$update_saved_files, 'v' => \$verbose, 'i' => \$do_internal, 'x' => \$do_external, ); ($do_internal, $do_external) = (1,1) unless ($do_internal || $do_external); $debug = 1 if (defined($debug) and ($debug < 1)); my $oTest = new WWW::Search::Test($desired_search_engines); $oTest->{debug} = $debug; $oTest->{verbose} = $verbose; &main(); exit 0; sub test_cases { my ($o, $query, $sSE, $sM, $file); my $bogus_query = $WWW::Search::Test::bogus_query; $sSE = 'AltaVista'; $sM = 'John Heidemann '; $file = 'zero_result_no_plus'; $oTest->test($sSE, $sM, $file, $bogus_query, $TEST_EXACTLY); $file = 'zero_result'; $query = '+LSAM +' . $bogus_query; $oTest->test($sSE, $sM, $file, $query, $TEST_EXACTLY); $file = 'one_page_result'; $query = '+LS'.'AM +Aut'.'oSearch'; $oTest->test($sSE, $sM, $file, $query, $TEST_RANGE, 2, 10); $file = 'two_page_result'; $query = '+LS'.'AM +IS'.'I +Heide'.'mann'; $oTest->test($sSE, $sM, $file, $query, $TEST_GREATER_THAN, 10); ###################################################################### $sSE = 'AltaVista::Web'; $sM = 'John Heidemann '; $file = 'zero_result'; $query = '+LSA'.'M +' . $bogus_query; $oTest->test($sSE, $sM, $file, $query, $TEST_EXACTLY); $file = 'one_page_result'; $query = '+LSA'.'M +AutoSea'.'rch'; $oTest->test($sSE, $sM, $file, $query, $TEST_RANGE, 2, 10); $file = 'two_page_result'; $query = '+LSA'.'M +IS'.'I +I'.'B'; $oTest->test($sSE, $sM, $file, $query, $TEST_GREATER_THAN, 10); ###################################################################### $sSE = 'AltaVista::AdvancedWeb'; $sM = 'John Heidemann '; $oTest->not_working($sSE, $sM); # $query = 'LS'.'AM and ' . $bogus_query; # $oTest->test($sSE, $sM, 'zero', $query, $TEST_EXACTLY); # $query = 'LSA'.'M and AutoSea'.'rch'; # $oTest->test($sSE, $sM, 'one', $query, $TEST_RANGE, 2, 11); # $query = 'LSA'.'M and IS'.'I and I'.'B'; # $oTest->test($sSE, $sM, 'two', $query, $TEST_GREATER_THAN, 10); ###################################################################### $sSE = 'AltaVista::News'; $sM = 'John Heidemann '; $oTest->not_working($sSE, $sM); # $query = '+pe'.'rl +' . $bogus_query; # $oTest->test($sSE, $sM, 'zero', $query, $TEST_EXACTLY); # $query = '+Pe'.'rl +CP'.'AN'; # $oTest->test($sSE, $sM, 'multi', $query, $TEST_GREATER_THAN, 30); # 30 hits/page ###################################################################### $sSE = 'AltaVista::AdvancedNews'; $sM = 'John Heidemann '; $oTest->not_working($sSE, $sM); # $query = 'per'.'l and ' . $bogus_query; # $oTest->test($sSE, $sM, 'zero', $query, $TEST_EXACTLY); # $query = 'Per'.'l and CP'.'AN'; # $oTest->test($sSE, $sM, 'multi', $query, $TEST_GREATER_THAN, 70); # 30 hits/page ###################################################################### $oTest->eval_test('AltaVista::Intranet'); ###################################################################### $oTest->no_test('Crawler', 'unsupported'); # $oTest->test($sSE, $sM, 'zero', $bogus_query, $TEST_EXACTLY); # $query = 'Bay'.'reuth Bindl'.'acher Be'.'rg Flu'.'gplatz P'.'ilot'; # $oTest->test($sSE, $sM, 'one', $query, $TEST_RANGE, 2, 10); # # 10 hits/page # $query = 'Fra'.'nkfurter Al'.'lgemeine Sonnt'.'agszeitung Rech'.'erche'; # $oTest->test($sSE, $sM, 'two', $query, $TEST_GREATER_THAN, 10); ###################################################################### $oTest->eval_test('Excite::News'); ###################################################################### $sSE = 'ExciteForWebServers'; $sM = 'Paul Lindner '; $oTest->not_working($sSE, $sM); # &test($sSE, $sM, 'zero', $bogus_query, $TEST_EXACTLY); # $query = 'bur'.'undi'; # &test($sSE, $sM, 'one', $query, $TEST_RANGE, 2, 99); ###################################################################### $oTest->eval_test('Fireball'); ###################################################################### $sSE = 'FolioViews'; $sM = 'Paul Lindner '; $oTest->test($sSE, $sM, 'zero', $bogus_query, $TEST_EXACTLY); $query = 'bur'.'undi'; $oTest->test($sSE, $sM, 'one', $query, $TEST_RANGE, 2, 400); ###################################################################### $oTest->eval_test('Google'); $oTest->no_test('Gopher', 'Paul Lindner '); $oTest->eval_test('GoTo'); $oTest->eval_test('HotFiles'); $oTest->eval_test('Infoseek::Companies'); $oTest->eval_test('Infoseek::Email'); $oTest->eval_test('Infoseek::News'); $oTest->eval_test('Infoseek::Web'); $oTest->no_test('Livelink', 'Paul Lindner '); $oTest->eval_test('LookSmart'); # use WWW::Search::LookSmart; # $oTest->no_test('LookSmart', $WWW::Search::LookSmart::MAINTAINER); $oTest->eval_test('Lycos'); $oTest->eval_test('Magellan'); $oTest->eval_test('MetaCrawler', 'Jim Smyser '); ###################################################################### $sSE = 'Metapedia'; $sM = 'Jim Smyser '; $file = 'zero'; $oTest->test($sSE, $sM, $file, $bogus_query, $TEST_EXACTLY); ###################################################################### $sSE = 'MSIndexServer'; $sM = 'Paul Lindner '; $oTest->not_working($sSE, $sM); # $oTest->test($sSE, $sM, 'zero', $bogus_query, $TEST_EXACTLY); # $query = 'bur'.'undi'; # $oTest->test($sSE, $sM, 'one', $query, $TEST_RANGE, 2, 99); ###################################################################### $oTest->eval_test('NetFind'); $oTest->eval_test('NorthernLight'); ###################################################################### $sSE = 'Null'; $sM = 'Paul Lindner '; $oTest->test($sSE, $sM, 'zero', $bogus_query, $TEST_EXACTLY); ###################################################################### $oTest->eval_test('OpenDirectory'); ###################################################################### $sSE = 'PLweb'; $sM = 'Paul Lindner '; $oTest->not_working($sSE, $sM); # $oTest->test($sSE, $sM, 'zero', $bogus_query, $TEST_EXACTLY); # $query = 'bur'.'undi'; # $oTest->test($sSE, $sM, 'one', $query, $TEST_RANGE, 2, 99); ###################################################################### $sSE = 'Search97'; $sM = 'Paul Lindner '; $oTest->not_working($sSE, $sM); # $file = 'zero'; # $oTest->test($sSE, $sM, $file, $bogus_query, $TEST_EXACTLY); # $file = 'one'; # $query = 'bur'.'undi'; # $oTest->test($sSE, $sM, $file, $query, $TEST_RANGE, 2, 99); ###################################################################### $sSE = 'SFgate'; $sM = 'Paul Lindner '; $oTest->test($sSE, $sM, 'zero', $bogus_query, $TEST_EXACTLY); $query = 'bur'.'undi'; $oTest->test($sSE, $sM, 'one', $query, $TEST_RANGE, 2, 99); ###################################################################### $oTest->no_test('Simple', 'Paul Lindner '); $oTest->eval_test('Snap'); $oTest->no_test('Verity', 'Paul Lindner '); $oTest->eval_test('WebCrawler'); $oTest->eval_test('ZDNet'); } # test_cases sub main { # print "\nVERSION INFO:\n "; # my ($cmd) = &web_search_bin . " --VERSION"; # print `$cmd`; if ($update_saved_files) { print "\nUPDATING.\n\n"; $oTest->mode($MODE_UPDATE); &test_cases(); # Can not do update AND test: return; } # if if ($do_internal) { print "\nTESTING INTERNAL PARSING.\n (Errors here should be reported to the WWW::Search maintainer.)\n\n"; $oTest->reset_error_count; $oTest->mode($MODE_INTERNAL); &test_cases(); print "\n"; if ($oTest->{error_count} <= 0) { print "All ", $oTest->mode, " tests have passed.\n\n"; } else { print "Some ", $oTest->mode, " tests failed. Please check the README file before reporting errors (sometimes back-ends have known failures).\n"; } } # if $do_internal if ($do_external) { print "\n\nTESTING EXTERNAL QUERIES.\n (Errors here suggest search-engine reformatting and should be\n reported to the maintainer of the back-end for the search engine.)\n\n"; $oTest->reset_error_count; $oTest->mode($MODE_EXTERNAL); &test_cases(); print "\n"; if ($oTest->{error_count} <= 0) { print "All ", $oTest->mode, " tests have passed.\n\n"; } else { print "Some ", $oTest->mode, " tests failed. Please check the README file before reporting errors (sometimes back-ends have known failures).\n"; } } # if $do_external } # main =head2 TO DO =over =item No identified needs at the moment... =back =head2 HOW IT WORKS At present there is only one function available, namely &test(). It takes at least 5 arguments. These are: 1) the name of the search engine (string); 2) the maintainer's name (and email address) (string); 3) a filename (unique among tests for this backend) (string); 4) the raw query string; 5) the test method (one of the constants $TEST_EXACTLY, $TEST_RANGE, $TEST_GREATER_THAN, $TEST_BY_COUNTING); optional arguments 6 and 7 are integers to be used when counting the results. The query is sent to the engine, and the results are compared to previously stored results as follows: If the method is $TEST_EXACTLY, the two lists of URLs must match exactly. If the method is $TEST_RANGE, the number of URLs must be between arg6 and arg7. If the method is $TEST_GREATER_THAN, the number of URLs must be greater than arg6. If the method is $TEST_BY_COUNTING, the number of URLs must be exactly arg6 (but we don't care what the URLs are). =cut WWW-Search-2.517/t/autosearch.t0000644000175000017500000000102013101347451015543 0ustar martinmartin# $Id: autosearch.t,v 1.9 2008-07-16 00:41:37 Martin Exp $ use ExtUtils::testlib; use File::Spec::Functions; use Test::File; use Test::More qw(no_plan); use strict; my $sProg = catfile('blib', 'script', 'AutoSearch'); my $iWIN32 = ($^O =~ m!win32!i); file_exists_ok($sProg, "$sProg exists"); SKIP: { skip 'Can not check "executable" file flag on Win32', 1 if $iWIN32; file_executable_ok($sProg, "$sProg is executable"); } # end of SKIP block pass(); # print STDERR "\n"; diag(`$sProg -V`); pass(); exit 0; __END__ WWW-Search-2.517/t/pod-coverage.t0000644000175000017500000000131713101347451015771 0ustar martinmartin # $Id: pod-coverage.t,v 1.1 2008/04/03 22:12:55 Martin Exp $ use strict; use warnings; use Test::More; use blib; # BEGIN { sub Pod::Coverage::TRACE_ALL () { 1 } } # BEGIN { sub TRACE_ALL () { 1 } } # Ensure a recent version of Test::Pod::Coverage my $min_tpc = 1.08; eval "use Test::Pod::Coverage $min_tpc"; plan skip_all => "Test::Pod::Coverage $min_tpc required for testing POD coverage" if $@; # Test::Pod::Coverage doesn't require a minimum Pod::Coverage version, # but older versions don't recognize some common documentation styles my $min_pc = 0.18; eval "use Pod::Coverage $min_pc"; plan skip_all => "Pod::Coverage $min_pc required for testing POD coverage" if $@; all_pod_coverage_ok(); __END__ WWW-Search-2.517/t/null-empty.t0000644000175000017500000000137313101347451015526 0ustar martinmartin# $Id: null-empty.t,v 1.2 2006-04-21 20:59:08 Daddy Exp $ use ExtUtils::testlib; use Test::More 'no_plan'; my $sMod; BEGIN { $sMod = 'WWW::Search::Null::Empty'; use_ok('WWW::Search'); use_ok($sMod); } # end of BEGIN block ok(my $iCount = 4); ok(my $oSearch = new WWW::Search('Null::Empty')); isa_ok($oSearch, $sMod); $oSearch->native_query('Makes no difference what you search for...'); my @aoResults = $oSearch->results; is(scalar(@aoResults), 0, 'got zero results'); is($oSearch->approximate_result_count, 0, 'got the right approx_results'); # ...But you get an HTTP::Response object with a code of 200 my $oResponse = $oSearch->response; is($oResponse->code, 200, 'response code'); ok($oResponse->is_success, 'HTTP::Response is success'); __END__ WWW-Search-2.517/t/use.t0000644000175000017500000002004713101347451014213 0ustar martinmartin# $Id: use.t,v 1.27 2008-07-14 03:11:10 Martin Exp $ use strict; use warnings; use ExtUtils::testlib; use Test::More 'no_plan'; use IO::Capture::Stderr; ok(my $oICE = IO::Capture::Stderr->new); BEGIN { use_ok('WWW::Search'); use_ok('WWW::SearchResult'); use_ok('WWW::Search::Result'); use_ok('WWW::Search::Test', qw(new_engine run_gui_test run_test skip_test count_results)); } # end of BEGIN block my @as; eval { @as = &WWW::Search::installed_engines }; ok(0 < scalar(@as), 'any installed engines'); diag('FYI the following backends are already installed (including ones in this distribution): '. join(', ', sort @as)); my %hs = map { $_ => 1 } @as; my $sBackend = 'AltaVista'; $ENV{HARNESS_PERL_SWITCHES} ||= ''; if (exists $hs{$sBackend} && ($ENV{HARNESS_PERL_SWITCHES} =~ m!Devel::Cover!)) { diag(qq{You are running 'make cover' and I found WWW::Search::$sBackend installed, therefore I will do some live searches to enhance the coverage testing...}); my $o = new WWW::Search($sBackend); $o->native_query('Vader'); $o->maximum_to_retrieve(111); my @ao = $o->results(); } # if # Make sure an undef query does not die; my $o1 = new WWW::Search; # NO BACKEND SPECIFIED ok(ref $o1); my @ao = $o1->results(); ok(ref $o1->response); ok($o1->response->is_error); ok(scalar(@ao) == 0); # Make sure an empty query does not die; my $o2 = new WWW::Search; # NO BACKEND SPECIFIED ok(ref $o2); $o2->native_query(''); # EMPTY STRING my @ao2 = $o2->results(); ok(ref $o2->response); ok($o2->response->is_error); ok(scalar(@ao2) == 0); # Tests for approx_result_count: is($o2->approximate_result_count(3), 3); is($o2->approximate_result_count(undef), 3); is($o2->approximate_result_count(''), 3); is($o2->approximate_result_count(0), 0); is($o2->approximate_result_count(2), 2); is($o2->approximate_hit_count(undef), 2); is($o2->approximate_hit_count(-1), 2); # Test for what happens when a backend is not installed: my $o3; eval { $o3 = new WWW::Search('No_Such_Backend') }; like($@, qr{(?i:can not load backend)}); my $iCount = 44; my $o4 = new WWW::Search('Null::Count', '_null_count' => $iCount, ); # Get result_count before submitting query: is($o4->approximate_result_count, $iCount); # Get some results: ok($o4->login, 'login'); $o4->maximum_to_retrieve($iCount/2); my $iCounter = 0; while ($o4->next_result) { $iCounter++; } # while is($iCounter, $iCount/2, 'next_result stops at maximum_to_retrieve'); $o4->maximum_to_return($iCount * 2); while ($o4->next_result) { $iCounter++; } # while is($iCounter, $iCount, 'next_result goes to end of cache'); ok($o4->logout, 'logout'); ok($o4->response, 'response'); ok($o4->submit, 'submit'); is($o4->opaque, undef, 'opaque is undef'); $o4->opaque('hello'); is($o4->opaque, 'hello', 'opaque is hello'); $o4->seek_result(undef); $o4->seek_result(-1); $o4->seek_result(3); is(WWW::Search::escape_query('+hi +mom'), '%2Bhi+%2Bmom', 'escape'); is(WWW::Search::unescape_query('%2Bhi+%2Bmom'), '+hi +mom', 'unescape'); # Use a backend for a second time (just to exercise the code in # Search.pm): my $o5 = new WWW::Search('Null::Count'); # Test the version() function: ok($o4->version, 'version defined in backend'); $o5 = new WWW::Search('Null::NoVersion'); is($o5->version, $WWW::Search::VERSION, 'default version'); # Test the maintainer() function: ok($o4->maintainer, 'maintainer defined in backend'); is($o5->maintainer, $WWW::Search::MAINTAINER, 'default maintainer'); # Exercise / test the cookie_jar() function: $o4->cookie_jar('t/cookies.txt'); my $oCookies = new HTTP::Cookies; $o5->cookie_jar($oCookies); $oICE->start; eval { $o2->cookie_jar($o4) }; $oICE->stop; $oCookies = $o4->cookie_jar; # Exercise / test the native_query() function: $o4->{_debug} = 1; $oICE->start; $o4->gui_query('query', {option1 => 1, search_option2 => 2, }); $oICE->stop; $o4->{_debug} = 0; $o4->gui_query('query', {option1 => 1, search_option2 => 2, }); # Exercise other set/get functions: is($o4->date_from, ''); is($o4->date_to, ''); is($o4->env_proxy, 0); is($o4->http_proxy, undef); is($o4->http_proxy_user, undef); is($o4->http_proxy_pwd, undef); $o4->date_from('dummydate'); $o4->date_to ('dummydate'); $o4->env_proxy('dummydate'); $o4->http_proxy('dummydate'); $o4->http_proxy_user('dummydate'); $o4->http_proxy_pwd('dummydate'); is($o4->date_from, 'dummydate'); is($o4->date_to, 'dummydate'); is($o4->env_proxy, 'dummydate'); is_deeply($o4->http_proxy, ['dummydate']); is($o4->http_proxy_user, 'dummydate'); is($o4->http_proxy_pwd, 'dummydate'); # Sanity-tests for proxy stuff: my $o6 = new WWW::Search; ok(! $o6->is_http_proxy); ok(! $o6->is_http_proxy_auth_data); $o6->http_proxy(''); ok(! $o6->is_http_proxy); ok(! $o6->is_http_proxy_auth_data); $o6->http_proxy('some', 'things'); ok($o6->is_http_proxy); ok(! $o6->is_http_proxy_auth_data); $o6->http_proxy_user('something'); ok(! $o6->is_http_proxy_auth_data); $o6->http_proxy_pwd('something'); # use Data::Dumper; # print STDERR Dumper(\$o6); ok($o6->is_http_proxy_auth_data); # Sanity-tests for the timeout() method: is($o6->timeout, 60); $o6->timeout(120); is($o6->timeout, 120); is(WWW::Search::strip_tags('hello world'), 'hello world'); ok($o6->user_agent('non-robot')); ok($o6->agent_name('junk')); is($o6->agent_name, 'junk'); is($o6->agent_name, 'junk'); ok(! $o6->agent_email('junk')); is($o6->agent_email, 'junk'); is($o6->agent_email, 'junk'); ok($o6->user_agent); ok(! $o6->http_referer('junk')); is($o6->http_referer, 'junk'); is($o6->http_referer, 'junk'); ok($o6->http_method('junk')); is($o6->http_method, 'junk'); is($o6->http_method, 'junk'); # Tests for WWW::SearchResult: my $oWSR = new WWW::SearchResult; $oWSR->add_url('url1'); $oWSR->thumb_url('url1.thumb'); $oWSR->image_url('url1.png'); $oWSR->title('title1'); $oWSR->description('description1'); $oWSR->change_date("yesterday"); $oWSR->start_date("last Tuesday"); $oWSR->index_date("today"); $oWSR->end_date("tomorrow"); $oWSR->raw(qq{title1}); $oWSR->score(99); $oWSR->normalized_score(990); $oWSR->size(4096); $oWSR->source('WWW::Search'); $oWSR->company('Dub Dub Dub Search, Inc.'); $oWSR->location('Ashburn, VA'); $oWSR->bid_amount(9.99); $oWSR->shipping(4.85); $oWSR->bid_count(9); $oWSR->question_count(3); $oWSR->watcher_count(65); $oWSR->item_number(987654321); $oWSR->category(7654); $oWSR->bidder('Joe'); $oWSR->seller('Jane'); ok($o4->result_as_HTML($oWSR)); is($o4->result_as_HTML(), ''); is($o4->result_as_HTML(undef), ''); is($o4->result_as_HTML(0), ''); is($o4->result_as_HTML(1), ''); is($o4->result_as_HTML([1,2]), ''); my $s = $oWSR->as_HTML; # Other miscellaneous sanity checks and coverage tests: is(&WWW::Search::escape_query, ''); my @a = &WWW::Search::unescape_query(qw(a b c)); $o2->strip_tags('a', undef, 'b'); delete $ENV{WWW_SEARCH_USERAGENT}; $o2->user_agent(1); $o2->user_agent; $ENV{WWW_SEARCH_USERAGENT} = 'No::Such::Module'; $oICE->start; $o2->user_agent(1); $o2->user_agent; $oICE->stop; my $sICE = join("\n", $oICE->read); like($sICE, qr'can not load'); $ENV{WWW_SEARCH_USERAGENT} = 'Carp'; # a module which does not have a new() $oICE->start; $o2->user_agent(1); $o2->user_agent; $oICE->stop; $sICE = join("\n", $oICE->read); like($sICE, qr'can not create'); $s = qq{foo\nbar\nbaz}; $o2->split_lines($s); $o2->split_lines(['a'], $s); $o2->generic_option; $o2->_native_setup_search; $o2->user_agent_delay; $o2->user_agent_delay(1); $o2->absurl; $o2->absurl('foo'); $o2->absurl('foo', 'bar'); $o2->need_to_delay; $o2->_parse_tree; $o2->_native_retrieve_some; $o2->preprocess_results_page; $o2->preprocess_results_page('foo'); $o2->test_cases; $o2->hash_to_cgi_string; $o2->hash_to_cgi_string({ foo => 'foo', bar => undef, undef => 'baz', empty => '', }); exit 0; foreach my $sEngine (@as) { my $o; # diag(qq{trying $sEngine}); eval { $o = new WWW::Search($sEngine) }; ok(ref($o), qq{loaded WWW::Search::$sEngine}); # } ) # Emacs bug } # foreach exit 0; __END__ WWW-Search-2.517/t/0_versions.t0000644000175000017500000000217613101347451015511 0ustar martinmartin# $Id: 0_versions.t,v 1.6 2013-11-19 03:33:28 Martin Exp $ use strict; use warnings; use Config; use Test::More tests => 1; # Create a list of modules we're interested in: my @asModule = qw( Date::Manip File::Copy File::Find File::Spec Getopt::Long HTML::Parser HTML::TreeBuilder HTTP::Cookies LWP::UserAgent MIME::Lite Net::Domain Pod::Parser Pod::Tests Pod::Usage Test::Inline URI User ); # Extract the version number from each module: my %hsvVersion; foreach my $sModule (@asModule) { eval " require $sModule; "; unless($@) { no strict 'refs'; $hsvVersion{$sModule} = ${$sModule .'::VERSION'} || "unknown"; } # unless } # foreach # Also look up the version number of perl itself: $hsvVersion{perl} = $Config{version} || $]; # Print on STDERR details of installed modules: diag(''); diag(sprintf("\r# %-30s %s\n", 'Module', 'Version')); foreach my $sModule (sort keys %hsvVersion) { $hsvVersion{$sModule} = 'Not Installed' unless(defined($hsvVersion{$sModule})); diag(sprintf(" %-30s %s\n", $sModule, $hsvVersion{$sModule})); } # foreach # Make sure this file passes at least one test: pass; exit 0; __END__ WWW-Search-2.517/t/null-error.t0000644000175000017500000000136713101347451015524 0ustar martinmartin# $Id: null-error.t,v 1.1 2006-04-21 20:58:01 Daddy Exp $ use ExtUtils::testlib; use Test::More 'no_plan'; my $sMod; BEGIN { $sMod = 'WWW::Search::Null::Error'; use_ok('WWW::Search'); use_ok($sMod); } # end of BEGIN block ok(my $iCount = 4); ok(my $oSearch = new WWW::Search('Null::Error')); isa_ok($oSearch, $sMod); $oSearch->native_query('Makes no difference what you search for...'); my @aoResults = $oSearch->results; is(scalar(@aoResults), 0, 'got zero results'); is($oSearch->approximate_result_count, 0, 'got the right approx_results'); # ...But you get an HTTP::Response object with a code of 200 my $oResponse = $oSearch->response; is($oResponse->code, 500, 'response code'); ok(! $oResponse->is_success, 'got an HTTP failure'); __END__ WWW-Search-2.517/t/pod.t0000644000175000017500000000027513101347451014202 0ustar martinmartin# $Id: pod.t,v 1.2 2007-04-09 12:52:30 Daddy Exp $ use Test::More; eval "use Test::Pod 1.00"; plan skip_all => "Test::Pod 1.00 required for testing POD" if $@; all_pod_files_ok(); __END__ WWW-Search-2.517/t/null-count.t0000644000175000017500000000225613101347451015521 0ustar martinmartin# $Id: null-count.t,v 1.2 2008-01-21 03:01:38 Daddy Exp $ use ExtUtils::testlib; use Test::More 'no_plan'; my $sMod; BEGIN { $sMod = 'WWW::Search::Null::Count'; use_ok('WWW::Search'); use_ok($sMod); } # end of BEGIN block ok(my $iCount = 4); ok(my $oSearch = new WWW::Search('Null::Count', '_null_count' => $iCount, ) ); isa_ok($oSearch, $sMod); $oSearch->native_query('Makes no difference what you search for...'); ok(my @aoResults = $oSearch->results); is(scalar(@aoResults), $iCount, 'got the right number of results'); is($oSearch->approximate_result_count, $iCount, 'got the right approx_results'); ok(my $oResult = shift @aoResults); is($oResult->url, "url1", 'url'); is(scalar(@{$oResult->related_urls}), $iCount, 'got N related_urls'); is(scalar(@{$oResult->related_titles}), $iCount, 'got N related_titles'); is(scalar(@{$oResult->urls}), $iCount+1, 'got N+1 urls'); ok(my $raURL = $oResult->urls); # diag("sURL =$sURL="); is(scalar(@{$raURL}), $iCount+1, 'got N+1 urls in arrayref'); # Additional calls for coverage: my $o5 = new WWW::Search('Null::Count'); $o5->native_query('fubar'); $o5->results; __END__ WWW-Search-2.517/t/test.t0000644000175000017500000000112413101347451014371 0ustar martinmartin # $Id: test.t,v 1.6 2008-11-27 22:33:40 Martin Exp $ # Tests for the WWW::Search::Test module use File::Spec::Functions; use ExtUtils::testlib; use Test::More qw(no_plan); use strict; BEGIN { use_ok('WWW::Search::Test') }; my $sWebsearch1 = WWW::Search::Test::find_websearch(); ok($sWebsearch1, 'found any WebSearch (first time)'); # Call it again to trigger memoization code: my $sWebsearch2 = WWW::Search::Test::find_websearch(); ok($sWebsearch2, 'found any WebSearch (second time)'); is($sWebsearch1, $sWebsearch2, 'both WebSearch are the same'); diag($sWebsearch1); pass; 1; __END__ WWW-Search-2.517/META.yml0000644000175000017500000000217513101350056014235 0ustar martinmartin--- abstract: 'Virtual base class for WWW searches' author: - 'John Heidemann ' build_requires: ExtUtils::MM_Unix: 1.41 ExtUtils::MakeMaker: 6.36 File::Temp: 0 IO::Capture::Stderr: 0 Test::File: 0 Test::More: 0 Test::Simple: 0 configure_requires: ExtUtils::MakeMaker: 6.36 distribution_type: module dynamic_config: 1 generated_by: 'Module::Install version 1.17' license: perl meta-spec: url: http://module-build.sourceforge.net/META-spec-v1.4.html version: 1.4 module_name: WWW::Search name: WWW-Search no_index: directory: - inc - t recommends: Email::MIME: 0 Email::MIME::Creator: 0 Email::Send: 0 Test::Pod: 0 Test::Pod::Coverage: 0 requires: Bit::Vector: 0 CGI: 0 Data::Dumper: 0 Date::Manip: 0 File::Basename: 0 File::Copy: 0 File::Find: 0 File::Path: 0 File::Slurp: 0 File::Spec: 0 Getopt::Long: 2.24 HTML::Parser: 2.23 HTML::TreeBuilder: 0 LWP::MemberMixin: 0 LWP::UserAgent: 2 Net::Domain: 0 POSIX: 0 Pod::Usage: 0 Test::More: 0 URI: 0 URI::Escape: 0 User: 0 perl: 5.005 resources: license: http://dev.perl.org/licenses/ version: 2.517 WWW-Search-2.517/README.md0000644000175000017500000000004513101347451014242 0ustar martinmartin# WWW-Search Perl module WWW::Search WWW-Search-2.517/Changes0000644000175000017500000012314113101350050014246 0ustar martinmartin2017-04-30 Kingpin * Makefile.PL: fixed for Perl 5.26 2015-12-13 Kingpin * removed MYMETA files from distro (again) 2015-06-06 Kingpin * lib/WWW/Search/Test.pm: removed Date::Manip::TZ setting 2013-11-18 Kingpin * lib/WWW/Search/Test.pm (test_most_results): better error messages 2013-10-05 Kingpin * MANIFEST.SKIP: do not ship MYMETA files 2013-08-20 Kingpin * Makefile.PL (MY::preamble): added fix for devel on Windows 7 2013-06-23 Kingpin * lib/WWW/Search/Test.pm (test_most_results): fixed quote problem in eval 2013-03-25 Kingpin * lib/WWW/Search/Test.pm (test_most_results): fixed off-by-one error 2010-05-20 Kingpin * lib/WWW/Search/Test.pm (test_most_results): fixed percentage test! 2008-11-28 Kingpin * lib/WWW/Search/Test.pm (test_most_results): new function 2008-11-27 Kingpin * lib/WWW/Search.pm (user_agent): wrap {new LWP::UserAgent} in an eval 2008-11-10 Kingpin * lib/WWW/Search.pm (_native_retrieve_some): check for empty _next_url 2008-07-20 Kingpin * lib/WWW/SearchResult.pm (sources): new method (add_sources): new method 2008-07-15 Kingpin * lib/WWW/Search/Test.pm (find_websearch): look bin blib/script also 2008-07-13 Kingpin * lib/WWW/Search.pm (_parse_tree): now calls parse_tree() for backward compatibility 2008-06-01 Kingpin * lib/WWW/Search.pm (parse_tree): don't need conditional 2008-04-05 * lib/WWW/Search.pm: renamed parse_tree() to _parse_tree() 2008-03-05 * lib/WWW/Search.pm (native_retrieve_some): turn on UTF-8 mode in HTML parser 2007-11-11 * all: use warnings in all modules 2007-07-26 * t/use.t: do not use the deprecated IO::Capture::ErrorMessages 2007-07-04 * lib/WWW/Search.pm (native_retrieve_some): prevent undef warning 2007-06-03 * various: make sure to use strict 2007-05-19 * lib/WWW/Search/Test.pm (count_results): print result number in front of URL 2007-05-07 * lib/WWW/Search/Test.pm (count_results): added eighth argument 2007-03-24 * lib/WWW/Search.pm (strip_tags): fix for possible "modify read-only" error (absurl): fix for possible undef error 2007-03-23 * lib/WWW/Search/Test.pm (count_results): FIX to not waste time getting too many results 2007-01-29 * code/AutoSearch-code.pl (main): added timezone to HTML email 2006-07-30 * lib/WWW/Search/Test.pm (tm_run_test_no_approx): new function (count_results): FIX to allow more than 300 results 2006-04-24 * lib/WWW/Search.pm (result_as_HTML): FIX for backends that use WWW::Search::Result 2006-04-22 * code/AutoSearch-code.pl (main): now uses result_as_HTML() to format email 2006-04-21 * lib/WWW/Search.pm (result_as_HTML): new method 2006-04-19 * lib/WWW/SearchResult.pm (sold): new method 2006-03-19 * lib/WWW/Search.pm: tweak pod (thanks to Brian Sammon) 2006-03-14 * lib/WWW/SearchResult.pm (as_HTML): new method 2006-03-11 * lib/WWW/SearchResult.pm (shipping): new method (as_HTML): new method 2006-03-10 * lib/WWW/SearchResult.pm (question_count): new method (watcher_count): new method 2006-01-12 * lib/WWW/Search.pm (results): add pod about how to detect errors * code/AutoSearch-code.pl (send_email): fix the call syntax of Email::Send 2006-01-07 * lib/WWW/Search.pm (results): add pod about how to check for errors * t/use.t: use IO::Capture::Stderr instead of ::ErrorMessages 2005-12-27 * lib/WWW/Search/Test.pm (count_results): set absolute maximum of 299 hits * code/AutoSearch-code.pl (main): make sure email contains HTML of new hits (send_email): don't need Email::Send::SMTP::Auth any more (Email::Send::SMTP will do) (make_date): fix URL of autosearch webpage 2005-12-26 * lib/WWW/Search/Test.pm (count_results): bugfix for when argument for max is undef 2005-12-25 * lib/WWW/SearchResult.pm: new attribute 'category' 2005-07-15 Kingpin * Makefile.PL (MY::postamble): get pod2test to work without cat or dos2unix 2005-07-09 Kingpin * code/AutoSearch-code.pl (main): report HTTP error even if we got some results 2005-07-07 Kingpin * lib/WWW/Search.pm (results): on error, now returns empty list instead of undef 2005-06-19 Kingpin * lib/WWW/SearchResult.pm (image_url): new method (thumb_url): new method 2005-04-18 Kingpin * code/AutoSearch-code.pl (make_date): fix link to AutoSearch webpage 2005-03-12 Kingpin * code/AutoSearch-code.pl (main): add result count to email message 2005-02-02 Kingpin * lib/WWW/Search.pm (http_proxy): BUGFIX for calling with no args setting the value 2004-11-05 Kingpin * lib/WWW/Search/Test.pm (find_websearch): try harder to find a working WebSearch 2004-10-26 Kingpin * lib/WWW/Search/Test.pm (find_websearch): avoid warnings * lib/WWW/Search/Null/Count.pm: delete extra spaces in pod directives 2004-10-10 Kingpin * lib/WWW/Search.pm (new): do NOT put login name into agent_e_mail 2004-10-07 Kingpin * lib/WWW/Search/Test.pm (tm_new_engine): new function (tm_run_test): new function * code/AutoSearch-code.pl (main): add backend name to email body 2004-09-25 Kingpin * lib/WWW/SearchResult.pm (bid_count): new method (bid_amount): new method 2004-08-21 Kingpin * code/AutoSearch-code.pl: added pod 2004-08-19 Kingpin * lib/WWW/Search.pm (agent_name): new method 2004-07-31 Kingpin * code/AutoSearch-code.pl (send_email): use Email::Send * code/AutoSearch-code.pl (send_email): send HTML using proper MIME-type 2004-06-30 Kingpin * lib/WWW/Search.pm (installed_engines): make it run much faster! 2004-06-01 Kingpin * t/use.t: added a lot of tests for coverage * lib/WWW/Search.pm (_load_env_useragent): new function 2004-05-24 Kingpin * lib/WWW/Search/Null/Count.pm: add $VERSION * lib/WWW/Search/Null/Empty.pm: add $VERSION * lib/WWW/Search/Null/Error.pm: add $VERSION * lib/WWW/Search/Null/Count.pm: add _ in front of function names * lib/WWW/Search/Null/Empty.pm: add _ in front of function names * lib/WWW/Search/Null/Error.pm: add _ in front of function names * lib/WWW/Search/Null/NoVersion.pm: new file (for testing) * lib/WWW/Search/Null/Count.pm (native_retrieve_some): set and test all SearchResult fields 2004-05-01 Kingpin * lib/WWW/Search.pm (need_to_delay): new method 2004-03-05 Kingpin * code/AutoSearch-code.pl (main): new option --emailfrom 2004-03-03 Kingpin * lib/WWW/Search.pm (approximate_result_count): clean up args (http_request): clean up redirect-following code 2004-02-24 Kingpin * t/use.t: add tests for approximate_result_count() args * lib/WWW/Search.pm (approximate_result_count): fix empty-string warning * code/AutoSearch-code.pl (print_version): convert to Pod::Usage (main): reorder --stats messages 2004-02-09 Kingpin * code/AutoSearch-code.pl (main): do not try to send email if MIME::Lite is not available * lib/WWW/Search.pm: convert to 3-digit version number; code cleanups * t/pod.t: new file 2004-01-30 Kingpin * Makefile.PL: converted to use Module::Install 2004-01-20 Kingpin * code/AutoSearch-code.pl: minor clean ups and prevent undef warnings 2004-01-09 Kingpin * lib/WWW/Search.pm (http_request): detect redirect loops 2003-12-29 Kingpin * lib/WWW/Search.pm (strip_tags): avoid warnings if we get undef argument 2003-12-19 Kingpin * lib/WWW/Search.pm (user_agent): fix for undef warnings in LWP::UserAgent::proxy 2003-11-29 Kingpin * lib/WWW/Search.pm (user_agent): fix $sUA creation 2003-11-27 Kingpin * code/AutoSearch-code.pl (main): added date to error messages in index.html (main): new option --cmdline 2003-11-25 Kingpin * code/AutoSearch-code.pl: new option --cleanup * lib/WWW/Search.pm (user_agent): new $ENV{WWW_SEARCH_USERAGENT} mechanism 2003-11-24 Kingpin * lib/WWW/Search.pm (strip_tags): reduce embedded spaces 2003-11-14 Kingpin * lib/WWW/Search.pm (http_proxy): now takes same arguments as LWP::UserAgent::proxy() (user_agent): get proxy password from %ENV 2003-10-21 Kingpin * code/AutoSearch-code.pl (usage): new option --ignore_channels * lib/WWW/Search.pm (approximate_result_count): fixed so can be called without explicitly calling retrieve_some() first 2003-09-16 Kingpin * Makefile.PL (MY::postamble): fixed path for pod2text; etc. 2003-08-31 Kingpin * lib/WWW/Search/Test.pm (count_results): new 7th argument = search options hash 2003-08-30 Kingpin * code/AutoSearch-code.pl: new option --ignore_channels 2003-07-28 Kingpin * lib/WWW/Search.pm (preprocess_results_page): BUGFIX bad return value 2003-07-14 Kingpin * lib/WWW/Search/Test.pm (count_results): printResults output is more verbose 2003-07-05 Kingpin * lib/WWW/Search.pm (parse_tree): new NOP stub function 2003-07-04 Kingpin * lib/WWW/Search.pm (login): now returns flag for success/failure * lib/WWW/Search/Test.pm (find_websearch): new function 2003-07-03 Kingpin * lib/WWW/Search/Test.pm (count_results): call login() before getting results 2003-06-24 Kingpin * lib/WWW/Search.pm (cookie_jar): allow any flavor of HTTP::Cookies 2003-06-06 Kingpin * code/AutoSearch-code.pl: new option --listnewurls 2003-05-27 Kingpin * lib/WWW/Search/Test.pm (new_engine_test_more): new function for use with Test::More (run_test_test_more): new function for use with Test::More 2003-05-22 Kingpin * code/WebSearch-code.pl (list_engines): new option --list new options --username --password 2003-05-14 Kingpin * lib/WWW/Search/Test.pm (count_results): when dumping, print title as well as url and description 2003-04-16 Kingpin * lib/WWW/Search/Null/Count.pm: fixed the Test::Inline pod directives (also in Empty.pm and Error.pm) * lib/WWW/Search.pm (retrieve_some): do not complain about empty query if $self->{'_allow_empty_query'} is set 2003-03-13 Kingpin * lib/WWW/Search.pm (strip_tags): avoid undef warnings 2003-02-06 Kingpin * lib/WWW/Search.pm: don't croak if query is undef or empty 2002-12-20 Kingpin * lib/WWW/Search.pm (approximate_hit_count): new method (synonym for approximate_result_count) 2002-12-16 Kingpin * code/AutoSearch-code.pl: new option --help 2002-10-22 Kingpin * code/WebSearch-code.pl: new option --terse; --count now prints actual count 2002-07-18 Kingpin * WWW::Search VERSION 2.36 RELEASED! 2002-07-18 Kingpin * lib/WWW/Search/Test.pm (run_our_test): FIX miscalculation of max 2002-07-18 Kingpin * WWW::Search VERSION 2.35 RELEASED! 2002-07-17 Kingpin * lib/WWW/Search/Test.pm (count_results): new function 2002-06-13 Kingpin * lib/WWW/Search.pm (http_request): do not infinite loop when HTTP::Response is_error! 2002-06-04 Kingpin * lib/WWW/Search.pm (login, logout): new methods (new): initialize _debug parameter to 0 2002-06-03 Kingpin * lib/WWW/Search.pm (cookie_jar): return jar if no args 2002-05-31 Kingpin * lib/WWW/Search.pm (hash_to_cgi_string): delete keys whose value is undef (http_method): new get/set method 2002-04-29 Kingpin * lib/WWW/Search/Test.pm: `use WWW::Search` so client doesn't have to 2002-04-22 Kingpin * code/AutoSearch-code.pl: fix for case-sensitive cmd-line args 2002-04-19 Kingpin * lib/WWW/Search.pm (installed_engines): new function * t/use.t: upgrade to use Test::More * lib/WWW/Search.pm: get rid of @ENGINES_WORKING * lib/WWW/Search/Gopher.pm (native_retrieve_some): remove duplicate declaration warnings; added $VERSION * lib/WWW/Search.pm (generic_option): gracefully handle missing argument 2002-03-29 Kingpin * code/WebSearch-code.pl (print_version): show version of --engine, too 2002-03-28 Kingpin * lib/WWW/Search.pm (native_retrieve_some): add debug msgs (native_retrieve_some): better pod (native_retrieve_some): make it able to re-use the TreeBuilder object! 2002-03-22 Kingpin * lib/WWW/Search.pm (native_retrieve_some): do not try to delete the TreeBuilder 2002-03-15 Kingpin * lib/WWW/Search.pm (native_retrieve_some): allow backends to specify the HTML::TreeBuilder object to be used 2002-03-12 Kingpin * Makefile.PL: require 2 more modules 2002-03-11 Kingpin * lib/WWW/Search.pm (user_agent): do not require users to call user_agent() explicitly; set default email to local user@host; (results): prevent warning when 0 hits 2002-02-05 Kingpin * lib/WWW/Search.pm (http_request): use URI instead of $HTTP::URI_CLASS (absurl): same 2002-01-22 Kingpin * lib/WWW/Search.pm (http_request): do not use URI::URL 2001-12-20 Kingpin * lib/WWW/Search/Test.pm: remove warning about ./WebSearch * lib/WWW/Search.pm (preprocess_results_page): new method 2001-11-23 Kingpin * lib/WWW/Search.pm (http_request): delete bogus argument to http_request_to_file() call 2001-11-02 Kingpin * code/AutoSearch-code.pl: new option --debug 2001-10-25 Kingpin * lib/WWW/Search/Test.pm (skip_test): new method 2001-09-10 Kingpin * lib/WWW/Search.pm (native_retrieve_some): new method 2001-09-07 Kingpin * lib/WWW/Search.pm (strip_tags): convert
to one space * lib/WWW/Search/Test.pm (run_our_test): new argument to print results to STDERR (run_our_test): do not set maximum if user expects an exact number of results 2001-08-31 Kingpin * lib/WWW/Search.pm (env_proxy): new method 2001-08-29 Kingpin * lib/WWW/Search.pm: tweaked debugging messages 2001-07-16 Kingpin * lib/WWW/Search/Test.pm (run_gui_test): new method * lib/WWW/Search.pm (date_from): new method (date_to): new method 2001-07-05 Kingpin * lib/WWW/Search/Test.pm (run_test): new functionality for simple test-running * lib/WWW/Search.pm (http_proxy): added proxy support 2001-06-14 Kingpin * WWW::Search VERSION 2.21 RELEASED! 2001-06-13 Kingpin * removed Scraper.pm and Sherlock.pm 2001-05-08 Kingpin * WWW::Search VERSION 2.19 RELEASED! 2001-05-07 Kingpin * lib/WWW/Search.pm (results): BUGFIX always returned 500! 2001-05-04 Kingpin * lib/WWW/Search/Scraper.pm: updated to 1.20 2001-05-02 Kingpin * lib/WWW/Search/Scraper.pm: updated to 1.10 2001-05-02 Kingpin * WWW::Search VERSION 2.17 RELEASED! 2001-05-02 Kingpin * lib/WWW/Search/Scraper.pm: new module contributed by Glenn Wood 2001-04-10 Kingpin * AutoSearch-code.pl (main): new option --mail for emailing new results * AutoSearch.PL: move real code to separate file (and update Makefile.PL) 2001-03-28 Kingpin * WWW::Search VERSION 2.16 RELEASED! 2001-03-27 Kingpin * lib/WWW/Search.pm (submit): new method 2001-03-20 Kingpin * lib/WWW/Search.pm (reset_search): BUGFIX: reset approx_count (thanks to Alexander Mikhailian) 2001-03-15 Kingpin * lib/WWW/Search/AltaVista/NL.pm: new backend 2000-10-25 Kingpin * lib/WWW/Search.pm: many minor pod updates 2000-10-13 Kingpin * Infoseek.pm et al. removed (use Go.pm instead) 2000-10-06 Kingpin * lib/WWW/Search.pm (http_referer): new method 2000-10-04 Kingpin * lib/WWW/Search.pm (cookie_jar): do not save cookies if user gave us HTTP::Cookie object 2000-10-03 Kingpin * WWW::Search VERSION 2.15 RELEASED! 2000-10-02 Kingpin * lib/WWW/Search.pm (split_lines): new optional first arg: list of patterns 2000-09-22 Kingpin * lib/WWW/Search.pm (cookie_jar): new method to handle web cookies 2000-08-08 Kingpin * lib/WWW/Search/AltaVista.pm updated to version 2.9 2000-06-15 Kingpin * lib/WWW/Search/Lycos.pm: removed to its own separate registered module 2000-06-12 Kingpin * lib/WWW/Search/LookSmart.pm: removed to its own separate registered module * lib/WWW/Search/NorthernLight.pm: removed to its own separate registered module 2000-05-26 Kingpin * lib/WWW/Search/AltaVista.pm: updated to 2.6 (parse only 1 result) 2000-05-17 Kingpin * removed Lycos/Sites.pm from the distribution (finally) * lib/WWW/Search/Yahoo/Classifieds/Employment.pm: updated to 1.02 2000-05-04 Kingpin * lib/WWW/Search/Test.pm: updated to 2.03 (fix pod typos) 2000-05-02 Kingpin * WWW::Search VERSION 2.14 RELEASED! * lib/WWW/Search/AltaVista.pm (native_retrieve_some): fix next_url 2000-05-01 Kingpin * lib/WWW/Search/Euroseek.pm: removed to its own separate registered module * lib/WWW/Search/GoTo.pm: removed to its own separate registered module * lib/WWW/Search/Google.pm: removed to its own separate registered module * lib/WWW/Search/Snap.pm: removed to its own separate registered module * lib/WWW/Search/ZDNet.pm: removed to its own separate registered module 2000-04-27 Kingpin * lib/WWW/Search.pm (reset_search): do not clear search_base_url 2000-04-12 Kingpin * lib/WWW/Search/Test.pm: now uses File::Spec for all paths and filenames 2000-04-11 Kingpin * lib/WWW/Search.pm (user_agent): now sets agent_name for non-robot UserAgent 2000-04-03 Kingpin * WWW::Search VERSION 2.13 RELEASED! 2000-04-02 Kingpin * lib/WWW/Search.pm (reset_search): correctly and silently handle intermixed calls to gui_query() and native_query() 2000-03-24 Kingpin * lib/WWW/Search/Google.pm: updated to 2.18 (fix rare garbage in URLs) 2000-03-23 Kingpin * WWW::Search VERSION 2.12 RELEASED! 2000-03-23 Kingpin * t/use.t: added test to make sure we're getting *any* results * lib/WWW/Search.pm (escape_query): fixed BUG in argument parsing 2000-03-22 Kingpin * WWW::Search VERSION 2.11 RELEASED! 2000-03-21 Kingpin * lib/WWW/Search/Excite/News.pm: updated to 2.03 (new output format) * lib/WWW/Search/VoilaFr.pm: new backend for www.voila.fr GUI searches 2000-03-13 Kingpin * lib/WWW/Search/Snap.pm updated to 2.04 * lib/WWW/Search/Google.pm updated to 2.17 2000-03-09 Kingpin * lib/WWW/Search/AltaVista/Intranet.pm: updated to 2.04 (pod only) 2000-02-28 Kingpin * lib/WWW/Search/Magellan.pm: removed to its own separate registered module 2000-02-26 Kingpin * WWW::Search VERSION 2.10 RELEASED! 2000-02-25 Kingpin * lib/WWW/Search.pm (reset_search): new method. New feature: if you call native_query() a second time on the same WWW::Search object, a new search will be started. (Before this change, you would get the results of the first query all over again) 2000-02-24 Kingpin * lib/WWW/Search/Dejanews.pm et al.: removed to their own separate registered module 2000-02-15 Kingpin * lib/WWW/Search/AltaVista/Intranet.pm (native_retrieve_some): parse Rank integer 2000-02-08 Kingpin * WWW::Search VERSION 2.09 RELEASED! 2000-02-07 Kingpin * WebSearch: new option --gui to mimic query in browser 2000-02-04 Kingpin * lib/WWW/Search/OpenDirectory.pm: updated to 2.02 * lib/WWW/Search/Yahoo.pm: removed to its own separate registered module 2000-02-01 Kingpin * lib/WWW/Search.pm (maintainer): new method 2000-01-31 Kingpin * lib/WWW/Search/Test.pm: new module * lib/WWW/Search/Magellan.pm: updated to 2.04 (new test cases) 2000-01-28 Kingpin * lib/WWW/Search/Excite.pm: removed to its own separate registered module 2000-01-19 Kingpin * lib/WWW/Search.pm (gui_query): new method * lib/WWW/Search/HotBot.pm: removed to its own separate registered module 2000-01-18 Kingpin * lib/WWW/Search/Dejanews.pm: updated to 2.07 (new output format) 2000-01-14 Kingpin * lib/WWW/Search/Euroseek.pm: new backend 2000-01-13 Kingpin * lib/WWW/Search/Snap.pm: updated to 2.03 * lib/WWW/Search/Google.pm: updated to 2.16 2000-01-03 Kingpin * Lycos::Sites retired * Lycos.pm updated to 2.09 (thanks to dbradford@bdctechnologies.com) 1999-12-24 Kingpin * WWW::Search VERSION 2.08 RELEASED! 1999-12-23 Kingpin * Google.pm updated to 2.15 (return more details for each result) 1999-12-22 Kingpin * HotBot.pm updated to 2.10 (new output format) * Lycos::Pages retired * Lycos.pm updated to 2.08 (new URL for advanced search) 1999-12-15 Kingpin * Profusion.pm retired * HotBot.pm updated to 2.09 (handle new result count format) * Google.pm updated to 2.12 (new output format, etc.) 1999-12-13 Kingpin * Yahoo::Classifieds::Employment updated to 1.01 (remove unneeded 'use Socket') * Monster updated to 1.01 (remove unneeded 'use Socket') * HeadHunter updated to 1.01 (remove unneeded 'use Socket') 1999-12-11 Kingpin * WWW::Search VERSION 2.07 RELEASED! 1999-12-10 Kingpin * Magellan.pm updated to 2.03 (new test cases) * Lycos::Sites.pm updated to 2.05 (fix for missing 'next' link) * HotBot.pm updated to 2.08 (was missing next-page link) * Dejanews.pm updated to 2.06 (ignore more deja-internal links) * SearchResult.pm updated to 2.06 (pod additions & corrections) * Infoseek.pm updated to 2.05 (handles new output format) * Lycos.pm updated to 2.07 (handles new output format) 1999-12-07 Kingpin * Dejanews.pm updated to 2.05 (new test cases, and pod typo) 1999-12-06 Kingpin * Dejanews.pm updated to 2.04 (handles new output format) 1999-12-03 Kingpin * Makefile.PL modified for new test suite call sequence * NetFind.pm updated to 1.8 * Lycos::Sites.pm updated to 2.05 (handles new url and new output format) * Lycos.pm updated to 2.05 (handles new output format for Lycos::Sites) 1999-12-01 Kingpin * WWW::Search VERSION 2.06 RELEASED! 1999-11-30 Kingpin * ZDNet.pm updated to 2.02 (handles new output format) 1999-11-29 Kingpin * AltaVista::Intranet.pm updated to 2.02 (fixed to work with latest AltaVista.pm) 1999-11-22 Kingpin * Yahoo.pm updated to 2.06 (supports Yahoo Korea) 1999-11-12 Kingpin * HotBot.pm updated to 2.07 (clean up and fix regexen) 1999-11-08 Kingpin * Span.pm updated to 2.03 * NorthernLight.pm updated to 2.04 (strip tags, etc.) 1999-11-05 Kingpin * MetaCrawler.pm fixed, renamed, and updated to 2.06 * GoTo.pm updated to 1.05 1999-10-29 Kingpin * AltaVista.pm updated (handles new output format) 1999-10-22 Kingpin * Lycos.pm updated to 2.04 (use strip_tags, and extract real URL from www.lycos.com's redirection URL) 1999-10-20 Kingpin * Excite.pm updated to 2.03 (fixed parser for new output format, and now strips tags out of title and description) 1999-10-18 Kingpin * GoTo.pm updated to 1.04 (now uses source field of SearchResult) * new backend Dice.pm for searching jobs at www.dice.com * new backend AOL::Classifieds::Employment * new backend AltaVista::Careers * new backend Yahoo::Classifieds::Employment * new backend Monster.pm for searching jobs at jobsearch.monster.com * new backend HeadHunter.pm for searching jobs at www.headhunter.net * Google.pm updated to 2.09 (fixed parser for new output format) * HotBot.pm updated to 2.06 (strip tags, and new output format) 1999-10-14 Kingpin * Yahoo.pm updated to 2.05 (strip tags, ignore newsgroups) * Fireball.pm updated to 2.00 (fixed) 1999-10-13 Kingpin * WWW::Search VERSION 2.05 RELEASED! 1999-10-12 Kingpin * Google.pm updated to 2.06 (fixed parser for new output format) 1999-10-11 Kingpin * Yahoo.pm updated to 2.04 (fixed parser for new output) * NetFind.pm updated to 1.5 (fixed parser for new output format) 1999-10-08 Kingpin * new fields in WWW::SearchResult (company, location, source) * cleaned up output of test.pl (again) * WebSearch now prints error messages to STDERR instead of STDOUT, and only prints "Nothing found." in verbose mode 1999-10-05 Kingpin * Yahoo.pm updated to 2.03 (use hash_to_cgi_string) * WebCrawler.pm updated to 2.02 (use hash_to_cgi_string) * Magellan.pm updated to 2.02 (use hash_to_cgi_string) * Lycos.pm updated to 2.03 (use hash_to_cgi_string) * LookSmart.pm updated to 2.02 (fixed) * Infoseek.pm updated to 2.05 (fix parser for Companies and News searches, and use hash_to_cgi_string) * HotBot.pm updated to 2.05 (new test cases and uses hash_to_cgi_string) * Excite::News.pm updated to 2.02 (fix parser) * Excite.pm updated to 2.02 (use hash_to_cgi_string) * Dejanews.pm updated to 2.03 (use hash_to_cgi_string) * new method WWW::Search::hash_to_cgi_string to control construction of parameter lists 1999-10-04 Kingpin * fixed pod2man error in NetFind.pm pod 1999-10-02 Kingpin * AltaVista.pm updated (gets approximate_result_count better now) * Search.pm updated (no longer screws up if approximate_result_count is called before parsing any results) 1999-10-01 Kingpin * WWW::Search VERSION 2.04 RELEASED! 1999-09-30 Kingpin * cleaned up the output of `make test` 1999-09-29 Kingpin * new module Lycos::Sites.pm for searching Lycos categorized sites * new module Lycos::Pages.pm for searching Lycos indexed web pages * Lycos.pm updated to 2.02 to recognize Sites searches * Yahoo.pm updated to 2.02 (new test cases, and pod update) * Infoseek.pm updated to 2.03 (handle \n in descriptions and ignore "company profile" URLs) 1999-09-28 Kingpin * Infoseek.pm updated to 2.02 (fixed "next" page URL parsing) * WebSearch has a new option --debug to turn on debugging in backends * WebSearch has a new option --lwpdebug to turn on low-level LWP debugging 1999-09-27 Kingpin * Google.pm updated to 2.05 * Metapedia.pm updated to 2.06 1999-09-22 Kingpin * GoTo.pm updated to 1.03 (format change, URL unencryption fixed) * WebSearch pod reflects long option names 1999-09-20 Kingpin * NetFind.pm updated to 0.7 (AOL moved/merged their search services) 1999-09-17 Kingpin * Dejanews.pm updated to version 2.02 (fixed URL parsing) * new backend Deja.pm for searching www.deja.com 1999-09-15 Kingpin * new backend GoTo.pm for searching www.goto.com * WebSearch has a new option --count to show the approximate_result_count 1999-09-13 Kingpin * `make test -u` now deletes existing test files before updating * NorthernLight.pm updated to version 2.03 (fixed URL parsing) 1999-09-09 Kingpin * new backend NetFind.pm for searching netfind2.aol.com 1999-09-07 Kingpin * extraneous warnings removed from AutoSearch 1999-08-19 Kingpin * WWW::Search VERSION 2.03 RELEASED! 1999-08-18 Kingpin * AutoSearch updated to version 2.01, with new command-line arguments -host and -port * AutoSearch now allows multiple -options arguments with the same key 1999-08-16 Kingpin * complete package tested with ActivePerl 519 in WinNT * It finally happened! Our "Bogus" query string is starting to appear in web archives and is being indexed by search engines. Therefore, the "Bogus" query string now contains some "random" numbers so that it can never be indexed. 1999-08-09 Kingpin * WebSearch now allows long command-line argument names with double-minus * WebSearch updated to version 2.04, with new command-line arguments --host and --port * WebSearch now allows multiple --options arguments with the same key 1999-08-01 Kingpin * WebSearch now has its own VERSION number 1999-07-23 Jim Smyser * NorthernLight.pm updated to 2.02 (fix description parsing) 1999-07-16 Kingpin * complete package tested with ActivePerl 517 in Win98 1999-07-14 Kingpin * WWW::Search VERSION 2.01 RELEASED! 1999-07-13 Kingpin * various cleanups in test.pl * All backends with their own version numbers have been bumped up to 2.01 (this is to clean up the CPAN database) * Makefile.PL and the test suite now work in Win32 * all test cases updated for all working backends; test cases eliminated for known broken backends * NorthernLight.pm updated * new backend Google.pm for searching www.google.com * new backend OpenDirectory.pm for searching search.dmoz.org 1999-07-09 Kingpin * Infoseek.pm updated to version 1.18; warnings removed * new backend LookSmart.pm for searching looksmart.com 1999-07-06 Kingpin * Dejanews.pm updated to version 1.12; now uses www.deja.com; adheres to new test mechanism * AltaVista/AdvancedWeb.pm updated to version 2.01 1999-07-02 Kingpin * ZDNet.pm FIXED and updated to version 1.03 * NorthernLight.pm FIXED and updated to version 1.05 1999-07-01 Kingpin * WWW::Search VERSION 1.025 RELEASED! 1999-06-30 Kingpin * WebCrawler.pm updated to version 1.14; adheres to new test mechanism * Infoseek.pm now strips HTML from titles and descriptions * Infoseek/Companies.pm updated to version 1.10; adheres to new test mechanism * Infoseek/News.pm updated to version 1.09; adheres to new test mechanism * Infoseek/Web.pm updated to version 1.10; adheres to new test mechanism * new exported function strip_tags() 1999-06-29 Kingpin * Excite.pm updated to version 1.11; adheres to new test mechanism * new test mechanism puts list of test cases into each module's code (no more duplicate test code!) * Infoseek.pm updated to version 1.14; Companies and News searches fixed 1999-06-28 Kingpin * test/test.pl is more informative about errors * Infoseek.pm updated to version 1.13 1999-06-24 Kingpin * Profusion.pm updated to version 1.05 * OpenDirectory.pm updated 1999-06-22 Kingpin * The version method is now implemented within the WWW::Search module, so backends no longer need to declare it locally * Snap.pm updated 1999-06-21 Kingpin * NorthernLight.pm updated; now returns score and date for each hit * new backend AltaVista::Intranet * HotBot.pm updated; now unescapes URLs before returning them ===================================== RELEASE HISTORY (moved here from the README file): 1.002: (11 October 1996) - First public release. 1.004: (31 October 1996) - new: AutoSearch, a client application (see below for details) - new: WWW::Search is now in CPAN (see GETTING WWW::Search for details) - bug fix: installation problem (no rule to make CLIENTS/search) fixed 1.005: (12 November 1996) - new: back-ends for HotBot, Lycos, and several AltaVista variants - new: application support for search-engine selection - new: application and library support for search-engine options 1.006: (25 November 1996) - private beta release, see 1.007 for list of new features 1.007: (17 December 1996) - new: back-ends for Dejanews (from Cesare Feroldi de Rosa), Infoseek (also from Cesare Feroldi de Rosa), and Excite (from GLen Pringle) - new: more fields in SearchResult (score, dates, etc., see the man page) (problem found by Cesare Feroldi de Rosa) - new: better error handling on network failures (AutoSearch should report errors on its pages, $search->response() provides an API for error reporting) - new (internal): user_agent handling has changed - new: proxy support added to WWW::Search (still needed in applications) (problem and fix suggested by T. V. Raman) - bug-fix: numerous documentation updates (problems found by Larry Virden) - bug-fix: AltaVista web search was occasionally dropping hits (problem found by Larry Virden, fixed by Bill Scheding) - bug-fix: all non-alphanumeric characters are now escaped (problem found by Larry Virden) 1.008: (8 January 1997) - private alpha release, see 1.009 for list of new features 1.009: (14 January 1997) overview: 1.009 is primarily a maintenance release to accommodate changes to LWP and some search engines. - change: search application renamed WebSearch (a more specific name) - bug-fix: the WWW::Search error in formatting is fixed (problem found by Larry Virden, fix by him and johnh) - bug-fix: RobotUA handling updated for new LWP in Search.pm - bug-fix: update for Infoseek (page format changed about 1 Jan 97) (problem found by Joseph McDonald, fix by Cesare Feroldi de Rosa) - bug-fix: update for Excite (page format changed about 9 Jan 97) (problem found by Juan Jose Amor, fix by GLen Pringle) 1.010: (20 August 1997) overview: an interim release to fix AltaVista - new: normalized_score, a back-end independent score (from Paul Lindner) - new: generic options are supported by several back-ends (specify search engine URL, debugging, etc.) - new: AltaVista back-end now sets SearchResult::raw - bug-fix: update for AltaVista (page format changed Jul 97) (some information wrt fix provided by Guy Decoux) 1.011: (8 October 1997) - internal alpha release, see 1.012 for list of new features 1.012: (3 November 1997) - Overview: an alpha release for test-suite testing - new: for testing, HTTP results can be saved to disk and played back - new: test scripts (try "make test") - bug-fix: Lycos works again and is now maintained by John Heidemann - bug-fix: AltaVista advanced and news searches have been repaired - bug-fix: some uninitialized value warnings suppressed (fix suggested by R. Chandrasekar (Mickey)) - new: new back-ends PLweb - new: documentation for PLweb (contributed by Paul Linder) - new: new back-ends: Gopher, Simple (contributed by Paul Linder) - new: WWW::Search mailing list: to subscribe, send "subscribe info-www-search" as the body of a message to 1.013, (19 February 1998) overview: this is an alpha release to include Martin's new back-ends - bug fix: HotBot back-end updated by Martin Thurn - new: Yahoo back-end now works, by Martin Thurn - problem: several back-ends don't work (Lycos) - problem: several back-ends don't have test suites and so may or may not work (DejaNews, Excite, HotBot, Infoseek, PLweb, SFgate, Verity, Yahoo) - reminder: WWW::Search mailing list: to subscribe, send "subscribe info-www-search" as the body of a message to 1.014, (24 March 1998) overview: this is an alpha release to fix the AltaVista/Lycos back-ends - bug fix: AltaVista/Lycos back-ends (problem reported by Bilal Siddiqui ) - known problem: some back-end test suites give intermittent results (AltaVista::News) - problem: several back-ends don't have test suites and so may or may not work (DejaNews, Excite, HotBot, Infoseek, PLweb, SFgate, Verity, Yahoo) 1.015, (2-Apr-98) overview: this is an alpha release with several new back-ends - new: back-ends: Magellan, WebCrawler (thanks to Martin Thurn) - bug fix: Yahoo/HotBot/Excite back-ends, with test suites. Many thanks to Martin Thurn. - bug fix: AltaVista news test suites have been relaxed, even though the code worked before, the test suites used to report false negatives. - bug fix: AltaVista is now more careful to detect the end of a hit's raw HTML - new: the test suite has been enhanced to be less sensitive to changes in what's indexed - problem: several back-ends don't have test suites and so may or may not work (DejaNews, Infoseek, PLweb, SFgate, Verity) - reminder: WWW::Search mailing list: to subscribe, send "subscribe info-www-search" as the body of a message to 1.016, 21-May-98 overview: this is an alpha to fix HotBot/Infoseek - bug fix: Infoseek/HotBot back ends now work again. (HotBot problem reported by Alan McCoy , both back-ends fixed by Martin Thurn) - addition: Infoseek test suite - addition: test output now includes the version number 1.017, 27-May-98 overview: this is the public release since 1.012 - bug fix: Lycos bug fix 1.018, 31-May-98 overview: back-end updates - bug fix: Excite and WebCrawler (by Martin Thurn), AltaVista (by John Heidemann) updated 30-May-98 - known bugs: WWW::Search doesn't work on MacPerl because of end-of-line differences. A fix for this problem is in progress. (Problem identified and fix suggested by Chris Nandor.) 1.019, 25-Jun-98 overview: back-end updates - bug fix: test suite bugs were causing false negatives on Yahoo, Excite, Magellan, WebCrawler (reported by Martin Thurn, fixed John Heidemann) - new feature: the test suite is now run daily (automatically). Output can be found at . - new feature: verbose mode of WebSearch is more verbose - bug fix: AltaVista was recording the RealName URL on some queries (bug reported by Vassilis Papadimos ) - bug fix: AltaVista wasn't correctly reporting change_time/size (bug and fix from Martin Valldeby ) 1.020, 12-Aug-98 overview: lots of bug fixes and new back-ends - bug fix: maximum_to_retrieve now works for very small values. (Problem identified by Vidyut Luther .) - new back-ends: ExciteForWebServers, FolioViews, Livelink, MSIndexServer, Null, Search97 all from Paul Lindner (thanks!) - bug fix: Gopher, PLweb, SFgate, Simple, Verity from Paul Lindner - bug fix: Lycos from John Heidemann - new test suites: PLweb, FolioViews, Null, MSIndexServer, Search97, SFgate, ExciteForWebServers rom Paul Lindner - bug fix: HotBot repair from Martin Thurn 1.021, 27-Aug-98 overview: a general release - new: Windows installation is now supported by Jim Smyser ; please see his web page for details. - new: MacPerl should now be supported. Thanks to Chris Nandor for the problem and a fix. - bug fix: Infoseek, WebCrawler, Dejanews, HotBot by Martin Thurn - bug fix: AltaVista approx_count bug found by Darren Stalder - bug fix: documentation cleanups from Neil Bowers 1.022, 16-Oct-98 overview: An interim release to fix several broken back-ends. - bug fix: documentation cleanups from Ave Wrigley - bug fix: Infoseek updates (from Martin Thurn) - bug fix: AltaVista update (minor format changes Oct. 1998, partial fix from Andreas Borchert) - new: back ends for Crawler, Fireball, NorthernLight from Andreas Borchert 1.024, 11-Dec-98 overview: primarily bug fixes for back ends - new: proxy support added to WebSearch and AutoSearch (based on code from Paul Linder) - new: new back end for Snap.com (from Jim Smyser) - bug fix: Yahoo, HotBot, Excite, Lycos (from Martin Thurn), NorthernLight (from Jim Smyser) WWW-Search-2.517/MANIFEST0000644000175000017500000000140213101353037014107 0ustar martinmartinChanges inc/Module/Install.pm inc/Module/Install/Base.pm inc/Module/Install/Can.pm inc/Module/Install/Fetch.pm inc/Module/Install/Makefile.pm inc/Module/Install/Metadata.pm inc/Module/Install/Scripts.pm inc/Module/Install/Win32.pm inc/Module/Install/WriteAll.pm lib/WWW/Search.pm lib/WWW/Search/Null.pm lib/WWW/Search/Null/Count.pm lib/WWW/Search/Null/Empty.pm lib/WWW/Search/Null/Error.pm lib/WWW/Search/Null/NoVersion.pm lib/WWW/Search/Result.pm lib/WWW/Search/Simple.pm lib/WWW/Search/Test.pm lib/WWW/SearchResult.pm LICENSE Makefile.PL MANIFEST This list of files META.yml Programs/AutoSearch Programs/WebSearch README README.md t/0_versions.t t/autosearch.t t/null-count.t t/null-empty.t t/null-error.t t/pod-coverage.t t/pod.t t/test.t t/test_parsing.pl t/use.t WWW-Search-2.517/LICENSE0000644000175000017500000000000013101347451013757 0ustar martinmartinWWW-Search-2.517/lib/0000755000175000017500000000000013101353062013525 5ustar martinmartinWWW-Search-2.517/lib/WWW/0000755000175000017500000000000013101353062014211 5ustar martinmartinWWW-Search-2.517/lib/WWW/Search/0000755000175000017500000000000013101353062015416 5ustar martinmartinWWW-Search-2.517/lib/WWW/Search/Null.pm0000644000175000017500000000262313101347451016676 0ustar martinmartin =head1 NAME WWW::Search::NULL - class for searching any web site =head1 SYNOPSIS require WWW::Search; $search = new WWW::Search('Null'); =head1 DESCRIPTION This class is a specialization of WWW::Search that only returns an error message. This class exports no public interface; all interaction should be done through WWW::Search objects. This modules is really a hack for systems that want to include indices that have no corresponding WWW::Search module (like UNIONS) =head1 AUTHOR C is written by Paul Lindner, =head1 COPYRIGHT Copyright (c) 1998 by the United Nations Administrative Committee on Coordination (ACC) All rights reserved. =cut package WWW::Search::Null; use strict; use warnings; use base 'WWW::Search'; use Carp (); use WWW::SearchResult; our $VERSION = do { my @r = (q$Revision: 1.6 $ =~ /\d+/g); sprintf "%d."."%03d" x $#r, @r }; sub _native_setup_search { my $self = shift; my ($native_query, $native_opt) = @_; my $native_url; $self->{_next_to_retrieve} = 0; $self->{_base_url} = $self->{_next_url} = $native_url; } # _native_setup_search sub _native_retrieve_some { my $self = shift; # Null search just returns an error.. return if (!defined($self->{_next_url})); my $response = new HTTP::Response(500, "This is a dummy search engine."); $self->{response} = $response; } # _native_retrieve_some 1; __END__ WWW-Search-2.517/lib/WWW/Search/Simple.pm0000644000175000017500000000576113101347451017223 0ustar martinmartin package WWW::Search::Simple; use strict; use warnings; =head1 NAME WWW::Search::Simple - class for searching any web site =head1 SYNOPSIS require WWW::Search; $search = new WWW::Search('Simple'); =head1 DESCRIPTION This class is a specialization of WWW::Search for simple web based search indices. It extracts all links from a given page. This class exports no public interface; all interaction should be done through WWW::Search objects. Note that this module will probably get a lot of false hits. =head1 AUTHOR C is written by Paul Lindner, =head1 COPYRIGHT Copyright (c) 1997,98 by the United Nations Administrative Committee on Coordination (ACC) All rights reserved. =cut use base 'WWW::Search'; use Carp (); use HTML::TreeBuilder; use WWW::SearchResult; my $debug = 0; sub _native_setup_search { my ($self, $native_query, $native_opt) = @_; my ($native_url); my ($default_native_url) = "http://www.itu.int/cgi-bin/SFgate?application=itu&database=local//usr/local/wais/WWW/www-pages&listenv=table&httppath=/usr/local/www-data/&httpprefix=/&tie=and&maxhits=%n&text=%s"; if (defined($native_opt)) { #print "Got " . join(' ', keys(%$native_opt)) . "\n"; # Process options.. # Substitute query terms for %s... if ($self->{'search_url'} && $native_opt->{'search_args'}) { $native_url = $native_opt->{'search_url'} . "?" . $native_opt->{'search_args'}; } # if } # if $native_url = $default_native_url if (!$native_url); $native_url =~ s/%s/$native_query/g; # Substitute search terms... $self->user_agent(); $self->{_next_to_retrieve} = 0; $self->{_base_url} = $self->{_next_url} = $native_url; } # _native_setup_search sub _native_retrieve_some { my ($self) = @_; my ($hit) = (); my ($hits_found) = 0; # fast exit if already done return undef if (!defined($self->{_next_url})); # get some print "GET " . $self->{_next_url} . "\n" if ($debug); my($response) = $self->http_request($self->{search_method}, $self->{_next_url}); $self->{response} = $response; if (!$response->is_success) { print "Some problem\n" if ($debug); return undef; } my $score = 800; my $results = $response->content(); my($h) = new HTML::TreeBuilder; $h->parse($results); for (@{ $h->extract_links(qw(a)) }) { my($link, $linkelem) = @$_; my($linkobj) = new URI::URL $link, $self->{_next_url}; print "Fixing $link\n" if ($debug); my($hit) = new WWW::SearchResult; $hit->add_url($linkobj->abs->as_string()); $hit->title(join(' ',@{$linkelem->content})); $hit->score($score); $hit->normalized_score($score); if ($hit->title !~ /HASH\(0x/) { $hits_found++; push(@{$self->{cache}}, $hit); } # if $score = int ($score * .95); } # for $self->approximate_result_count($hits_found); $self->{_next_url} = undef; return($hits_found); } # _native_retrieve_some 1; WWW-Search-2.517/lib/WWW/Search/Test.pm0000644000175000017500000005761413101347451016715 0ustar martinmartin# $rcs = ' $Id: Test.pm,v 2.293 2015-06-06 20:55:31 Martin Exp $ ' ; =head1 NAME WWW::Search::Test - utilities to aid in testing WWW::Search backends =head1 SYNOPSIS $oTest = new WWW::Search::Test('HotBot,Yahoo,Excite'); $oTest->test('HotBot', 'Kingpin', 'one', $sQuery, $TEST_RANGE, 1, 10); =head1 DESCRIPTION See file test.pl in the WWW-Search-HotBot distribution for a detailed "real-world" example. =head1 METHODS AND FUNCTIONS =cut package WWW::Search::Test; use strict; use warnings; use Bit::Vector; use Carp; use Config; use Cwd; use Data::Dumper; # for debugging only use Date::Manip; use base 'Exporter'; use File::Path; use File::Slurp; use File::Spec::Functions qw( :ALL ); use Test::More; use WWW::Search; use vars qw( $MODE_DUMMY $MODE_INTERNAL $MODE_EXTERNAL $MODE_UPDATE ); use vars qw( $TEST_DUMMY $TEST_EXACTLY $TEST_BY_COUNTING $TEST_GREATER_THAN $TEST_RANGE ); use vars qw( $iTest $oSearch $sEngine ); # If set, will be used as a filename to save HTML when a test fails: use vars qw( $sSaveOnError ); use vars qw( @EXPORT ); @EXPORT = qw( eval_test test no_test not_working not_working_with_tests not_working_and_abandoned $MODE_DUMMY $MODE_INTERNAL $MODE_EXTERNAL $MODE_UPDATE $TEST_DUMMY $TEST_EXACTLY $TEST_BY_COUNTING $TEST_GREATER_THAN $TEST_RANGE new_engine run_test run_gui_test skip_test count_results tm_new_engine tm_run_test tm_run_test_no_approx ); use vars qw( $VERSION $bogus_query $websearch ); $VERSION = do { my @r = (q$Revision: 2.293 $ =~ /\d+/g); sprintf "%d."."%03d" x $#r, @r }; $bogus_query = "Bogus" . $$ . "NoSuchWord" . time; ($MODE_DUMMY, $MODE_INTERNAL, $MODE_EXTERNAL, $MODE_UPDATE) = qw(dummy internal external update); ($TEST_DUMMY, $TEST_EXACTLY, $TEST_BY_COUNTING, $TEST_GREATER_THAN, $TEST_RANGE) = (1..10); use constant DEBUG => 0; =head2 find_websearch Returns the full path of an executable WebSearch program, or undef if none can be found. =cut sub find_websearch { unless ($websearch) { # Try to find a working WebSearch: my $sProg = 'WebSearch'; my @asTry = ( $sProg ); # Try local directory, in case . is not in the path: push @asTry, catfile(curdir, $sProg); push @asTry, catfile(qw( blib script ), $sProg); # See if WebSearch.BAT has been created/installed, and try it with # explicit 'perl' in front: push @asTry, map { ("$_.bat", "$Config{perlpath} $_") } @asTry; DEBUG && print STDERR Dumper(\@asTry); WEBSEARCH_TRY: foreach my $sTry (@asTry) { my $sCmd = "$sTry --VERSION"; DEBUG && print STDERR " + W::S::T::find_websearch() cmd ==$sCmd==\n"; # Turn off warnings: local $^W = 0; # Wrap it in an eval so we don't die if it fails: my @as = split(/\s/, eval{`$sCmd`}); $websearch = shift(@as) || undef; last WEBSEARCH_TRY if $websearch; } # foreach # Prevent undef warning: $websearch ||= ''; undef $websearch unless ($websearch =~ m/WebSearch/); # print STDERR "in WWW::Search::Test, websearch is $websearch\n"; } # unless return $websearch; } # find_websearch =head2 new Create a new WWW::Search::Test object. All arguments are strings, names of backends that this object will be able to test. If no arguments are given, will be able to test all backends. =cut sub new { my $class = shift; my $sEngines = join(',', '', @_, ''); return bless { debug => 0, engines => $sEngines, error_count => 0, mode => $MODE_DUMMY, verbose => 0, # websearch => $websearch, # why do we need this? }, $class; } # new =head2 mode Set / get the test mode of this object. If an argument is given, sets the mode to that value. Returns the current (or newly set) value. There are three test modes available. They are: $MODE_INTERNAL: parse URLs out of saved pages (as a sanity check or regression test); $MODE_EXTERNAL: send the query to the search engine "live", parse the results, and compare them to the previously saved results; and $MODE_UPDATE: send the query to the search engine "live", parse the results, and save them for future testing. =cut sub mode { my $self = shift; my $new_mode = shift; if ($new_mode) { $self->{'mode'} = $new_mode; } return $self->{'mode'}; } # mode =head2 relevant_test Given the name of a backend, returns true if this Test object is able to test that backend. =cut sub relevant_test { my $self = shift; return 1 if ($self->{engines} eq ',,'); my $e = ','.shift().','; # print STDERR " + relevant_test($e|", $self->{engines}, ")\n"; return ($self->{engines} =~ m/$e/); } # relevant_test =head2 eval_test Given the name of a backend, grabs the $TEST_CASES variable from that backend and evaluates it. =cut sub eval_test { my $self = shift; my $sSE = shift; return unless $self->relevant_test($sSE); my $o = new WWW::Search($sSE); my $iVersion = $o->version; my $code = $o->test_cases; $code ||= ''; unless ($code ne '') { print " $sSE version $iVersion contains no TEST_CASES\n"; $self->{error_count}++; } # print STDERR " BEFORE SUBST: $code\n"; $code =~ s!&test\(!\$self->test\(!g; $code =~ s/&no_test\(/\$self->no_test\(/g; $code =~ s/¬_working\(/\$self->not_working\(/g; $code =~ s/¬_working_and_abandoned\(/\$self->not_working_and_abandoned\(/g; $code =~ s/¬_working_with_tests\(/\$self->not_working_with_tests\(/g; # print STDERR " AFTER SUBST: $code\n"; print "\n"; # put a little space between each engine's results eval $code; warn $@ if $@; } # eval_test =head2 test Run test(s) for a backend. Arguments are, in order: name of a backend to test (string, required); name of backend maintainer (string, if undef $backend::MAINTAINER will be used); filename for results storage/comparison (string, required); query to be sent to backend (string, required); test method (required, one of the following). Several test methods are possible: $TEST_EXACTLY: list of URLs must match exactly (line for line, in order); $TEST_BY_COUNTING: test passes if number of resulting URLs is equal; $TEST_GREATER_THAN: test passes if we get more than N result URLs; and $TEST_RANGE: like $TEST_GREATER_THAN but constrained on both ends. =cut sub test { my $self = shift; my $sSE = shift; my $sM = shift; my $file = shift; my $query = shift; my $test_method = shift; print STDERR " + test($sSE,$sM,$file,$query,$test_method)\n" if $self->{debug}; my ($low_end, $high_end) = @_; $low_end ||= 0; $high_end ||= 0; my $sExpected = $low_end; if ($test_method == $TEST_GREATER_THAN) { $low_end++; $sExpected = "$low_end.."; } if (0 < $high_end) { $sExpected = "$low_end..$high_end"; } return if (!$self->relevant_test($sSE)); print " trial $file (", $self->{'mode'}, ")\n"; if (($self->{'mode'} eq $MODE_INTERNAL) && ($query =~ m/$bogus_query/)) { print " skipping test on this platform.\n"; return; } # if my $pwd = curdir(); my @asSE = split(/::/, $sSE); my $path = catdir($pwd, 'Test-Pages', @asSE); mkpath $path; if ($self->{'mode'} eq $MODE_UPDATE) { # Delete all existing test result files for this Engine: opendir DIR, $path; foreach my $afile (readdir DIR) { unlink catfile($path, $afile) if $afile =~ m/^$file/; } # foreach closedir DIR; } # if MODE_UPDATE # Look at the filename argument we got: my ($v,$d,$f) = splitpath($file); # If it contains no path element (file name only): if ($d eq '') { # Prepend path onto file: $file = catfile($path, $file); } # if my $o = new WWW::Search($sSE); my $version = $o->version; print " ($sSE $version, $sM)\n"; print STDERR " expect to find results in $file\n" if $self->{debug}; my %src = ( $MODE_INTERNAL => "--option search_from_file=$file", $MODE_EXTERNAL => '', $MODE_UPDATE => "--option search_to_file=$file", ); # --max 209 added by Martin Thurn 1999-09-27. We never want to # fetch more than three pages, if we can at all help it (or do we?) my $websearch = &find_websearch; $websearch ||= catfile($pwd, 'blib', 'script', 'WebSearch'); my $cmd = $Config{'perlpath'} . " -MExtUtils::testlib $websearch "; $cmd .= $self->{debug} ? '--debug '.$self->{debug} : ''; $cmd .= " --max 209 --engine $sSE ". $src{$self->{'mode'}} ." -- $query"; print " $cmd\n" if ($self->{verbose} || $self->{debug}); open(TRIALSTREAM, "$cmd|") || die "$0: cannot run test ($!)\n"; open(TRIALFILE, ">$file.trial") || die "$0: cannot open $file.trial for writing ($!)\n"; open(OUTFILE, ">$file.out") || die "$0: cannot open $file.out for writing ($!)\n" if ($self->{'mode'} eq $MODE_UPDATE); my $iActual = 0; while () { print TRIALFILE $_; $iActual++; print OUTFILE $_ if ($self->{'mode'} eq $MODE_UPDATE); } close TRIALSTREAM; close TRIALFILE; if ($self->{'mode'} eq $MODE_UPDATE) { close OUTFILE; if (open TS, ">$file.README") { print TS "This set of test-result pages was created on ", scalar(localtime(time)), "\n"; close TS; } # if my $iPageCount = &wc_l($file); my $iURLCount = &wc_l("$file.out"); print " $query --> $iURLCount urls (should be $sExpected) on $iPageCount pages\n"; return; } # if if (-f "$file.out") { my ($e, $sMsg) = (0, ''); if ($test_method == $TEST_GREATER_THAN) { if ($iActual <= $low_end) { $sMsg .= "expected more than $low_end, but got $iActual; "; $e = 1; } } # TEST_GREATER_THAN elsif ($test_method == $TEST_RANGE) { $sMsg .= "INTERNAL ERROR, low_end has no value; " unless defined($low_end); $sMsg .= "INTERNAL ERROR, high_end has no value; " unless defined($high_end); $sMsg .= "INTERNAL ERROR, high_end is zero; " unless 0 < $high_end; if ($iActual < $low_end) { $sMsg .= "expected $low_end..$high_end, but got $iActual; "; $e = 1; } if ($high_end < $iActual) { $sMsg .= "expected $low_end..$high_end, but got $iActual; "; $e = 1; } } # TEST_RANGE elsif ($test_method == $TEST_EXACTLY) { $e = &diff("$file.out", "$file.trial") ? 1 : 0; } # TEST_EXACTLY elsif ($test_method == $TEST_BY_COUNTING) { my $iExpected = shift; my $iActual = &wc_l("$file.trial"); if ($iActual != $iExpected) { $sMsg .= "expected $iExpected, but got $iActual; "; $e = 1; } } else { $e = 0; $sMsg = "INTERNAL ERROR, unknown test method $test_method; "; } if ($e == 0) { print " ok.\n"; unlink("$file.trial"); # clean up } elsif ($e == 1) { print "DIFFERENCE DETECTED: $query --> $sMsg\n"; $self->{error_count}++; } else { print "INTERNAL ERROR $query --> e is $e.\n"; $self->{error_count}++; } } else { print "NO SAVED OUTPUT, can not evaluate test results.\n"; $self->{error_count}++; } } # test =head2 no_test Prints a message stating that this backend does not have a test suite. Takes two arguments, the backend name and the name of the maintainer. =cut sub no_test { my $self = shift; my ($engine, $maint) = @_; return unless ($self->relevant_test($engine)); print <<"NONE"; trial none ($engine) This search engine does not have any tests, but report problems with it to $maint. NONE } # no_test =head2 not_working Prints a message stating that this backend is known to be broken. Takes two arguments, the backend name and the name of the maintainer. =cut sub not_working { my $self = shift; my ($engine, $maint) = @_; return unless ($self->relevant_test($engine)); print <<"BROKEN"; trial none ($engine) This search engine is known to be non-functional. You are encouraged to investigate the problem and email its maintainer, $maint. BROKEN } # not_working =head2 not_working_with_tests Prints a message stating that this backend is known to be broken even though it has a test suite. Takes two arguments, the backend name and the name of the maintainer. =cut sub not_working_with_tests { my $self = shift; my ($engine, $maint) = @_; return if (!$self->relevant_test($engine)); print <<"KNOWNFAILURE"; trial none ($engine) Test cases for this search engine are known to fail. You are encouraged to investigate the problem and email its maintainer, $maint. KNOWNFAILURE } # not_working_with_tests =head2 not_working_and_abandoned Prints a message stating that this backend is known to be broken and is not being actively maintained. Takes two arguments, the backend name and the name of the maintainer. =cut sub not_working_and_abandoned { my $self = shift; my ($engine, $maint) = @_; return if (!$self->relevant_test($engine)); print <<"ADOPT"; trial none ($engine) This search engine is known to be non-functional. You are encouraged to adopt it from its last known maintainer, $maint. ADOPT } # not_working_and_abandoned =head2 reset_error_count Reset the counter of errors to zero. You probably want to call this before each call to test() or eval_test(). =cut sub reset_error_count { my $self = shift; $self->{error_count} = 0; } # reset_error_count =head2 wc_l (private, not a method) Given a filename, count the number of lines of text contained within the file. (I.e. simulate running UNIX command C on a file) =cut sub wc_l { # SPECIAL CASE: If first line is "Nothing found.", report 0 lines. open WC, shift or return 0; $/ = "\n"; my $i = 0; while () { last if /Nothing found./; $i++; } # while return $i; } # wc_l =head2 diff (private, not a method) Given two files, returns TRUE if contents are line-by-line different, or FALSE if contents are line-by-line same. (I.e. like the UNIX command diff, but just reports true or false) =cut sub diff { open DIFF1, shift or return 91; open DIFF2, shift or return 92; my $iResult = 0; $/ = "\n"; while ((defined(my $s1 = )) && ($iResult ne 1)) { my $s2 = ; unless (defined($s2)) { $iResult = 1; last; } chomp $s1; chomp $s2; if ($s1 ne $s2) { $iResult = 1; last; } } # while close DIFF1; close DIFF2; return $iResult; } # diff =head2 Shortcuts for running backend tests WWW::Search::Test keeps its own count of test numbers, so if you want to mix-and-match these functions with your own tests, use the $WWW::Search::Test::iTest counter. =head2 new_engine One argument: the name of a backend suitable to be passed to WWW::Search::new(). Prints 'ok' or 'not ok' and the test number. Creates a WWW::Search object internally, to be used for all subsequent calls to run_test and run_gui_test (see below). =cut sub new_engine { $iTest++; $sEngine = shift; $oSearch = new WWW::Search($sEngine); print ref($oSearch) ? '' : 'not '; print "ok $iTest\n"; $oSearch->env_proxy('yes'); } # new_engine =head2 tm_new_engine Same as new_engine(), but uses Test::More instead of just printing 'ok'. =cut sub tm_new_engine { my $sEngine = shift; $oSearch = new WWW::Search($sEngine); Test::More::ok(ref($oSearch), "instantiate WWW::Search::$sEngine object"); $oSearch->env_proxy('yes'); } # tm_new_engine =head2 run_test Three arguments: a query string, NOT escaped; a minimum number of expected results; and a maximum number of expected results. Optional fourth argument: integer value to be used as the search_debug. Optional fifth argument: send any true value to dump the search results. Optional sixth argument: reference to hash of search options (see backend documentation). Optional seventh argument: send any true value to NOT escape the query string. If the minimum is undef, assumes zero. If the maximum is undef, does not check. Prints 'ok' or 'not ok' and the test number. =cut sub run_test { return &_run_our_test('normal', @_); } # run_test =head2 run_gui_test Same as run_test(), but calls gui_query() instead of native_query(). =cut sub run_gui_test { return &_run_our_test('gui', @_); } # run_gui_test =head2 tm_run_test Same as run_test(), but uses Test::More rather than just printing 'ok'. Note: If you use this function inside a TODO block, you must set global variable $TODO rather than a local $TODO, and you must set the global $TODO back to empty-string (or undef) at the end of your TODO block. For example: TODO: { $TODO = 'I have not fixed this yet'; tm_run_test(...); $TODO = ''; } # end of TODO block =cut sub tm_run_test { _tm_run_test(@_, 1); } # tm_run_test sub _tm_run_test { # Last argument is boolean, whether to check approx_result_count: my $iApprox = pop(@_) || 0; # Remaining args, same as count_results(): my ($sType, $sQuery, $iMin, $iMax) = @_; my $iCount = count_results(@_); my $iAnyFailure = 0; $iAnyFailure++ unless Test::More::is($oSearch->response->code, 200, 'got valid HTTP response'); if (defined $iMin) { $iAnyFailure++ unless Test::More::cmp_ok($iMin, '<=', $iCount, qq{lower-bound num-hits for query=$sQuery}); if ($iApprox) { $iAnyFailure++ unless Test::More::cmp_ok($iMin, '<=', $oSearch->approximate_result_count, qq{lower-bound approximate_result_count}); } # if } # if if (defined $iMax) { $iAnyFailure++ unless Test::More::cmp_ok($iCount, '<=', $iMax, qq{upper-bound num-hits for query=$sQuery}); if ($iApprox) { $iAnyFailure++ unless Test::More::cmp_ok($oSearch->approximate_result_count, '<=', $iMax, qq{upper-bound approximate_result_count}); } # if } # if $sSaveOnError ||= q''; if ($iAnyFailure && ($sSaveOnError ne q'')) { write_file($sSaveOnError, { err_mode => 'quiet'}, $oSearch->response->content); Test::More::diag(qq'HTML was saved in $sSaveOnError'); } # if } # _tm_run_test =head2 tm_run_test_no_approx Same as tm_run_test, but does NOT check the approximate_result_count. =cut sub tm_run_test_no_approx { _tm_run_test(@_, 0); } # tm_run_test_no_approx =head2 count_results Run a query, and return the actual (not approximate) number of hits. Required first argument determines which backend query method to call: 'gui' to call gui_query(), anything else to call native_query(). Remaining arguments are same as all the run_test() arguments. =cut sub count_results { my ($sType, $sQuery, $iMin, $iMax, $iDebug, $iPrintResults, $rh, $iDoNotEscape) = @_; # print STDERR qq{ DDD count_results raw args($sType,$sQuery,$iMin,$iMax,$iDebug,$iPrintResults,$rh,$iDoNotEscape)\n}; $iDebug ||= 0; $iPrintResults ||= 0; $rh->{'search_debug'} = $iDebug; carp ' --- min/max values out of order?' if (defined($iMin) && defined($iMax) && ($iMax < $iMin)); $oSearch->reset_search; $iMin ||= 0; # While $iMax is the number the user wants to compare, $iMaxAbs is # the actual number we apply to the search: my $iMaxAbs; if (! defined($iMax)) { # User said upper limit is 'undef'; just make sure we get the # mininum: $iMaxAbs = $iMin + 1; } # if else { # Give a little breathing room, so we'll notice if there are too # many returned: $iMaxAbs = $iMax + 1; } $oSearch->maximum_to_retrieve($iMaxAbs); $iTest++; $sQuery = WWW::Search::escape_query($sQuery) unless $iDoNotEscape; # print STDERR " + in WWW::Search::Test::count_results, iDebug = $iDebug\n"; if ($sType eq 'gui') { $oSearch->gui_query($sQuery, $rh); } else { $oSearch->native_query($sQuery, $rh); } $oSearch->login($ENV{WWW_SEARCH_USERNAME}, $ENV{WWW_SEARCH_PASSWORD}); my @aoResults = $oSearch->results(); if ($iPrintResults) { my $i = 1; foreach my $oResult (@aoResults) { print $i++, '. ', $oResult->url, "\n"; foreach my $sField (qw( title description score change_date index_date size company location source )) { print " $sField==", $oResult->$sField, "==\n" if defined($oResult->$sField); } # foreach } # foreach } # if return scalar(@aoResults); } # count_results sub _run_our_test { my ($sType, $sQuery, $iMin, $iMax, $iDebug, $iPrintResults) = @_; my $iResults = &count_results(@_); my $sExpect; if (! defined($iMax)) { $sExpect = "more than $iMin"; } elsif (! defined($iMin)) { $sExpect = "fewer than $iMax"; } else { $sExpect = "$iMin..$iMax"; } $iMax = 999999 unless defined ($iMax); if (($iResults < $iMin) || ($iMax < $iResults)) { print STDERR " --- got $iResults results for $sType $sEngine query '$sQuery', but expected $sExpect\n"; print STDOUT 'not '; } # if print STDOUT "ok $iTest\n"; } # _run_our_test =head2 skip_test You can call this function instead of run_test() or run_gui_test() if the current test must be skipped for any reason. =cut sub skip_test { $iTest++; print STDOUT "skip $iTest\n"; } # skip_test =head2 test_most_results Given an arrayref of things to test, runs all those things against all the results of the most-recently executed test search. =cut sub test_most_results { my $rara = shift; my $fPct = shift || 0.80; my $iCount = scalar(@$rara); my $iAnyFailed = my $iResult = 0; my %hioExemplar; my %hiiFailed; # Create a bit vector large enough to hold one bit for each test: my $oV = new Bit::Vector($iCount); # Turn on all the bits (we will turn off bits when tests fail): $oV->Fill; my $iVall = $oV->to_Dec; my $sCodeAll = q{}; my $iTest = 0; TEST: foreach my $ra (@$rara) { # print STDERR " DDD ra is ", Dumper($ra); my ($sField, $sCmp, $sValue, $sDesc) = @$ra; $sDesc ||= qq{test #$iTest}; my $sCode; if ($sCmp eq 'like') { $sCode = "(\$oResult->$sField =~ m!$sValue!)"; } # if elsif ($sCmp eq 'unlike') { $sCode = "(\$oResult->$sField !~ m!$sValue!)"; } # if elsif ($sCmp eq 'date') { $sCode = "((ParseDate(\$oResult->$sField) || '') ne q{})"; } # if else { $sCode = "(\$oResult->$sField $sCmp $sValue)"; } $sCode = <<"ENDCODE"; if (! $sCode) { \$oV->Bit_Off($iTest); \$hiiFailed{'$sDesc'}++; } # if ENDCODE $sCodeAll .= $sCode; $iTest++; } # foreach TEST $sCodeAll .= "1;\n"; # print STDERR " DDD the test is ===$sCodeAll===\n"; RESULT: foreach my $oResult ($oSearch->results()) { $iResult++; # Turn on all the bits (we will turn off bits when tests fail): $oV->Fill; # print STDERR " DDD eval the test...\n"; if (! eval $sCodeAll) { print STDERR $@; } # if # Now look at the value of the Bit::Vector after running the tests: my $iV = $oV->to_Dec; if ($iV < $iVall) { # At least one of the bits got turned off (i.e. a test failed): $hioExemplar{$iV} = $oResult; $iAnyFailed++; # For debugging: # print STDERR Dumper($oResult); # last RESULT; } # if } # foreach RESULT ok($iResult, qq{got more than zero results ($iResult, to be exact)}); # Now make sure all the sub-tests passed at least N% of the time. # We only need to look at sub-tests that had any failures (sub-tests # with no failures are 100% correct, so there's no need to check # them 8-) while (my ($sItem, $iFailed) = each %hiiFailed) { my $fPctFailed = ($iFailed / $iResult); ok($fPctFailed < (1 - $fPct), sprintf(qq{%0.1f%% of '%s' tests failed}, $fPctFailed * 100, $sItem)); } # while if ($iAnyFailed) { Test::More::diag(" Here are result(s) that exemplify test failure(s):"); foreach my $oResult (values %hioExemplar) { Test::More::diag(Dumper($oResult)); } # while } # if } # test_most_results 1; __END__ WWW-Search-2.517/lib/WWW/Search/Null/0000755000175000017500000000000013101353062016330 5ustar martinmartinWWW-Search-2.517/lib/WWW/Search/Null/Empty.pm0000644000175000017500000000253713101347451020000 0ustar martinmartin# $Id: Empty.pm,v 1.10 2010-12-02 23:45:57 Martin Exp $ =head1 NAME WWW::Search::Null::Empty - class for testing WWW::Search clients =head1 SYNOPSIS use WWW::Search; my $oSearch = new WWW::Search('Null::Empty'); $oSearch->native_query('Makes no difference what you search for...'); my @aoResults = $oSearch->results; # You get no results... my $oResponse = $oSearch->response; # ...But you get an HTTP::Response object with a code of 200 =head1 DESCRIPTION This class is a specialization of WWW::Search that returns no hits, but no error message. This module might be useful for testing a client program without actually being connected to any particular search engine. =head1 AUTHOR Martin 'Kingpin' Thurn, C, L. =cut package WWW::Search::Null::Empty; use strict; use warnings; use base 'WWW::Search'; our $VERSION = do { my @r = (q$Revision: 1.10 $ =~ /\d+/g); sprintf "%d."."%03d" x $#r, @r }; our $MAINTAINER = q{Martin Thurn }; sub _native_setup_search { my($self, $native_query, $native_opt) = @_; } # native_setup_search sub _native_retrieve_some { my $self = shift; my $response = new HTTP::Response(200, "This is a test of WWW::Search"); $self->{response} = $response; return 0; } # native_retrieve_some 1; __END__ WWW-Search-2.517/lib/WWW/Search/Null/NoVersion.pm0000644000175000017500000000115413101347451020616 0ustar martinmartin# $Id: NoVersion.pm,v 1.5 2010-12-02 23:45:57 Martin Exp $ =head1 NAME WWW::Search::Null::NoVersion - class for testing WWW::Search =head1 SYNOPSIS use WWW::Search; my $oSearch = new WWW::Search('Null::NoVersion'); =head1 DESCRIPTION This class is a specialization of WWW::Search that has no $VERSION. This module is for testing the WWW::Search module. =head1 AUTHOR Martin 'Kingpin' Thurn, C, L. =cut package WWW::Search::Null::NoVersion; use strict; use warnings; use base 'WWW::Search'; our $MAINTAINER = q{Martin Thurn }; 1; __END__ WWW-Search-2.517/lib/WWW/Search/Null/Error.pm0000644000175000017500000000246713101347451017775 0ustar martinmartin # $Id: Error.pm,v 1.12 2010-12-02 23:45:57 Martin Exp $ =head1 NAME WWW::Search::Null::Error - class for testing WWW::Search clients =head1 SYNOPSIS require WWW::Search; my $oSearch = new WWW::Search('Null::Error'); $oSearch->native_query('Makes no difference what you search for...'); $oSearch->retrieve_some(); my $oResponse = $oSearch->response; # You get an HTTP::Response object with a code of 500 =head1 DESCRIPTION This class is a specialization of WWW::Search that only returns an error message. This module might be useful for testing a client program without actually being connected to any particular search engine. =head1 AUTHOR Martin 'Kingpin' Thurn, C, L. =cut package WWW::Search::Null::Error; use strict; use warnings; use base 'WWW::Search'; our $VERSION = do { my @r = (q$Revision: 1.12 $ =~ /\d+/g); sprintf "%d."."%03d" x $#r, @r }; our $MAINTAINER = q{Martin Thurn }; sub _native_setup_search { my($self, $native_query, $native_opt) = @_; } # native_setup_search sub _native_retrieve_some { my $self = shift; my $response = new HTTP::Response(500, "This is a test of WWW::Search"); $self->{response} = $response; return undef; } # native_retrieve_some 1; __END__ WWW-Search-2.517/lib/WWW/Search/Null/Count.pm0000644000175000017500000000616713101347451017775 0ustar martinmartin # $Id: Count.pm,v 1.17 2010-12-02 23:45:57 Martin Exp $ =head1 NAME WWW::Search::Null::Count - class for testing WWW::Search clients =head1 SYNOPSIS use WWW::Search; my $iCount = 4; my $oSearch = new WWW::Search('Null::Count', '_null_count' => $iCount, ); $oSearch->native_query('Makes no difference what you search for...'); my @aoResults = $oSearch->results; # ...You get $iCount results. =head1 DESCRIPTION This class is a specialization of WWW::Search that returns some hits, but no error message. The number of hits returned can be controlled by adding a '_null_count' hash entry onto the call to WWW::Search::new(). The default is 5. This module might be useful for testing a client program without actually being connected to any particular search engine. =head1 AUTHOR Martin 'Kingpin' Thurn, C, L. =cut package WWW::Search::Null::Count; use strict; use warnings; use WWW::Search; use WWW::Search::Result; use base 'WWW::Search'; our $VERSION = do { my @r = (q$Revision: 1.17 $ =~ /\d+/g); sprintf "%d."."%03d" x $#r, @r }; our $MAINTAINER = q{Martin Thurn }; use constant DEBUG_FUNC => 0; sub _native_setup_search { my ($self, $native_query, $native_opt) = @_; # print STDERR " FFF ::Null::Count::_native_setup_search()\n" if (DEBUG_FUNC || $self->{_debug}); if (! defined $self->{_null_count}) { # print STDERR " + setting default _null_count to 5\n"; $self->{_null_count} = 5; } # if $self->{_allow_empty_query} = 1; } # _native_setup_search sub _native_retrieve_some { my $self = shift; # print STDERR " FFF ::Null::Count::_n_r_s()\n" if (DEBUG_FUNC || $self->{_debug}); my $response = new HTTP::Response(200, "This is a test of WWW::Search"); $self->{response} = $response; my $iCount = $self->{_null_count}; # print STDERR " + iCount is $iCount\n"; $self->_elem('approx_count', $iCount); for my $i (1..$iCount) { my $oResult = new WWW::Search::Result; $oResult->url(qq{url$i}); $oResult->title(qq{title$i}); $oResult->description("description$i"); $oResult->change_date("yesterday"); $oResult->index_date("today"); $oResult->raw(qq{}); $oResult->score(100-$i*2); $oResult->normalized_score(1000-$i*20); $oResult->size($i*2*1024); $oResult->source('WWW::Search::Null::Count'); $oResult->company('Dub Dub Dub Search, Inc.'); $oResult->location('Ashburn, VA'); if ($i % 2) { $oResult->urls("url$i", map { "url$i.$_" } (1..$iCount)); $oResult->related_urls(map { "url-r$i.$_" } (1..$iCount)); my @aoTitles = map { "title-r$i.$_" } (1..$iCount); $oResult->related_titles(\@aoTitles); } else { for my $j (1..$iCount) { $oResult->add_url(qq{url$i.$j}); $oResult->add_related_url(qq{url-r$j}); $oResult->add_related_title(qq{title-r$i}); } # for $j } # else push(@{$self->{cache}}, $oResult); } # for $i return 0; } # _native_retrieve_some 1; __END__ WWW-Search-2.517/lib/WWW/Search/Result.pm0000644000175000017500000000067313101347451017245 0ustar martinmartin # $Id: Result.pm,v 1.5 2007-11-12 01:13:49 Daddy Exp $ =head1 NAME WWW::Search::Result - class for results returned from WWW::Search =head1 DESCRIPTION This module is just a synonym for L =head1 AUTHOR Martin Thurn =cut package WWW::Search::Result; use strict; use warnings; use base 'WWW::SearchResult'; our $VERSION = do { my @r = (q$Revision: 1.5 $ =~ /\d+/g); sprintf "%d."."%03d" x $#r, @r }; 1; __END__ WWW-Search-2.517/lib/WWW/SearchResult.pm0000644000175000017500000002211213101347451017156 0ustar martinmartin# SearchResult.pm # by John Heidemann # Copyright (C) 1996 by USC/ISI # $Id: SearchResult.pm,v 2.78 2008-07-21 01:20:30 Martin Exp $ # # Copyright (c) 1996 University of Southern California. # All rights reserved. # # Redistribution and use in source and binary forms are permitted # provided that the above copyright notice and this paragraph are # duplicated in all such forms and that any documentation, advertising # materials, and other materials related to such distribution and use # acknowledge that the software was developed by the University of # Southern California, Information Sciences Institute. The name of the # University may not be used to endorse or promote products derived from # this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED # WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. =head1 NAME WWW::SearchResult - class for results returned from WWW::Search =head1 SYNOPSIS require WWW::Search; require WWW::SearchResult; $search = new WWW::Search; $search->native_query(WWW::Search::escape_query($query)); # Get first result: $result = $search->next_result(); =head1 DESCRIPTION A framework for returning the results of C. =head1 SEE ALSO L =head1 REQUIRED RESULTS The particular fields returned in a result are backend- (search engine-) dependent. However, all search engines are required to return a url and title. (This list may grow in the future.) =head1 METHODS AND FUNCTIONS =cut ##################################################################### package WWW::SearchResult; use strict; use warnings; use CGI; use base 'LWP::MemberMixin'; our $VERSION = do{ my @r = (q$Revision: 2.78 $ =~ /\d+/g); sprintf "%d."."%03d" x $#r, @r}; =head2 new To create a new WWW::SearchResult, call $result = new WWW::SearchResult(); =cut sub new { my $class = shift; my $self = bless { }, $class; $self->{urls} = (); return $self; } # new =head2 url Returns the primary URL. Note that there may be a list of urls, see also methods C and C. Nothing special is guaranteed about the primary URL other than that it is the first one returned by the back end. Every result is required to have at least one URL. =cut sub url { my $self = shift; if (@_ < 1) { # No arguments, just return the current value: return ${$self->{urls}}[0]; } # if no args unshift @{$self->{urls}}, $_[0]; return $self->{urls}->[0]; } # url sub _elem_array { my $self = shift; my $elem = shift; if (@_ < 1) { # No arguments return wantarray ? @{$self->{$elem}} : $self->{$elem}; } # if if (ref($_[0])) { # Trying to assign an arrayref: $self->{$elem} = $_[0]; } else { # Trying to set to a scalar (or list of scalars); make sure it's # an array even if they give one element: $self->{$elem} = undef; push @{$self->{$elem}}, @_; } # Always return array refrence return $self->{$elem}; } # _elem_array sub _add_elem_array { my $self = shift; my $elem = shift; # No matter how many they're adding: push(@{$self->{$elem}}, @_); } # _add_elem_array =head2 add_url Add a URL to the list. =cut sub add_url { return shift->_add_elem_array('urls', @_); } =head2 urls Return a reference to the list of urls. There is also a primary URL (C). =cut sub urls { return shift->_elem_array('urls', @_); } =head2 add_related_url Add a URL to the related_url list. =cut sub add_related_url { return shift->_add_elem_array('related_urls', @_); } =head2 related_urls Return a reference to the list of related urls. =cut sub related_urls { return shift->_elem_array('related_urls', @_); } =head2 add_related_title Add a title to the list or related titles. =cut sub add_related_title { return shift->_add_elem_array('related_titles', @_); } =head2 related_titles Return a reference to the list of related titles. =cut sub related_titles { return shift->_elem_array('related_titles', @_); } =head2 title, description, score, change_date, index_date, size, raw Set or get attributes of the result. None of these attributes is guaranteed to be provided by a given backend. If an attribute is not provided its method will return C. Typical contents of these attributes: =over 8 =item title The title of the hit result (typically that provided by the 'TITLE' HTML tag). =cut sub title { return shift->_elem('title', @_); } =item description A brief description of the result, as provided (or not) by the search engine. Often the first few sentences of the document. =cut sub description { return shift->_elem('description', @_); } =item source Source is either the base url for this result (as listed on the search engine's results page) or another copy of the full url path of the result. It might also indicate the source site name or address whence the result came, for example, 'CNN' or 'http://www.cnn.com' if the search result page said "found at CNN.com". This value is backend-specific; in fact very few backends set this value. =cut sub source { return shift->_elem('source', @_); } =item add_sources Same meaning as source above, for adding sources in case there are potentially multiple sources. =cut sub add_sources { return shift->_add_elem_array('sources', @_); } =item sources Returns a reference to the list of sources. =cut sub sources { return shift->_elem_array('sources', @_); } =item score A backend specific, numeric score of the search result. The exact range of scores is search-engine specific. Usually larger scores are better, but this is no longer required. See normalized_score for a backend independent score. =cut sub score { return shift->_elem('score', @_); } =item normalized_score This is intended to be a backend-independent score of the search result. The range of this score is between 0 and 1000. Higher values indicate better quality results. This is not really implemented since no one has created an backend-independent ranking algorithm. =cut sub normalized_score { return shift->_elem('normalized_score', @_); } =item change_date When the result was last changed. Typically this is the modification time of the destination web page. =cut sub change_date { return shift->_elem('change_date', @_); } =item index_date When the search engine indexed the result. =cut sub index_date { return shift->_elem('index_date', @_); } =item size The approximate size of the result, in bytes. This is only an approximation because search backends often report the size as "18.4K"; the best we can do with that number is return it as the value of 18.4 * 1024. =cut sub size { return shift->_elem('size', @_); } =item raw The raw HTML for the entire result. Raw should be exactly the raw HTML for one entry. It should not include list or table setup commands (like ul or table tags), but it may include list item or table data commands (like li, tr, or td). Whether raw contains a list entry, table row, br-separated lines, or plain text is search-engine dependent. In fact, many backends do not even return it at all. =cut sub raw { return shift->_elem('raw', @_); } =item as_HTML Convert the search result to a human-readable form, decorated with HTML for pretty-printing. =cut sub as_HTML { my $self = shift; my $cgi = new CGI; my $s = $cgi->a({ href => $self->url, }, $self->title || 'title unknown', ); $s .= $cgi->br; $s .= $self->description || 'no description available'; return $s; } # as_HTML =back =head2 Others More attributes of the result. Backend-specific. Refer to the documentation of each backend for details. =over =item bid_amount =cut sub bid_amount { return shift->_elem('bid', @_); } =item bid_count =cut sub bid_count { return shift->_elem('bids', @_); } =item bidder =cut sub bidder { return shift->_elem('bidder', @_); } =item category =cut sub category { return shift->_elem('category', @_); } =item company =cut sub company { return shift->_elem('company', @_); } =item end_date =cut sub end_date { return shift->_elem('end_date', @_); } =item image_url =cut sub image_url { return shift->_elem('image_url', @_); } =item item_number =cut sub item_number { return shift->_elem('item_number', @_); } =item location =cut sub location { return shift->_elem('location', @_); } =item question_count =cut sub question_count { return shift->_elem('question_count', @_); } =item seller =cut sub seller { return shift->_elem('seller', @_); } =item shipping =cut sub shipping { return shift->_elem('shipping', @_); } =item sold =cut sub sold { return shift->_elem('sold', @_); } =item start_date =cut sub start_date { return shift->_elem('start_date', @_); } =item thumb_url =cut sub thumb_url { return shift->_elem('thumb_url', @_); } =item watcher_count =cut sub watcher_count { return shift->_elem('seller', @_); } =back =head1 AUTHOR WWW::SearchResult was written by John Heidemann. WWW::SearchResult is maintained by Martin Thurn. =cut 1; __END__ WWW-Search-2.517/lib/WWW/Search.pm0000644000175000017500000015347313101347451015776 0ustar martinmartin# Search.pm # by John Heidemann # Copyright (C) 1996 by USC/ISI # $Id: Search.pm,v 2.565 2008-11-29 05:52:08 Martin Exp $ # # A complete copyright notice appears at the end of this file. =head1 NAME WWW::Search - Virtual base class for WWW searches =head1 SYNOPSIS use WWW::Search; my $sEngine = 'AltaVista'; my $oSearch = new WWW::Search($sEngine); =head1 DESCRIPTION This class is the parent for all access methods supported by the C library. This library implements a Perl API to web-based search engines. See README for a list of search engines currently supported, and for a lot of interesting high-level information about this distribution. Search results can be limited, and there is a pause between each request to avoid overloading either the client or the server. =head2 Sample program Here is a sample program: my $sQuery = 'Columbus Ohio sushi restaurant'; my $oSearch = new WWW::Search('AltaVista'); $oSearch->native_query(WWW::Search::escape_query($sQuery)); $oSearch->login($sUser, $sPassword); while (my $oResult = $oSearch->next_result()) { print $oResult->url, "\n"; } # while $oSearch->logout; Results are objects of type C (see L for details). Note that different backends support different result fields. All backends are required to support title and url. =head1 SEE ALSO For specific search engines, see L (replacing TheEngineName with a particular search engine). For details about the results of a search, see L. =head1 METHODS AND FUNCTIONS FOR SEARCHERS =over =cut ##################################################################### package WWW::Search; use strict qw( vars ); use warnings; use Carp (); use CGI; # use Data::Dumper; # for debugging only use Exporter; use File::Find; use File::Spec::Functions; use HTML::TreeBuilder; use HTTP::Cookies; use HTTP::Request; use HTTP::Response; use HTTP::Status; use LWP::MemberMixin; use LWP::RobotUA; use LWP::UserAgent; # use Net::Domain qw( hostfqdn ); use URI; use URI::Escape; # use User; # Internal states: use constant SEARCH_BEFORE => 1; use constant SEARCH_UNDERWAY => 2; use constant SEARCH_DONE => 3; use constant SEARCH_RETRIEVING => 4; use vars qw( @ISA @EXPORT @EXPORT_OK ); @EXPORT = qw(); @EXPORT_OK = qw( escape_query unescape_query generic_option strip_tags ); @ISA = qw(Exporter LWP::MemberMixin); our $MAINTAINER = 'Martin Thurn '; our $VERSION = do { my @r = (q$Revision: 2.565 $ =~ /\d+/g); sprintf "%d."."%03d" x $#r, @r }; =item new To create a new WWW::Search, call $oSearch = new WWW::Search('SearchEngineName'); where SearchEngineName is replaced with a particular search engine. For example: $oSearch = new WWW::Search('Yahoo'); If no search engine is specified, a default (currently 'Null::Empty') will be chosen for you. =cut sub new { my $class = shift; my $engine = shift; # Remaining arguments will become hash args # The default backend (not currently more configurable :-< ) my $default_engine = 'Null::Empty'; my $default_agent_name = "$class/$VERSION"; my $default_agent_email = ''; $engine = $default_engine if (!defined($engine)); # Load the engine, if necessary. my $subclass = "${class}::$engine"; my $sEval = "use $subclass;"; eval $sEval; Carp::croak("can not load backend $engine ($@)") if ($@); my $self = bless { engine => $engine, maximum_to_retrieve => 500, # both pages and hits interrequest_delay => 0.25, # in seconds agent_name => $default_agent_name, agent_email => $default_agent_email, env_proxy => 0, http_method => 'GET', http_proxy => undef, http_proxy_user => undef, http_proxy_pwd => undef, timeout => 60, _debug => 0, _parse_debug => 0, search_from_file => undef, search_to_file => undef, search_to_file_index => 0, @_, # variable initialization goes here }, $subclass; $self->reset_search(); return $self; } # new =item version Returns the value of the $VERSION variable of the backend engine, or $WWW::Search::VERSION if the backend does not contain $VERSION. =cut sub version { my $self = shift; my $iVersion = eval '$'.ref($self).'::VERSION'; # print STDERR " + iVersion = >>>$iVersion<<<\n"; $iVersion ||= $VERSION; return $iVersion; } # version =item maintainer Returns the value of the $MAINTAINER variable of the backend engine, or $WWW::Search::MAINTAINER if the backend does not contain $MAINTAINER. =cut sub maintainer { my $self = shift; my $sMaintainer = eval '$'.ref($self).'::MAINTAINER'; # print STDERR " + sMaintainer = >>>$sMaintainer<<<\n"; $sMaintainer ||= $MAINTAINER; return $sMaintainer; } # maintainer =item installed_engines Returns a list of the names of all installed backends. We can not tell if they are up-to-date or working, though. use WWW::Search; my @asEngines = sort &WWW::Search::installed_engines(); local $" = ', '; print (" + These WWW::Search backends are installed: @asEngines\n"); # Choose a backend at random (yes, this is rather silly): my $oSearch = WWW::Search->new($asEngines[rand(scalar(@asEngines))]); =cut use constant DEBUG_ARC => 0; use constant DEBUG_COOKIES => 0; use constant DEBUG_FIND => 0; use constant DEBUG_FUNC => 0; use constant DEBUG_RETR => 0; sub _wanted { # Code adapted from the following netnews post (Thank you, Tom!): # From: Tom Christiansen (tchrist@mox.perl.com) # Subject: SRC: pminst - find modules whose names match this pattern # Newsgroups: comp.lang.perl.misc # Date: 1999/02/15 my $startdir = shift; my $sFullPath = $File::Find::name; print STDERR " + _wanted($startdir, $sFullPath)\n" if DEBUG_FIND; if (-d && /^[a-z]/) { # This is so we don't go down site_perl etc too early (whatever # that means): $File::Find::prune = 1; DEBUG_FIND && print STDERR " + prune\n"; return; } # if unless ($sFullPath =~ s!\.pm\Z!!) { DEBUG_FIND && print STDERR " + not .pm\n"; return; } # unless # Delete absolute path off front of file path: $sFullPath =~ s{^\Q$startdir\E[\\/]}{}; unless (1 || ($sFullPath =~ s!\AWWW/Search!!)) { print STDERR " + not WWW/Search\n" if DEBUG_FIND; return; } # unless print STDERR " + found $sFullPath\n" if DEBUG_FIND; $sFullPath =~ s{/}{::}g; $sFullPath =~ s!\A::!!; return $sFullPath; } # _wanted sub installed_engines { # Does NOT need a WWW::Search object to operate my %hsi; local $" = '|'; DEBUG_FIND && print STDERR " + installed_engines() start, INC is @INC...\n"; INC_DIR: foreach my $sDir (map catdir($_, 'WWW', 'Search'), @INC) { DEBUG_FIND && print STDERR " + foreach ==$sDir==\n"; next INC_DIR unless -d $sDir; File::Find::find(sub { $hsi{&_wanted($sDir) || 'JUNKJUNK'}++; }, $sDir); } # foreach INC_DIR delete $hsi{'JUNKJUNK'}; delete $hsi{'Test'}; delete $hsi{'Result'}; return keys %hsi; } # installed_engines =item native_query Specify a query (and optional options) to the current search object. Previous query (if any) and its cached results (if any) will be thrown away. The option values and the query must be escaped; call L to escape a string. The search process is not actually begun until C or C is called (lazy!), so native_query does not return anything. Example: $oSearch->native_query('search-engine-specific+escaped+query+string', { option1 => 'able', option2 => 'baker' } ); The hash of options following the query string is optional. The query string is backend-specific. There are two kinds of options: options specific to the backend, and generic options applicable to multiple backends. Generic options all begin with 'search_'. Currently a few are supported: =over 4 =item search_url Specifies the base URL for the search engine. =item search_debug Enables backend debugging. The default is 0 (no debugging). =item search_parse_debug Enables backend parser debugging. The default is 0 (no debugging). =item search_to_file FILE Causes the search results to be saved in a set of files prefixed by FILE. (Used internally by the test-suite, not intended for general use.) =item search_from_file FILE Reads a search from a set of files prefixed by FILE. (Used internally by the test-suite, not intended for general use.) =back Some backends may not implement these generic options, but any which do implement them must provide these semantics. Backend-specific options are described in the documentation for each backend. In most cases the options and their values are packed together to create the query portion of the final URL. Details about how the search string and option hash are interpreted might be found in the search-engine-specific manual pages (WWW::Search::SearchEngineName). =cut sub native_query { my $self = shift; print STDERR " FFF native_query($_[0])\n" if (DEBUG_FUNC || $self->{_debug}); $self->reset_search(); $self->{'native_query'} = $_[0]; $self->{'native_options'} = $_[1]; # promote generic options my $opts_ref = $_[1]; foreach my $sKey (keys %$opts_ref) { if (generic_option($sKey)) { print STDERR " + promoting $sKey to $self\n" if $self->{_debug}; $self->{$sKey} = $opts_ref->{$sKey}; # delete $opts_ref->{$sKey}; } # if } # foreach $self->setup_search() if ($self->{state} == SEARCH_BEFORE); } # native_query =item gui_query Specify a query to the current search object; the query will be performed with the engine's default options, as if it were typed by a user in a browser window. Same arguments as C above. Currently, this feature is supported by only a few backends; consult the documentation for each backend to see if it is implemented. =cut sub gui_query { # This function is a stub to prevent runtime errors. This function # should be defined in each backend as appropriate. See Yahoo.pm in # the WWW-Search-Yahoo distribution for an example of how to # implement it. my $self = shift; return $self->native_query(@_); } # gui_query =item cookie_jar Call this method (anytime before asking for results) if you want to communicate cookie data with the search engine. Takes one argument, either a filename or an HTTP::Cookies object. If you give a filename, WWW::Search will attempt to read/store cookies there (by passing the filename to HTTP::Cookies::new). $oSearch->cookie_jar('/tmp/my_cookies'); If you give an HTTP::Cookies object, it is up to you to save the cookies if/when you wish. use HTTP::Cookies; my $oJar = HTTP::Cookies->new(...); $oSearch->cookie_jar($oJar); If you pass in no arguments, the cookie jar (if any) is returned. my $oJar = $oSearch->cookie_jar; unless (ref $oJar) { print "No jar" }; =cut sub cookie_jar { my $self = shift; if (! @_) { return $self->{'_cookie_jar'}; } # if no arguments my $arg = shift; my $sRef = ref($arg); if ($sRef =~ m!\AHTTP::Cookies!) { print STDERR " + WWW::Search using caller's $sRef object\n" if DEBUG_COOKIES; $self->{'_cookie_jar'} = $arg; $self->{'_cookie_jar_we_save'} = 0; } # if elsif (! ref($arg)) { # Assume that $arg is a file name: print STDERR " + WWW::Search using Cookies from file $arg\n" if DEBUG_COOKIES; $self->{'_cookie_jar'} = HTTP::Cookies->new( 'file' => $arg, 'autosave' => 1, 'ignore_discard' => 1, ); $self->{'_cookie_jar'}->load; $self->{'_cookie_jar_we_save'} = 1; # print STDERR " + WWW::Search just loaded cookies from $arg\n"; } else { Carp::carp "argument to WWW::Search::cookie_jar() must be a scalar or a flavor of HTTP::Cookies"; } } # cookie_jar =item date_from Set/get the start date for limiting the query by a date range. See the documentation for each backend to find out if date ranges are supported. =item date_to Set/get the end date for limiting the query by a date range. See the documentation for each backend to find out if date ranges are supported. =cut sub date_from { return shift->_elem('date_from', @_) || ''; } # date_from sub date_to { return shift->_elem('date_to', @_) || ''; } # date_to =item env_proxy Enable loading proxy settings from environment variables. The proxy URL will be read from $ENV{http_proxy}. The username for authentication will be read from $ENV{http_proxy_user}. The password for authentication will be read from $ENV{http_proxy_pwd}. If you don't want to put passwords in the environment, one solution would be to subclass LWP::UserAgent and use $ENV{WWW_SEARCH_USERAGENT} instead (see user_agent below). env_proxy() must be called before the first retrieval is attempted. Example: $ENV{http_proxy } = 'http://my.proxy.com:80'; $ENV{http_proxy_user} = 'bugsbun'; $ENV{http_proxy_pwd } = 'c4rr0t5'; $oSearch->env_proxy('yes'); # Turn on with any true value ... $oSearch->env_proxy(0); # Turn off with zero ... if ($oSearch->env_proxy) # Test =cut # contributed by Klaus Johannes Rusch sub env_proxy { return shift->_elem('env_proxy', @_); } # env_proxy =item http_proxy Set up an HTTP proxy (for connections from behind a firewall). Takes the same arguments as LWP::UserAgent::proxy(). This routine should be called before calling any of the result functions (any method with "result" in its name). Example: # Turn on and set address: $oSearch->http_proxy(['http','ftp'] => 'http://proxy:8080'); # Turn off: $oSearch->http_proxy(''); =cut sub http_proxy { my $self = shift; # Make a copy of our arguments: if (@_) { my @a = @_; return $self->_elem('http_proxy', \@a); } else { return $self->_elem('http_proxy'); } } # http_proxy =item http_proxy_user, http_proxy_pwd Set/get HTTP proxy authentication data. These routines set/get username and password used in proxy authentication. Authentication is attempted only if all three items (proxy URL, username and password) have been set. Example: $oSearch->http_proxy_user("myuser"); $oSearch->http_proxy_pwd("mypassword"); $oSearch->http_proxy_user(undef); # Example for no authentication $username = $oSearch->http_proxy_user(); =cut sub http_proxy_user { return shift->_elem('http_proxy_user', @_); } sub http_proxy_pwd { return shift->_elem('http_proxy_pwd', @_); } =item maximum_to_retrieve Set the maximum number of hits to return. Queries resulting in more than this many hits will return the first hits, up to this limit. Although this specifies a maximum limit, search engines might return less than this number. Defaults to 500. Example: $max = $oSearch->maximum_to_retrieve(100); You can also spell this method "maximum_to_return". =cut sub maximum_to_retrieve { return shift->_elem('maximum_to_retrieve', @_); } =item maximum_to_return Synonym for maximum_to_retrieve =cut sub maximum_to_return { return shift->_elem('maximum_to_retrieve', @_); } =item timeout The maximum length of time any portion of the query should take, in seconds. Defaults to 60. Example: $oSearch->timeout(120); =cut sub timeout { return shift->_elem('timeout', @_); } =item login Backends which need to login to the search engine should implement this function. Takes two arguments, user and password. Return nonzero if login was successful. Return undef or 0 if login failed. =cut sub login { my $self = shift; # Here is just a stub. return 1; # These are the arguments: my ($sUser, $sPassword) = @_; } # login =item logout Backends which need to logout from the search engine should implement this function. =cut sub logout { my $self = shift; # no other args # Here is just a stub. } # logout =item approximate_result_count Some backends indicate how many results they have found. Typically this is an approximate value. =cut sub approximate_result_count { my $self = shift; # Optional arg1 = new value for this option. my $iArg = shift; $iArg = undef if (defined($iArg) && ($iArg eq '')); DEBUG_FUNC && print STDERR " FFF a_r_c(state=$self->{state},iArg=$iArg)\n"; DEBUG_ARC && print STDERR " + a_r_c(state=$self->{state},iArg=$iArg)\n"; if (defined($iArg) && (0 <= $iArg)) { # Caller is trying to SET the value: DEBUG_ARC && print STDERR " + a_r_cSET(state=$self->{state},iArg=$iArg)\n"; $self->{'approx_count'} = $iArg; return $iArg; } # if if ( # This prevents infinite recursion, for when retrieve_some() # calls this function in order to SET the value: ($self->{state} ne SEARCH_RETRIEVING) && # This prevents useless repeat calls to retrieve_some() after # the search has been completed: ($self->{state} ne SEARCH_DONE)) { DEBUG_ARC && print STDERR " DDD a_r_c prime the pump\n"; # Prime the pump, if necessary: $self->retrieve_some(); } # if $iArg = $self->{'approx_count'} || 0; # print STDERR " + a_r_cGET(state=$self->{state},answer=$iArg)\n"; return $iArg; } # approximate_result_count =item approximate_hit_count This is an alias for approximate_result_count(). =cut sub approximate_hit_count { shift->approximate_result_count(@_); } # approximate_hit_count =item results Return all the results of a query as an array of WWW::SearchResult objects. Note: This might take a while, because a web backend will keep asking the search engine for "next page of results" over and over until there are no more next pages, and THEN return from this function. If an error occurs at any time during query processing, it will be indicated in the response(). Example: @results = $oSearch->results(); # Go have a cup of coffee while the previous line executes... foreach $oResult (@results) { print $oResult->url(), "\n"; } # foreach =cut sub results { my $self = shift; print STDERR " FFF results(",$self->{'native_query'},")\n" if (DEBUG_FUNC || $self->{_debug}); # Put all the search results into the cache: 1 while ($self->retrieve_some()); $self->{cache} ||= []; my $iMax = scalar(@{$self->{cache}}); # print STDERR " + mtr is ", $self->{maximum_to_retrieve}, "\n" if $self->{_debug}; # print STDERR " + cache contains $iMax results\n" if $self->{_debug}; $iMax = $self->{maximum_to_retrieve} if ($self->{maximum_to_retrieve} < $iMax); # print STDERR " + returning $iMax results\n" if $self->{_debug}; return @{$self->{cache}}[0..$iMax-1]; } # results =item next_result Call this method repeatedly to return each result of a query as a WWW::SearchResult object. Example: while ($oResult = $oSearch->next_result()) { print $oResult->url(), "\n"; } # while When there are no more results, or if an error occurs, next_result() will return undef. If an error occurs at any time during query processing, it will be indicated in the response(). =cut sub next_result { my $self = shift; # Carp::croak "search not yet specified" if (!defined($self->{'native_query'})); return undef if ($self->{next_to_return} >= $self->{maximum_to_retrieve}); while (1) { if ($self->{next_to_return} <= $#{$self->{cache}}) { # The cache already contains the desired element; return it: my $i = ($self->{next_to_return})++; return $self->{cache}->[$i]; } # if # If we get here, then the desired element is beyond the end of # the cache. if ($self->{state} == SEARCH_DONE) { # There are no more results to be gotten; fail & bail: return undef; } # if # Get some more results into the cache: $self->retrieve_some(); # Go back and try again: } # while infinite } # next_result =item seek_result($offset) Set which result should be returned next time C is called. Results are zero-indexed. The only guaranteed valid offset is 0, which will replay the results from the beginning. In particular, seeking past the end of the current cached results probably will not do what you might think it should. Results are cached, so this does not re-issue the query or cause IO (unless you go off the end of the results). To re-do the query, create a new search object. Example: $oSearch->seek_result(0); =cut sub seek_result { my ($self, $desired) = @_; my $old = $self->{next_to_return}; $self->{next_to_return} = $desired if (defined($desired) and (0 <= $desired)); return $old; } # seek_result =item response Returns an L object which resulted from the most-recently-sent query. Errors can be detected like this: if (! $oSearch->response->is_success) { print STDERR "Error: " . $oSearch->response->as_string() . "\n"; } # if Note to backend authors: even if the backend does not involve the web, it should return an L object. =cut sub response { my $self = shift; $self->{response} ||= new HTTP::Response(RC_OK); return $self->{response}; } # response =item submit This method can be used to submit URLs to the search engines for indexing. Consult the documentation for each backend to find out if it is implemented there, and if so what the arguments are. Returns an HTTP::Response object describing the result of the submission request. Consult the documentation for each backend to find out the meaning of the response. =cut sub submit { return new HTTP::Response(788, 'Sorry, this backend does not support the submit() method.'); } # submit =item opaque This function provides an application a place to store one opaque data element (or many, via a Perl reference). This facility is useful to (for example), maintain client-specific information in each active query when you have multiple concurrent queries. =cut sub opaque { return shift->_elem('opaque', @_); } =item escape_query Escape a query. Before queries are sent to the internet, special characters must be escaped so that a proper URL can be formed. This is like escaping a URL, but all non-alphanumeric characters are escaped and and spaces are converted to "+"s. Example: $escaped = WWW::Search::escape_query('+hi +mom'); # $escaped is now '%2Bhi+%2Bmom' See also C. NOTE that this is not a method, it is a plain function. =cut sub escape_query { my $text = join(' ', @_); $text ||= ''; # print STDERR " + escape_query($text)\n"; $text =~ s/([^ A-Za-z0-9])/$URI::Escape::escapes{$1}/g; #" # print STDERR " + escape_query($text)\n"; $text =~ s/ /+/g; # print STDERR " + escape_query($text)\n"; return $text; } # escape_query =item unescape_query Unescape a query. See C for details. Example: $unescaped = WWW::Search::unescape_query('%22hi+mom%22'); # $unescaped eq q{"hi mom"} NOTE that this is not a method, it is a plain function. =cut sub unescape_query { # code stolen from URI::Escape.pm. my @copy = @_; for (@copy) { s!\+! !g; s!\%([\dA-Fa-f]{2})!chr(hex($1))!eg; } # for return wantarray ? @copy : $copy[0]; } # unescape_query =item strip_tags Given a string, returns a copy of that string with HTML tags removed. This should be used by each backend as they insert the title and description values into the search results objects. NOTE that this is not a method, it is a plain function. =cut sub strip_tags { # Prevent undef warnings if we get passed any undefined values: my @args = @_; @args = map { $_ ||= '' } @args; my $s = join('', @args); # Special case: change BR to space: $s =~ s!
! !gi; # We assume for now that we will not be encountering tags with # embedded '>' characters! $s =~ s/\074.+?\076//g; $s =~ s/ / /g; $s =~ s/"/\042/g; $s =~ s/&/\046/g; $s =~ s/</\074/g; $s =~ s/>/\076/g; # Coalesce multiple spaces: $s =~ tr!\040\t\r\n! !s; # Delete leading & trailing spaces: $s =~ s!\A\s+!!; $s =~ s!\s+\Z!!; return $s; } # strip_tags =item is_http_proxy Returns true if proxy information is available. =cut sub is_http_proxy { my $self = shift; my $ra = $self->http_proxy; my $ret = ( ('ARRAY' eq ref($ra)) && defined($ra->[0]) && ($ra->[0] ne '') ); # print STDERR " DDD is_http_proxy() return =$ret=\n"; return $ret; } # is_http_proxy =back =head1 METHODS AND FUNCTIONS FOR BACKEND PROGRAMMERS =over =item reset_search Resets internal data structures to start over with a new search (on the same engine). =cut sub reset_search { my $self = shift; print STDERR " FFF reset_search(",$self->{'native_query'},")\n" if (DEBUG_FUNC || $self->{_debug}); $self->{'cache'} = []; $self->{'native_query'} = ''; $self->{'next_to_retrieve'} = 1; $self->{'next_to_return'} = 0; $self->{'number_retrieved'} = 0; $self->{'requests_made'} = 0; $self->{'state'} = SEARCH_BEFORE; $self->{'_next_url'} = ''; $self->{'approx_count'} = 0; # This method is called by native_query(). native_query() is called # either by gui_query() or by the user. In the case that # gui_query() was called, we do NOT want to clear out the _options # hash. For now, I implement a pretty ugly hack to make this work: if (caller(2)) { my @as = caller(2); if (1 < scalar(@as)) { # print STDERR " in reset_search(), as is (", join(',', @as), ")\n"; return if $as[3] =~ m/gui_query/; } # if } # if $self->{_options} = (); } # reset_search =item is_http_proxy_auth_data Returns true if all authentication data (proxy URL, username, and password) are available. =cut sub is_http_proxy_auth_data { my $self = shift; # print STDERR (" DDD http_proxy is ", Dumper(\$self)); my $ret = ( $self->is_http_proxy && defined($self->http_proxy_user) && ($self->http_proxy_user ne '') && defined($self->http_proxy_pwd) && ($self->http_proxy_pwd ne '') ); # print STDERR " DDD is_http_proxy_auth_data() return =$ret=\n"; return $ret; } # is_http_proxy_auth_data =item agent_name($sName) If your search engine rejects certain browser, you can trick it into thinking you're any browser type you want. See below under user_agent(). =cut sub agent_name { return shift->_elem('agent_name', @_); } =item agent_email($sName) =cut sub agent_email { return shift->_elem('agent_email', @_); } =item user_agent($NON_ROBOT) This internal routine creates a user-agent for derived classes that query the web. If any non-false argument is given, a normal LWP::UserAgent (rather than a LWP::RobotUA) is used. Returns the user-agent object. If a backend needs the low-level LWP::UserAgent or LWP::RobotUA to have a particular name, $oSearch->agent_name() and possibly $oSearch->agent_email() should be called to set the desired values *before* calling $oSearch->user_agent(). If the environment variable WWW_SEARCH_USERAGENT has a value, it will be used as the class for a new user agent object. This class should be a subclass of LWP::UserAgent. For example, $ENV{WWW_SEARCH_USERAGENT} = 'My::Own::UserAgent'; # If this env.var. has no value, # LWP::UserAgent or LWP::RobotUA will be used. $oSearch = new WWW::Search('MyBackend'); $oSearch->agent_name('MySpider'); if ($iBackendWebsiteRequiresNonRobot) { $oSearch->user_agent('non-robot'); } else { $oSearch->agent_email('me@here.com'); $oSearch->user_agent(); } Backends should use robot-style user-agents whenever possible. =cut sub _load_env_useragent { my $self = shift; my $sUA = $ENV{'WWW_SEARCH_USERAGENT'} || ''; my $ua; if ($sUA ne '') { eval "use $sUA"; # print STDERR " DDD this is after eval use $sUA\n"; if (! $@) { # Successfully loaded module. eval { $ua = $sUA->new }; # print STDERR " DDD this is after eval new $sUA\n"; if (ref($ua) && ! $@) { # Successfully created object. return $ua; } else { warn " --- WWW::Search::user_agent can not create $sUA object: $@\n"; # Fall through and try the other methods: } } else { warn " --- WWW::Search::user_agent can not load $sUA: $@\n"; # Fall through and try the other methods: } } # if found WWW_SEARCH_USERAGENT in environment } # _load_env_useragent sub user_agent { my $self = shift; unless (@_) { # If NO ARGS, return the previously-created agent (if any): return $self->{'user_agent'} if ref($self->{'user_agent'}); } # unless my $non_robot = shift || 0; my $ua = _load_env_useragent(); # If we loaded a UserAgent, don't do any of this stuff: if (! ref $ua) { if ($non_robot) { eval { $ua = new LWP::UserAgent; $ua->agent($self->agent_name); $ua->from($self->agent_email); }; # end of eval block } else { $ua = LWP::RobotUA->new($self->agent_name, $self->agent_email); $ua->delay($self->{'interrequest_delay'}); } $ua->timeout($self->{'timeout'}); eval { $ua->proxy(@{$self->{'http_proxy'}}) } if $self->is_http_proxy; if ($self->env_proxy) { $ua->env_proxy($self->env_proxy); # Read password from ENV: $self->http_proxy_user($ENV{http_proxy_user}); $self->http_proxy_pwd ($ENV{http_proxy_pwd}); } # if } # if ! ref $ua $self->{'user_agent'} = $ua; return $ua; } # user_agent =item http_referer Get / set the value of the HTTP_REFERER variable for this search object. Some search engines might only accept requests that originated at some specific previous page. This method lets backend authors "fake" the previous page. Call this method before calling http_request. $oSearch->http_referer('http://prev.engine.com/wherever/setup.html'); $oResponse = $oSearch->http_request('GET', $url); =cut sub http_referer { return shift->_elem('_http_referer', @_); } =item http_method Get / set the method to be used for the HTTP request. Must be either 'GET' or 'POST'. Call this method before calling http_request. (Normally you would set this during _native_setup_search().) The default is 'GET'. $oSearch->http_method('POST'); =cut sub http_method { shift->_elem('http_method', @_); } =item http_request($method, $url) Submit the HTTP request to the world, and return the response. Similar to LWP::UserAgent::request. Handles cookies, follows redirects, etc. Requires that http_referer already be set up, if needed. =cut sub http_request { my $self = shift; my ($method, $url) = @_; my $response; if (50 <= $self->{_debug}) { eval q{ use LWP::Debug qw(+) }; # } # emacs bug } # if if ($self->{search_from_file}) { $response = $self->_http_request_from_file($url); } else { # fetch it my $request; if ($method eq 'POST') { my $uri_url = URI->new($url); my $equery = $uri_url->query; $uri_url->query(undef); # we will handle the query ourselves $request = new HTTP::Request($method, $uri_url); $request->header('Content-Type', 'application/x-www-form-urlencoded'); $request->header('Content-Length', length $equery); $request->content($equery); } else { $request = new HTTP::Request($method, $url); } $request->header('Accept-Charset' => 'iso-8859-1,*,utf-8'); if ($self->is_http_proxy_auth_data) { $request->proxy_authorization_basic($self->http_proxy_user, $self->http_proxy_pwd); } # if $self->{'_cookie_jar'}->add_cookie_header($request) if ref($self->{'_cookie_jar'}); if ($self->{'_http_referer'} && ($self->{'_http_referer'} ne '')) { my $s = uri_escape($self->{'_http_referer'}); # print STDERR " + referer($s), ref(s) = ", ref($s), "\n"; $s = $s->as_string if ref($s) =~ m!URI!; $request->referer($s); } # if referer print STDERR " DDD raw HTTP::Request is:\n", $request->as_string if (3 <= $self->{_debug}); my $ua = $self->user_agent(); TRY_GET: while (1) { $response = $ua->request($request); printf(STDERR " + got HTTP::Response (code=%d):\n%s", $response->code, $response->headers->as_string) if (3 <= $self->{_debug}); if (ref($self->{'_cookie_jar'})) { $self->{'_cookie_jar'}->extract_cookies($response); $self->{'_cookie_jar'}->save if $self->{'_cookie_jar_we_save'}; print STDERR " + WWW::Search just extracted cookies\n" if DEBUG_COOKIES; print STDERR $self->{'_cookie_jar'}->as_string if DEBUG_COOKIES; # print STDERR Dumper($self->{'_cookie_jar'}) if DEBUG_COOKIES; } # if if ($self->{'search_to_file'} && $response->is_success) { $self->_http_request_to_file($url, $response); } # if last TRY_GET if ($response->is_success); last TRY_GET if ($response->is_error); last TRY_GET if ($response->headers->header('Client-Warning') =~ m!redirect loop detected!i); if ($response->is_redirect || # Some engines spoof us with a false 302 code, so look at # the message rather than the code: ($response->message =~ m!Object moved!i) ) { my $sURL = $response->request->uri->as_string; my $sURLredir = $response->headers->header('Location'); # Low-level loop detection: last TRY_GET if ($sURLredir eq $sURL); print STDERR " + 'Object moved' from $sURL to $sURLredir\n" if (2 <= $self->{_debug}); # Follow the redirect: $request = new HTTP::Request('GET', URI->new_abs($sURLredir, $sURL), ); $request->referer($sURL); $self->{'_cookie_jar'}->add_cookie_header($request) if ref($self->{'_cookie_jar'}); print STDERR " + 'Object moved', new HTTP::Request is:\n", $request->as_string if (3 <= $self->{_debug}); # Go back and try again } # if } # while infinite } # if not from_file return $response; } # http_request sub _http_request_get_filename { my $self = shift; my $fn; # filename? if (!defined($self->{search_filename})) { $fn = $self->{search_from_file}; $fn = $self->{search_to_file} if (!defined($fn)); $self->{search_filename} = WWW::Search::unescape_query($fn); } $fn = $self->{search_filename}; die "$0: bogus filename.\n" if (!defined($fn)); return $fn; } # _http_request_get_filename sub _http_request_from_file { my $self = shift; my ($url) = @_; my $fn = $self->_http_request_get_filename(); # read index? if (!defined($self->{search_from_file_hash})) { open(TABLE, "<$fn") || die "$0: open $fn failed.\n"; my $i = 0; while () { chomp; $self->{search_from_file_hash}{$_} = $i; # print STDERR "$0: file index: $i <$_>\n"; $i++; }; close TABLE; }; # read file my $i = $self->{search_from_file_hash}{$url}; if (defined($i)) { # print STDERR "$0: saved request <$url> found in $fn.$i\n"; # read the data open(FILE, "<$fn.$i") || die "$0: open $fn.$i\n"; my $d = ''; while () { $d .= $_; }; close FILE; # make up the response my $r = new HTTP::Response(RC_OK); $r->content($d); return $r; } else { print STDERR "$0: saved request <$url> not found.\n"; my $r = new HTTP::Response(RC_NOT_FOUND); return $r; }; } # _http_request_from_file sub _http_request_to_file { my $self = shift; # The LAST arg is a LWP::Response object: my $response = pop; # The only other arg we care about is the FIRST arg, a url: my ($url, ) = @_; my $fn = $self->_http_request_get_filename(); unlink($fn) if ($self->{search_to_file_index} == 0); open(TABLE, ">>$fn") || die "$0: open $fn\n"; print TABLE "$url\n"; close TABLE; my $i = ($self->{search_to_file_index})++; open (FILE, ">$fn.$i") || die "$0: open $fn.$i\n"; print FILE $response->content(); close FILE; } # _http_request_to_file =item next_url Get or set the URL for the next backend request. This can be used to save the WWW::Search state between sessions (e.g. if you are showing pages of results to the user in a web browser). Before closing down a session, save the value of next_url: ... $oSearch->maximum_to_return(10); while ($oSearch->next_result) { ... } my $urlSave = $oSearch->next_url; Then, when you start up the next session (e.g. after the user clicks your "next" button), restore this value before calling for the results: $oSearch->native_query(...); $oSearch->next_url($urlSave); $oSearch->maximum_to_return(20); while ($oSearch->next_result) { ... } WARNING: It is entirely up to you to keep your interface in sync with the number of hits per page being returned from the backend. And, we make no guarantees whether this method will work for any given backend. (Their caching scheme might not enable you to jump into the middle of a list of search results, for example.) =cut sub next_url { return shift->_elem('_next_url', @_); } =item split_lines This internal routine splits data (typically the result of the web page retrieval) into lines in a way that is OS independent. If the first argument is a reference to an array, that array is taken to be a list of possible delimiters for this split. For example, Yahoo.pm uses

and

  • as "line" delimiters for convenience. =cut sub split_lines { # This probably fails on an EBCDIC box where input is in text mode. # Too bad Macs do not just use binmode like Windows boxen. my $self = shift; my $s = shift; my $patt = '\015?\012'; if (ref($s)) { $patt = '('. $patt; foreach (@$s) { $patt .= "|$_"; } # foreach $patt .= ')'; # print STDERR " + patt is >>>$patt<<<\n"; $s = shift; } # if return split(/$patt/i, $s); # If we require perl 5.005, this can be done by: # use Socket qw(:crlf :DEFAULT); # split(/$CR?$LF/,$_[0]) } # split_lines =item generic_option This internal routine checks if an option is generic or backend specific. Currently all generic options begin with 'search_'. This routine is not a method. =cut sub generic_option { my $option = shift || ''; return ($option =~ /^search_/); } # generic_option =item _native_setup_search Do some backend-specific initialization. It will be called with the same arguments as native_query(). =cut sub _native_setup_search { my $self = shift; print STDERR " FFF _n_s_s\n" if (DEBUG_FUNC || $self->{_debug}); # Backward-compatibility for backends that define the old # native_setup_search(), but not the new _native_setup_search() if ($self->can('native_setup_search')) { return $self->native_setup_search(@_); } # if } # _native_setup_search =item setup_search This internal routine does generic Search setup. It calls C<_native_setup_search()> to do backend-specific setup. =cut sub setup_search { my ($self) = @_; print STDERR " FFF setup_search(",$self->{'native_query'},")\n" if (DEBUG_FUNC || $self->{_debug}); $self->{cache} = []; $self->{next_to_retrieve} = 1; $self->{number_retrieved} = 0; $self->{state} = SEARCH_UNDERWAY; # $self->{_options} = (); $self->_native_setup_search($self->{'native_query'}, $self->{'native_options'}); } # setup_search =item need_to_delay A backend should override this method in order to dictate whether user_agent_delay() needs to be called before the next HTTP request is sent. Return any perlish true or zero value. =cut sub need_to_delay { my $self = shift; # This is a NOP stub. Unless the subclass overrides this method, # there is no reason to delay. return 0; } # need_to_delay =item user_agent_delay According to what need_to_delay() returns, user_agent_delay() will be called between requests to remote servers to avoid overloading them with many back-to-back requests. =cut sub user_agent_delay { my $self = shift; # Sleep for some number of seconds: select(undef, undef, undef, $self->{interrequest_delay}); } # user_agent_delay =item absurl An internal routine to convert a relative URL into a absolute URL. It takes two arguments, the 'base' url (usually the search engine CGI URL) and the URL to be converted. Returns a URI object. =cut sub absurl { my ($self, $base, $url) = @_; $base ||= ''; $url ||= ''; # print STDERR " + this is WWW::Search::absurl($base,$url)\n" if 1 < $self->{_debug}; $base = $self->{_prev_url} if ($base eq ''); return '' unless (($url ne '') && ($base ne '')); my $link = URI->new_abs($url, $base); return $link; } # absurl =item retrieve_some An internal routine to interface with C<_native_retrieve_some()>. Checks for overflow. =cut sub retrieve_some { my $self = shift; print STDERR " FFF retrieve_some(", $self->{'native_query'}, ")\n" if (DEBUG_FUNC || $self->{_debug}); return undef if ($self->{state} == SEARCH_DONE); $self->setup_search() if ($self->{state} == SEARCH_BEFORE); $self->{state} = SEARCH_RETRIEVING; if (! $self->{'_allow_empty_query'}) { if (! defined($self->{'native_query'})) { $self->{response} = new HTTP::Response(500, "query string is not defined"); $self->{state} = SEARCH_DONE; return; } # if if ($self->{'native_query'} eq '') { $self->{response} = new HTTP::Response(500, "query string is empty"); $self->{state} = SEARCH_DONE; return; } # if } # if # Got enough already? if ($self->{number_retrieved} >= $self->{'maximum_to_retrieve'}) { print STDERR " DDD retrieve_some() got enough already\n" if (DEBUG_RETR || $self->{_debug}); $self->{state} = SEARCH_DONE; return; } # if # Spinning our wheels? if ($self->{requests_made} > $self->{'maximum_to_retrieve'}) { print STDERR " DDD retrieve_some() too many requests\n" if (DEBUG_RETR || $self->{_debug}); $self->{state} = SEARCH_DONE; return; } # if # Need more results: my $res = $self->_native_retrieve_some() || 0; print STDERR " + _native_retrieve_some() returned $res\n" if (DEBUG_RETR || $self->{_debug}); $self->{requests_made}++; $self->{number_retrieved} += $res; $self->{state} = SEARCH_DONE if ($res == 0); return $res; } # retrieve_some sub HTML::TreeBuilder::www_search_reset { # If a reset() method becomes available in HTML::TreeBuilder, we # won't need this any more. my $self = shift; $self->delete; # These 4 lines copied from HTML::TreeBuilder::new $self->{'_head'} = $self->insert_element('head',1); $self->{'_pos'} = undef; # pull it back up $self->{'_body'} = $self->insert_element('body',1); $self->{'_pos'} = undef; # pull it back up again } # HTML::TreeBuilder::www_search_reset =item _native_retrieve_some Fetch the next page of results from the web engine, parse the results, and prepare for the next page of results. If a backend defines this method, it is in total control of the WWW fetch, parsing, and preparing for the next page of results. See the WWW::Search::AltaVista module for example usage of the _native_retrieve_some method. An easier way to achieve this in a backend is to inherit _native_retrieve_some from WWW::Search, and do only the HTML parsing. Simply define a method _parse_tree which takes one argument, an HTML::TreeBuilder object, and returns an integer, the number of results found on this page. See the WWW::Search::Yahoo module for example usage of the _parse_tree method. A backend should, in general, define either _parse_tree() or _native_retrieve_some(), but not both. Additional features of the default _native_retrieve_some method: Sets $self->{_prev_url} to the URL of the page just retrieved. Calls $self->preprocess_results_page() on the raw HTML of the page. Then, parses the page with an HTML::TreeBuilder object and passes that populated object to $self->_parse_tree(). Additional notes on using the _parse_tree method: The built-in HTML::TreeBuilder object used to parse the page has store_comments turned ON. If a backend needs to use a subclassed or modified HTML::TreeBuilder object, the backend should set $self->{'_treebuilder'} to that object before any results are retrieved. The best place to do this is at the end of _native_setup_search. my $oTree = new myTreeBuilder; $oTree->store_pis(1); # for example $self->{'_treebuilder'} = $oTree; When _parse_tree() is called, the $self->next_url is cleared. During parsing, the backend should set $self->next_url to the appropriate URL for the next page of results. (If _parse_tree() does not set the value, the search will end after parsing this page of results.) When _parse_tree() is called, the URL for the page being parsed can be found in $self->{_prev_url}. =cut sub _parse_tree { my $self = shift; # This is a NOP stub. Backend MUST define their own parse function! print STDERR " FFF stub _parse_tree\n" if (DEBUG_FUNC || $self->{_debug}); # This is for backward-compatibility, for backends that define the # old parse_tree(), but not the new _parse_tree(): return $self->parse_tree(@_) if $self->can('parse_tree'); return 0; } # _parse_tree sub _native_retrieve_some { my $self = shift; if ($self->can('native_retrieve_some')) { # This is for backward-compatibility, for backends that define the # old native_retrieve_some(), but not the new # _native_retrieve_some(): return $self->native_retrieve_some(@_); } # if print STDERR " FFF _n_r_s\n" if (DEBUG_FUNC || $self->{_debug}); # Fast exit if already done: return if (!defined($self->{_next_url})); return if ($self->{_next_url} eq q{}); # If this is not the first page of results, sleep so as to not # overload the server: $self->{_next_to_retrieve} ||= 1; $self->user_agent_delay if ( (1 < $self->{_next_to_retrieve}) || $self->need_to_delay ); # Get one page of results: print STDERR " + submitting URL (", $self->{'_next_url'}, ")\n" if $self->{_debug}; my $response = $self->http_request($self->http_method, $self->{'_next_url'}); print STDERR " + got response\n", $response->headers->as_string, "\n" if 2 <= $self->{_debug}; $self->{_prev_url} = $self->{_next_url}; # Assume there are no more results, unless we find out otherwise # when we parse the html: $self->{_next_url} = undef; $self->{response} = $response; # print STDERR " --- HTTP response is:\n", $response->as_string if 4 < $self->{_debug}; if (! $response->is_success) { if ($self->{_debug}) { print STDERR " --- HTTP request failed, response is:\n", $response->as_string; } # if return; } # if # Pre-process the output: my $sPage = $self->preprocess_results_page($response->content); # Parse the output: my $tree; if (ref $self->{'_treebuilder'}) { # print STDERR " + using existing _treebuilder\n" if 1 < $self->{_debug}; # Assume that the backend has installed their own TreeBuilder $tree = $self->{'_treebuilder'}; } else { # print STDERR " + creating new _treebuilder\n" if 1 < $self->{_debug}; $tree = HTML::TreeBuilder->new( # use all default options ); $tree->store_comments('yes'); $self->{'_treebuilder'} = $tree; } # If a reset() method becomes available in HTML::TreeBuilder, we can # change this: $tree->www_search_reset; # print STDERR " + parsing content, tree is ", Dumper(\$tree) if 1 < $self->{_debug}; # use Encode; # my $sPageOctets = Encode::encode_utf8($sPage); $tree->utf8_mode('true'); $tree->parse($sPage); # print STDERR " + done parsing content.\n" if 1 < $self->{_debug}; $tree->eof(); print STDERR " + calling _parse_tree...\n" if (1 < $self->{_debug}); return $self->_parse_tree($tree); } # _native_retrieve_some =item result_as_HTML Given a WWW::SearchResult object, formats it human-readable with HTML. =cut sub result_as_HTML { # print STDERR " DDD r_as_H(@_)\n"; my $self = shift; my $oSR = shift or return ''; return '' unless (ref($oSR) =~ m'WWW::Search(::)?Result'); my $o = new CGI; return join('', $o->a( { href => $oSR->url, }, $oSR->title, ), $o->br, $oSR->description, ); } # result_as_HTML =item preprocess_results_page A filter on the raw HTML of the results page. This allows the backend to alter the HTML before it is parsed, such as to correct for known problems, HTML that can not be parsed correctly, etc. Takes one argument, a string (the HTML webpage); returns one string (the same HTML, modified). This method is called from within _native_retrieve_some (above) before the HTML of the page is parsed. See the WWW::Search::Ebay distribution 2.07 or higher for example usage. =cut sub preprocess_results_page { # Here is just a stub. Return our argument without modification. my $self = shift; return shift; } # preprocess_results_page =item test_cases (DEPRECATED) Deprecated. Returns the value of the $TEST_CASES variable of the backend engine. =cut sub test_cases { my $self = shift; return eval '$'.ref($self).'::TEST_CASES'; } # test_cases =item hash_to_cgi_string (DEPRECATED) Given a reference to a hash of string => string, constructs a CGI parameter string that looks like 'key1=value1&key2=value2'. If the value is undef, the key will not be added to the string. At one time, for testing purposes, we asked backends to use this function rather than piecing the URL together by hand, to ensure that URLs are identical across platforms and software versions. But this is no longer necessary. Example: $self->{_options} = { 'opt3' => 'val3', 'search_url' => 'http://www.deja.com/dnquery.xp', 'opt1' => 'val1', 'QRY' => $native_query, 'opt2' => 'val2', }; $self->{_next_url} = $self->{_options}{'search_url'} .'?'. $self->hash_to_cgi_string($self->{_options}); =cut sub hash_to_cgi_string { my $self = shift; # Because of the design of our test suite, we need our generated # URLs to be identical on all systems, all versions of perl. Ergo # we must explicitly control the order in which our CGI parameter # strings are cobbled together. For now, I assume sorting the hash # keys will suffice. my $rh = shift; my $ret = ''; foreach my $key (sort keys %$rh) { # printf STDERR "option: $key is " . $rh->{$key} . "\n"; next if generic_option($key); # Throw out keys with undef values. next unless defined($rh->{$key}); # If we want to let the user delete options, uncomment the next # line. (They can still blank them out, which may or may not have # the same effect): # next unless $rh->{$key} ne ''; $ret .= $key .'='. $rh->{$key} .'&'; } # foreach $key # Remove the trailing '&': chop $ret; return $ret; } # hash_to_cgi_string =back =head1 IMPLEMENTING NEW BACKENDS C supports backends to separate search engines. Each backend is implemented as a subclass of C. L provides a good sample backend. A backend must have the routine C<_native_setup_search()>. A backend must have the routine C<_native_retrieve_some()> or C<_parse_tree()>. C<_native_setup_search()> is invoked before the search. It is passed a single argument: the escaped, native version of the query. C<_native_retrieve_some()> is the core of a backend. It will be called periodically to fetch URLs. It should retrieve several hits from the search service and add them to the cache. It should return the number of hits found, or undef when there are no more hits. Internally, C<_native_retrieve_some()> typically sends an HTTP request to the search service, parses the HTML, extracts the links and descriptions, then saves the URL for the next page of results. See the code for the C module for an example. Alternatively, a backend can define the method C<_parse_tree()> instead of C<_native_retrieve_some()>. See the C module for a good example. If you implement a new backend, please let the authors know. =head1 BUGS AND DESIRED FEATURES The bugs are there for you to find (some people call them Easter Eggs). Desired features: =over 4 =item A portable query language. A portable language would easily allow you to move queries easily between different search engines. A query abstraction is non-trivial and unfortunately will not be done any time soon by the current maintainer. If you want to take a shot at it, please let me know. =back =head1 AUTHOR John Heidemann Maintained by Martin Thurn, C, L. =head1 COPYRIGHT Copyright (c) 1996 University of Southern California. All rights reserved. Redistribution and use in source and binary forms are permitted provided that the above copyright notice and this paragraph are duplicated in all such forms and that any documentation, advertising materials, and other materials related to such distribution and use acknowledge that the software was developed by the University of Southern California, Information Sciences Institute. The name of the University may not be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. =cut 1; __END__