WWW-Search-Pagesjaunes-0.13/0000755000175000017500000000000010207316754015756 5ustar briacbriac00000000000000WWW-Search-Pagesjaunes-0.13/t/0000755000175000017500000000000010207316754016221 5ustar briacbriac00000000000000WWW-Search-Pagesjaunes-0.13/t/01interface.t0000644000175000017500000000032410130737177020507 0ustar briacbriac00000000000000use Test; BEGIN { plan tests => 3 } use WWW::Search::Pagesjaunes; my $pj = WWW::Search::Pagesjaunes->new(); ok(ref($pj), 'WWW::Search::Pagesjaunes'); ok($pj->limit, 50); $pj->limit(100); ok($pj->limit, 100); WWW-Search-Pagesjaunes-0.13/t/00basic.t0000644000175000017500000000015010130737177017624 0ustar briacbriac00000000000000use Test; BEGIN { plan tests => 1 } END { ok($loaded) } use WWW::Search::Pagesjaunes; $loaded++; WWW-Search-Pagesjaunes-0.13/t/02search.t0000644000175000017500000000100610130737177020013 0ustar briacbriac00000000000000use Test; BEGIN { plan tests => 5 } use WWW::Search::Pagesjaunes; my $pj = WWW::Search::Pagesjaunes->new(); $pj->{ua}->agent("test".time); $pj->find( nom => "palais de l'elysée", localite => "paris"); my $r = $pj->results; ok($r->name, "Présidence de la République Palais de l'Elysée"); ok($r->address, "55 e Fbg St Honoré 75008 Paris"); ok($r->phone->[0], "01 42 92 81 00"); ok($pj->has_more, 1); $pj->find( activite => "plombier", localite => "marseille", limit => 4); my @r = $pj->results; ok($#r, 3); WWW-Search-Pagesjaunes-0.13/lib/0000755000175000017500000000000010207316754016524 5ustar briacbriac00000000000000WWW-Search-Pagesjaunes-0.13/lib/WWW/0000755000175000017500000000000010207316754017210 5ustar briacbriac00000000000000WWW-Search-Pagesjaunes-0.13/lib/WWW/Search/0000755000175000017500000000000010207316754020415 5ustar briacbriac00000000000000WWW-Search-Pagesjaunes-0.13/lib/WWW/Search/Pagesjaunes.pm0000644000175000017500000002504310207315501023211 0ustar briacbriac00000000000000package WWW::Search::Pagesjaunes; use strict; use Carp qw(carp croak); use HTML::Form; use WWW::Mechanize; use HTML::TokeParser; use HTTP::Request::Common; use LWP::UserAgent; $WWW::Search::Pagesjaunes::VERSION = '0.13'; sub ROOT_URL() { 'http://www.pagesjaunes.fr' } sub new { my $class = shift; my $self = {}; my $ua = shift() || WWW::Mechanize->new( env_proxy => 1, keep_alive => 1, timeout => 30, agent => "WWW::Search::Pagesjaunes/$WWW::Search::Pagesjaunes::VERSION", ); $self->{ua} = $ua; $self->{limit} = 50; $self->{fast} = 0; $self->{error} = 1; $self->{lang} = 'FR'; bless( $self, $class ); } sub agent { my $self = shift; if ( $_[0] ) { my $old = $self->{ua}; $self->{ua} = $_[0]; return $old; } else { return $self->{ua}; } } sub find { my $self = shift; my %opt = @_; my $p = $opt{activite} ? 'j' : 'b'; # Make the first request to pagesjaunes.fr $self->{URL} = ROOT_URL . "/p$p.cgi"; if ( $self->{fast} ) { $self->{req} = POST( $self->{URL}, [ faire => 'decode_input_image', DEFAULT_ACTION => $p . 'f_inscriptions_req', lang => $self->{lang}, pays => 'FR', srv => uc("p$p"), TYPE_RECHERCHE => 'ZZZ', input_image => '', FRM_ACTIVITE => $p eq 'j' ? $opt{activite} : undef, FRM_NOM => $opt{nom}, FRM_PRENOM => $p eq 'b' ? $opt{prenom} : undef, FRM_ADRESSE => $opt{adresse}, FRM_LOCALITE => $opt{localite}, FRM_DEPARTEMENT => $opt{departement}, #'${p}F_INSCRIPTIONS_REQ.x' => 1, #'${p}F_INSCRIPTIONS_REQ.y' => 1, ]); } else { my $req = $self->{ua}->get($self->{URL}); if ( !$req->content || !$req->is_success ) { croak('Error while retrieving the HTML page'); } my @forms = HTML::Form->parse( $req->content, $self->{URL} ); # BooK finds the form by grepping thru all of them, instead # of limiting ourselves to the first and second form. my ($form) = grep { $_->find_input('lang') } @forms; eval { # HTML::Form complains when you change hidden fields values. local $^W; $form->value( 'lang', $self->{lang} ); $form->value( 'FRM_ACTIVITE', $opt{activite} ) if $opt{activite}; $form->value( 'FRM_NOM', $opt{nom} ); $form->value( 'FRM_PRENOM', $opt{prenom} ) if !$opt{activite}; $form->value( 'FRM_ADRESSE', $opt{adresse} ); $form->value( 'FRM_LOCALITE', $opt{localite} ); $form->value( 'FRM_DEPARTEMENT', $opt{departement} ); }; croak "Cannot fill the pagesjaunes request form. try with the 'fast' option\n" if $@; $self->{limit} = $opt{limit} || $self->{limit}; $self->{req} = $form->click; } return $self; } sub results { my $self = shift; my $result_page = $self->{ua}->request( $self->{req} )->content; my $parser = HTML::TokeParser->new( \$result_page ); # All the
tags are transformed to '§¤§', to separate # multiple phone numbers $parser->{textify} = { 'br' => sub() { '§¤§' } }; my @results; if ( $self->{limit} == 0 ) { $self->{has_more} = 0; return @results; } # XXX This is a really crude parsing of the data, but it seems to # get the job done. # # # # # #
# # # # # # # # # #
Name 
Address(télécopie)? Phone
#
# $self->{has_more} = 0; while ( my $token = $parser->get_tag("table") ) { next unless $token->[1] && $token->[1]{class} && $token->[1]{class} eq 'fdinscr'; { # We're inside an entry table $parser->get_tag("td"); # The first is the name my $name = _trim( $parser->get_trimmed_text('/td') ); $parser->get_tag("td"); # The second is ignored $parser->get_tag("td"); # The third is the address my $address = _trim( $parser->get_trimmed_text('/td') ); $address =~ s/\W*\|.*$//g; $parser->get_tag("td"); # The fourth is the phone number my $phone = _trim( $parser->get_trimmed_text('/td') ); my @phones = map { _trim($_); s/\.(\s*\d)/$1/; $_ } split(/§¤§/, $phone); # The fifth tag is either the mail or the descr, depending # on the class my @emails = (''); my $tag = $parser->get_tag("td"); if ( $tag->[1]{class} && $tag->[1]{class} eq 'txtinscr'){ my $email = _trim( $parser->get_trimmed_text('/td') ); @emails = map { _trim($_); s/Mail\s*:\s*//; $_ } split(/§¤§/, $email); } push( @results, WWW::Search::Pagesjaunes::Entry->new( $name, $address, [ @phones ], [ @emails ] ) ); return @results if --$self->{limit} == 0; } } foreach my $form ( HTML::Form->parse( $result_page, $self->{URL} ) ) { if ( $form->find_input('faire') && $form->value('faire') eq 'inscriptions_suivant' ) { $self->{has_more} = 1; $self->{req} = $form->click(); } } # If there was no result, we look for an error message in the HTML page if ( !@results && $self->{error} ) { $parser = HTML::TokeParser->new( \$result_page ); while ( my $token = $parser->get_tag("font") ) { next unless $token->[1] && $token->[1]{color} && $token->[1]{color} eq '#ff0000'; $parser->{textify} = { 'br' => sub() { " " } }; carp _trim( $parser->get_trimmed_text('/font') ) . "\n"; } } wantarray ? @results : $results[0]; } sub _trim { $_[0] =~ s/\xa0/ /g; # Transform the   into whitespace $_[0] =~ s/^\s*|\s*$//g; $_[0] =~ s/\s+/ /g; $_[0]; } sub limit { my $self = shift; $self->{limit} = $_[0] || $self->{limit}; } sub has_more { $_[0]->{has_more} } package WWW::Search::Pagesjaunes::Entry; # The entry object is a blessed array with the following indices: # 0 - Name # 1 - Address # 2 - Arrayref of phone numbers # 3 - E-mail (pj) # 4 - Notes (pj) sub new { my $class = shift; bless [ @_ ], $class } sub name { $_[0]->[0] } sub address { $_[0]->[1] } sub phone { $_[0]->[2] } sub email { $_[0]->[3] } sub entry { # Name Address First email Phones $_[0]->[0], $_[0]->[1], $_[0]->[3]->[0], @{ @{ $_[0] }[2] }, } 1; __END__ =pod =head1 NAME WWW::Search::Pagesjaunes - Lookup phones numbers from www.pagesjaunes.fr =head1 SYNOPSIS use WWW::Search::Pagesjaunes; my $pj = new WWW::Search::Pagesjaunes; $pj->find( activite => "Plombier", localite => "Paris" ); do { print $_->entry . "\n" foreach ($pj->results); } while $pj->has_more; =head1 DESCRIPTION The WWW::Search::Pagesjaunes provides name, phone number and addresses of French telephone subscribers by using the L directory. =head1 METHODS Two classes are used in this module, a first one (WWW::Search::Pagesjaunes) to do the fetching and parsing, and the second one and a second one (WWW::Search::Pagesjaunes::Entry) holding the entry infos. Here are the methods for the main WWW::Search::Pagesjaunes module: =over 4 =item new() The constructor accept an optional LWP::UserAgent as argument, if you want to provide your own. =item find( %request ) Here are the values for the %request hash that are understood. They each have two name, the first is the french one and the second is the english one: =over 4 =item nom / name Name of the person you're looking for. =item activite / business Business type of the company you're looking for. Note that if this field is filled, the module searches in the yellow pages. =item localite / town Name of the town you're searching in. =item prenom / firstname First name of the person you're looking for. It is not set if you set the 'activite' field. =item departement / district Name or number of the Département or Région you're searching in. =back =item results() Returns an array of WWW::Search::Pagesjaunes::Entry containing the first matches of the query. =item limit($max_number_of_entries) Set the maximum number of entries returned. Default to 50. =item has_more() If the query leads to more than a few results, the field has_more is set. You can then call the results() method again to fetch the datas. =back The WWW::Search::Pagesjaunes::Entry class has six methods: =over 4 =item new($name, $address, $phone, $fax) Returns a new WWW::Search::Pagesjaunes::Entry. =item name Returns the name of the entry. =item address Returns the address of the entry. =item phone Returns the phone number of the entry. =item is_fax Returns true if the phone number is a fax one, false otherwise. Note that currently, this method always returns 0. =item entry($separator) Returns the concatenation of the name and the phone number, separated by " - ". You can specify your own separator as first argument. =back =head1 BUGS The phone numbers are sometimes not correctly parsed, esp. when one entry has several phone numbers. If you found a bug and want to report it or send a patch, you are encouraged to use the CPAN Request Tracker interface: L =head1 COPYRIGHT Please read the Publisher information of L available at the following URL: L WWW::Search::Pagesjaunes is Copyright (C) 2002, Briac Pilpré This module is free software; you can redistribute it or modify it under the same terms as Perl itself. =head1 AUTHOR Briac Pilpré =cut WWW-Search-Pagesjaunes-0.13/META.yml0000644000175000017500000000073610207316754017235 0ustar briacbriac00000000000000# http://module-build.sourceforge.net/META-spec.html #XXXXXXX This is a prototype!!! It will change in the future!!! XXXXX# name: WWW-Search-Pagesjaunes version: 0.13 version_from: lib/WWW/Search/Pagesjaunes.pm installdirs: site requires: HTML::Form: 0 HTML::TokeParser: 0 LWP::UserAgent: 1.8 Pod::Usage: 0 distribution_type: module generated_by: ExtUtils::MakeMaker version 6.17 WWW-Search-Pagesjaunes-0.13/pagesjaunes0000755000175000017500000001122210207315542020201 0ustar briacbriac00000000000000#!/usr/bin/perl -w use strict; use WWW::Search::Pagesjaunes; use Getopt::Long; use Pod::Usage; my %opt; GetOptions( \%opt, 'activite|business|activity=s', 'nom|name=s', 'prenom|firstname:s', 'adresse|address:s', 'localite|ville|city|town=s', 'departement|district=s', 'limit:i', 'english', 'quiet', 'help|aide', 'man', 'separator|separateur=s', 'useragent=s', 'version', 'fast' ) or pod2usage(2); # # $form->value('input_image', $x) où $x est: # # CD_PDR_SUP_REG => Région entière # CD_PDR_SUP_DEP_REQ => Département entier # CD_PDR_VOISIN_REQ => Localités voisines # CD_PDR_INSCRIPTIONS_REQ => Sans nom ni adresse # BT_INSCRIPTIONS_REQ_PROF => Afficher seulement les professionnels # die "$WWW::Search::Pagesjaunes::VERSION\n" if $opt{version}; pod2usage(1) if $opt{help}; pod2usage( -verbose => 2 ) if $opt{man}; pod2usage("$0: No town or district given.") unless ($opt{localite} || $opt{departement}); pod2usage("$0: No business or name given.") unless ( $opt{activite} || $opt{nom} ); my $pj = WWW::Search::Pagesjaunes->new(); $pj->{lang} = 'EN' if $opt{english}; $pj->{error} = 0 if $opt{quiet}; $pj->{fast} = 1 if $opt{fast}; $pj->{ua}->agent($opt{useragent}) if $opt{useragent}; $pj->find(%opt); do { local $\ = "\n"; print join($opt{separator} || "\t", $_->entry) foreach ( $pj->results ); } while $pj->has_more; __END__ =head1 NAME pagesjaunes - Lookup phones numbers from www.pagesjaunes.fr =head1 SYNOPSIS pagesjaunes [options ...] Options: -activite -business : Business type -nom -name : Name -prenom -firstname : First name -localite -town : Town -departement -district : Dept district or Region -useragent : String to be passed as User-Agent header (this may be needed to bypass user-agent detection) -fast : Submit the request directly without querying the form -separator : Character used to separate the fields (default '\t') -limit : Maximum number of results returned -english : Use the pagesjaunes.fr english interface -quiet : Turn off error messages display -help : Brief help message -man : Full documentation -version : Display version number You must provide the localite/town option, and either activite/business or nom/name option. The prenom/firstname option is ignored if the localite/town option is set. =head1 OPTIONS =over 8 =item B<-activite> or B<-business> Activity or business type you're looking for. This is a mandatory switch if you don't specify the B<-name> or B<-nom> switches. =item B<-nom> or B<-name> Name of the person or company you're looking for. Note that the search is done with a fuzzy match. =item B<-prenom> or B<-firstname> First name of the person you're looking for. This option is ignored if the B<-activite> or B<-business> are set. =item B<-adresse> or B<-address> Address of the person you're searching for. =item B<-localite> or B<-town> Name of the town. =item B<-department> or B<-district> Department district or Region you're searching in. =item B<-separator> Character used to separate fields in the result set. Default is a tabulation character. =item B<-useragent> The default user-agent string passed to the www.pagesjaunes.fr site is ""WWW::Search::Pagesjaunes/x.xx" where x.xx is the version of the module. Sometimes, it is needed to change it to other user-agent strings. =item B<-limit> Maximum number of entries returned. Default is 50. If you set it to 0 or a negative number, it will return all the entries found. =item B<-english> Use the english interface of pagesjaunes.fr. This means that you can enter business types in english (i.e. 'plumber' instead of 'plombier'), and error messages will be displayed in english. =item B<-quiet> No error messages are printed when this switch is on =item B<-help> Print a brief help message and exits. =item B<-man> Prints the manual page and exits. =item B<-version> Prints the version of the script and exits. =back =head1 DESCRIPTION This script provides name, phone number and addresses of French telephone subscribers by using the http://www.pagesjaunes.fr directory and the WWW::Search::Pagesjaunes module. =head1 COPYRIGHT Please read the Publisher information of L available at the following URL: L This script is Copyright (C) 2002, Briac Pilpré This script is free software; you can redistribute it or modify it under the same terms as Perl itself. =head1 AUTHOR Briac Pilpré =cut WWW-Search-Pagesjaunes-0.13/Changes0000644000175000017500000000510210207316747017251 0ustar briacbriac000000000000000.13 Thu Feb 24 10:23:28 CET 2005 - Fixed the HTML parsing to reflect the change in the layout of the result table (patch thanks to Éric(glb)). 0.12 Wed Jun 30 14:13:10 CEST 2004 - fixed a bug when the main form was not found. - the main form is now filled within an eval() to avoid HTML::Forms errors when trying to fill it. 0.11 Thu Jun 17 12:00:27 CEST 2004 - Added a '--fast' parameter to create the request from scratch (and not fetching the form from pagesjaunes.fr) - Support multiple phone numbers - Support e-mail addresses 0.10 Sat Jun 12 21:41:51 CEST 2004 - Added a '--useragent' parameter to let the user specify the UserAgent HTTP header. - Added some docs 0.8 Wed Jan 7 16:29:09 CET 2004 - Fixed the HTML parsing again. - Modified the user-agent string as pagejaunes.fr seems to be blocking user agent matching /WWW::Search::Pagesjaunes/ (see https://rt.cpan.org/Ticket/Display.html?id=4833 ) 0.7 Tue Dec 9 10:46:28 CET 2003 - Fixed the HTML parsing to reflect the changes in the pagesjaunes.fr website. (Thanks to David Landgren) - The form of pagesjaunes (not pagesblanches) is the second on on the frame, not the first. (Thanks to Jon Orwant) - Added a couple more tests 0.5 Sat Feb 8 13:03:05 CET 2003 - Fixed the HTML parsing to reflect the changes in the pagesjaunes.fr website. (Thanks to Nicolas Chuche) - Added documentation for the -address and -adresse switch 0.4 Thu Oct 24 23:26:04 CEST 2002 - No check on missing parameter is done in the .pm, it's up to the script to check that. - a Town *or* District is required (used to be only a Town) - pagesjaunes.fr errors are reported if the entry set is empty - Added a --quiet switch to avoid error reporting - Added a --english switch to use the pagesjaunes.fr interface in english (you can then use the english name of business types 'plombier' => 'plumber') - The Entry::entry method now returns an array. It is up to the user to set the proper separator ($, and $\) - Fixed the name truncation bug. 0.3 Tue Oct 15 13:11:09 CEST 2002 - Fixed a bug that prevented limit() to work - Added a --limit switch on the pagesjaunes script - pagesjaunes is better documented, using Pod::Usage - Changed the english switches to aliases, much cleaner 0.2 Tue Oct 15 10:29:35 CEST 2002 - Added usage for the pagesjaunes script - Added support for english options - Documented the limit() method 0.1 Mon Oct 14 22:58:57 CEST 2002 - first released version WWW-Search-Pagesjaunes-0.13/MANIFEST0000644000175000017500000000033310207315562017102 0ustar briacbriac00000000000000Changes MANIFEST This list of files Makefile.PL README lib/WWW/Search/Pagesjaunes.pm pagesjaunes t/00basic.t t/01interface.t t/02search.t META.yml Module meta-data (added by MakeMaker) WWW-Search-Pagesjaunes-0.13/Makefile.PL0000644000175000017500000000065410130737177017736 0ustar briacbriac00000000000000use ExtUtils::MakeMaker; WriteMakefile( NAME => 'WWW::Search::Pagesjaunes', VERSION_FROM => 'lib/WWW/Search/Pagesjaunes.pm', PREREQ_PM => { 'HTML::Form' => 0, 'LWP::UserAgent' => 1.80, 'HTML::TokeParser' => 0, 'Pod::Usage' => 0, }, EXE_FILES => [ 'pagesjaunes' ], ABSTRACT_FROM => 'lib/WWW/Search/Pagesjaunes.pm', AUTHOR => 'Briac Pilpré ', ); WWW-Search-Pagesjaunes-0.13/README0000644000175000017500000000662510130737177016650 0ustar briacbriac00000000000000NAME WWW::Search::Pagesjaunes - Lookup phones numbers from www.pagesjaunes.fr SYNOPSIS use WWW::Search::Pagesjaunes; my $pj = new WWW::Search::Pagesjaunes; $pj->find( activite => "Plombier", localite => "Paris" ); { print $_->entry . "\n" foreach ($pj->results); redo if $pj->has_more; } DESCRIPTION The WWW::Search::Pagesjaunes provides name, phone number and addresses of French telephone subscribers by using the directory. METHODS Two classes are used in this module, a first one (WWW::Search::Pagesjaunes) to do the fetching and parsing, and the second one and a second one (WWW::Search::Pagesjaunes::Entry) holding the entry infos. Here are the methods for the main WWW::Search::Pagesjaunes module: new() The constructor accept an optional LWP::UserAgent as argument, if you want to provide your own. find( %request ) Here are the values for the %request hash that are understood. They each have two name, the first is the french one and the second is the english one: nom / name Name of the person you're looking for. activite / business Business type of the company you're looking for. Note that if this field is filled, the module searches in the yellow pages. localite / town Name of the town you're searching in. prenom / firstname First name of the person you're looking for. It is not set if you set the 'activite' field. departement / district Name or number of the Département or Région you're searching in. results() Returns an array of WWW::Search::Pagesjaunes::Entry containing the first matches of the query. limit($max_number_of_entries) Set the maximum number of entries returned. Default to 50. has_more() If the query leads to more than a few results, the field has_more is set. You can then call the results() method again to fetch the datas. The WWW::Search::Pagesjaunes::Entry class has six methods: new($name, $address, $phone, $fax) Returns a new WWW::Search::Pagesjaunes::Entry. name Returns the name of the entry. address Returns the address of the entry. phone Returns the phone number of the entry. is_fax Returns true if the phone number is a fax one, false otherwise. Note that currently, this method always returns 0. entry Returns the concatenation of the name and the phone number, separated by " - ". BUGS The phone numbers are sometimes not correctly parsed, esp. when one entry has several phone numbers. Names are sometimes truncated. If you found a bug and want to report it or send a patch, you are encouraged to use the CPAN Request Tracker interface: COPYRIGHT Please read the Publisher information of available at the following URL: WWW::Search::Pagesjaunes is Copyright (C) 2002, Briac Pilpré This module is free software; you can redistribute it or modify it under the same terms as Perl itself. AUTHOR Briac Pilpré briac@cpan.org