WWW-Search-Pagesjaunes-0.14/ 0000755 0001750 0001750 00000000000 10244704716 015757 5 ustar briac briac 0000000 0000000 WWW-Search-Pagesjaunes-0.14/t/ 0000755 0001750 0001750 00000000000 10244704716 016222 5 ustar briac briac 0000000 0000000 WWW-Search-Pagesjaunes-0.14/t/01interface.t 0000644 0001750 0001750 00000000324 10130737177 020510 0 ustar briac briac 0000000 0000000 use Test;
BEGIN { plan tests => 3 }
use WWW::Search::Pagesjaunes;
my $pj = WWW::Search::Pagesjaunes->new();
ok(ref($pj), 'WWW::Search::Pagesjaunes');
ok($pj->limit, 50);
$pj->limit(100);
ok($pj->limit, 100);
WWW-Search-Pagesjaunes-0.14/t/00basic.t 0000644 0001750 0001750 00000000150 10130737177 017625 0 ustar briac briac 0000000 0000000 use Test;
BEGIN { plan tests => 1 }
END { ok($loaded) }
use WWW::Search::Pagesjaunes;
$loaded++;
WWW-Search-Pagesjaunes-0.14/t/02search.t 0000644 0001750 0001750 00000001006 10244704527 020013 0 ustar briac briac 0000000 0000000 use Test;
BEGIN { plan tests => 5 }
use WWW::Search::Pagesjaunes;
my $pj = WWW::Search::Pagesjaunes->new();
$pj->{ua}->agent("test".time);
$pj->find( nom => "palais de l'elysée", localite => "paris");
my $r = $pj->results;
ok($r->name, "Présidence de la République Palais de l'Elysée");
ok($r->address, "55 r Fbg St Honoré 75008 PARIS");
ok($r->phone->[0], "01 42 92 81 00");
ok($pj->has_more, 1);
$pj->find( activite => "plombier", localite => "marseille", limit => 4);
my @r = $pj->results;
ok($#r, 3);
WWW-Search-Pagesjaunes-0.14/lib/ 0000755 0001750 0001750 00000000000 10244704716 016525 5 ustar briac briac 0000000 0000000 WWW-Search-Pagesjaunes-0.14/lib/WWW/ 0000755 0001750 0001750 00000000000 10244704716 017211 5 ustar briac briac 0000000 0000000 WWW-Search-Pagesjaunes-0.14/lib/WWW/Search/ 0000755 0001750 0001750 00000000000 10244704716 020416 5 ustar briac briac 0000000 0000000 WWW-Search-Pagesjaunes-0.14/lib/WWW/Search/Pagesjaunes.pm 0000644 0001750 0001750 00000025042 10244567623 023230 0 ustar briac briac 0000000 0000000 package WWW::Search::Pagesjaunes;
use strict;
use Carp qw(carp croak);
use HTML::Form;
use WWW::Mechanize;
use HTML::TokeParser;
use HTTP::Request::Common;
use LWP::UserAgent;
$WWW::Search::Pagesjaunes::VERSION = '0.14';
sub ROOT_URL() { 'http://www.pagesjaunes.fr' }
sub new {
my $class = shift;
my $self = {};
my $ua = shift() || WWW::Mechanize->new(
env_proxy => 1,
keep_alive => 1,
timeout => 30,
agent => "WWW::Search::Pagesjaunes/$WWW::Search::Pagesjaunes::VERSION",
);
$self->{ua} = $ua;
$self->{limit} = 50;
$self->{fast} = 0;
$self->{error} = 1;
$self->{lang} = 'FR';
bless( $self, $class );
}
sub agent {
my $self = shift;
if ( $_[0] ) {
my $old = $self->{ua};
$self->{ua} = $_[0];
return $old;
}
else {
return $self->{ua};
}
}
sub find {
my $self = shift;
my %opt = @_;
my $p = $opt{activite} ? 'j' : 'b';
# Make the first request to pagesjaunes.fr
$self->{URL} = ROOT_URL . "/p$p.cgi";
if ( $self->{fast} ) {
$self->{req} = POST(
$self->{URL},
[
faire => 'decode_input_image',
DEFAULT_ACTION => $p . 'f_inscriptions_req',
lang => $self->{lang},
pays => 'FR',
srv => uc("p$p"),
TYPE_RECHERCHE => 'ZZZ',
input_image => '',
FRM_ACTIVITE => $p eq 'j' ? $opt{activite} : undef,
FRM_NOM => $opt{nom},
FRM_PRENOM => $p eq 'b' ? $opt{prenom} : undef,
FRM_ADRESSE => $opt{adresse},
FRM_LOCALITE => $opt{localite},
FRM_DEPARTEMENT => $opt{departement},
#'${p}F_INSCRIPTIONS_REQ.x' => 1,
#'${p}F_INSCRIPTIONS_REQ.y' => 1,
]);
}
else {
my $req = $self->{ua}->get($self->{URL});
if ( !$req->content || !$req->is_success ) {
croak('Error while retrieving the HTML page');
}
my @forms = HTML::Form->parse( $req->content, $self->{URL} );
# BooK finds the form by grepping thru all of them, instead
# of limiting ourselves to the first and second form.
my ($form) = grep { $_->find_input('lang') } @forms;
eval {
# HTML::Form complains when you change hidden fields values.
local $^W;
$form->value( 'lang', $self->{lang} );
$form->value( 'FRM_ACTIVITE', $opt{activite} ) if $opt{activite};
$form->value( 'FRM_NOM', $opt{nom} );
$form->value( 'FRM_PRENOM', $opt{prenom} ) if !$opt{activite};
$form->value( 'FRM_ADRESSE', $opt{adresse} );
$form->value( 'FRM_LOCALITE', $opt{localite} );
$form->value( 'FRM_DEPARTEMENT', $opt{departement} );
};
croak "Cannot fill the pagesjaunes request form. try with the 'fast' option\n" if $@;
$self->{limit} = $opt{limit} || $self->{limit};
$self->{req} = $form->click;
}
return $self;
}
sub results {
my $self = shift;
my $result_page = $self->{ua}->request( $self->{req} )->content;
my $parser = HTML::TokeParser->new( \$result_page );
# All the
tags are transformed to '§¤§', to separate
# multiple phone numbers
$parser->{textify} = {
'br' => sub() { '§¤§' }
};
my @results;
if ( $self->{limit} == 0 ) {
$self->{has_more} = 0;
return @results;
}
# XXX This is a really crude parsing of the data, but it seems to
# get the job done.
#
#
#
#
#
#
# Name |
# |
#
#
# Address |
# (télécopie)? Phone |
#
#
# |
#
#
#
$self->{has_more} = 0;
while ( my $token = $parser->get_tag("table") ) {
next
unless $token->[1]
&& $token->[1]{class}
&& $token->[1]{class} eq 'fdinscr';
{ # We're inside an entry table
$parser->get_tag("td"); # The first is the name
my $name = _trim( $parser->get_trimmed_text('/td') );
$parser->get_tag("td"); # The second | is ignored
$parser->get_tag("td"); # The third | is the address
my $address = _trim( $parser->get_trimmed_text('/td') );
$address =~ s/\W*\|.*$//g;
$parser->get_tag("td"); # The fourth | is the phone number
my $phone = _trim( $parser->get_trimmed_text('/td') );
my @phones = map { _trim($_); s/\.(\s*\d)/$1/; $_ } split(/§¤§/, $phone);
# The fifth | tag is either the mail or the descr, depending
# on the class
my @emails = ('');
my $tag = $parser->get_tag("td");
if ( $tag->[1]{class} && $tag->[1]{class} eq 'txtinscr'){
my $email = _trim( $parser->get_trimmed_text('/td') );
@emails = map { _trim($_); s/Mail\s*:\s*//; $_ } split(/§¤§/, $email);
}
push(
@results,
WWW::Search::Pagesjaunes::Entry->new(
$name, $address, [ @phones ], [ @emails ]
)
);
return @results if --$self->{limit} == 0;
}
}
foreach my $form ( HTML::Form->parse( $result_page, $self->{URL} ) ) {
if ( $form->find_input('faire') &&
$form->value('faire') eq 'decode_input_image' )
{
$self->{has_more} = 1;
$self->{req} = $form->click();
}
}
# If there was no result, we look for an error message in the HTML page
if ( !@results && $self->{error} ) {
$parser = HTML::TokeParser->new( \$result_page );
while ( my $token = $parser->get_tag("font") ) {
next
unless $token->[1]
&& $token->[1]{color}
&& $token->[1]{color} eq '#ff0000';
$parser->{textify} = {
'br' => sub() { " " }
};
carp _trim( $parser->get_trimmed_text('/font') ) . "\n";
}
}
wantarray ? @results : $results[0];
}
sub _trim {
$_[0] =~ s/\xa0/ /g; # Transform the into whitespace
$_[0] =~ s/^\s*|\s*$//g;
$_[0] =~ s/\s+/ /g;
$_[0];
}
sub limit {
my $self = shift;
$self->{limit} = $_[0] || $self->{limit};
}
sub has_more { $_[0]->{has_more} }
package WWW::Search::Pagesjaunes::Entry;
# The entry object is a blessed array with the following indices:
# 0 - Name
# 1 - Address
# 2 - Arrayref of phone numbers
# 3 - E-mail (pj)
# 4 - Notes (pj)
sub new {
my $class = shift;
bless [ @_ ], $class
}
sub name { $_[0]->[0] }
sub address { $_[0]->[1] }
sub phone { $_[0]->[2] }
sub email { $_[0]->[3] }
sub entry {
# Name Address First email Phones
$_[0]->[0], $_[0]->[1], $_[0]->[3]->[0], @{ @{ $_[0] }[2] },
}
1;
__END__
=pod
=head1 NAME
WWW::Search::Pagesjaunes - Lookup phones numbers from www.pagesjaunes.fr
=head1 SYNOPSIS
use WWW::Search::Pagesjaunes;
my $pj = new WWW::Search::Pagesjaunes;
$pj->find( activite => "Plombier", localite => "Paris" );
do {
print $_->entry . "\n" foreach ($pj->results);
} while $pj->has_more;
=head1 DESCRIPTION
The WWW::Search::Pagesjaunes provides name, phone number and addresses of French
telephone subscribers by using the L
directory.
=head1 METHODS
Two classes are used in this module, a first one (WWW::Search::Pagesjaunes) to do the
fetching and parsing, and the second one and a second one
(WWW::Search::Pagesjaunes::Entry) holding the entry infos.
Here are the methods for the main WWW::Search::Pagesjaunes module:
=over 4
=item new()
The constructor accept an optional LWP::UserAgent as argument, if you want to
provide your own.
=item find( %request )
Here are the values for the %request hash that are understood. They
each have two name, the first is the french one and the second is the
english one:
=over 4
=item nom / name
Name of the person you're looking for.
=item activite / business
Business type of the company you're looking for. Note that if this
field is filled, the module searches in the yellow pages.
=item localite / town
Name of the town you're searching in.
=item prenom / firstname
First name of the person you're looking for. It is not set if you set the
'activite' field.
=item departement / district
Name or number of the Département or Région you're searching in.
=back
=item results()
Returns an array of WWW::Search::Pagesjaunes::Entry containing the first matches of the
query.
=item limit($max_number_of_entries)
Set the maximum number of entries returned. Default to 50.
=item has_more()
If the query leads to more than a few results, the field has_more is set. You
can then call the results() method again to fetch the datas.
=back
The WWW::Search::Pagesjaunes::Entry class has six methods:
=over 4
=item new($name, $address, $phone, $fax)
Returns a new WWW::Search::Pagesjaunes::Entry.
=item name
Returns the name of the entry.
=item address
Returns the address of the entry.
=item phone
Returns the phone number of the entry.
=item is_fax
Returns true if the phone number is a fax one, false otherwise. Note
that currently, this method always returns 0.
=item entry($separator)
Returns the concatenation of the name and the phone number, separated by
" - ". You can specify your own separator as first argument.
=back
=head1 BUGS
The phone numbers are sometimes not correctly parsed, esp. when one
entry has several phone numbers.
If you found a bug and want to report it or send a patch, you are
encouraged to use the CPAN Request Tracker interface:
L
=head1 COPYRIGHT
Please read the Publisher information of L available at the following URL:
L
WWW::Search::Pagesjaunes is Copyright (C) 2002, Briac Pilpré
This module is free software; you can redistribute it or modify it under the
same terms as Perl itself.
=head1 AUTHOR
Briac Pilpré
=cut
WWW-Search-Pagesjaunes-0.14/META.yml 0000644 0001750 0001750 00000000736 10244704716 017236 0 ustar briac briac 0000000 0000000 # http://module-build.sourceforge.net/META-spec.html
#XXXXXXX This is a prototype!!! It will change in the future!!! XXXXX#
name: WWW-Search-Pagesjaunes
version: 0.14
version_from: lib/WWW/Search/Pagesjaunes.pm
installdirs: site
requires:
HTML::Form: 0
HTML::TokeParser: 0
LWP::UserAgent: 1.8
Pod::Usage: 0
distribution_type: module
generated_by: ExtUtils::MakeMaker version 6.17
WWW-Search-Pagesjaunes-0.14/pagesjaunes 0000755 0001750 0001750 00000011222 10207315542 020202 0 ustar briac briac 0000000 0000000 #!/usr/bin/perl -w
use strict;
use WWW::Search::Pagesjaunes;
use Getopt::Long;
use Pod::Usage;
my %opt;
GetOptions(
\%opt, 'activite|business|activity=s',
'nom|name=s', 'prenom|firstname:s',
'adresse|address:s', 'localite|ville|city|town=s',
'departement|district=s', 'limit:i', 'english', 'quiet',
'help|aide', 'man', 'separator|separateur=s',
'useragent=s', 'version', 'fast'
)
or pod2usage(2);
#
# $form->value('input_image', $x) où $x est:
#
# CD_PDR_SUP_REG => Région entière
# CD_PDR_SUP_DEP_REQ => Département entier
# CD_PDR_VOISIN_REQ => Localités voisines
# CD_PDR_INSCRIPTIONS_REQ => Sans nom ni adresse
# BT_INSCRIPTIONS_REQ_PROF => Afficher seulement les professionnels
#
die "$WWW::Search::Pagesjaunes::VERSION\n" if $opt{version};
pod2usage(1) if $opt{help};
pod2usage( -verbose => 2 ) if $opt{man};
pod2usage("$0: No town or district given.") unless ($opt{localite} || $opt{departement});
pod2usage("$0: No business or name given.")
unless ( $opt{activite} || $opt{nom} );
my $pj = WWW::Search::Pagesjaunes->new();
$pj->{lang} = 'EN' if $opt{english};
$pj->{error} = 0 if $opt{quiet};
$pj->{fast} = 1 if $opt{fast};
$pj->{ua}->agent($opt{useragent}) if $opt{useragent};
$pj->find(%opt);
do {
local $\ = "\n";
print join($opt{separator} || "\t", $_->entry) foreach ( $pj->results );
} while $pj->has_more;
__END__
=head1 NAME
pagesjaunes - Lookup phones numbers from www.pagesjaunes.fr
=head1 SYNOPSIS
pagesjaunes [options ...]
Options:
-activite -business : Business type
-nom -name : Name
-prenom -firstname : First name
-localite -town : Town
-departement -district : Dept district or Region
-useragent : String to be passed as User-Agent header
(this may be needed to bypass user-agent detection)
-fast : Submit the request directly without querying the form
-separator : Character used to separate the fields (default '\t')
-limit : Maximum number of results returned
-english : Use the pagesjaunes.fr english interface
-quiet : Turn off error messages display
-help : Brief help message
-man : Full documentation
-version : Display version number
You must provide the localite/town option, and either activite/business
or nom/name option. The prenom/firstname option is ignored if the
localite/town option is set.
=head1 OPTIONS
=over 8
=item B<-activite> or B<-business>
Activity or business type you're looking for. This is a mandatory switch
if you don't specify the B<-name> or B<-nom> switches.
=item B<-nom> or B<-name>
Name of the person or company you're looking for. Note that the search
is done with a fuzzy match.
=item B<-prenom> or B<-firstname>
First name of the person you're looking for. This option is ignored if
the B<-activite> or B<-business> are set.
=item B<-adresse> or B<-address>
Address of the person you're searching for.
=item B<-localite> or B<-town>
Name of the town.
=item B<-department> or B<-district>
Department district or Region you're searching in.
=item B<-separator>
Character used to separate fields in the result set.
Default is a tabulation character.
=item B<-useragent>
The default user-agent string passed to the www.pagesjaunes.fr site
is ""WWW::Search::Pagesjaunes/x.xx" where x.xx is the version of
the module.
Sometimes, it is needed to change it to other user-agent strings.
=item B<-limit>
Maximum number of entries returned. Default is 50. If you set it to 0 or
a negative number, it will return all the entries found.
=item B<-english>
Use the english interface of pagesjaunes.fr. This means that you can
enter business types in english (i.e. 'plumber' instead of 'plombier'),
and error messages will be displayed in english.
=item B<-quiet>
No error messages are printed when this switch is on
=item B<-help>
Print a brief help message and exits.
=item B<-man>
Prints the manual page and exits.
=item B<-version>
Prints the version of the script and exits.
=back
=head1 DESCRIPTION
This script provides name, phone number and addresses of French
telephone subscribers by using the http://www.pagesjaunes.fr directory
and the WWW::Search::Pagesjaunes module.
=head1 COPYRIGHT
Please read the Publisher information of L
available at the following URL:
L
This script is Copyright (C) 2002, Briac Pilpré
This script is free software; you can redistribute it or modify it under
the same terms as Perl itself.
=head1 AUTHOR
Briac Pilpré
=cut
WWW-Search-Pagesjaunes-0.14/Changes 0000644 0001750 0001750 00000005371 10244704707 017260 0 ustar briac briac 0000000 0000000 0.14 Tue May 24 22:08:13 CEST 2005
- Fixed the HTML parsing to reflect the change in the layout of the
result table (patch thanks to Yann Gauguet).
- Updated the tests
0.13 Thu Feb 24 10:23:28 CET 2005
- Fixed the HTML parsing to reflect the change in the layout of the
result table (patch thanks to Éric(glb)).
0.12 Wed Jun 30 14:13:10 CEST 2004
- fixed a bug when the main form was not found.
- the main form is now filled within an eval() to avoid
HTML::Forms errors when trying to fill it.
0.11 Thu Jun 17 12:00:27 CEST 2004
- Added a '--fast' parameter to create the request from scratch (and
not fetching the form from pagesjaunes.fr)
- Support multiple phone numbers
- Support e-mail addresses
0.10 Sat Jun 12 21:41:51 CEST 2004
- Added a '--useragent' parameter to let the user specify the
UserAgent HTTP header.
- Added some docs
0.8 Wed Jan 7 16:29:09 CET 2004
- Fixed the HTML parsing again.
- Modified the user-agent string as pagejaunes.fr seems to be blocking
user agent matching /WWW::Search::Pagesjaunes/
(see https://rt.cpan.org/Ticket/Display.html?id=4833 )
0.7 Tue Dec 9 10:46:28 CET 2003
- Fixed the HTML parsing to reflect the changes in the
pagesjaunes.fr website. (Thanks to David Landgren)
- The form of pagesjaunes (not pagesblanches) is the second on on
the frame, not the first. (Thanks to Jon Orwant)
- Added a couple more tests
0.5 Sat Feb 8 13:03:05 CET 2003
- Fixed the HTML parsing to reflect the changes in the
pagesjaunes.fr website. (Thanks to Nicolas Chuche)
- Added documentation for the -address and -adresse switch
0.4 Thu Oct 24 23:26:04 CEST 2002
- No check on missing parameter is done in the .pm, it's up to the
script to check that.
- a Town *or* District is required (used to be only a Town)
- pagesjaunes.fr errors are reported if the entry set is empty
- Added a --quiet switch to avoid error reporting
- Added a --english switch to use the pagesjaunes.fr interface in
english (you can then use the english name of business types
'plombier' => 'plumber')
- The Entry::entry method now returns an array. It is up to the user
to set the proper separator ($, and $\)
- Fixed the name truncation bug.
0.3 Tue Oct 15 13:11:09 CEST 2002
- Fixed a bug that prevented limit() to work
- Added a --limit switch on the pagesjaunes script
- pagesjaunes is better documented, using Pod::Usage
- Changed the english switches to aliases, much cleaner
0.2 Tue Oct 15 10:29:35 CEST 2002
- Added usage for the pagesjaunes script
- Added support for english options
- Documented the limit() method
0.1 Mon Oct 14 22:58:57 CEST 2002
- first released version
WWW-Search-Pagesjaunes-0.14/MANIFEST 0000644 0001750 0001750 00000000333 10207315562 017103 0 ustar briac briac 0000000 0000000 Changes
MANIFEST This list of files
Makefile.PL
README
lib/WWW/Search/Pagesjaunes.pm
pagesjaunes
t/00basic.t
t/01interface.t
t/02search.t
META.yml Module meta-data (added by MakeMaker)
WWW-Search-Pagesjaunes-0.14/Makefile.PL 0000644 0001750 0001750 00000000654 10130737177 017737 0 ustar briac briac 0000000 0000000 use ExtUtils::MakeMaker;
WriteMakefile(
NAME => 'WWW::Search::Pagesjaunes',
VERSION_FROM => 'lib/WWW/Search/Pagesjaunes.pm',
PREREQ_PM => {
'HTML::Form' => 0,
'LWP::UserAgent' => 1.80,
'HTML::TokeParser' => 0,
'Pod::Usage' => 0,
},
EXE_FILES => [ 'pagesjaunes' ],
ABSTRACT_FROM => 'lib/WWW/Search/Pagesjaunes.pm',
AUTHOR => 'Briac Pilpré ',
);
WWW-Search-Pagesjaunes-0.14/README 0000644 0001750 0001750 00000006625 10130737177 016651 0 ustar briac briac 0000000 0000000 NAME
WWW::Search::Pagesjaunes - Lookup phones numbers from www.pagesjaunes.fr
SYNOPSIS
use WWW::Search::Pagesjaunes;
my $pj = new WWW::Search::Pagesjaunes;
$pj->find( activite => "Plombier", localite => "Paris" );
{
print $_->entry . "\n" foreach ($pj->results);
redo if $pj->has_more;
}
DESCRIPTION
The WWW::Search::Pagesjaunes provides name, phone number and addresses
of French telephone subscribers by using the
directory.
METHODS
Two classes are used in this module, a first one
(WWW::Search::Pagesjaunes) to do the fetching and parsing, and the
second one and a second one (WWW::Search::Pagesjaunes::Entry) holding
the entry infos.
Here are the methods for the main WWW::Search::Pagesjaunes module:
new()
The constructor accept an optional LWP::UserAgent as argument, if
you want to provide your own.
find( %request )
Here are the values for the %request hash that are understood. They
each have two name, the first is the french one and the second is
the english one:
nom / name
Name of the person you're looking for.
activite / business
Business type of the company you're looking for. Note that if
this field is filled, the module searches in the yellow pages.
localite / town
Name of the town you're searching in.
prenom / firstname
First name of the person you're looking for. It is not set if
you set the 'activite' field.
departement / district
Name or number of the Département or Région you're searching in.
results()
Returns an array of WWW::Search::Pagesjaunes::Entry containing the
first matches of the query.
limit($max_number_of_entries)
Set the maximum number of entries returned. Default to 50.
has_more()
If the query leads to more than a few results, the field has_more is
set. You can then call the results() method again to fetch the
datas.
The WWW::Search::Pagesjaunes::Entry class has six methods:
new($name, $address, $phone, $fax)
Returns a new WWW::Search::Pagesjaunes::Entry.
name
Returns the name of the entry.
address
Returns the address of the entry.
phone
Returns the phone number of the entry.
is_fax
Returns true if the phone number is a fax one, false otherwise. Note
that currently, this method always returns 0.
entry
Returns the concatenation of the name and the phone number,
separated by " - ".
BUGS
The phone numbers are sometimes not correctly parsed, esp. when one
entry has several phone numbers.
Names are sometimes truncated.
If you found a bug and want to report it or send a patch, you are
encouraged to use the CPAN Request Tracker interface:
COPYRIGHT
Please read the Publisher information of
available at the following URL:
WWW::Search::Pagesjaunes is Copyright (C) 2002, Briac Pilpré
This module is free software; you can redistribute it or modify it under
the same terms as Perl itself.
AUTHOR
Briac Pilpré briac@cpan.org
|