Web-Scraper-0.38/ 000755 000765 000024 00000000000 12421053223 014512 5 ustar 00miyagawa staff 000000 000000 Web-Scraper-0.38/bin/ 000755 000765 000024 00000000000 12421053223 015262 5 ustar 00miyagawa staff 000000 000000 Web-Scraper-0.38/Build.PL 000644 000765 000024 00000000261 12421053223 016005 0 ustar 00miyagawa staff 000000 000000 # This Build.PL for Web-Scraper was generated by Dist::Zilla::Plugin::ModuleBuildTiny 0.007.
use strict;
use warnings;
use 5.008001;
use Module::Build::Tiny 0.039;
Build_PL();
Web-Scraper-0.38/Changes 000644 000765 000024 00000016171 12421053223 016013 0 ustar 00miyagawa staff 000000 000000 Revision history for Perl extension Web::Scraper
0.38 2014-10-19 17:25:53 PDT
- Improved documentation #8 (vti)
- Add regexp filter #10 (creaktive)
- Fix documentation error #16
0.37 Fri Oct 19 15:09:17 PDT 2012
- Repack with the latest Module::Install
0.36 Sat Nov 19 12:12:54 PST 2011
- Support HTML5 tags by not ignoring unknonw tags (leedo)
0.35 Mon Sep 26 18:40:06 PDT 2011
- Added support for comments() XPath #3 (Perlover)
0.34 Thu Feb 24 09:35:12 PST 2011
- Skip xml_simple.t if LibXML is not there (omega)
0.33 Thu Feb 17 09:12:55 PST 2011
- Remove failing invalid XPath tests
0.32 Wed Feb 3 22:13:01 PST 2010
- Removes poking around charset and LWP's decoded_content
(Thanks to flatwhatson)
- More docs (jshirley)
0.31 Sun Jul 19 00:43:54 PDT 2009
- Use new LWP's content_charset method instead of HTTP::Response::Encoding
(Thanks to hanekomu)
0.30 Wed Jul 8 15:47:21 PDT 2009
- No warnings when use()d multiple times in the same package
0.29 Wed Jul 8 13:40:14 PDT 2009
- Adds Web::Scraper::LibXML which uses HTML::TreeBuilder::LibXML
(without the replace_original hack)
0.28 Sat Mar 28 14:31:45 PDT 2009
- Call ->eof when parsing with HTML::TreeBuilder
(Thanks to Tokuhiro Matsuno)
0.27 Tue Mar 24 12:09:04 PDT 2009
- Added tests to use HTML::TreeBuilder::LibXML
(Thanks to Tokuhiro Matsuno)
0.26 Thu Jan 15 11:37:56 PST 2009
- Fixed an error message when GET request fails
0.25 Sun Jan 11 13:36:44 PST 2009
- scrape() now accepts HTTP::Response as well for Remedie/Plagger
- repository moved to github http://github.com/miyagawa/web-scraper/tree/master
0.24 Sun Nov 25 15:58:38 PST 2007
- Support duck typing in filter args to take object that has 'filter' method
This could give Web::Scraper::Filter::Pipe a better interface
(Thanks to hanekomu and tokuhirom)
0.23 Sat Nov 24 17:21:14 PST 2007
- Upped Web::Scraper dependency
- Skip & test until HTML::TreeBuilder::XPath fixes it
- removed eg/search-cpan.pl
0.22 Wed Oct 17 17:51:54 PDT 2007
- 's' on scraper shell now prints to pager (e.g. less) if PAGER is set
0.21_01 Thu Oct 4 01:05:00 PDT 2007
- Added an experimental filter support
(Thanks to hirose31, tokuhirom and Yappo for brainstorming)
0.21 Wed Oct 3 10:37:13 PDT 2007
- Bumped up HTML::TreeBuilder dependency to fix 12_html.t issues
[rt.cpan.org #29733]
0.20 Wed Oct 3 00:28:13 PDT 2007
- Fixed a bug where URI is not absolutized with a hash reference value
- Added eg/jp-playstation-store.pl
0.19 Thu Sep 20 22:42:30 PDT 2007
- Try to get HTML encoding from META tags as well, when there's
no charset value in HTTP response header.
0.18 Thu Sep 20 19:49:11 PDT 2007
- Fixed a bug where URI is not absolutized when scraper is nested
- Use as_XML not as_HTML in 'RAW'
0.17 Wed Sep 19 19:12:25 PDT 2007
- Reverted Term::Encoding support since it causes segfaults
(double utf-8 encoding) in some environment
0.16 Tue Sep 18 04:48:47 PDT 2007
- Support 'RAW' and 'TEXT' for TextNode object
- Call Term::Encoding from scraper shell if installed
0.15 Sat Sep 15 21:28:10 PDT 2007
- Call env_proxy in scraper CLI
- Added $Web::Scraper::UserAgent and $scraper->user_agent accessor to deal
with UserAgent object
- Don't escape non-ASCII characters into XXXX; in scraper shell 's' and WARN
0.14 Fri Sep 14 16:06:20 PDT 2007
- Fix bin/scraper to work with older Term::ReadLine.
(Thanks to Tina Müller [RT:29079])
- Now link elements like img@src and a@href are automatically
converted to absolute URI using the current URI as a base.
Only effective when you do $s->scrape(URI) or $s->scrape(\$html, URI)
- Added 'HTML' and its alias 'RAW' to get the HTML chunk inside the tag
process "script", "code" => 'RAW';
Handy if you want the raw HTML code inside
--- selector
script
--- expected
function foo() {
return bar;
}
=== a
--- html
foo bar
--- selector
a
--- expected
foo bar
=== div
--- html
--- selector
#foo
--- expected
foo bar
bar
=== non-ascii
--- html
テスト
--- selector
#foo
--- expected
テスト
=== textarea
--- html
--- selector
textarea
--- expected
\n
foo
bar
\n
baz
Web-Scraper-0.38/t/13_textnode.t 000644 000765 000024 00000001520 12421053223 017275 0 ustar 00miyagawa staff 000000 000000 use strict;
use Test::Base;
use Web::Scraper;
plan tests => 1 * blocks;
filters {
selector => 'chomp',
want => 'chomp',
expected => 'chomp',
};
run {
my $block = shift;
my $s = scraper {
process $block->selector, want => $block->want;
result 'want';
};
my $want = $s->scrape($block->html);
is $want, $block->expected, $block->name;
};
__DATA__
=== TEXT
--- html
foo bar
--- selector
//p/node()[2]
--- want
TEXT
--- expected
bar
=== TEXT
--- html
foo bar
--- selector
//p/node()[2]
--- want
TEXT
--- expected
bar
=== TEXT
--- html
foo bar & baz
--- selector
//p/node()[2]
--- want
TEXT
--- expected
bar & baz
=== RAW HTML
--- SKIP
--- html
foo bar & baz
--- selector
//p/node()[2]
--- want
RAW
--- expected
bar & baz
Web-Scraper-0.38/t/14_absolute_nested.t 000644 000765 000024 00000001157 12421053223 020632 0 ustar 00miyagawa staff 000000 000000 use strict;
use Test::Base;
use Web::Scraper;
plan tests => 1 * blocks;
filters {
selector => 'chomp',
expected => 'chomp',
};
run {
my $block = shift;
my $s = scraper {
process $block->selector, want => scraper {
process "img", image => '@src';
result "image";
};
result 'want';
};
my $want = $s->scrape($block->html, $block->url);
is $want, $block->expected, $block->name;
};
__DATA__
===
--- url: http://example.com/
--- html
--- selector
a#foo
--- expected
http://example.com/foo.jpg
Web-Scraper-0.38/t/15_absolute_hash.t 000644 000765 000024 00000001041 12421053223 020264 0 ustar 00miyagawa staff 000000 000000 use strict;
use Test::Base;
use Web::Scraper;
plan tests => 1 * blocks;
filters {
expected => 'chomp',
};
run {
my $block = shift;
my $s = scraper {
process $block->selector, 'want[]' => { link => '@href' };
result 'want';
};
my $want = $s->scrape($block->html, $block->url);
is $want->[0]->{link}, $block->expected, $block->name;
};
__DATA__
===
--- url: http://example.com/
--- html
--- selector
a#foo
--- expected
http://example.com/foo.html
Web-Scraper-0.38/t/16_filter.t 000644 000765 000024 00000002535 12421053223 016742 0 ustar 00miyagawa staff 000000 000000 use strict;
use Test::Base;
use Web::Scraper;
plan tests => 1 * blocks;
filters {
expected => 'chomp',
want => 'eval',
};
run {
my $block = shift;
my $s = scraper {
process 'a', want => $block->want;
result 'want';
};
my $want = $s->scrape('foo');
my $expected = $block->expected eq 'undef' ? undef : $block->expected;
is $want, $expected, $block->name;
};
BEGIN {
package Web::Scraper::Filter::foo;
use base qw( Web::Scraper::Filter );
sub filter { tr/a-z/b-za/ }
package Web::Scraper::Filter::bar;
use base qw( Web::Scraper::Filter );
sub filter { $_[1] . 'bar' }
}
package main;
__DATA__
=== tr
--- want
['TEXT', 'foo']
--- expected
gpp
=== shift + return
--- want
['TEXT', 'bar']
--- expected
foobar
=== inline callback
--- want
['TEXT', sub { return "baz" } ]
--- expected
baz
=== inline callback + s///
--- want
['TEXT', sub { s/foo/bax/ } ]
--- expected
bax
=== stack
--- want
['TEXT', 'bar', 'foo' ]
--- expected
gppcbs
=== stack
--- want
['TEXT', 'bar', sub { s/foo/bar/ } ]
--- expected
barbar
=== no match
--- want
['TEXT', sub { s/xxx/yyy/g }]
--- expected
foo
=== undef
--- want
['TEXT', sub { return }]
--- expected
undef
=== number
--- want
['TEXT', sub { return 3 }]
--- expected
3
=== object
--- want
['TEXT', Web::Scraper::Filter::foo->new]
--- expected
gpp
Web-Scraper-0.38/t/17_filter_loop.t 000644 000765 000024 00000000702 12421053223 017766 0 ustar 00miyagawa staff 000000 000000 use strict;
use Test::Base;
use Web::Scraper;
plan tests => 1 * blocks;
filters {
expected => 'yaml',
want => 'eval',
};
run {
my $block = shift;
my $s = scraper {
process 'a', 'want[]' => $block->want;
result 'want';
};
my $want = $s->scrape('foobar');
is_deeply $want, $block->expected, $block->name;
};
__DATA__
=== tr
--- want
['TEXT', sub { tr/a-z/b-za/ }]
--- expected
- gpp
- cbs
Web-Scraper-0.38/t/18_http_response.t 000644 000765 000024 00000000636 12421053223 020354 0 ustar 00miyagawa staff 000000 000000 use strict;
use warnings;
use URI;
use LWP::UserAgent;
use Web::Scraper;
use Test::More;
plan skip_all => "LIVE_TEST not enabled"
unless $ENV{LIVE_TEST} || $ENV{TEST_ALL};
plan tests => 1;
my $ua = LWP::UserAgent->new;
{
my $res = $ua->get("http://www.yahoo.co.jp/");
my $result = scraper {
process 'title', title => 'TEXT';
}->scrape($res);
is $result->{title}, 'Yahoo! JAPAN';
}
Web-Scraper-0.38/t/19_decode_content.t 000644 000765 000024 00000000720 12421053223 020427 0 ustar 00miyagawa staff 000000 000000 use strict;
use warnings;
use URI;
use LWP::UserAgent;
use Web::Scraper;
use Test::More;
plan skip_all => "LIVE_TEST not enabled"
unless $ENV{LIVE_TEST} || $ENV{TEST_ALL};
plan tests => 1;
my $ua = LWP::UserAgent->new;
$ua->default_header('Accept-Encoding' => 'gzip');
{
my $res = $ua->get("http://www.yahoo.co.jp/");
my $result = scraper {
process 'title', title => 'TEXT';
}->scrape($res);
is $result->{title}, 'Yahoo! JAPAN';
}
Web-Scraper-0.38/t/20_comment_nodes.t 000644 000765 000024 00000002272 12421053223 020300 0 ustar 00miyagawa staff 000000 000000 use strict;
use Test::Base;
use utf8;
use Web::Scraper;
plan skip_all => "Please upgrade HTML::TreeBuilder::XPath and HTML::TreeBuilder::LibXML modules for comment nodes supporting"
unless
eval "use HTML::TreeBuilder::XPath 0.14; 1" &&
eval "use HTML::TreeBuilder::LibXML 0.13; 1";
plan tests => 1 * blocks;
filters {
selector => 'chomp',
expected => [ 'chomp', 'newline' ],
html => 'newline',
};
sub newline {
s/\\n\n/\n/g;
}
# For turning off of "Wide character warnings if test failed"
my $builder = Test::More->builder;
binmode $builder->output, ":utf8";
binmode $builder->failure_output, ":utf8";
binmode $builder->todo_output, ":utf8";
run {
my $block = shift;
my $s = scraper {
process $block->selector, want => 'TEXT';
result 'want';
};
my $want = $s->scrape($block->html);
is $want, $block->expected, $block->name;
};
__DATA__
=== comment in p
--- html
This is a paragraph bla bla bla
--- selector
//p/comment()
--- expected
This is the comment
=== non-ascii comment
--- html
Bla bla bla
--- selector
//p/comment()
--- expected
テスト
Web-Scraper-0.38/t/21_html5.t 000644 000765 000024 00000001157 12421053223 016501 0 ustar 00miyagawa staff 000000 000000 use strict;
use Test::Base;
use utf8;
use Web::Scraper;
plan tests => 1 * blocks;
filters {
selector => 'chomp',
expected => [ 'chomp', 'newline' ],
html => 'newline',
};
sub newline {
s/\\n\n/\n/g;
}
run {
my $block = shift;
my $s = scraper {
process $block->selector, want => 'HTML';
result 'want';
};
my $want = $s->scrape($block->html);
is $want, $block->expected, $block->name;
};
__DATA__
=== header
--- html
--- selector
header
--- expected
hello
=== section
--- html
--- selector
header
--- expected
hello
Web-Scraper-0.38/t/22_filter_regex.t 000644 000765 000024 00000001475 12421053223 020133 0 ustar 00miyagawa staff 000000 000000 use strict;
use Test::Base;
use Web::Scraper;
plan tests => 1 * blocks;
filters {
expected => 'yaml',
want => 'eval',
};
run {
my $block = shift;
return pass("no named grouping in Perl $]")
if $] < 5.010 and $block->name eq 'named';
my $s = scraper {
process 'a', 'want[]' => $block->want;
result 'want';
};
my $want = $s->scrape('foo=123bar=456');
is_deeply $want, $block->expected, $block->name;
};
__DATA__
=== unnamed
--- want
[ TEXT => qr/(\d+)/ ]
--- expected
- 123
- 456
=== named
--- want
[ TEXT => qr/^(?\w+)=(?\d+)$/ ]
--- expected
- name: foo
value: 123
- name: bar
value: 456
=== boolean
--- want
[ TEXT => qr/BAR/i ]
--- expected
-
- 1
=== stack
--- want
[ TEXT => qr/(\w+)/ => sub { ucfirst } ]
--- expected
- Foo
- Bar
Web-Scraper-0.38/t/redefine.t 000644 000765 000024 00000000177 12421053223 016730 0 ustar 00miyagawa staff 000000 000000 BEGIN {
use Test::More 'no_plan';
$SIG{__WARN__} = sub { fail shift };
}
use Web::Scraper;
use Web::Scraper;
ok 1;
Web-Scraper-0.38/t/release-pod-syntax.t 000644 000765 000024 00000000456 12421053223 020673 0 ustar 00miyagawa staff 000000 000000 #!perl
BEGIN {
unless ($ENV{RELEASE_TESTING}) {
require Test::More;
Test::More::plan(skip_all => 'these tests are for release candidate testing');
}
}
# This file was automatically generated by Dist::Zilla::Plugin::PodSyntaxTests.
use Test::More;
use Test::Pod 1.41;
all_pod_files_ok();
Web-Scraper-0.38/t/xml-simple.t 000644 000765 000024 00000001547 12421053223 017240 0 ustar 00miyagawa staff 000000 000000 use strict;
use Test::Requires qw(HTML::TreeBuilder::LibXML);
use Test::Base;
use Web::Scraper::LibXML;
filters { expected => [ 'lines', 'chomp' ] };
plan tests => 1 * blocks;
run {
my $block = shift;
my $s = scraper {
process $block->selector, "value[]", $block->get;
};
my $r = $s->scrape($block->input);
is_deeply $r->{value}, [ $block->expected ];
};
__END__
===
--- input
bar
--- selector: foo
--- get: TEXT
--- expected
bar
===
--- input
baz
bax
--- selector: foo>bar
--- get: TEXT
--- expected
baz
bax
===
--- input
--- selector: bar
--- get: @attr
--- expected
test bar
Hello & World
Web-Scraper-0.38/lib/Web/ 000755 000765 000024 00000000000 12421053223 015775 5 ustar 00miyagawa staff 000000 000000 Web-Scraper-0.38/lib/Web/Scraper/ 000755 000765 000024 00000000000 12421053223 017374 5 ustar 00miyagawa staff 000000 000000 Web-Scraper-0.38/lib/Web/Scraper.pm 000644 000765 000024 00000033610 12421053223 017735 0 ustar 00miyagawa staff 000000 000000 package Web::Scraper;
use strict;
use warnings;
use 5.008001;
use Carp;
use Scalar::Util qw(blessed);
use List::Util qw(first);
use HTML::Entities;
use HTML::Tagset;
use HTML::TreeBuilder::XPath;
use HTML::Selector::XPath;
use UNIVERSAL::require;
our $VERSION = '0.38';
sub import {
my $class = shift;
my $pkg = caller;
no strict 'refs';
no warnings 'redefine';
*{"$pkg\::scraper"} = _build_scraper($class);
*{"$pkg\::process"} = sub { goto &process };
*{"$pkg\::process_first"} = sub { goto &process_first };
*{"$pkg\::result"} = sub { goto &result };
}
our $UserAgent;
sub __ua {
require LWP::UserAgent;
$UserAgent ||= LWP::UserAgent->new(agent => __PACKAGE__ . "/" . $VERSION);
$UserAgent;
}
sub user_agent {
my $self = shift;
$self->{user_agent} = shift if @_;
$self->{user_agent} || __ua;
}
sub define {
my($class, $coderef) = @_;
bless { code => $coderef }, $class;
}
sub _build_scraper {
my $class = shift;
return sub(&) {
my($coderef) = @_;
bless { code => $coderef }, $class;
};
}
sub scrape {
my $self = shift;
my($stuff, $current) = @_;
my($html, $tree);
if (blessed($stuff) && $stuff->isa('URI')) {
my $ua = $self->user_agent;
my $res = $ua->get($stuff);
return $self->scrape($res, $stuff->as_string);
} elsif (blessed($stuff) && $stuff->isa('HTTP::Response')) {
if ($stuff->is_success) {
$html = $stuff->decoded_content;
} else {
croak "GET " . $stuff->request->uri . " failed: ", $stuff->status_line;
}
$current ||= $stuff->request->uri;
} elsif (blessed($stuff) && $stuff->isa('HTML::Element')) {
$tree = $stuff->clone;
} elsif (ref($stuff) && ref($stuff) eq 'SCALAR') {
$html = $$stuff;
} else {
$html = $stuff;
}
$tree ||= $self->build_tree($html);
my $stash = {};
no warnings 'redefine';
local *process = create_process(0, $tree, $stash, $current);
local *process_first = create_process(1, $tree, $stash, $current);
my $retval;
local *result = sub {
$retval++;
my @keys = @_;
if (@keys == 1) {
return $stash->{$keys[0]};
} elsif (@keys) {
my %res;
@res{@keys} = @{$stash}{@keys};
return \%res;
} else {
return $stash;
}
};
my $ret = $self->{code}->($tree);
$tree->delete;
# check user specified return value
return $ret if $retval;
return $stash;
}
sub build_tree {
my($self, $html) = @_;
my $t = HTML::TreeBuilder::XPath->new;
$t->store_comments(1) if ($t->can('store_comments'));
$t->ignore_unknown(0);
$t->parse($html);
$t->eof;
$t;
}
sub create_process {
my($first, $tree, $stash, $uri) = @_;
sub {
my($exp, @attr) = @_;
my $xpath = $exp =~ m!^(?:/|id\()! ? $exp : HTML::Selector::XPath::selector_to_xpath($exp);
my @nodes = eval {
local $SIG{__WARN__} = sub { };
$tree->findnodes($xpath);
};
if ($@) {
die "'$xpath' doesn't look like a valid XPath expression: $@";
}
@nodes or return;
@nodes = ($nodes[0]) if $first;
while (my($key, $val) = splice(@attr, 0, 2)) {
if (!defined $val) {
if (ref($key) && ref($key) eq 'CODE') {
for my $node (@nodes) {
local $_ = $node;
$key->($node);
}
} else {
die "Don't know what to do with $key => undef";
}
} elsif ($key =~ s!\[\]$!!) {
$stash->{$key} = [ map __get_value($_, $val, $uri), @nodes ];
} else {
$stash->{$key} = __get_value($nodes[0], $val, $uri);
}
}
return;
};
}
sub __get_value {
my($node, $val, $uri) = @_;
if (ref($val) && ref($val) eq 'CODE') {
local $_ = $node;
return $val->($node);
} elsif (blessed($val) && $val->isa('Web::Scraper')) {
return $val->scrape($node, $uri);
} elsif ($val =~ s!^@!!) {
my $value = $node->attr($val);
if ($uri && is_link_element($node, $val)) {
require URI;
$value = URI->new_abs($value, $uri);
}
return $value;
} elsif (lc($val) eq 'content' || lc($val) eq 'text') {
# getValue method is used for getting a content of comment nodes
# from HTML::TreeBuilder::XPath (version >= 0.14)
# or HTML::TreeBuilder::LibXML (version >= 0.13)
# getValue method works like as_text in both modules
# for other node types
return $node->isTextNode
? $node->string_value
: ($node->can('getValue')
? $node->getValue
: $node->as_text);
} elsif (lc($val) eq 'raw' || lc($val) eq 'html') {
if ($node->isTextNode) {
if ($HTML::TreeBuilder::XPath::VERSION < 0.09) {
return HTML::Entities::encode($node->as_XML, q("'<>&));
} else {
return $node->as_XML;
}
}
my $html = $node->as_XML;
$html =~ s!^<.*?>!!;
$html =~ s!\s*\w+>\n*$!!;
return $html;
} elsif (ref($val) eq 'HASH') {
my $values;
for my $key (keys %$val) {
$values->{$key} = __get_value($node, $val->{$key}, $uri);
}
return $values;
} elsif (ref($val) eq 'ARRAY') {
my $how = $val->[0];
my $value = __get_value($node, $how, $uri);
for my $filter (@$val[1..$#$val]) {
$value = run_filter($value, $filter);
}
return $value;
} else {
Carp::croak "Unknown value type $val";
}
}
sub run_filter {
my($value, $filter) = @_;
## sub { s/foo/bar/g } is a valid filter
## sub { DateTime::Format::Foo->parse_string(shift) } is valid too
my $callback;
my $module;
if (ref($filter) eq 'CODE') {
$callback = $filter;
$module = "$filter";
} elsif (ref($filter) eq 'Regexp') {
$callback = sub {
my @unnamed = shift =~ /$filter/x;
if (%+) {
return { %+ };
} elsif (@unnamed) {
return shift @unnamed;
} else {
return;
}
};
$module = "$filter";
} elsif (!ref($filter)) {
$module = $filter =~ s/^\+// ? $filter : "Web::Scraper::Filter::$filter";
unless ($module->isa('Web::Scraper::Filter')) {
$module->require or Carp::croak("Loading $module: $@");
}
$callback = sub { $module->new->filter(shift) };
} elsif (blessed($filter) && $filter->can('filter')) {
$callback = sub { $filter->filter(shift) };
} else {
Carp::croak("Don't know filter type $filter");
}
local $_ = $value;
my $retval = eval { $callback->($value) };
if ($@) {
Carp::croak("Filter $module had an error: $@");
}
no warnings 'uninitialized';
# sub { s/foo/bar/ } returns number or PL_sv_no which is stringified to ''
if (($retval =~ /^\d+$/ and $_ ne $value) or (defined($retval) and $retval eq '')) {
$value = $_;
} else {
$value = $retval;
}
return $value;
}
sub is_link_element {
my($node, $attr) = @_;
my $link_elements = $HTML::Tagset::linkElements{$node->tag} || [];
for my $elem (@$link_elements) {
return 1 if $attr eq $elem;
}
return;
}
sub __stub {
my $func = shift;
return sub {
croak "Can't call $func() outside scraper block";
};
}
*process = __stub 'process';
*process_first = __stub 'process_first';
*result = __stub 'result';
1;
__END__
=for stopwords API SCRAPI Scrapi
=head1 NAME
Web::Scraper - Web Scraping Toolkit using HTML and CSS Selectors or XPath expressions
=head1 SYNOPSIS
use URI;
use Web::Scraper;
use Encode;
# First, create your scraper block
my $authors = scraper {
# Parse all TDs inside 'table[width="100%]"', store them into
# an array 'authors'. We embed other scrapers for each TD.
process 'table[width="100%"] td', "authors[]" => scraper {
# And, in each TD,
# get the URI of "a" element
process "a", uri => '@href';
# get text inside "small" element
process "small", fullname => 'TEXT';
};
};
my $res = $authors->scrape( URI->new("http://search.cpan.org/author/?A") );
# iterate the array 'authors'
for my $author (@{$res->{authors}}) {
# output is like:
# Andy Adler http://search.cpan.org/~aadler/
# Aaron K Dancygier http://search.cpan.org/~aakd/
# Aamer Akhter http://search.cpan.org/~aakhter/
print Encode::encode("utf8", "$author->{fullname}\t$author->{uri}\n");
}
The structure would resemble this (visually)
{
authors => [
{ fullname => $fullname, link => $uri },
{ fullname => $fullname, link => $uri },
]
}
=head1 DESCRIPTION
Web::Scraper is a web scraper toolkit, inspired by Ruby's equivalent
Scrapi. It provides a DSL-ish interface for traversing HTML documents and
returning a neatly arranged Perl data structure.
The I and I blocks provide a method to define what segments
of a document to extract. It understands HTML and CSS Selectors as well as
XPath expressions.
=head1 METHODS
=head2 scraper
$scraper = scraper { ... };
Creates a new Web::Scraper object by wrapping the DSL code that will be fired when I method is called.
=head2 scrape
$res = $scraper->scrape(URI->new($uri));
$res = $scraper->scrape($html_content);
$res = $scraper->scrape(\$html_content);
$res = $scraper->scrape($http_response);
$res = $scraper->scrape($html_element);
Retrieves the HTML from URI, HTTP::Response, HTML::Tree or text
strings and creates a DOM object, then fires the callback scraper code
to retrieve the data structure.
If you pass URI or HTTP::Response object, Web::Scraper will
automatically guesses the encoding of the content by looking at
Content-Type headers and META tags. Otherwise you need to decode the
HTML to Unicode before passing it to I method.
You can optionally pass the base URL when you pass the HTML content as
a string instead of URI or HTTP::Response.
$res = $scraper->scrape($html_content, "http://example.com/foo");
This way Web::Scraper can resolve the relative links found in the document.
=head2 process
scraper {
process "tag.class", key => 'TEXT';
process '//tag[contains(@foo, "bar")]', key2 => '@attr';
process '//comment()', 'comments[]' => 'TEXT';
};
I is the method to find matching elements from HTML with CSS
selector or XPath expression, then extract text or attributes into the
result stash.
If the first argument begins with "//" or "id(" it's treated as an
XPath expression and otherwise CSS selector.
# 2008/12/21
# date => "2008/12/21"
process ".date", date => 'TEXT';
#
# link => URI->new("http://example.com/")
process ".body > a", link => '@href';
#
# comment => " HTML Comment here "
#
# NOTES: A comment nodes are accessed when installed
# the HTML::TreeBuilder::XPath (version >= 0.14) and/or
# the HTML::TreeBuilder::LibXML (version >= 0.13)
process "//div[contains(@class, 'body')]/comment()", comment => 'TEXT';
#
# link => URI->new("http://example.com/"), text => "foo"
process ".body > a", link => '@href', text => 'TEXT';
#
# list => [ "foo", "bar" ]
process "li", "list[]" => "TEXT";
#
# list => [ { id => "1", text => "foo" }, { id => "2", text => "bar" } ];
process "li", "list[]" => { id => '@id', text => "TEXT" };
=head2 process_first
C is the same as C but stops when the first matching
result is found.
# 2008/12/21
# 2008/12/22
# date => "2008/12/21"
process_first ".date", date => 'TEXT';
=head2 result
C allows to return not the default value after processing but a single
value specified by a key or a hash reference built from several keys.
process 'a', 'want[]' => 'TEXT';
result 'want';
=head1 EXAMPLES
There are many examples in the C dir packaged in this distribution.
It is recommended to look through these.
=head1 NESTED SCRAPERS
Scrapers can be nested thus allowing to scrape already captured data.
#
# friends => [ {href => 'foo1'}, {href => 'foo2'} ];
process 'li', 'friends[]' => scraper {
process 'a', href => '@href',
};
=head1 FILTERS
Filters are applied to the result after processing. They can be declared as
anonymous subroutines or as class names.
process $exp, $key => [ 'TEXT', sub { s/foo/bar/ } ];
process $exp, $key => [ 'TEXT', 'Something' ];
process $exp, $key => [ 'TEXT', '+MyApp::Filter::Foo' ];
Filters can be stacked
process $exp, $key => [ '@href', 'Foo', '+MyApp::Filter::Bar', \&baz ];
More about filters you can find in L documentation.
=head1 XML backends
By default L is used, this can be replaces by
a L backend using L module.
use Web::Scraper::LibXML;
# same as Web::Scraper
my $scraper = scraper { ... };
=head1 AUTHOR
Tatsuhiko Miyagawa Emiyagawa@bulknews.netE
=head1 LICENSE
This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself.
=head1 SEE ALSO
L
L
=cut
Web-Scraper-0.38/lib/Web/Scraper/Filter.pm 000644 000765 000024 00000003123 12421053223 021156 0 ustar 00miyagawa staff 000000 000000 package Web::Scraper::Filter;
use strict;
use warnings;
sub new {
my $class = shift;
bless {}, $class;
}
1;
__END__
=for stopwords namespace inline callback
=head1 NAME
Web::Scraper::Filter - Base class for Web::Scraper filters
=head1 SYNOPSIS
package Web::Scraper::Filter::YAML;
use base qw( Web::Scraper::Filter );
use YAML ();
sub filter {
my($self, $value) = @_;
YAML::Load($value);
}
1;
use Web::Scraper;
my $scraper = scraper {
process ".yaml-code", data => [ 'TEXT', 'YAML' ];
};
=head1 DESCRIPTION
Web::Scraper::Filter is a base class for text filters in
Web::Scraper. You can create your own text filter by subclassing this
module.
There are two ways to create and use your custom filter. If you name
your filter Web::Scraper::Filter::Something, you just call:
process $exp, $key => [ 'TEXT', 'Something' ];
If you declare your filter under your own namespace, like
'MyApp::Filter::Foo',
process $exp, $key => [ 'TEXT', '+MyApp::Filter::Foo' ];
You can also inline your filter function or regexp without creating a
filter class:
process $exp, $key => [ 'TEXT', sub { s/foo/bar/ } ];
process $exp, $key => [ 'TEXT', qr/Price: (\d+)/ ];
process $exp, $key => [ 'TEXT', qr/(?\w+): (?\w+)/ ];
Note that this function munges C<$_> and returns the count of
replacement. Filter code special cases if the return value of the
callback is number and C<$_> value is updated.
You can, of course, stack filters like:
process $exp, $key => [ '@href', 'Foo', '+MyApp::Filter::Bar', \&baz ];
=head1 AUTHOR
Tatsuhiko Miyagawa
=cut
Web-Scraper-0.38/lib/Web/Scraper/LibXML.pm 000644 000765 000024 00000002263 12421053223 021024 0 ustar 00miyagawa staff 000000 000000 package Web::Scraper::LibXML;
use strict;
use base qw( Web::Scraper );
use HTML::TreeBuilder::LibXML;
sub build_tree {
my($self, $html) = @_;
my $t = HTML::TreeBuilder::LibXML->new;
$t->parse($html);
$t->eof;
$t;
}
1;
__END__
=head1 NAME
Web::Scraper::LibXML - Drop-in replacement for Web::Scraper to use LibXML
=head1 SYNOPSIS
use Web::Scraper::LibXML;
# same as Web::Scraper
my $scraper = scraper { ... };
=head1 DESCRIPTION
Web::Scraper::LibXML is a drop-in replacement for Web::Scraper to use
the fast libxml-based HTML tree builder, HTML::TreeBuilder::LibXML.
This is almost identical to HTML::TreeBuilder::LibXML's
I installer, like:
use HTML::TreeBuilder::LibXML;
HTML::TreeBuilder::LibXML->replace_original();
use Web::Scraper;
my $scraper = scraper { ... };
# this code uses LibXML parser
which overrides HTML::TreeBuilder::XPath's new() constructor so that
L of your code using HTML::TreeBuilder::XPath is switched to the
libxml based parser.
This module, instead, gives you more control over which TreeBuilder to
use, depending on the site etc.
=head1 SEE ALSO
L L
=cut
Web-Scraper-0.38/eg/dave-trailer-HD.pl 000755 000765 000024 00000001204 12421053223 020312 0 ustar 00miyagawa staff 000000 000000 #!/usr/bin/perl
use strict;
use warnings;
use lib "lib";
use Web::Scraper;
use URI;
use YAML;
# extract HD trailers from Dave's trailer page
my $uri = URI->new("http://www.drfoster.f2s.com/");
my $s = scraper {
process "td>ul>li", "trailers[]" => scraper {
process_first "li>b", title => "TEXT";
process_first "ul>li>a[href]", url => '@href';
process "ul>li>ul>li>a", "movies[]" => sub {
my $elem = shift;
return {
text => $elem->as_text,
href => $elem->attr('href'),
};
};
};
result "trailers";
};
warn Dump $s->scrape($uri);
Web-Scraper-0.38/eg/ebay-auction.pl 000755 000765 000024 00000001145 12421053223 020026 0 ustar 00miyagawa staff 000000 000000 #!/usr/bin/perl
use strict;
use warnings;
use URI;
use lib "lib";
use Web::Scraper;
my $ebay_auction = scraper {
process "h3.ens>a",
description => 'TEXT',
url => '@href';
process "td.ebcPr>span", price => "TEXT";
process "div.ebPicture >a>img", image => '@src';
result 'description', 'url', 'price', 'image';
};
my $ebay = scraper {
process "table.ebItemlist tr.single",
"auctions[]" => $ebay_auction;
result 'auctions';
};
my $auctions = $ebay->scrape( URI->new("http://search.ebay.com/apple-ipod-nano_W0QQssPageNameZWLRS") );
use YAML;
warn Dump $auctions;
Web-Scraper-0.38/eg/extract-links.pl 000755 000765 000024 00000000447 12421053223 020242 0 ustar 00miyagawa staff 000000 000000 #!/usr/bin/perl
use strict;
use warnings;
use URI;
use lib "lib";
use Web::Scraper;
my $uri = shift @ARGV or die "URI needed";
my $scraper = scraper {
process "a[href]", "urls[]" => '@href';
result 'urls';
};
my $links = $scraper->scrape(URI->new($uri));
use YAML;
warn Dump $links;
Web-Scraper-0.38/eg/hatena-keyword.pl 000755 000765 000024 00000001202 12421053223 020362 0 ustar 00miyagawa staff 000000 000000 #!/usr/bin/perl
use strict;
use warnings;
use lib "lib";
use URI;
use Web::Scraper;
# same as http://d.hatena.ne.jp/secondlife/20060922/1158923779
my $keyword = scraper {
process 'span.title > a:first-child', title => 'TEXT', url => '@href';
process 'span.furigana', furigana => 'TEXT';
process 'ul.list-circle > li:first-child > a', category => 'TEXT';
};
my $res = $keyword->scrape(URI->new("http://d.hatena.ne.jp/keyword/%BA%B0%CC%EE%A4%A2%A4%B5%C8%FE"));
use YAML;
warn Dump $res;
__END__
---
category: アイドル
furigana: こんのあさみ
title: 紺野あさ美
url: /keyword/%ba%b0%cc%ee%a4%a2%a4%b5%c8%fe?kid=800
Web-Scraper-0.38/eg/jp-playstation-store.pl 000755 000765 000024 00000000453 12421053223 021557 0 ustar 00miyagawa staff 000000 000000 #!/usr/bin/perl
use strict;
use Web::Scraper;
use URI;
use YAML;
my $stuff = URI->new("http://www.jp.playstation.com/store/");
my $scraper = scraper {
process "#Sinfo p a", 'news[]' => { link => '@href', title => 'TEXT' };
};
my $result = $scraper->scrape($stuff);
print YAML::Dump $result;
Web-Scraper-0.38/eg/rel-tag.pl 000755 000765 000024 00000000770 12421053223 017004 0 ustar 00miyagawa staff 000000 000000 #!/usr/bin/perl
# Extract tags from web pages that have rel-tag microformat
use strict;
use warnings;
use URI;
use URI::Escape;
use Web::Scraper;
use YAML;
my $uri = shift or die "Usage: rel-tag.pl URL\n";
my $scraper = scraper {
process 'a[rel~="tag"]', 'tags[]' => sub {
my $uri = URI->new($_->attr('href'));
my $label = (grep length, split '/', $uri->path)[-1];
$label =~ s/\+/%20/g;
uri_unescape($label);
};
};
warn Dump $scraper->scrape(URI->new($uri));
Web-Scraper-0.38/eg/twitter-friends.pl 000755 000765 000024 00000000723 12421053223 020601 0 ustar 00miyagawa staff 000000 000000 #!/usr/bin/perl
use strict;
use warnings;
use lib "lib";
use URI;
use Web::Scraper;
my $nick = shift || "miyagawa";
my $uri = URI->new("http://twitter.com/$nick");
my $twitter = scraper {
process 'a[rel=~"contact"]',
'friends[]' => scraper {
process 'a', url => '@href', name => '@title';
process 'img', src => '@src';
};
result 'friends';
};
my $friends = $twitter->scrape($uri);
use YAML;
warn Dump $friends;
Web-Scraper-0.38/bin/scraper 000755 000765 000024 00000005057 12421053223 016656 0 ustar 00miyagawa staff 000000 000000 #!/usr/bin/perl
use strict;
use warnings;
use Config;
use Term::ReadLine;
use Data::Dumper;
use HTML::Entities;
use URI;
use Web::Scraper;
use YAML;
sub WARN() {
return sub {
warn $_->isTextNode
? HTML::Entities::encode($_->as_XML, q("'<>&))
: $_->as_HTML(q('"&<>), "", {});
};
}
my $print = sub {
if ($ENV{PAGER}) {
open my $pager, "|$ENV{PAGER}";
print $pager @_;
} else {
print @_;
}
};
my(@stack, $source);
my $stuff = process_args($ARGV[0])
or die "Usage: scraper [URI-or-filename]\n";
my $term = Term::ReadLine->new("Web::Scraper");
my $scraper = scraper { run_loop($_[0], $term) };
$scraper->user_agent->env_proxy;
my $result = $scraper->scrape($stuff);
sub process_args {
my $uri = shift;
if (!-t STDIN and my $content = join "", ) {
$source = [ 'stdin' ];
return \$content;
} elsif ($uri && $uri =~ m!^https?://!) {
$source = [ "URI", $uri ];
return URI->new($uri);
} elsif ($uri && -e $uri) {
$source = [ 'file', $uri ];
open my $fh, "<", $uri or die "$uri: $!";
return join "", <$fh>;
}
return;
}
sub run_loop {
my($tree, $term) = @_;
while (defined(my $in = $term->readline("scraper> "))) {
if ($in eq 'd') {
$Data::Dumper::Indent = 1;
warn Dumper result;
} elsif ($in eq 'y') {
warn Dump result;
} elsif ($in eq 's') {
$print->($tree->as_HTML(q('"&<>), " ", {}));
} elsif ($in eq 'q') {
return;
} elsif ($in eq 'c') {
print generate_code($source, $stack[-1]);
} elsif ($in =~ /^c\s+all\s*$/) {
print generate_code($source, @stack);
} else {
my $res = eval $in;
warn $@ if $@;
push @stack, $in unless $@;
}
}
}
sub generate_code {
my($source, @stack) = @_;
my $code_stack = join "\n", map { " $_" . (/;$/ ? "" : ";") } @stack;
my($var, $stuff) =
$source->[0] eq 'stdin' ? ('$input', '\join "", ') :
$source->[0] eq 'URI' ? ('$uri', qq(URI->new("$source->[1]"))) :
$source->[0] eq 'file' ? ('$file', qq(\\do { my \$file = "$source->[1]"; open my \$fh, \$file or die "\$file: \$!"; join '', <\$fh> })) :
'...';
return <scrape($var);
CODE
}