Web-Scraper-0.38/000755 000765 000024 00000000000 12421053223 014512 5ustar00miyagawastaff000000 000000 Web-Scraper-0.38/bin/000755 000765 000024 00000000000 12421053223 015262 5ustar00miyagawastaff000000 000000 Web-Scraper-0.38/Build.PL000644 000765 000024 00000000261 12421053223 016005 0ustar00miyagawastaff000000 000000 # This Build.PL for Web-Scraper was generated by Dist::Zilla::Plugin::ModuleBuildTiny 0.007. use strict; use warnings; use 5.008001; use Module::Build::Tiny 0.039; Build_PL(); Web-Scraper-0.38/Changes000644 000765 000024 00000016171 12421053223 016013 0ustar00miyagawastaff000000 000000 Revision history for Perl extension Web::Scraper 0.38 2014-10-19 17:25:53 PDT - Improved documentation #8 (vti) - Add regexp filter #10 (creaktive) - Fix documentation error #16 0.37 Fri Oct 19 15:09:17 PDT 2012 - Repack with the latest Module::Install 0.36 Sat Nov 19 12:12:54 PST 2011 - Support HTML5 tags by not ignoring unknonw tags (leedo) 0.35 Mon Sep 26 18:40:06 PDT 2011 - Added support for comments() XPath #3 (Perlover) 0.34 Thu Feb 24 09:35:12 PST 2011 - Skip xml_simple.t if LibXML is not there (omega) 0.33 Thu Feb 17 09:12:55 PST 2011 - Remove failing invalid XPath tests 0.32 Wed Feb 3 22:13:01 PST 2010 - Removes poking around charset and LWP's decoded_content (Thanks to flatwhatson) - More docs (jshirley) 0.31 Sun Jul 19 00:43:54 PDT 2009 - Use new LWP's content_charset method instead of HTTP::Response::Encoding (Thanks to hanekomu) 0.30 Wed Jul 8 15:47:21 PDT 2009 - No warnings when use()d multiple times in the same package 0.29 Wed Jul 8 13:40:14 PDT 2009 - Adds Web::Scraper::LibXML which uses HTML::TreeBuilder::LibXML (without the replace_original hack) 0.28 Sat Mar 28 14:31:45 PDT 2009 - Call ->eof when parsing with HTML::TreeBuilder (Thanks to Tokuhiro Matsuno) 0.27 Tue Mar 24 12:09:04 PDT 2009 - Added tests to use HTML::TreeBuilder::LibXML (Thanks to Tokuhiro Matsuno) 0.26 Thu Jan 15 11:37:56 PST 2009 - Fixed an error message when GET request fails 0.25 Sun Jan 11 13:36:44 PST 2009 - scrape() now accepts HTTP::Response as well for Remedie/Plagger - repository moved to github http://github.com/miyagawa/web-scraper/tree/master 0.24 Sun Nov 25 15:58:38 PST 2007 - Support duck typing in filter args to take object that has 'filter' method This could give Web::Scraper::Filter::Pipe a better interface (Thanks to hanekomu and tokuhirom) 0.23 Sat Nov 24 17:21:14 PST 2007 - Upped Web::Scraper dependency - Skip & test until HTML::TreeBuilder::XPath fixes it - removed eg/search-cpan.pl 0.22 Wed Oct 17 17:51:54 PDT 2007 - 's' on scraper shell now prints to pager (e.g. less) if PAGER is set 0.21_01 Thu Oct 4 01:05:00 PDT 2007 - Added an experimental filter support (Thanks to hirose31, tokuhirom and Yappo for brainstorming) 0.21 Wed Oct 3 10:37:13 PDT 2007 - Bumped up HTML::TreeBuilder dependency to fix 12_html.t issues [rt.cpan.org #29733] 0.20 Wed Oct 3 00:28:13 PDT 2007 - Fixed a bug where URI is not absolutized with a hash reference value - Added eg/jp-playstation-store.pl 0.19 Thu Sep 20 22:42:30 PDT 2007 - Try to get HTML encoding from META tags as well, when there's no charset value in HTTP response header. 0.18 Thu Sep 20 19:49:11 PDT 2007 - Fixed a bug where URI is not absolutized when scraper is nested - Use as_XML not as_HTML in 'RAW' 0.17 Wed Sep 19 19:12:25 PDT 2007 - Reverted Term::Encoding support since it causes segfaults (double utf-8 encoding) in some environment 0.16 Tue Sep 18 04:48:47 PDT 2007 - Support 'RAW' and 'TEXT' for TextNode object - Call Term::Encoding from scraper shell if installed 0.15 Sat Sep 15 21:28:10 PDT 2007 - Call env_proxy in scraper CLI - Added $Web::Scraper::UserAgent and $scraper->user_agent accessor to deal with UserAgent object - Don't escape non-ASCII characters into &#xXXXX; in scraper shell 's' and WARN 0.14 Fri Sep 14 16:06:20 PDT 2007 - Fix bin/scraper to work with older Term::ReadLine. (Thanks to Tina Müller [RT:29079]) - Now link elements like img@src and a@href are automatically converted to absolute URI using the current URI as a base. Only effective when you do $s->scrape(URI) or $s->scrape(\$html, URI) - Added 'HTML' and its alias 'RAW' to get the HTML chunk inside the tag process "script", "code" => 'RAW'; Handy if you want the raw HTML code inside --- selector script --- expected function foo() { return bar; } === a --- html foo bar --- selector a --- expected foo bar === div --- html

foo bar

bar

--- selector #foo --- expected

foo bar

bar

=== non-ascii --- html

テスト

--- selector #foo --- expected テスト === textarea --- html --- selector textarea --- expected \n foo bar \n baz Web-Scraper-0.38/t/13_textnode.t000644 000765 000024 00000001520 12421053223 017275 0ustar00miyagawastaff000000 000000 use strict; use Test::Base; use Web::Scraper; plan tests => 1 * blocks; filters { selector => 'chomp', want => 'chomp', expected => 'chomp', }; run { my $block = shift; my $s = scraper { process $block->selector, want => $block->want; result 'want'; }; my $want = $s->scrape($block->html); is $want, $block->expected, $block->name; }; __DATA__ === TEXT --- html

foo bar

--- selector //p/node()[2] --- want TEXT --- expected bar === TEXT --- html

foo bar

--- selector //p/node()[2] --- want TEXT --- expected bar === TEXT --- html

foo bar & baz

--- selector //p/node()[2] --- want TEXT --- expected bar & baz === RAW HTML --- SKIP --- html

foo bar & baz

--- selector //p/node()[2] --- want RAW --- expected bar & baz Web-Scraper-0.38/t/14_absolute_nested.t000644 000765 000024 00000001157 12421053223 020632 0ustar00miyagawastaff000000 000000 use strict; use Test::Base; use Web::Scraper; plan tests => 1 * blocks; filters { selector => 'chomp', expected => 'chomp', }; run { my $block = shift; my $s = scraper { process $block->selector, want => scraper { process "img", image => '@src'; result "image"; }; result 'want'; }; my $want = $s->scrape($block->html, $block->url); is $want, $block->expected, $block->name; }; __DATA__ === --- url: http://example.com/ --- html --- selector a#foo --- expected http://example.com/foo.jpg Web-Scraper-0.38/t/15_absolute_hash.t000644 000765 000024 00000001041 12421053223 020264 0ustar00miyagawastaff000000 000000 use strict; use Test::Base; use Web::Scraper; plan tests => 1 * blocks; filters { expected => 'chomp', }; run { my $block = shift; my $s = scraper { process $block->selector, 'want[]' => { link => '@href' }; result 'want'; }; my $want = $s->scrape($block->html, $block->url); is $want->[0]->{link}, $block->expected, $block->name; }; __DATA__ === --- url: http://example.com/ --- html --- selector a#foo --- expected http://example.com/foo.html Web-Scraper-0.38/t/16_filter.t000644 000765 000024 00000002535 12421053223 016742 0ustar00miyagawastaff000000 000000 use strict; use Test::Base; use Web::Scraper; plan tests => 1 * blocks; filters { expected => 'chomp', want => 'eval', }; run { my $block = shift; my $s = scraper { process 'a', want => $block->want; result 'want'; }; my $want = $s->scrape('foo'); my $expected = $block->expected eq 'undef' ? undef : $block->expected; is $want, $expected, $block->name; }; BEGIN { package Web::Scraper::Filter::foo; use base qw( Web::Scraper::Filter ); sub filter { tr/a-z/b-za/ } package Web::Scraper::Filter::bar; use base qw( Web::Scraper::Filter ); sub filter { $_[1] . 'bar' } } package main; __DATA__ === tr --- want ['TEXT', 'foo'] --- expected gpp === shift + return --- want ['TEXT', 'bar'] --- expected foobar === inline callback --- want ['TEXT', sub { return "baz" } ] --- expected baz === inline callback + s/// --- want ['TEXT', sub { s/foo/bax/ } ] --- expected bax === stack --- want ['TEXT', 'bar', 'foo' ] --- expected gppcbs === stack --- want ['TEXT', 'bar', sub { s/foo/bar/ } ] --- expected barbar === no match --- want ['TEXT', sub { s/xxx/yyy/g }] --- expected foo === undef --- want ['TEXT', sub { return }] --- expected undef === number --- want ['TEXT', sub { return 3 }] --- expected 3 === object --- want ['TEXT', Web::Scraper::Filter::foo->new] --- expected gpp Web-Scraper-0.38/t/17_filter_loop.t000644 000765 000024 00000000702 12421053223 017766 0ustar00miyagawastaff000000 000000 use strict; use Test::Base; use Web::Scraper; plan tests => 1 * blocks; filters { expected => 'yaml', want => 'eval', }; run { my $block = shift; my $s = scraper { process 'a', 'want[]' => $block->want; result 'want'; }; my $want = $s->scrape('foobar'); is_deeply $want, $block->expected, $block->name; }; __DATA__ === tr --- want ['TEXT', sub { tr/a-z/b-za/ }] --- expected - gpp - cbs Web-Scraper-0.38/t/18_http_response.t000644 000765 000024 00000000636 12421053223 020354 0ustar00miyagawastaff000000 000000 use strict; use warnings; use URI; use LWP::UserAgent; use Web::Scraper; use Test::More; plan skip_all => "LIVE_TEST not enabled" unless $ENV{LIVE_TEST} || $ENV{TEST_ALL}; plan tests => 1; my $ua = LWP::UserAgent->new; { my $res = $ua->get("http://www.yahoo.co.jp/"); my $result = scraper { process 'title', title => 'TEXT'; }->scrape($res); is $result->{title}, 'Yahoo! JAPAN'; } Web-Scraper-0.38/t/19_decode_content.t000644 000765 000024 00000000720 12421053223 020427 0ustar00miyagawastaff000000 000000 use strict; use warnings; use URI; use LWP::UserAgent; use Web::Scraper; use Test::More; plan skip_all => "LIVE_TEST not enabled" unless $ENV{LIVE_TEST} || $ENV{TEST_ALL}; plan tests => 1; my $ua = LWP::UserAgent->new; $ua->default_header('Accept-Encoding' => 'gzip'); { my $res = $ua->get("http://www.yahoo.co.jp/"); my $result = scraper { process 'title', title => 'TEXT'; }->scrape($res); is $result->{title}, 'Yahoo! JAPAN'; } Web-Scraper-0.38/t/20_comment_nodes.t000644 000765 000024 00000002272 12421053223 020300 0ustar00miyagawastaff000000 000000 use strict; use Test::Base; use utf8; use Web::Scraper; plan skip_all => "Please upgrade HTML::TreeBuilder::XPath and HTML::TreeBuilder::LibXML modules for comment nodes supporting" unless eval "use HTML::TreeBuilder::XPath 0.14; 1" && eval "use HTML::TreeBuilder::LibXML 0.13; 1"; plan tests => 1 * blocks; filters { selector => 'chomp', expected => [ 'chomp', 'newline' ], html => 'newline', }; sub newline { s/\\n\n/\n/g; } # For turning off of "Wide character warnings if test failed" my $builder = Test::More->builder; binmode $builder->output, ":utf8"; binmode $builder->failure_output, ":utf8"; binmode $builder->todo_output, ":utf8"; run { my $block = shift; my $s = scraper { process $block->selector, want => 'TEXT'; result 'want'; }; my $want = $s->scrape($block->html); is $want, $block->expected, $block->name; }; __DATA__ === comment in p --- html

This is a paragraph bla bla bla

--- selector //p/comment() --- expected This is the comment === non-ascii comment --- html

Bla bla bla

--- selector //p/comment() --- expected テスト Web-Scraper-0.38/t/21_html5.t000644 000765 000024 00000001157 12421053223 016501 0ustar00miyagawastaff000000 000000 use strict; use Test::Base; use utf8; use Web::Scraper; plan tests => 1 * blocks; filters { selector => 'chomp', expected => [ 'chomp', 'newline' ], html => 'newline', }; sub newline { s/\\n\n/\n/g; } run { my $block = shift; my $s = scraper { process $block->selector, want => 'HTML'; result 'want'; }; my $want = $s->scrape($block->html); is $want, $block->expected, $block->name; }; __DATA__ === header --- html
hello
--- selector header --- expected hello === section --- html
hello
--- selector header --- expected hello Web-Scraper-0.38/t/22_filter_regex.t000644 000765 000024 00000001475 12421053223 020133 0ustar00miyagawastaff000000 000000 use strict; use Test::Base; use Web::Scraper; plan tests => 1 * blocks; filters { expected => 'yaml', want => 'eval', }; run { my $block = shift; return pass("no named grouping in Perl $]") if $] < 5.010 and $block->name eq 'named'; my $s = scraper { process 'a', 'want[]' => $block->want; result 'want'; }; my $want = $s->scrape('foo=123bar=456'); is_deeply $want, $block->expected, $block->name; }; __DATA__ === unnamed --- want [ TEXT => qr/(\d+)/ ] --- expected - 123 - 456 === named --- want [ TEXT => qr/^(?\w+)=(?\d+)$/ ] --- expected - name: foo value: 123 - name: bar value: 456 === boolean --- want [ TEXT => qr/BAR/i ] --- expected - - 1 === stack --- want [ TEXT => qr/(\w+)/ => sub { ucfirst } ] --- expected - Foo - Bar Web-Scraper-0.38/t/redefine.t000644 000765 000024 00000000177 12421053223 016730 0ustar00miyagawastaff000000 000000 BEGIN { use Test::More 'no_plan'; $SIG{__WARN__} = sub { fail shift }; } use Web::Scraper; use Web::Scraper; ok 1; Web-Scraper-0.38/t/release-pod-syntax.t000644 000765 000024 00000000456 12421053223 020673 0ustar00miyagawastaff000000 000000 #!perl BEGIN { unless ($ENV{RELEASE_TESTING}) { require Test::More; Test::More::plan(skip_all => 'these tests are for release candidate testing'); } } # This file was automatically generated by Dist::Zilla::Plugin::PodSyntaxTests. use Test::More; use Test::Pod 1.41; all_pod_files_ok(); Web-Scraper-0.38/t/xml-simple.t000644 000765 000024 00000001547 12421053223 017240 0ustar00miyagawastaff000000 000000 use strict; use Test::Requires qw(HTML::TreeBuilder::LibXML); use Test::Base; use Web::Scraper::LibXML; filters { expected => [ 'lines', 'chomp' ] }; plan tests => 1 * blocks; run { my $block = shift; my $s = scraper { process $block->selector, "value[]", $block->get; }; my $r = $s->scrape($block->input); is_deeply $r->{value}, [ $block->expected ]; }; __END__ === --- input bar --- selector: foo --- get: TEXT --- expected bar === --- input baz bax --- selector: foo>bar --- get: TEXT --- expected baz bax === --- input --- selector: bar --- get: @attr --- expected test bar Hello & World Web-Scraper-0.38/lib/Web/000755 000765 000024 00000000000 12421053223 015775 5ustar00miyagawastaff000000 000000 Web-Scraper-0.38/lib/Web/Scraper/000755 000765 000024 00000000000 12421053223 017374 5ustar00miyagawastaff000000 000000 Web-Scraper-0.38/lib/Web/Scraper.pm000644 000765 000024 00000033610 12421053223 017735 0ustar00miyagawastaff000000 000000 package Web::Scraper; use strict; use warnings; use 5.008001; use Carp; use Scalar::Util qw(blessed); use List::Util qw(first); use HTML::Entities; use HTML::Tagset; use HTML::TreeBuilder::XPath; use HTML::Selector::XPath; use UNIVERSAL::require; our $VERSION = '0.38'; sub import { my $class = shift; my $pkg = caller; no strict 'refs'; no warnings 'redefine'; *{"$pkg\::scraper"} = _build_scraper($class); *{"$pkg\::process"} = sub { goto &process }; *{"$pkg\::process_first"} = sub { goto &process_first }; *{"$pkg\::result"} = sub { goto &result }; } our $UserAgent; sub __ua { require LWP::UserAgent; $UserAgent ||= LWP::UserAgent->new(agent => __PACKAGE__ . "/" . $VERSION); $UserAgent; } sub user_agent { my $self = shift; $self->{user_agent} = shift if @_; $self->{user_agent} || __ua; } sub define { my($class, $coderef) = @_; bless { code => $coderef }, $class; } sub _build_scraper { my $class = shift; return sub(&) { my($coderef) = @_; bless { code => $coderef }, $class; }; } sub scrape { my $self = shift; my($stuff, $current) = @_; my($html, $tree); if (blessed($stuff) && $stuff->isa('URI')) { my $ua = $self->user_agent; my $res = $ua->get($stuff); return $self->scrape($res, $stuff->as_string); } elsif (blessed($stuff) && $stuff->isa('HTTP::Response')) { if ($stuff->is_success) { $html = $stuff->decoded_content; } else { croak "GET " . $stuff->request->uri . " failed: ", $stuff->status_line; } $current ||= $stuff->request->uri; } elsif (blessed($stuff) && $stuff->isa('HTML::Element')) { $tree = $stuff->clone; } elsif (ref($stuff) && ref($stuff) eq 'SCALAR') { $html = $$stuff; } else { $html = $stuff; } $tree ||= $self->build_tree($html); my $stash = {}; no warnings 'redefine'; local *process = create_process(0, $tree, $stash, $current); local *process_first = create_process(1, $tree, $stash, $current); my $retval; local *result = sub { $retval++; my @keys = @_; if (@keys == 1) { return $stash->{$keys[0]}; } elsif (@keys) { my %res; @res{@keys} = @{$stash}{@keys}; return \%res; } else { return $stash; } }; my $ret = $self->{code}->($tree); $tree->delete; # check user specified return value return $ret if $retval; return $stash; } sub build_tree { my($self, $html) = @_; my $t = HTML::TreeBuilder::XPath->new; $t->store_comments(1) if ($t->can('store_comments')); $t->ignore_unknown(0); $t->parse($html); $t->eof; $t; } sub create_process { my($first, $tree, $stash, $uri) = @_; sub { my($exp, @attr) = @_; my $xpath = $exp =~ m!^(?:/|id\()! ? $exp : HTML::Selector::XPath::selector_to_xpath($exp); my @nodes = eval { local $SIG{__WARN__} = sub { }; $tree->findnodes($xpath); }; if ($@) { die "'$xpath' doesn't look like a valid XPath expression: $@"; } @nodes or return; @nodes = ($nodes[0]) if $first; while (my($key, $val) = splice(@attr, 0, 2)) { if (!defined $val) { if (ref($key) && ref($key) eq 'CODE') { for my $node (@nodes) { local $_ = $node; $key->($node); } } else { die "Don't know what to do with $key => undef"; } } elsif ($key =~ s!\[\]$!!) { $stash->{$key} = [ map __get_value($_, $val, $uri), @nodes ]; } else { $stash->{$key} = __get_value($nodes[0], $val, $uri); } } return; }; } sub __get_value { my($node, $val, $uri) = @_; if (ref($val) && ref($val) eq 'CODE') { local $_ = $node; return $val->($node); } elsif (blessed($val) && $val->isa('Web::Scraper')) { return $val->scrape($node, $uri); } elsif ($val =~ s!^@!!) { my $value = $node->attr($val); if ($uri && is_link_element($node, $val)) { require URI; $value = URI->new_abs($value, $uri); } return $value; } elsif (lc($val) eq 'content' || lc($val) eq 'text') { # getValue method is used for getting a content of comment nodes # from HTML::TreeBuilder::XPath (version >= 0.14) # or HTML::TreeBuilder::LibXML (version >= 0.13) # getValue method works like as_text in both modules # for other node types return $node->isTextNode ? $node->string_value : ($node->can('getValue') ? $node->getValue : $node->as_text); } elsif (lc($val) eq 'raw' || lc($val) eq 'html') { if ($node->isTextNode) { if ($HTML::TreeBuilder::XPath::VERSION < 0.09) { return HTML::Entities::encode($node->as_XML, q("'<>&)); } else { return $node->as_XML; } } my $html = $node->as_XML; $html =~ s!^<.*?>!!; $html =~ s!\s*\n*$!!; return $html; } elsif (ref($val) eq 'HASH') { my $values; for my $key (keys %$val) { $values->{$key} = __get_value($node, $val->{$key}, $uri); } return $values; } elsif (ref($val) eq 'ARRAY') { my $how = $val->[0]; my $value = __get_value($node, $how, $uri); for my $filter (@$val[1..$#$val]) { $value = run_filter($value, $filter); } return $value; } else { Carp::croak "Unknown value type $val"; } } sub run_filter { my($value, $filter) = @_; ## sub { s/foo/bar/g } is a valid filter ## sub { DateTime::Format::Foo->parse_string(shift) } is valid too my $callback; my $module; if (ref($filter) eq 'CODE') { $callback = $filter; $module = "$filter"; } elsif (ref($filter) eq 'Regexp') { $callback = sub { my @unnamed = shift =~ /$filter/x; if (%+) { return { %+ }; } elsif (@unnamed) { return shift @unnamed; } else { return; } }; $module = "$filter"; } elsif (!ref($filter)) { $module = $filter =~ s/^\+// ? $filter : "Web::Scraper::Filter::$filter"; unless ($module->isa('Web::Scraper::Filter')) { $module->require or Carp::croak("Loading $module: $@"); } $callback = sub { $module->new->filter(shift) }; } elsif (blessed($filter) && $filter->can('filter')) { $callback = sub { $filter->filter(shift) }; } else { Carp::croak("Don't know filter type $filter"); } local $_ = $value; my $retval = eval { $callback->($value) }; if ($@) { Carp::croak("Filter $module had an error: $@"); } no warnings 'uninitialized'; # sub { s/foo/bar/ } returns number or PL_sv_no which is stringified to '' if (($retval =~ /^\d+$/ and $_ ne $value) or (defined($retval) and $retval eq '')) { $value = $_; } else { $value = $retval; } return $value; } sub is_link_element { my($node, $attr) = @_; my $link_elements = $HTML::Tagset::linkElements{$node->tag} || []; for my $elem (@$link_elements) { return 1 if $attr eq $elem; } return; } sub __stub { my $func = shift; return sub { croak "Can't call $func() outside scraper block"; }; } *process = __stub 'process'; *process_first = __stub 'process_first'; *result = __stub 'result'; 1; __END__ =for stopwords API SCRAPI Scrapi =head1 NAME Web::Scraper - Web Scraping Toolkit using HTML and CSS Selectors or XPath expressions =head1 SYNOPSIS use URI; use Web::Scraper; use Encode; # First, create your scraper block my $authors = scraper { # Parse all TDs inside 'table[width="100%]"', store them into # an array 'authors'. We embed other scrapers for each TD. process 'table[width="100%"] td', "authors[]" => scraper { # And, in each TD, # get the URI of "a" element process "a", uri => '@href'; # get text inside "small" element process "small", fullname => 'TEXT'; }; }; my $res = $authors->scrape( URI->new("http://search.cpan.org/author/?A") ); # iterate the array 'authors' for my $author (@{$res->{authors}}) { # output is like: # Andy Adler http://search.cpan.org/~aadler/ # Aaron K Dancygier http://search.cpan.org/~aakd/ # Aamer Akhter http://search.cpan.org/~aakhter/ print Encode::encode("utf8", "$author->{fullname}\t$author->{uri}\n"); } The structure would resemble this (visually) { authors => [ { fullname => $fullname, link => $uri }, { fullname => $fullname, link => $uri }, ] } =head1 DESCRIPTION Web::Scraper is a web scraper toolkit, inspired by Ruby's equivalent Scrapi. It provides a DSL-ish interface for traversing HTML documents and returning a neatly arranged Perl data structure. The I and I blocks provide a method to define what segments of a document to extract. It understands HTML and CSS Selectors as well as XPath expressions. =head1 METHODS =head2 scraper $scraper = scraper { ... }; Creates a new Web::Scraper object by wrapping the DSL code that will be fired when I method is called. =head2 scrape $res = $scraper->scrape(URI->new($uri)); $res = $scraper->scrape($html_content); $res = $scraper->scrape(\$html_content); $res = $scraper->scrape($http_response); $res = $scraper->scrape($html_element); Retrieves the HTML from URI, HTTP::Response, HTML::Tree or text strings and creates a DOM object, then fires the callback scraper code to retrieve the data structure. If you pass URI or HTTP::Response object, Web::Scraper will automatically guesses the encoding of the content by looking at Content-Type headers and META tags. Otherwise you need to decode the HTML to Unicode before passing it to I method. You can optionally pass the base URL when you pass the HTML content as a string instead of URI or HTTP::Response. $res = $scraper->scrape($html_content, "http://example.com/foo"); This way Web::Scraper can resolve the relative links found in the document. =head2 process scraper { process "tag.class", key => 'TEXT'; process '//tag[contains(@foo, "bar")]', key2 => '@attr'; process '//comment()', 'comments[]' => 'TEXT'; }; I is the method to find matching elements from HTML with CSS selector or XPath expression, then extract text or attributes into the result stash. If the first argument begins with "//" or "id(" it's treated as an XPath expression and otherwise CSS selector. # 2008/12/21 # date => "2008/12/21" process ".date", date => 'TEXT'; # # link => URI->new("http://example.com/") process ".body > a", link => '@href'; # # comment => " HTML Comment here " # # NOTES: A comment nodes are accessed when installed # the HTML::TreeBuilder::XPath (version >= 0.14) and/or # the HTML::TreeBuilder::LibXML (version >= 0.13) process "//div[contains(@class, 'body')]/comment()", comment => 'TEXT'; # # link => URI->new("http://example.com/"), text => "foo" process ".body > a", link => '@href', text => 'TEXT'; #
  • foo
  • bar
# list => [ "foo", "bar" ] process "li", "list[]" => "TEXT"; #
  • foo
  • bar
# list => [ { id => "1", text => "foo" }, { id => "2", text => "bar" } ]; process "li", "list[]" => { id => '@id', text => "TEXT" }; =head2 process_first C is the same as C but stops when the first matching result is found. # 2008/12/21 # 2008/12/22 # date => "2008/12/21" process_first ".date", date => 'TEXT'; =head2 result C allows to return not the default value after processing but a single value specified by a key or a hash reference built from several keys. process 'a', 'want[]' => 'TEXT'; result 'want'; =head1 EXAMPLES There are many examples in the C dir packaged in this distribution. It is recommended to look through these. =head1 NESTED SCRAPERS Scrapers can be nested thus allowing to scrape already captured data. # # friends => [ {href => 'foo1'}, {href => 'foo2'} ]; process 'li', 'friends[]' => scraper { process 'a', href => '@href', }; =head1 FILTERS Filters are applied to the result after processing. They can be declared as anonymous subroutines or as class names. process $exp, $key => [ 'TEXT', sub { s/foo/bar/ } ]; process $exp, $key => [ 'TEXT', 'Something' ]; process $exp, $key => [ 'TEXT', '+MyApp::Filter::Foo' ]; Filters can be stacked process $exp, $key => [ '@href', 'Foo', '+MyApp::Filter::Bar', \&baz ]; More about filters you can find in L documentation. =head1 XML backends By default L is used, this can be replaces by a L backend using L module. use Web::Scraper::LibXML; # same as Web::Scraper my $scraper = scraper { ... }; =head1 AUTHOR Tatsuhiko Miyagawa Emiyagawa@bulknews.netE =head1 LICENSE This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =head1 SEE ALSO L L =cut Web-Scraper-0.38/lib/Web/Scraper/Filter.pm000644 000765 000024 00000003123 12421053223 021156 0ustar00miyagawastaff000000 000000 package Web::Scraper::Filter; use strict; use warnings; sub new { my $class = shift; bless {}, $class; } 1; __END__ =for stopwords namespace inline callback =head1 NAME Web::Scraper::Filter - Base class for Web::Scraper filters =head1 SYNOPSIS package Web::Scraper::Filter::YAML; use base qw( Web::Scraper::Filter ); use YAML (); sub filter { my($self, $value) = @_; YAML::Load($value); } 1; use Web::Scraper; my $scraper = scraper { process ".yaml-code", data => [ 'TEXT', 'YAML' ]; }; =head1 DESCRIPTION Web::Scraper::Filter is a base class for text filters in Web::Scraper. You can create your own text filter by subclassing this module. There are two ways to create and use your custom filter. If you name your filter Web::Scraper::Filter::Something, you just call: process $exp, $key => [ 'TEXT', 'Something' ]; If you declare your filter under your own namespace, like 'MyApp::Filter::Foo', process $exp, $key => [ 'TEXT', '+MyApp::Filter::Foo' ]; You can also inline your filter function or regexp without creating a filter class: process $exp, $key => [ 'TEXT', sub { s/foo/bar/ } ]; process $exp, $key => [ 'TEXT', qr/Price: (\d+)/ ]; process $exp, $key => [ 'TEXT', qr/(?\w+): (?\w+)/ ]; Note that this function munges C<$_> and returns the count of replacement. Filter code special cases if the return value of the callback is number and C<$_> value is updated. You can, of course, stack filters like: process $exp, $key => [ '@href', 'Foo', '+MyApp::Filter::Bar', \&baz ]; =head1 AUTHOR Tatsuhiko Miyagawa =cut Web-Scraper-0.38/lib/Web/Scraper/LibXML.pm000644 000765 000024 00000002263 12421053223 021024 0ustar00miyagawastaff000000 000000 package Web::Scraper::LibXML; use strict; use base qw( Web::Scraper ); use HTML::TreeBuilder::LibXML; sub build_tree { my($self, $html) = @_; my $t = HTML::TreeBuilder::LibXML->new; $t->parse($html); $t->eof; $t; } 1; __END__ =head1 NAME Web::Scraper::LibXML - Drop-in replacement for Web::Scraper to use LibXML =head1 SYNOPSIS use Web::Scraper::LibXML; # same as Web::Scraper my $scraper = scraper { ... }; =head1 DESCRIPTION Web::Scraper::LibXML is a drop-in replacement for Web::Scraper to use the fast libxml-based HTML tree builder, HTML::TreeBuilder::LibXML. This is almost identical to HTML::TreeBuilder::LibXML's I installer, like: use HTML::TreeBuilder::LibXML; HTML::TreeBuilder::LibXML->replace_original(); use Web::Scraper; my $scraper = scraper { ... }; # this code uses LibXML parser which overrides HTML::TreeBuilder::XPath's new() constructor so that L of your code using HTML::TreeBuilder::XPath is switched to the libxml based parser. This module, instead, gives you more control over which TreeBuilder to use, depending on the site etc. =head1 SEE ALSO L L =cut Web-Scraper-0.38/eg/dave-trailer-HD.pl000755 000765 000024 00000001204 12421053223 020312 0ustar00miyagawastaff000000 000000 #!/usr/bin/perl use strict; use warnings; use lib "lib"; use Web::Scraper; use URI; use YAML; # extract HD trailers from Dave's trailer page my $uri = URI->new("http://www.drfoster.f2s.com/"); my $s = scraper { process "td>ul>li", "trailers[]" => scraper { process_first "li>b", title => "TEXT"; process_first "ul>li>a[href]", url => '@href'; process "ul>li>ul>li>a", "movies[]" => sub { my $elem = shift; return { text => $elem->as_text, href => $elem->attr('href'), }; }; }; result "trailers"; }; warn Dump $s->scrape($uri); Web-Scraper-0.38/eg/ebay-auction.pl000755 000765 000024 00000001145 12421053223 020026 0ustar00miyagawastaff000000 000000 #!/usr/bin/perl use strict; use warnings; use URI; use lib "lib"; use Web::Scraper; my $ebay_auction = scraper { process "h3.ens>a", description => 'TEXT', url => '@href'; process "td.ebcPr>span", price => "TEXT"; process "div.ebPicture >a>img", image => '@src'; result 'description', 'url', 'price', 'image'; }; my $ebay = scraper { process "table.ebItemlist tr.single", "auctions[]" => $ebay_auction; result 'auctions'; }; my $auctions = $ebay->scrape( URI->new("http://search.ebay.com/apple-ipod-nano_W0QQssPageNameZWLRS") ); use YAML; warn Dump $auctions; Web-Scraper-0.38/eg/extract-links.pl000755 000765 000024 00000000447 12421053223 020242 0ustar00miyagawastaff000000 000000 #!/usr/bin/perl use strict; use warnings; use URI; use lib "lib"; use Web::Scraper; my $uri = shift @ARGV or die "URI needed"; my $scraper = scraper { process "a[href]", "urls[]" => '@href'; result 'urls'; }; my $links = $scraper->scrape(URI->new($uri)); use YAML; warn Dump $links; Web-Scraper-0.38/eg/hatena-keyword.pl000755 000765 000024 00000001202 12421053223 020362 0ustar00miyagawastaff000000 000000 #!/usr/bin/perl use strict; use warnings; use lib "lib"; use URI; use Web::Scraper; # same as http://d.hatena.ne.jp/secondlife/20060922/1158923779 my $keyword = scraper { process 'span.title > a:first-child', title => 'TEXT', url => '@href'; process 'span.furigana', furigana => 'TEXT'; process 'ul.list-circle > li:first-child > a', category => 'TEXT'; }; my $res = $keyword->scrape(URI->new("http://d.hatena.ne.jp/keyword/%BA%B0%CC%EE%A4%A2%A4%B5%C8%FE")); use YAML; warn Dump $res; __END__ --- category: アイドル furigana: こんのあさみ title: 紺野あさ美 url: /keyword/%ba%b0%cc%ee%a4%a2%a4%b5%c8%fe?kid=800 Web-Scraper-0.38/eg/jp-playstation-store.pl000755 000765 000024 00000000453 12421053223 021557 0ustar00miyagawastaff000000 000000 #!/usr/bin/perl use strict; use Web::Scraper; use URI; use YAML; my $stuff = URI->new("http://www.jp.playstation.com/store/"); my $scraper = scraper { process "#Sinfo p a", 'news[]' => { link => '@href', title => 'TEXT' }; }; my $result = $scraper->scrape($stuff); print YAML::Dump $result; Web-Scraper-0.38/eg/rel-tag.pl000755 000765 000024 00000000770 12421053223 017004 0ustar00miyagawastaff000000 000000 #!/usr/bin/perl # Extract tags from web pages that have rel-tag microformat use strict; use warnings; use URI; use URI::Escape; use Web::Scraper; use YAML; my $uri = shift or die "Usage: rel-tag.pl URL\n"; my $scraper = scraper { process 'a[rel~="tag"]', 'tags[]' => sub { my $uri = URI->new($_->attr('href')); my $label = (grep length, split '/', $uri->path)[-1]; $label =~ s/\+/%20/g; uri_unescape($label); }; }; warn Dump $scraper->scrape(URI->new($uri)); Web-Scraper-0.38/eg/twitter-friends.pl000755 000765 000024 00000000723 12421053223 020601 0ustar00miyagawastaff000000 000000 #!/usr/bin/perl use strict; use warnings; use lib "lib"; use URI; use Web::Scraper; my $nick = shift || "miyagawa"; my $uri = URI->new("http://twitter.com/$nick"); my $twitter = scraper { process 'a[rel=~"contact"]', 'friends[]' => scraper { process 'a', url => '@href', name => '@title'; process 'img', src => '@src'; }; result 'friends'; }; my $friends = $twitter->scrape($uri); use YAML; warn Dump $friends; Web-Scraper-0.38/bin/scraper000755 000765 000024 00000005057 12421053223 016656 0ustar00miyagawastaff000000 000000 #!/usr/bin/perl use strict; use warnings; use Config; use Term::ReadLine; use Data::Dumper; use HTML::Entities; use URI; use Web::Scraper; use YAML; sub WARN() { return sub { warn $_->isTextNode ? HTML::Entities::encode($_->as_XML, q("'<>&)) : $_->as_HTML(q('"&<>), "", {}); }; } my $print = sub { if ($ENV{PAGER}) { open my $pager, "|$ENV{PAGER}"; print $pager @_; } else { print @_; } }; my(@stack, $source); my $stuff = process_args($ARGV[0]) or die "Usage: scraper [URI-or-filename]\n"; my $term = Term::ReadLine->new("Web::Scraper"); my $scraper = scraper { run_loop($_[0], $term) }; $scraper->user_agent->env_proxy; my $result = $scraper->scrape($stuff); sub process_args { my $uri = shift; if (!-t STDIN and my $content = join "", ) { $source = [ 'stdin' ]; return \$content; } elsif ($uri && $uri =~ m!^https?://!) { $source = [ "URI", $uri ]; return URI->new($uri); } elsif ($uri && -e $uri) { $source = [ 'file', $uri ]; open my $fh, "<", $uri or die "$uri: $!"; return join "", <$fh>; } return; } sub run_loop { my($tree, $term) = @_; while (defined(my $in = $term->readline("scraper> "))) { if ($in eq 'd') { $Data::Dumper::Indent = 1; warn Dumper result; } elsif ($in eq 'y') { warn Dump result; } elsif ($in eq 's') { $print->($tree->as_HTML(q('"&<>), " ", {})); } elsif ($in eq 'q') { return; } elsif ($in eq 'c') { print generate_code($source, $stack[-1]); } elsif ($in =~ /^c\s+all\s*$/) { print generate_code($source, @stack); } else { my $res = eval $in; warn $@ if $@; push @stack, $in unless $@; } } } sub generate_code { my($source, @stack) = @_; my $code_stack = join "\n", map { " $_" . (/;$/ ? "" : ";") } @stack; my($var, $stuff) = $source->[0] eq 'stdin' ? ('$input', '\join "", ') : $source->[0] eq 'URI' ? ('$uri', qq(URI->new("$source->[1]"))) : $source->[0] eq 'file' ? ('$file', qq(\\do { my \$file = "$source->[1]"; open my \$fh, \$file or die "\$file: \$!"; join '', <\$fh> })) : '...'; return <scrape($var); CODE }