Web-Scraper-0.37/000755 000765 000024 00000000000 12040350116 014507 5ustar00miyagawastaff000000 000000 Web-Scraper-0.37/.gitignore000644 000765 000024 00000000063 11162227072 016506 0ustar00miyagawastaff000000 000000 META.yml Makefile inc/ pm_to_blib *~ t/libxml-*.t Web-Scraper-0.37/bin/000755 000765 000024 00000000000 12040350116 015257 5ustar00miyagawastaff000000 000000 Web-Scraper-0.37/Changes000644 000765 000024 00000015736 12040350075 016022 0ustar00miyagawastaff000000 000000 Revision history for Perl extension Web::Scraper 0.37 Fri Oct 19 15:09:17 PDT 2012 - Repack with the latest Module::Install 0.36 Sat Nov 19 12:12:54 PST 2011 - Support HTML5 tags by not ignoring unknonw tags (leedo) 0.35 Mon Sep 26 18:40:06 PDT 2011 - Added support for comments() XPath #3 (Perlover) 0.34 Thu Feb 24 09:35:12 PST 2011 - Skip xml_simple.t if LibXML is not there (omega) 0.33 Thu Feb 17 09:12:55 PST 2011 - Remove failing invalid XPath tests 0.32 Wed Feb 3 22:13:01 PST 2010 - Removes poking around charset and LWP's decoded_content (Thanks to flatwhatson) - More docs (jshirley) 0.31 Sun Jul 19 00:43:54 PDT 2009 - Use new LWP's content_charset method instead of HTTP::Response::Encoding (Thanks to hanekomu) 0.30 Wed Jul 8 15:47:21 PDT 2009 - No warnings when use()d multiple times in the same package 0.29 Wed Jul 8 13:40:14 PDT 2009 - Adds Web::Scraper::LibXML which uses HTML::TreeBuilder::LibXML (without the replace_original hack) 0.28 Sat Mar 28 14:31:45 PDT 2009 - Call ->eof when parsing with HTML::TreeBuilder (Thanks to Tokuhiro Matsuno) 0.27 Tue Mar 24 12:09:04 PDT 2009 - Added tests to use HTML::TreeBuilder::LibXML (Thanks to Tokuhiro Matsuno) 0.26 Thu Jan 15 11:37:56 PST 2009 - Fixed an error message when GET request fails 0.25 Sun Jan 11 13:36:44 PST 2009 - scrape() now accepts HTTP::Response as well for Remedie/Plagger - repository moved to github http://github.com/miyagawa/web-scraper/tree/master 0.24 Sun Nov 25 15:58:38 PST 2007 - Support duck typing in filter args to take object that has 'filter' method This could give Web::Scraper::Filter::Pipe a better interface (Thanks to hanekomu and tokuhirom) 0.23 Sat Nov 24 17:21:14 PST 2007 - Upped Web::Scraper dependency - Skip & test until HTML::TreeBuilder::XPath fixes it - removed eg/search-cpan.pl 0.22 Wed Oct 17 17:51:54 PDT 2007 - 's' on scraper shell now prints to pager (e.g. less) if PAGER is set 0.21_01 Thu Oct 4 01:05:00 PDT 2007 - Added an experimental filter support (Thanks to hirose31, tokuhirom and Yappo for brainstorming) 0.21 Wed Oct 3 10:37:13 PDT 2007 - Bumped up HTML::TreeBuilder dependency to fix 12_html.t issues [rt.cpan.org #29733] 0.20 Wed Oct 3 00:28:13 PDT 2007 - Fixed a bug where URI is not absolutized with a hash reference value - Added eg/jp-playstation-store.pl 0.19 Thu Sep 20 22:42:30 PDT 2007 - Try to get HTML encoding from META tags as well, when there's no charset value in HTTP response header. 0.18 Thu Sep 20 19:49:11 PDT 2007 - Fixed a bug where URI is not absolutized when scraper is nested - Use as_XML not as_HTML in 'RAW' 0.17 Wed Sep 19 19:12:25 PDT 2007 - Reverted Term::Encoding support since it causes segfaults (double utf-8 encoding) in some environment 0.16 Tue Sep 18 04:48:47 PDT 2007 - Support 'RAW' and 'TEXT' for TextNode object - Call Term::Encoding from scraper shell if installed 0.15 Sat Sep 15 21:28:10 PDT 2007 - Call env_proxy in scraper CLI - Added $Web::Scraper::UserAgent and $scraper->user_agent accessor to deal with UserAgent object - Don't escape non-ASCII characters into &#xXXXX; in scraper shell 's' and WARN 0.14 Fri Sep 14 16:06:20 PDT 2007 - Fix bin/scraper to work with older Term::ReadLine. (Thanks to Tina Müller [RT:29079]) - Now link elements like img@src and a@href are automatically converted to absolute URI using the current URI as a base. Only effective when you do $s->scrape(URI) or $s->scrape(\$html, URI) - Added 'HTML' and its alias 'RAW' to get the HTML chunk inside the tag process "script", "code" => 'RAW'; Handy if you want the raw HTML code inside --- selector script --- expected function foo() { return bar; } === a --- html foo bar --- selector a --- expected foo bar === div --- html

foo bar

bar

--- selector #foo --- expected

foo bar

bar

=== non-ascii --- html

テスト

--- selector #foo --- expected テスト === textarea --- html --- selector textarea --- expected \n foo bar \n baz Web-Scraper-0.37/t/13_textnode.t000644 000765 000024 00000001520 11162225735 017306 0ustar00miyagawastaff000000 000000 use strict; use Test::Base; use Web::Scraper; plan tests => 1 * blocks; filters { selector => 'chomp', want => 'chomp', expected => 'chomp', }; run { my $block = shift; my $s = scraper { process $block->selector, want => $block->want; result 'want'; }; my $want = $s->scrape($block->html); is $want, $block->expected, $block->name; }; __DATA__ === TEXT --- html

foo bar

--- selector //p/node()[2] --- want TEXT --- expected bar === TEXT --- html

foo bar

--- selector //p/node()[2] --- want TEXT --- expected bar === TEXT --- html

foo bar & baz

--- selector //p/node()[2] --- want TEXT --- expected bar & baz === RAW HTML --- SKIP --- html

foo bar & baz

--- selector //p/node()[2] --- want RAW --- expected bar & baz Web-Scraper-0.37/t/14_absolute_nested.t000644 000765 000024 00000001157 11162225735 020643 0ustar00miyagawastaff000000 000000 use strict; use Test::Base; use Web::Scraper; plan tests => 1 * blocks; filters { selector => 'chomp', expected => 'chomp', }; run { my $block = shift; my $s = scraper { process $block->selector, want => scraper { process "img", image => '@src'; result "image"; }; result 'want'; }; my $want = $s->scrape($block->html, $block->url); is $want, $block->expected, $block->name; }; __DATA__ === --- url: http://example.com/ --- html --- selector a#foo --- expected http://example.com/foo.jpg Web-Scraper-0.37/t/15_absolute_hash.t000644 000765 000024 00000001041 11162225735 020275 0ustar00miyagawastaff000000 000000 use strict; use Test::Base; use Web::Scraper; plan tests => 1 * blocks; filters { expected => 'chomp', }; run { my $block = shift; my $s = scraper { process $block->selector, 'want[]' => { link => '@href' }; result 'want'; }; my $want = $s->scrape($block->html, $block->url); is $want->[0]->{link}, $block->expected, $block->name; }; __DATA__ === --- url: http://example.com/ --- html --- selector a#foo --- expected http://example.com/foo.html Web-Scraper-0.37/t/16_filter.t000644 000765 000024 00000002535 11162225735 016753 0ustar00miyagawastaff000000 000000 use strict; use Test::Base; use Web::Scraper; plan tests => 1 * blocks; filters { expected => 'chomp', want => 'eval', }; run { my $block = shift; my $s = scraper { process 'a', want => $block->want; result 'want'; }; my $want = $s->scrape('foo'); my $expected = $block->expected eq 'undef' ? undef : $block->expected; is $want, $expected, $block->name; }; BEGIN { package Web::Scraper::Filter::foo; use base qw( Web::Scraper::Filter ); sub filter { tr/a-z/b-za/ } package Web::Scraper::Filter::bar; use base qw( Web::Scraper::Filter ); sub filter { $_[1] . 'bar' } } package main; __DATA__ === tr --- want ['TEXT', 'foo'] --- expected gpp === shift + return --- want ['TEXT', 'bar'] --- expected foobar === inline callback --- want ['TEXT', sub { return "baz" } ] --- expected baz === inline callback + s/// --- want ['TEXT', sub { s/foo/bax/ } ] --- expected bax === stack --- want ['TEXT', 'bar', 'foo' ] --- expected gppcbs === stack --- want ['TEXT', 'bar', sub { s/foo/bar/ } ] --- expected barbar === no match --- want ['TEXT', sub { s/xxx/yyy/g }] --- expected foo === undef --- want ['TEXT', sub { return }] --- expected undef === number --- want ['TEXT', sub { return 3 }] --- expected 3 === object --- want ['TEXT', Web::Scraper::Filter::foo->new] --- expected gpp Web-Scraper-0.37/t/17_filter_loop.t000644 000765 000024 00000000702 11162225735 017777 0ustar00miyagawastaff000000 000000 use strict; use Test::Base; use Web::Scraper; plan tests => 1 * blocks; filters { expected => 'yaml', want => 'eval', }; run { my $block = shift; my $s = scraper { process 'a', 'want[]' => $block->want; result 'want'; }; my $want = $s->scrape('foobar'); is_deeply $want, $block->expected, $block->name; }; __DATA__ === tr --- want ['TEXT', sub { tr/a-z/b-za/ }] --- expected - gpp - cbs Web-Scraper-0.37/t/18_http_response.t000644 000765 000024 00000001214 11162225735 020356 0ustar00miyagawastaff000000 000000 use strict; use warnings; use URI; use LWP::UserAgent; use Web::Scraper; use Test::More; plan skip_all => "LIVE_TEST not enabled" unless $ENV{LIVE_TEST} || $ENV{TEST_ALL}; plan tests => 2; my $ua = LWP::UserAgent->new; { my $res = $ua->get("http://www.yahoo.co.jp/"); my $result = scraper { process 'title', title => 'TEXT'; }->scrape($res); is $result->{title}, 'Yahoo! JAPAN'; } { my $res = $ua->get("http://b.hatena.ne.jp/"); my $result = scraper { process 'img.csschanger', image => '@src'; }->scrape($res); is $result->{image}, 'http://b.hatena.ne.jp/images/logo1.gif', 'Absolute URI'; } Web-Scraper-0.37/t/19_decode_content.t000644 000765 000024 00000001276 11332462511 020441 0ustar00miyagawastaff000000 000000 use strict; use warnings; use URI; use LWP::UserAgent; use Web::Scraper; use Test::More; plan skip_all => "LIVE_TEST not enabled" unless $ENV{LIVE_TEST} || $ENV{TEST_ALL}; plan tests => 2; my $ua = LWP::UserAgent->new; $ua->default_header('Accept-Encoding' => 'gzip'); { my $res = $ua->get("http://www.yahoo.co.jp/"); my $result = scraper { process 'title', title => 'TEXT'; }->scrape($res); is $result->{title}, 'Yahoo! JAPAN'; } { my $res = $ua->get("http://b.hatena.ne.jp/"); my $result = scraper { process 'img.csschanger', image => '@src'; }->scrape($res); is $result->{image}, 'http://b.hatena.ne.jp/images/logo1.gif', 'Absolute URI'; } Web-Scraper-0.37/t/20_comment_nodes.t000644 000765 000024 00000002272 11640224677 020316 0ustar00miyagawastaff000000 000000 use strict; use Test::Base; use utf8; use Web::Scraper; plan skip_all => "Please upgrade HTML::TreeBuilder::XPath and HTML::TreeBuilder::LibXML modules for comment nodes supporting" unless eval "use HTML::TreeBuilder::XPath 0.14; 1" && eval "use HTML::TreeBuilder::LibXML 0.13; 1"; plan tests => 1 * blocks; filters { selector => 'chomp', expected => [ 'chomp', 'newline' ], html => 'newline', }; sub newline { s/\\n\n/\n/g; } # For turning off of "Wide character warnings if test failed" my $builder = Test::More->builder; binmode $builder->output, ":utf8"; binmode $builder->failure_output, ":utf8"; binmode $builder->todo_output, ":utf8"; run { my $block = shift; my $s = scraper { process $block->selector, want => 'TEXT'; result 'want'; }; my $want = $s->scrape($block->html); is $want, $block->expected, $block->name; }; __DATA__ === comment in p --- html

This is a paragraph bla bla bla

--- selector //p/comment() --- expected This is the comment === non-ascii comment --- html

Bla bla bla

--- selector //p/comment() --- expected テスト Web-Scraper-0.37/t/21_html5.t000644 000765 000024 00000001157 11662006662 016513 0ustar00miyagawastaff000000 000000 use strict; use Test::Base; use utf8; use Web::Scraper; plan tests => 1 * blocks; filters { selector => 'chomp', expected => [ 'chomp', 'newline' ], html => 'newline', }; sub newline { s/\\n\n/\n/g; } run { my $block = shift; my $s = scraper { process $block->selector, want => 'HTML'; result 'want'; }; my $want = $s->scrape($block->html); is $want, $block->expected, $block->name; }; __DATA__ === header --- html
hello
--- selector header --- expected hello === section --- html
hello
--- selector header --- expected hello Web-Scraper-0.37/t/redefine.t000644 000765 000024 00000000177 11225220721 016727 0ustar00miyagawastaff000000 000000 BEGIN { use Test::More 'no_plan'; $SIG{__WARN__} = sub { fail shift }; } use Web::Scraper; use Web::Scraper; ok 1; Web-Scraper-0.37/t/xml-simple.t000644 000765 000024 00000001547 11531513276 017251 0ustar00miyagawastaff000000 000000 use strict; use Test::Requires qw(HTML::TreeBuilder::LibXML); use Test::Base; use Web::Scraper::LibXML; filters { expected => [ 'lines', 'chomp' ] }; plan tests => 1 * blocks; run { my $block = shift; my $s = scraper { process $block->selector, "value[]", $block->get; }; my $r = $s->scrape($block->input); is_deeply $r->{value}, [ $block->expected ]; }; __END__ === --- input bar --- selector: foo --- get: TEXT --- expected bar === --- input baz bax --- selector: foo>bar --- get: TEXT --- expected baz bax === --- input --- selector: bar --- get: @attr --- expected test bar Hello & World Web-Scraper-0.37/lib/Web/000755 000765 000024 00000000000 12040350116 015772 5ustar00miyagawastaff000000 000000 Web-Scraper-0.37/lib/Web/Scraper/000755 000765 000024 00000000000 12040350116 017371 5ustar00miyagawastaff000000 000000 Web-Scraper-0.37/lib/Web/Scraper.pm000644 000765 000024 00000027731 12040350102 017734 0ustar00miyagawastaff000000 000000 package Web::Scraper; use strict; use warnings; use 5.008001; use Carp; use Scalar::Util qw(blessed); use List::Util qw(first); use HTML::Entities; use HTML::Tagset; use HTML::TreeBuilder::XPath; use HTML::Selector::XPath; use UNIVERSAL::require; our $VERSION = '0.37'; sub import { my $class = shift; my $pkg = caller; no strict 'refs'; no warnings 'redefine'; *{"$pkg\::scraper"} = _build_scraper($class); *{"$pkg\::process"} = sub { goto &process }; *{"$pkg\::process_first"} = sub { goto &process_first }; *{"$pkg\::result"} = sub { goto &result }; } our $UserAgent; sub __ua { require LWP::UserAgent; $UserAgent ||= LWP::UserAgent->new(agent => __PACKAGE__ . "/" . $VERSION); $UserAgent; } sub user_agent { my $self = shift; $self->{user_agent} = shift if @_; $self->{user_agent} || __ua; } sub define { my($class, $coderef) = @_; bless { code => $coderef }, $class; } sub _build_scraper { my $class = shift; return sub(&) { my($coderef) = @_; bless { code => $coderef }, $class; }; } sub scrape { my $self = shift; my($stuff, $current) = @_; my($html, $tree); if (blessed($stuff) && $stuff->isa('URI')) { my $ua = $self->user_agent; my $res = $ua->get($stuff); return $self->scrape($res, $stuff->as_string); } elsif (blessed($stuff) && $stuff->isa('HTTP::Response')) { if ($stuff->is_success) { $html = $stuff->decoded_content; } else { croak "GET " . $stuff->request->uri . " failed: ", $stuff->status_line; } $current ||= $stuff->request->uri; } elsif (blessed($stuff) && $stuff->isa('HTML::Element')) { $tree = $stuff->clone; } elsif (ref($stuff) && ref($stuff) eq 'SCALAR') { $html = $$stuff; } else { $html = $stuff; } $tree ||= $self->build_tree($html); my $stash = {}; no warnings 'redefine'; local *process = create_process(0, $tree, $stash, $current); local *process_first = create_process(1, $tree, $stash, $current); my $retval; local *result = sub { $retval++; my @keys = @_; if (@keys == 1) { return $stash->{$keys[0]}; } elsif (@keys) { my %res; @res{@keys} = @{$stash}{@keys}; return \%res; } else { return $stash; } }; my $ret = $self->{code}->($tree); $tree->delete; # check user specified return value return $ret if $retval; return $stash; } sub build_tree { my($self, $html) = @_; my $t = HTML::TreeBuilder::XPath->new; $t->store_comments(1) if ($t->can('store_comments')); $t->ignore_unknown(0); $t->parse($html); $t->eof; $t; } sub create_process { my($first, $tree, $stash, $uri) = @_; sub { my($exp, @attr) = @_; my $xpath = $exp =~ m!^(?:/|id\()! ? $exp : HTML::Selector::XPath::selector_to_xpath($exp); my @nodes = eval { local $SIG{__WARN__} = sub { }; $tree->findnodes($xpath); }; if ($@) { die "'$xpath' doesn't look like a valid XPath expression: $@"; } @nodes or return; @nodes = ($nodes[0]) if $first; while (my($key, $val) = splice(@attr, 0, 2)) { if (!defined $val) { if (ref($key) && ref($key) eq 'CODE') { for my $node (@nodes) { local $_ = $node; $key->($node); } } else { die "Don't know what to do with $key => undef"; } } elsif ($key =~ s!\[\]$!!) { $stash->{$key} = [ map __get_value($_, $val, $uri), @nodes ]; } else { $stash->{$key} = __get_value($nodes[0], $val, $uri); } } return; }; } sub __get_value { my($node, $val, $uri) = @_; if (ref($val) && ref($val) eq 'CODE') { local $_ = $node; return $val->($node); } elsif (blessed($val) && $val->isa('Web::Scraper')) { return $val->scrape($node, $uri); } elsif ($val =~ s!^@!!) { my $value = $node->attr($val); if ($uri && is_link_element($node, $val)) { require URI; $value = URI->new_abs($value, $uri); } return $value; } elsif (lc($val) eq 'content' || lc($val) eq 'text') { # getValue method is used for getting a content of comment nodes # from HTML::TreeBuilder::XPath (version >= 0.14) # or HTML::TreeBuilder::LibXML (version >= 0.13) # getValue method works like as_text in both modules # for other node types return $node->isTextNode ? $node->string_value : ($node->can('getValue') ? $node->getValue : $node->as_text); } elsif (lc($val) eq 'raw' || lc($val) eq 'html') { if ($node->isTextNode) { if ($HTML::TreeBuilder::XPath::VERSION < 0.09) { return HTML::Entities::encode($node->as_XML, q("'<>&)); } else { return $node->as_XML; } } my $html = $node->as_XML; $html =~ s!^<.*?>!!; $html =~ s!\s*\n*$!!; return $html; } elsif (ref($val) eq 'HASH') { my $values; for my $key (keys %$val) { $values->{$key} = __get_value($node, $val->{$key}, $uri); } return $values; } elsif (ref($val) eq 'ARRAY') { my $how = $val->[0]; my $value = __get_value($node, $how, $uri); for my $filter (@$val[1..$#$val]) { $value = run_filter($value, $filter); } return $value; } else { Carp::croak "Unknown value type $val"; } } sub run_filter { my($value, $filter) = @_; ## sub { s/foo/bar/g } is a valid filter ## sub { DateTime::Format::Foo->parse_string(shift) } is valid too my $callback; my $module; if (ref($filter) eq 'CODE') { $callback = $filter; $module = "$filter"; } elsif (!ref($filter)) { $module = $filter =~ s/^\+// ? $filter : "Web::Scraper::Filter::$filter"; unless ($module->isa('Web::Scraper::Filter')) { $module->require or Carp::croak("Loading $module: $@"); } $callback = sub { $module->new->filter(shift) }; } elsif (blessed($filter) && $filter->can('filter')) { $callback = sub { $filter->filter(shift) }; } else { Carp::croak("Don't know filter type $filter"); } local $_ = $value; my $retval = eval { $callback->($value) }; if ($@) { Carp::croak("Filter $module had an error: $@"); } no warnings 'uninitialized'; # sub { s/foo/bar/ } returns number or PL_sv_no which is stringified to '' if (($retval =~ /^\d+$/ and $_ ne $value) or (defined($retval) and $retval eq '')) { $value = $_; } else { $value = $retval; } return $value; } sub is_link_element { my($node, $attr) = @_; my $link_elements = $HTML::Tagset::linkElements{$node->tag} || []; for my $elem (@$link_elements) { return 1 if $attr eq $elem; } return; } sub __stub { my $func = shift; return sub { croak "Can't call $func() outside scraper block"; }; } *process = __stub 'process'; *process_first = __stub 'process_first'; *result = __stub 'result'; 1; __END__ =for stopwords API SCRAPI Scrapi =head1 NAME Web::Scraper - Web Scraping Toolkit using HTML and CSS Selectors or XPath expressions =head1 SYNOPSIS use URI; use Web::Scraper; # First, create your scraper block my $tweets = scraper { # Parse all LIs with the class "status", store them into a resulting # array 'tweets'. We embed another scraper for each tweet. process "li.status", "tweets[]" => scraper { # And, in that array, pull in the elementy with the class # "entry-content", "entry-date" and the link process ".entry-content", body => 'TEXT'; process ".entry-date", when => 'TEXT'; process 'a[rel="bookmark"]', link => '@href'; }; }; my $res = $tweets->scrape( URI->new("http://twitter.com/miyagawa") ); # The result has the populated tweets array for my $tweet (@{$res->{tweets}}) { print "$tweet->{body} $tweet->{when} (link: $tweet->{link})\n"; } The structure would resemble this (visually) { tweets => [ { body => $body, when => $date, link => $uri }, { body => $body, when => $date, link => $uri }, ] } =head1 DESCRIPTION Web::Scraper is a web scraper toolkit, inspired by Ruby's equivalent Scrapi. It provides a DSL-ish interface for traversing HTML documents and returning a neatly arranged Perl data strcuture. The I and I blocks provide a method to define what segments of a document to extract. It understands HTML and CSS Selectors as well as XPath expressions. =head1 METHODS =head2 scraper $scraper = scraper { ... }; Creates a new Web::Scraper object by wrapping the DSL code that will be fired when I method is called. =head2 scrape $res = $scraper->scrape(URI->new($uri)); $res = $scraper->scrape($html_content); $res = $scraper->scrape(\$html_content); $res = $scraper->scrape($http_response); $res = $scraper->scrape($html_element); Retrieves the HTML from URI, HTTP::Response, HTML::Tree or text strings and creates a DOM object, then fires the callback scraper code to retrieve the data structure. If you pass URI or HTTP::Response object, Web::Scraper will automatically guesses the encoding of the content by looking at Content-Type headers and META tags. Otherwise you need to decode the HTML to Unicode before passing it to I method. You can optionally pass the base URL when you pass the HTML content as a string instead of URI or HTTP::Response. $res = $scraper->scrape($html_content, "http://example.com/foo"); This way Web::Scraper can resolve the relative links found in the document. =head2 process scraper { process "tag.class", key => 'TEXT'; process '//tag[contains(@foo, "bar")]', key2 => '@attr'; process '//comment()', 'comments[]' => 'TEXT'; }; I is the method to find matching elements from HTML with CSS selector or XPath expression, then extract text or attributes into the result stash. If the first argument begins with "//" or "id(" it's treated as an XPath expression and otherwise CSS selector. # 2008/12/21 # date => "2008/12/21" process ".date", date => 'TEXT'; # # link => URI->new("http://example.com/") process ".body > a", link => '@href'; # # comment => " HTML Comment here " # # NOTES: A comment nodes are accessed when installed # the HTML::TreeBuilder::XPath (version >= 0.14) and/or # the HTML::TreeBuilder::LibXML (version >= 0.13) process "//div[contains(@class, 'body')]/comment()", comment => 'TEXT'; # # link => URI->new("http://example.com/"), text => "foo" process ".body > a", link => '@href', text => 'TEXT'; #
  • foo
  • bar
# list => [ "foo", "bar" ] process "li", "list[]" => "TEXT"; #
  • foo
  • bar
# list => [ { id => "1", text => "foo" }, { id => "2", text => "bar" } ]; process "li", "list[]" => { id => '@id', text => "TEXT" }; =head1 EXAMPLES There are many examples in the C dir packaged in this distribution. It is recommended to look through these. =head1 NESTED SCRAPERS TBD =head1 FILTERS TBD =head1 AUTHOR Tatsuhiko Miyagawa Emiyagawa@bulknews.netE =head1 LICENSE This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =head1 SEE ALSO L L =cut Web-Scraper-0.37/lib/Web/Scraper/Filter.pm000644 000765 000024 00000002715 11162225735 021175 0ustar00miyagawastaff000000 000000 package Web::Scraper::Filter; use strict; use warnings; sub new { my $class = shift; bless {}, $class; } 1; __END__ =for stopwords namespace inline callback =head1 NAME Web::Scraper::Filter - Base class for Web::Scraper filters =head1 SYNOPSIS package Web::Scraper::Filter::YAML; use base qw( Web::Scraper::Filter ); use YAML (); sub filter { my($self, $value) = @_; YAML::Load($value); } 1; use Web::Scraper; my $scraper = scraper { process ".yaml-code", data => [ 'TEXT', 'YAML' ]; }; =head1 DESCRIPTION Web::Scraper::Filter is a base class for text filters in Web::Scraper. You can create your own text filter by subclassing this module. There are two ways to create and use your custom filter. If you name your filter Web::Scraper::Filter::Something, you just call: process $exp, $key => [ 'TEXT', 'Something' ]; If you declare your filter under your own namespace, like 'MyApp::Filter::Foo', process $exp, $key => [ 'TEXT', '+MyApp::Filter::Foo' ]; You can also inline your filter function without creating a filter class: process $exp, $key => [ 'TEXT', sub { s/foo/bar/ } ]; Note that this function munges C<$_> and returns the count of replacement. Filter code special cases if the return value of the callback is number and C<$_> value is updated. You can, of course, stack filters like: process $exp, $key => [ '@href', 'Foo', '+MyApp::Filter::Bar', \&baz ]; =head1 AUTHOR Tatsuhiko Miyagawa =cut Web-Scraper-0.37/lib/Web/Scraper/LibXML.pm000644 000765 000024 00000002263 11225234342 021027 0ustar00miyagawastaff000000 000000 package Web::Scraper::LibXML; use strict; use base qw( Web::Scraper ); use HTML::TreeBuilder::LibXML; sub build_tree { my($self, $html) = @_; my $t = HTML::TreeBuilder::LibXML->new; $t->parse($html); $t->eof; $t; } 1; __END__ =head1 NAME Web::Scraper::LibXML - Drop-in replacement for Web::Scraper to use LibXML =head1 SYNOPSIS use Web::Scraper::LibXML; # same as Web::Scraper my $scraper = scraper { ... }; =head1 DESCRIPTION Web::Scraper::LibXML is a drop-in replacement for Web::Scraper to use the fast libxml-based HTML tree builder, HTML::TreeBuilder::LibXML. This is almost identical to HTML::TreeBuilder::LibXML's I installer, like: use HTML::TreeBuilder::LibXML; HTML::TreeBuilder::LibXML->replace_original(); use Web::Scraper; my $scraper = scraper { ... }; # this code uses LibXML parser which overrides HTML::TreeBuilder::XPath's new() constructor so that L of your code using HTML::TreeBuilder::XPath is switched to the libxml based parser. This module, instead, gives you more control over which TreeBuilder to use, depending on the site etc. =head1 SEE ALSO L L =cut Web-Scraper-0.37/inc/Module/000755 000765 000024 00000000000 12040350116 016505 5ustar00miyagawastaff000000 000000 Web-Scraper-0.37/inc/Module/Install/000755 000765 000024 00000000000 12040350116 020113 5ustar00miyagawastaff000000 000000 Web-Scraper-0.37/inc/Module/Install.pm000644 000765 000024 00000030135 12040350115 020452 0ustar00miyagawastaff000000 000000 #line 1 package Module::Install; # For any maintainers: # The load order for Module::Install is a bit magic. # It goes something like this... # # IF ( host has Module::Install installed, creating author mode ) { # 1. Makefile.PL calls "use inc::Module::Install" # 2. $INC{inc/Module/Install.pm} set to installed version of inc::Module::Install # 3. The installed version of inc::Module::Install loads # 4. inc::Module::Install calls "require Module::Install" # 5. The ./inc/ version of Module::Install loads # } ELSE { # 1. Makefile.PL calls "use inc::Module::Install" # 2. $INC{inc/Module/Install.pm} set to ./inc/ version of Module::Install # 3. The ./inc/ version of Module::Install loads # } use 5.005; use strict 'vars'; use Cwd (); use File::Find (); use File::Path (); use vars qw{$VERSION $MAIN}; BEGIN { # All Module::Install core packages now require synchronised versions. # This will be used to ensure we don't accidentally load old or # different versions of modules. # This is not enforced yet, but will be some time in the next few # releases once we can make sure it won't clash with custom # Module::Install extensions. $VERSION = '1.06'; # Storage for the pseudo-singleton $MAIN = undef; *inc::Module::Install::VERSION = *VERSION; @inc::Module::Install::ISA = __PACKAGE__; } sub import { my $class = shift; my $self = $class->new(@_); my $who = $self->_caller; #------------------------------------------------------------- # all of the following checks should be included in import(), # to allow "eval 'require Module::Install; 1' to test # installation of Module::Install. (RT #51267) #------------------------------------------------------------- # Whether or not inc::Module::Install is actually loaded, the # $INC{inc/Module/Install.pm} is what will still get set as long as # the caller loaded module this in the documented manner. # If not set, the caller may NOT have loaded the bundled version, and thus # they may not have a MI version that works with the Makefile.PL. This would # result in false errors or unexpected behaviour. And we don't want that. my $file = join( '/', 'inc', split /::/, __PACKAGE__ ) . '.pm'; unless ( $INC{$file} ) { die <<"END_DIE" } Please invoke ${\__PACKAGE__} with: use inc::${\__PACKAGE__}; not: use ${\__PACKAGE__}; END_DIE # This reportedly fixes a rare Win32 UTC file time issue, but # as this is a non-cross-platform XS module not in the core, # we shouldn't really depend on it. See RT #24194 for detail. # (Also, this module only supports Perl 5.6 and above). eval "use Win32::UTCFileTime" if $^O eq 'MSWin32' && $] >= 5.006; # If the script that is loading Module::Install is from the future, # then make will detect this and cause it to re-run over and over # again. This is bad. Rather than taking action to touch it (which # is unreliable on some platforms and requires write permissions) # for now we should catch this and refuse to run. if ( -f $0 ) { my $s = (stat($0))[9]; # If the modification time is only slightly in the future, # sleep briefly to remove the problem. my $a = $s - time; if ( $a > 0 and $a < 5 ) { sleep 5 } # Too far in the future, throw an error. my $t = time; if ( $s > $t ) { die <<"END_DIE" } Your installer $0 has a modification time in the future ($s > $t). This is known to create infinite loops in make. Please correct this, then run $0 again. END_DIE } # Build.PL was formerly supported, but no longer is due to excessive # difficulty in implementing every single feature twice. if ( $0 =~ /Build.PL$/i ) { die <<"END_DIE" } Module::Install no longer supports Build.PL. It was impossible to maintain duel backends, and has been deprecated. Please remove all Build.PL files and only use the Makefile.PL installer. END_DIE #------------------------------------------------------------- # To save some more typing in Module::Install installers, every... # use inc::Module::Install # ...also acts as an implicit use strict. $^H |= strict::bits(qw(refs subs vars)); #------------------------------------------------------------- unless ( -f $self->{file} ) { foreach my $key (keys %INC) { delete $INC{$key} if $key =~ /Module\/Install/; } local $^W; require "$self->{path}/$self->{dispatch}.pm"; File::Path::mkpath("$self->{prefix}/$self->{author}"); $self->{admin} = "$self->{name}::$self->{dispatch}"->new( _top => $self ); $self->{admin}->init; @_ = ($class, _self => $self); goto &{"$self->{name}::import"}; } local $^W; *{"${who}::AUTOLOAD"} = $self->autoload; $self->preload; # Unregister loader and worker packages so subdirs can use them again delete $INC{'inc/Module/Install.pm'}; delete $INC{'Module/Install.pm'}; # Save to the singleton $MAIN = $self; return 1; } sub autoload { my $self = shift; my $who = $self->_caller; my $cwd = Cwd::cwd(); my $sym = "${who}::AUTOLOAD"; $sym->{$cwd} = sub { my $pwd = Cwd::cwd(); if ( my $code = $sym->{$pwd} ) { # Delegate back to parent dirs goto &$code unless $cwd eq $pwd; } unless ($$sym =~ s/([^:]+)$//) { # XXX: it looks like we can't retrieve the missing function # via $$sym (usually $main::AUTOLOAD) in this case. # I'm still wondering if we should slurp Makefile.PL to # get some context or not ... my ($package, $file, $line) = caller; die <<"EOT"; Unknown function is found at $file line $line. Execution of $file aborted due to runtime errors. If you're a contributor to a project, you may need to install some Module::Install extensions from CPAN (or other repository). If you're a user of a module, please contact the author. EOT } my $method = $1; if ( uc($method) eq $method ) { # Do nothing return; } elsif ( $method =~ /^_/ and $self->can($method) ) { # Dispatch to the root M:I class return $self->$method(@_); } # Dispatch to the appropriate plugin unshift @_, ( $self, $1 ); goto &{$self->can('call')}; }; } sub preload { my $self = shift; unless ( $self->{extensions} ) { $self->load_extensions( "$self->{prefix}/$self->{path}", $self ); } my @exts = @{$self->{extensions}}; unless ( @exts ) { @exts = $self->{admin}->load_all_extensions; } my %seen; foreach my $obj ( @exts ) { while (my ($method, $glob) = each %{ref($obj) . '::'}) { next unless $obj->can($method); next if $method =~ /^_/; next if $method eq uc($method); $seen{$method}++; } } my $who = $self->_caller; foreach my $name ( sort keys %seen ) { local $^W; *{"${who}::$name"} = sub { ${"${who}::AUTOLOAD"} = "${who}::$name"; goto &{"${who}::AUTOLOAD"}; }; } } sub new { my ($class, %args) = @_; delete $INC{'FindBin.pm'}; { # to suppress the redefine warning local $SIG{__WARN__} = sub {}; require FindBin; } # ignore the prefix on extension modules built from top level. my $base_path = Cwd::abs_path($FindBin::Bin); unless ( Cwd::abs_path(Cwd::cwd()) eq $base_path ) { delete $args{prefix}; } return $args{_self} if $args{_self}; $args{dispatch} ||= 'Admin'; $args{prefix} ||= 'inc'; $args{author} ||= ($^O eq 'VMS' ? '_author' : '.author'); $args{bundle} ||= 'inc/BUNDLES'; $args{base} ||= $base_path; $class =~ s/^\Q$args{prefix}\E:://; $args{name} ||= $class; $args{version} ||= $class->VERSION; unless ( $args{path} ) { $args{path} = $args{name}; $args{path} =~ s!::!/!g; } $args{file} ||= "$args{base}/$args{prefix}/$args{path}.pm"; $args{wrote} = 0; bless( \%args, $class ); } sub call { my ($self, $method) = @_; my $obj = $self->load($method) or return; splice(@_, 0, 2, $obj); goto &{$obj->can($method)}; } sub load { my ($self, $method) = @_; $self->load_extensions( "$self->{prefix}/$self->{path}", $self ) unless $self->{extensions}; foreach my $obj (@{$self->{extensions}}) { return $obj if $obj->can($method); } my $admin = $self->{admin} or die <<"END_DIE"; The '$method' method does not exist in the '$self->{prefix}' path! Please remove the '$self->{prefix}' directory and run $0 again to load it. END_DIE my $obj = $admin->load($method, 1); push @{$self->{extensions}}, $obj; $obj; } sub load_extensions { my ($self, $path, $top) = @_; my $should_reload = 0; unless ( grep { ! ref $_ and lc $_ eq lc $self->{prefix} } @INC ) { unshift @INC, $self->{prefix}; $should_reload = 1; } foreach my $rv ( $self->find_extensions($path) ) { my ($file, $pkg) = @{$rv}; next if $self->{pathnames}{$pkg}; local $@; my $new = eval { local $^W; require $file; $pkg->can('new') }; unless ( $new ) { warn $@ if $@; next; } $self->{pathnames}{$pkg} = $should_reload ? delete $INC{$file} : $INC{$file}; push @{$self->{extensions}}, &{$new}($pkg, _top => $top ); } $self->{extensions} ||= []; } sub find_extensions { my ($self, $path) = @_; my @found; File::Find::find( sub { my $file = $File::Find::name; return unless $file =~ m!^\Q$path\E/(.+)\.pm\Z!is; my $subpath = $1; return if lc($subpath) eq lc($self->{dispatch}); $file = "$self->{path}/$subpath.pm"; my $pkg = "$self->{name}::$subpath"; $pkg =~ s!/!::!g; # If we have a mixed-case package name, assume case has been preserved # correctly. Otherwise, root through the file to locate the case-preserved # version of the package name. if ( $subpath eq lc($subpath) || $subpath eq uc($subpath) ) { my $content = Module::Install::_read($subpath . '.pm'); my $in_pod = 0; foreach ( split //, $content ) { $in_pod = 1 if /^=\w/; $in_pod = 0 if /^=cut/; next if ($in_pod || /^=cut/); # skip pod text next if /^\s*#/; # and comments if ( m/^\s*package\s+($pkg)\s*;/i ) { $pkg = $1; last; } } } push @found, [ $file, $pkg ]; }, $path ) if -d $path; @found; } ##################################################################### # Common Utility Functions sub _caller { my $depth = 0; my $call = caller($depth); while ( $call eq __PACKAGE__ ) { $depth++; $call = caller($depth); } return $call; } # Done in evals to avoid confusing Perl::MinimumVersion eval( $] >= 5.006 ? <<'END_NEW' : <<'END_OLD' ); die $@ if $@; sub _read { local *FH; open( FH, '<', $_[0] ) or die "open($_[0]): $!"; my $string = do { local $/; }; close FH or die "close($_[0]): $!"; return $string; } END_NEW sub _read { local *FH; open( FH, "< $_[0]" ) or die "open($_[0]): $!"; my $string = do { local $/; }; close FH or die "close($_[0]): $!"; return $string; } END_OLD sub _readperl { my $string = Module::Install::_read($_[0]); $string =~ s/(?:\015{1,2}\012|\015|\012)/\n/sg; $string =~ s/(\n)\n*__(?:DATA|END)__\b.*\z/$1/s; $string =~ s/\n\n=\w+.+?\n\n=cut\b.+?\n+/\n\n/sg; return $string; } sub _readpod { my $string = Module::Install::_read($_[0]); $string =~ s/(?:\015{1,2}\012|\015|\012)/\n/sg; return $string if $_[0] =~ /\.pod\z/; $string =~ s/(^|\n=cut\b.+?\n+)[^=\s].+?\n(\n=\w+|\z)/$1$2/sg; $string =~ s/\n*=pod\b[^\n]*\n+/\n\n/sg; $string =~ s/\n*=cut\b[^\n]*\n+/\n\n/sg; $string =~ s/^\n+//s; return $string; } # Done in evals to avoid confusing Perl::MinimumVersion eval( $] >= 5.006 ? <<'END_NEW' : <<'END_OLD' ); die $@ if $@; sub _write { local *FH; open( FH, '>', $_[0] ) or die "open($_[0]): $!"; foreach ( 1 .. $#_ ) { print FH $_[$_] or die "print($_[0]): $!"; } close FH or die "close($_[0]): $!"; } END_NEW sub _write { local *FH; open( FH, "> $_[0]" ) or die "open($_[0]): $!"; foreach ( 1 .. $#_ ) { print FH $_[$_] or die "print($_[0]): $!"; } close FH or die "close($_[0]): $!"; } END_OLD # _version is for processing module versions (eg, 1.03_05) not # Perl versions (eg, 5.8.1). sub _version ($) { my $s = shift || 0; my $d =()= $s =~ /(\.)/g; if ( $d >= 2 ) { # Normalise multipart versions $s =~ s/(\.)(\d{1,3})/sprintf("$1%03d",$2)/eg; } $s =~ s/^(\d+)\.?//; my $l = $1 || 0; my @v = map { $_ . '0' x (3 - length $_) } $s =~ /(\d{1,3})\D?/g; $l = $l . '.' . join '', @v if @v; return $l + 0; } sub _cmp ($$) { _version($_[1]) <=> _version($_[2]); } # Cloned from Params::Util::_CLASS sub _CLASS ($) { ( defined $_[0] and ! ref $_[0] and $_[0] =~ m/^[^\W\d]\w*(?:::\w+)*\z/s ) ? $_[0] : undef; } 1; # Copyright 2008 - 2012 Adam Kennedy. Web-Scraper-0.37/inc/Module/Install/AuthorTests.pm000644 000765 000024 00000002215 12040350115 022735 0ustar00miyagawastaff000000 000000 #line 1 package Module::Install::AuthorTests; use 5.005; use strict; use Module::Install::Base; use Carp (); #line 16 use vars qw{$VERSION $ISCORE @ISA}; BEGIN { $VERSION = '0.002'; $ISCORE = 1; @ISA = qw{Module::Install::Base}; } #line 42 sub author_tests { my ($self, @dirs) = @_; _add_author_tests($self, \@dirs, 0); } #line 56 sub recursive_author_tests { my ($self, @dirs) = @_; _add_author_tests($self, \@dirs, 1); } sub _wanted { my $href = shift; sub { /\.t$/ and -f $_ and $href->{$File::Find::dir} = 1 } } sub _add_author_tests { my ($self, $dirs, $recurse) = @_; return unless $Module::Install::AUTHOR; my @tests = $self->tests ? (split / /, $self->tests) : 't/*.t'; # XXX: pick a default, later -- rjbs, 2008-02-24 my @dirs = @$dirs ? @$dirs : Carp::confess "no dirs given to author_tests"; @dirs = grep { -d } @dirs; if ($recurse) { require File::Find; my %test_dir; File::Find::find(_wanted(\%test_dir), @dirs); $self->tests( join ' ', @tests, map { "$_/*.t" } sort keys %test_dir ); } else { $self->tests( join ' ', @tests, map { "$_/*.t" } sort @dirs ); } } #line 107 1; Web-Scraper-0.37/inc/Module/Install/Base.pm000644 000765 000024 00000002147 12040350115 021326 0ustar00miyagawastaff000000 000000 #line 1 package Module::Install::Base; use strict 'vars'; use vars qw{$VERSION}; BEGIN { $VERSION = '1.06'; } # Suspend handler for "redefined" warnings BEGIN { my $w = $SIG{__WARN__}; $SIG{__WARN__} = sub { $w }; } #line 42 sub new { my $class = shift; unless ( defined &{"${class}::call"} ) { *{"${class}::call"} = sub { shift->_top->call(@_) }; } unless ( defined &{"${class}::load"} ) { *{"${class}::load"} = sub { shift->_top->load(@_) }; } bless { @_ }, $class; } #line 61 sub AUTOLOAD { local $@; my $func = eval { shift->_top->autoload } or return; goto &$func; } #line 75 sub _top { $_[0]->{_top}; } #line 90 sub admin { $_[0]->_top->{admin} or Module::Install::Base::FakeAdmin->new; } #line 106 sub is_admin { ! $_[0]->admin->isa('Module::Install::Base::FakeAdmin'); } sub DESTROY {} package Module::Install::Base::FakeAdmin; use vars qw{$VERSION}; BEGIN { $VERSION = $Module::Install::Base::VERSION; } my $fake; sub new { $fake ||= bless(\@_, $_[0]); } sub AUTOLOAD {} sub DESTROY {} # Restore warning handler BEGIN { $SIG{__WARN__} = $SIG{__WARN__}->(); } 1; #line 159 Web-Scraper-0.37/inc/Module/Install/Can.pm000644 000765 000024 00000006157 12040350115 021162 0ustar00miyagawastaff000000 000000 #line 1 package Module::Install::Can; use strict; use Config (); use ExtUtils::MakeMaker (); use Module::Install::Base (); use vars qw{$VERSION @ISA $ISCORE}; BEGIN { $VERSION = '1.06'; @ISA = 'Module::Install::Base'; $ISCORE = 1; } # check if we can load some module ### Upgrade this to not have to load the module if possible sub can_use { my ($self, $mod, $ver) = @_; $mod =~ s{::|\\}{/}g; $mod .= '.pm' unless $mod =~ /\.pm$/i; my $pkg = $mod; $pkg =~ s{/}{::}g; $pkg =~ s{\.pm$}{}i; local $@; eval { require $mod; $pkg->VERSION($ver || 0); 1 }; } # Check if we can run some command sub can_run { my ($self, $cmd) = @_; my $_cmd = $cmd; return $_cmd if (-x $_cmd or $_cmd = MM->maybe_command($_cmd)); for my $dir ((split /$Config::Config{path_sep}/, $ENV{PATH}), '.') { next if $dir eq ''; require File::Spec; my $abs = File::Spec->catfile($dir, $cmd); return $abs if (-x $abs or $abs = MM->maybe_command($abs)); } return; } # Can our C compiler environment build XS files sub can_xs { my $self = shift; # Ensure we have the CBuilder module $self->configure_requires( 'ExtUtils::CBuilder' => 0.27 ); # Do we have the configure_requires checker? local $@; eval "require ExtUtils::CBuilder;"; if ( $@ ) { # They don't obey configure_requires, so it is # someone old and delicate. Try to avoid hurting # them by falling back to an older simpler test. return $self->can_cc(); } # Do we have a working C compiler my $builder = ExtUtils::CBuilder->new( quiet => 1, ); unless ( $builder->have_compiler ) { # No working C compiler return 0; } # Write a C file representative of what XS becomes require File::Temp; my ( $FH, $tmpfile ) = File::Temp::tempfile( "compilexs-XXXXX", SUFFIX => '.c', ); binmode $FH; print $FH <<'END_C'; #include "EXTERN.h" #include "perl.h" #include "XSUB.h" int main(int argc, char **argv) { return 0; } int boot_sanexs() { return 1; } END_C close $FH; # Can the C compiler access the same headers XS does my @libs = (); my $object = undef; eval { local $^W = 0; $object = $builder->compile( source => $tmpfile, ); @libs = $builder->link( objects => $object, module_name => 'sanexs', ); }; my $result = $@ ? 0 : 1; # Clean up all the build files foreach ( $tmpfile, $object, @libs ) { next unless defined $_; 1 while unlink; } return $result; } # Can we locate a (the) C compiler sub can_cc { my $self = shift; my @chunks = split(/ /, $Config::Config{cc}) or return; # $Config{cc} may contain args; try to find out the program part while (@chunks) { return $self->can_run("@chunks") || (pop(@chunks), next); } return; } # Fix Cygwin bug on maybe_command(); if ( $^O eq 'cygwin' ) { require ExtUtils::MM_Cygwin; require ExtUtils::MM_Win32; if ( ! defined(&ExtUtils::MM_Cygwin::maybe_command) ) { *ExtUtils::MM_Cygwin::maybe_command = sub { my ($self, $file) = @_; if ($file =~ m{^/cygdrive/}i and ExtUtils::MM_Win32->can('maybe_command')) { ExtUtils::MM_Win32->maybe_command($file); } else { ExtUtils::MM_Unix->maybe_command($file); } } } } 1; __END__ #line 236 Web-Scraper-0.37/inc/Module/Install/Fetch.pm000644 000765 000024 00000004627 12040350115 021512 0ustar00miyagawastaff000000 000000 #line 1 package Module::Install::Fetch; use strict; use Module::Install::Base (); use vars qw{$VERSION @ISA $ISCORE}; BEGIN { $VERSION = '1.06'; @ISA = 'Module::Install::Base'; $ISCORE = 1; } sub get_file { my ($self, %args) = @_; my ($scheme, $host, $path, $file) = $args{url} =~ m|^(\w+)://([^/]+)(.+)/(.+)| or return; if ( $scheme eq 'http' and ! eval { require LWP::Simple; 1 } ) { $args{url} = $args{ftp_url} or (warn("LWP support unavailable!\n"), return); ($scheme, $host, $path, $file) = $args{url} =~ m|^(\w+)://([^/]+)(.+)/(.+)| or return; } $|++; print "Fetching '$file' from $host... "; unless (eval { require Socket; Socket::inet_aton($host) }) { warn "'$host' resolve failed!\n"; return; } return unless $scheme eq 'ftp' or $scheme eq 'http'; require Cwd; my $dir = Cwd::getcwd(); chdir $args{local_dir} or return if exists $args{local_dir}; if (eval { require LWP::Simple; 1 }) { LWP::Simple::mirror($args{url}, $file); } elsif (eval { require Net::FTP; 1 }) { eval { # use Net::FTP to get past firewall my $ftp = Net::FTP->new($host, Passive => 1, Timeout => 600); $ftp->login("anonymous", 'anonymous@example.com'); $ftp->cwd($path); $ftp->binary; $ftp->get($file) or (warn("$!\n"), return); $ftp->quit; } } elsif (my $ftp = $self->can_run('ftp')) { eval { # no Net::FTP, fallback to ftp.exe require FileHandle; my $fh = FileHandle->new; local $SIG{CHLD} = 'IGNORE'; unless ($fh->open("|$ftp -n")) { warn "Couldn't open ftp: $!\n"; chdir $dir; return; } my @dialog = split(/\n/, <<"END_FTP"); open $host user anonymous anonymous\@example.com cd $path binary get $file $file quit END_FTP foreach (@dialog) { $fh->print("$_\n") } $fh->close; } } else { warn "No working 'ftp' program available!\n"; chdir $dir; return; } unless (-f $file) { warn "Fetching failed: $@\n"; chdir $dir; return; } return if exists $args{size} and -s $file != $args{size}; system($args{run}) if exists $args{run}; unlink($file) if $args{remove}; print(((!exists $args{check_for} or -e $args{check_for}) ? "done!" : "failed! ($!)"), "\n"); chdir $dir; return !$?; } 1; Web-Scraper-0.37/inc/Module/Install/Makefile.pm000644 000765 000024 00000027437 12040350115 022202 0ustar00miyagawastaff000000 000000 #line 1 package Module::Install::Makefile; use strict 'vars'; use ExtUtils::MakeMaker (); use Module::Install::Base (); use Fcntl qw/:flock :seek/; use vars qw{$VERSION @ISA $ISCORE}; BEGIN { $VERSION = '1.06'; @ISA = 'Module::Install::Base'; $ISCORE = 1; } sub Makefile { $_[0] } my %seen = (); sub prompt { shift; # Infinite loop protection my @c = caller(); if ( ++$seen{"$c[1]|$c[2]|$_[0]"} > 3 ) { die "Caught an potential prompt infinite loop ($c[1]|$c[2]|$_[0])"; } # In automated testing or non-interactive session, always use defaults if ( ($ENV{AUTOMATED_TESTING} or -! -t STDIN) and ! $ENV{PERL_MM_USE_DEFAULT} ) { local $ENV{PERL_MM_USE_DEFAULT} = 1; goto &ExtUtils::MakeMaker::prompt; } else { goto &ExtUtils::MakeMaker::prompt; } } # Store a cleaned up version of the MakeMaker version, # since we need to behave differently in a variety of # ways based on the MM version. my $makemaker = eval $ExtUtils::MakeMaker::VERSION; # If we are passed a param, do a "newer than" comparison. # Otherwise, just return the MakeMaker version. sub makemaker { ( @_ < 2 or $makemaker >= eval($_[1]) ) ? $makemaker : 0 } # Ripped from ExtUtils::MakeMaker 6.56, and slightly modified # as we only need to know here whether the attribute is an array # or a hash or something else (which may or may not be appendable). my %makemaker_argtype = ( C => 'ARRAY', CONFIG => 'ARRAY', # CONFIGURE => 'CODE', # ignore DIR => 'ARRAY', DL_FUNCS => 'HASH', DL_VARS => 'ARRAY', EXCLUDE_EXT => 'ARRAY', EXE_FILES => 'ARRAY', FUNCLIST => 'ARRAY', H => 'ARRAY', IMPORTS => 'HASH', INCLUDE_EXT => 'ARRAY', LIBS => 'ARRAY', # ignore '' MAN1PODS => 'HASH', MAN3PODS => 'HASH', META_ADD => 'HASH', META_MERGE => 'HASH', PL_FILES => 'HASH', PM => 'HASH', PMLIBDIRS => 'ARRAY', PMLIBPARENTDIRS => 'ARRAY', PREREQ_PM => 'HASH', CONFIGURE_REQUIRES => 'HASH', SKIP => 'ARRAY', TYPEMAPS => 'ARRAY', XS => 'HASH', # VERSION => ['version',''], # ignore # _KEEP_AFTER_FLUSH => '', clean => 'HASH', depend => 'HASH', dist => 'HASH', dynamic_lib=> 'HASH', linkext => 'HASH', macro => 'HASH', postamble => 'HASH', realclean => 'HASH', test => 'HASH', tool_autosplit => 'HASH', # special cases where you can use makemaker_append CCFLAGS => 'APPENDABLE', DEFINE => 'APPENDABLE', INC => 'APPENDABLE', LDDLFLAGS => 'APPENDABLE', LDFROM => 'APPENDABLE', ); sub makemaker_args { my ($self, %new_args) = @_; my $args = ( $self->{makemaker_args} ||= {} ); foreach my $key (keys %new_args) { if ($makemaker_argtype{$key}) { if ($makemaker_argtype{$key} eq 'ARRAY') { $args->{$key} = [] unless defined $args->{$key}; unless (ref $args->{$key} eq 'ARRAY') { $args->{$key} = [$args->{$key}] } push @{$args->{$key}}, ref $new_args{$key} eq 'ARRAY' ? @{$new_args{$key}} : $new_args{$key}; } elsif ($makemaker_argtype{$key} eq 'HASH') { $args->{$key} = {} unless defined $args->{$key}; foreach my $skey (keys %{ $new_args{$key} }) { $args->{$key}{$skey} = $new_args{$key}{$skey}; } } elsif ($makemaker_argtype{$key} eq 'APPENDABLE') { $self->makemaker_append($key => $new_args{$key}); } } else { if (defined $args->{$key}) { warn qq{MakeMaker attribute "$key" is overriden; use "makemaker_append" to append values\n}; } $args->{$key} = $new_args{$key}; } } return $args; } # For mm args that take multiple space-seperated args, # append an argument to the current list. sub makemaker_append { my $self = shift; my $name = shift; my $args = $self->makemaker_args; $args->{$name} = defined $args->{$name} ? join( ' ', $args->{$name}, @_ ) : join( ' ', @_ ); } sub build_subdirs { my $self = shift; my $subdirs = $self->makemaker_args->{DIR} ||= []; for my $subdir (@_) { push @$subdirs, $subdir; } } sub clean_files { my $self = shift; my $clean = $self->makemaker_args->{clean} ||= {}; %$clean = ( %$clean, FILES => join ' ', grep { length $_ } ($clean->{FILES} || (), @_), ); } sub realclean_files { my $self = shift; my $realclean = $self->makemaker_args->{realclean} ||= {}; %$realclean = ( %$realclean, FILES => join ' ', grep { length $_ } ($realclean->{FILES} || (), @_), ); } sub libs { my $self = shift; my $libs = ref $_[0] ? shift : [ shift ]; $self->makemaker_args( LIBS => $libs ); } sub inc { my $self = shift; $self->makemaker_args( INC => shift ); } sub _wanted_t { } sub tests_recursive { my $self = shift; my $dir = shift || 't'; unless ( -d $dir ) { die "tests_recursive dir '$dir' does not exist"; } my %tests = map { $_ => 1 } split / /, ($self->tests || ''); require File::Find; File::Find::find( sub { /\.t$/ and -f $_ and $tests{"$File::Find::dir/*.t"} = 1 }, $dir ); $self->tests( join ' ', sort keys %tests ); } sub write { my $self = shift; die "&Makefile->write() takes no arguments\n" if @_; # Check the current Perl version my $perl_version = $self->perl_version; if ( $perl_version ) { eval "use $perl_version; 1" or die "ERROR: perl: Version $] is installed, " . "but we need version >= $perl_version"; } # Make sure we have a new enough MakeMaker require ExtUtils::MakeMaker; if ( $perl_version and $self->_cmp($perl_version, '5.006') >= 0 ) { # This previous attempted to inherit the version of # ExtUtils::MakeMaker in use by the module author, but this # was found to be untenable as some authors build releases # using future dev versions of EU:MM that nobody else has. # Instead, #toolchain suggests we use 6.59 which is the most # stable version on CPAN at time of writing and is, to quote # ribasushi, "not terminally fucked, > and tested enough". # TODO: We will now need to maintain this over time to push # the version up as new versions are released. $self->build_requires( 'ExtUtils::MakeMaker' => 6.59 ); $self->configure_requires( 'ExtUtils::MakeMaker' => 6.59 ); } else { # Allow legacy-compatibility with 5.005 by depending on the # most recent EU:MM that supported 5.005. $self->build_requires( 'ExtUtils::MakeMaker' => 6.36 ); $self->configure_requires( 'ExtUtils::MakeMaker' => 6.36 ); } # Generate the MakeMaker params my $args = $self->makemaker_args; $args->{DISTNAME} = $self->name; $args->{NAME} = $self->module_name || $self->name; $args->{NAME} =~ s/-/::/g; $args->{VERSION} = $self->version or die <<'EOT'; ERROR: Can't determine distribution version. Please specify it explicitly via 'version' in Makefile.PL, or set a valid $VERSION in a module, and provide its file path via 'version_from' (or 'all_from' if you prefer) in Makefile.PL. EOT if ( $self->tests ) { my @tests = split ' ', $self->tests; my %seen; $args->{test} = { TESTS => (join ' ', grep {!$seen{$_}++} @tests), }; } elsif ( $Module::Install::ExtraTests::use_extratests ) { # Module::Install::ExtraTests doesn't set $self->tests and does its own tests via harness. # So, just ignore our xt tests here. } elsif ( -d 'xt' and ($Module::Install::AUTHOR or $ENV{RELEASE_TESTING}) ) { $args->{test} = { TESTS => join( ' ', map { "$_/*.t" } grep { -d $_ } qw{ t xt } ), }; } if ( $] >= 5.005 ) { $args->{ABSTRACT} = $self->abstract; $args->{AUTHOR} = join ', ', @{$self->author || []}; } if ( $self->makemaker(6.10) ) { $args->{NO_META} = 1; #$args->{NO_MYMETA} = 1; } if ( $self->makemaker(6.17) and $self->sign ) { $args->{SIGN} = 1; } unless ( $self->is_admin ) { delete $args->{SIGN}; } if ( $self->makemaker(6.31) and $self->license ) { $args->{LICENSE} = $self->license; } my $prereq = ($args->{PREREQ_PM} ||= {}); %$prereq = ( %$prereq, map { @$_ } # flatten [module => version] map { @$_ } grep $_, ($self->requires) ); # Remove any reference to perl, PREREQ_PM doesn't support it delete $args->{PREREQ_PM}->{perl}; # Merge both kinds of requires into BUILD_REQUIRES my $build_prereq = ($args->{BUILD_REQUIRES} ||= {}); %$build_prereq = ( %$build_prereq, map { @$_ } # flatten [module => version] map { @$_ } grep $_, ($self->configure_requires, $self->build_requires) ); # Remove any reference to perl, BUILD_REQUIRES doesn't support it delete $args->{BUILD_REQUIRES}->{perl}; # Delete bundled dists from prereq_pm, add it to Makefile DIR my $subdirs = ($args->{DIR} || []); if ($self->bundles) { my %processed; foreach my $bundle (@{ $self->bundles }) { my ($mod_name, $dist_dir) = @$bundle; delete $prereq->{$mod_name}; $dist_dir = File::Basename::basename($dist_dir); # dir for building this module if (not exists $processed{$dist_dir}) { if (-d $dist_dir) { # List as sub-directory to be processed by make push @$subdirs, $dist_dir; } # Else do nothing: the module is already present on the system $processed{$dist_dir} = undef; } } } unless ( $self->makemaker('6.55_03') ) { %$prereq = (%$prereq,%$build_prereq); delete $args->{BUILD_REQUIRES}; } if ( my $perl_version = $self->perl_version ) { eval "use $perl_version; 1" or die "ERROR: perl: Version $] is installed, " . "but we need version >= $perl_version"; if ( $self->makemaker(6.48) ) { $args->{MIN_PERL_VERSION} = $perl_version; } } if ($self->installdirs) { warn qq{old INSTALLDIRS (probably set by makemaker_args) is overriden by installdirs\n} if $args->{INSTALLDIRS}; $args->{INSTALLDIRS} = $self->installdirs; } my %args = map { ( $_ => $args->{$_} ) } grep {defined($args->{$_} ) } keys %$args; my $user_preop = delete $args{dist}->{PREOP}; if ( my $preop = $self->admin->preop($user_preop) ) { foreach my $key ( keys %$preop ) { $args{dist}->{$key} = $preop->{$key}; } } my $mm = ExtUtils::MakeMaker::WriteMakefile(%args); $self->fix_up_makefile($mm->{FIRST_MAKEFILE} || 'Makefile'); } sub fix_up_makefile { my $self = shift; my $makefile_name = shift; my $top_class = ref($self->_top) || ''; my $top_version = $self->_top->VERSION || ''; my $preamble = $self->preamble ? "# Preamble by $top_class $top_version\n" . $self->preamble : ''; my $postamble = "# Postamble by $top_class $top_version\n" . ($self->postamble || ''); local *MAKEFILE; open MAKEFILE, "+< $makefile_name" or die "fix_up_makefile: Couldn't open $makefile_name: $!"; eval { flock MAKEFILE, LOCK_EX }; my $makefile = do { local $/; }; $makefile =~ s/\b(test_harness\(\$\(TEST_VERBOSE\), )/$1'inc', /; $makefile =~ s/( -I\$\(INST_ARCHLIB\))/ -Iinc$1/g; $makefile =~ s/( "-I\$\(INST_LIB\)")/ "-Iinc"$1/g; $makefile =~ s/^(FULLPERL = .*)/$1 "-Iinc"/m; $makefile =~ s/^(PERL = .*)/$1 "-Iinc"/m; # Module::Install will never be used to build the Core Perl # Sometimes PERL_LIB and PERL_ARCHLIB get written anyway, which breaks # PREFIX/PERL5LIB, and thus, install_share. Blank them if they exist $makefile =~ s/^PERL_LIB = .+/PERL_LIB =/m; #$makefile =~ s/^PERL_ARCHLIB = .+/PERL_ARCHLIB =/m; # Perl 5.005 mentions PERL_LIB explicitly, so we have to remove that as well. $makefile =~ s/(\"?)-I\$\(PERL_LIB\)\1//g; # XXX - This is currently unused; not sure if it breaks other MM-users # $makefile =~ s/^pm_to_blib\s+:\s+/pm_to_blib :: /mg; seek MAKEFILE, 0, SEEK_SET; truncate MAKEFILE, 0; print MAKEFILE "$preamble$makefile$postamble" or die $!; close MAKEFILE or die $!; 1; } sub preamble { my ($self, $text) = @_; $self->{preamble} = $text . $self->{preamble} if defined $text; $self->{preamble}; } sub postamble { my ($self, $text) = @_; $self->{postamble} ||= $self->admin->postamble; $self->{postamble} .= $text if defined $text; $self->{postamble} } 1; __END__ #line 544 Web-Scraper-0.37/inc/Module/Install/Metadata.pm000644 000765 000024 00000043277 12040350115 022205 0ustar00miyagawastaff000000 000000 #line 1 package Module::Install::Metadata; use strict 'vars'; use Module::Install::Base (); use vars qw{$VERSION @ISA $ISCORE}; BEGIN { $VERSION = '1.06'; @ISA = 'Module::Install::Base'; $ISCORE = 1; } my @boolean_keys = qw{ sign }; my @scalar_keys = qw{ name module_name abstract version distribution_type tests installdirs }; my @tuple_keys = qw{ configure_requires build_requires requires recommends bundles resources }; my @resource_keys = qw{ homepage bugtracker repository }; my @array_keys = qw{ keywords author }; *authors = \&author; sub Meta { shift } sub Meta_BooleanKeys { @boolean_keys } sub Meta_ScalarKeys { @scalar_keys } sub Meta_TupleKeys { @tuple_keys } sub Meta_ResourceKeys { @resource_keys } sub Meta_ArrayKeys { @array_keys } foreach my $key ( @boolean_keys ) { *$key = sub { my $self = shift; if ( defined wantarray and not @_ ) { return $self->{values}->{$key}; } $self->{values}->{$key} = ( @_ ? $_[0] : 1 ); return $self; }; } foreach my $key ( @scalar_keys ) { *$key = sub { my $self = shift; return $self->{values}->{$key} if defined wantarray and !@_; $self->{values}->{$key} = shift; return $self; }; } foreach my $key ( @array_keys ) { *$key = sub { my $self = shift; return $self->{values}->{$key} if defined wantarray and !@_; $self->{values}->{$key} ||= []; push @{$self->{values}->{$key}}, @_; return $self; }; } foreach my $key ( @resource_keys ) { *$key = sub { my $self = shift; unless ( @_ ) { return () unless $self->{values}->{resources}; return map { $_->[1] } grep { $_->[0] eq $key } @{ $self->{values}->{resources} }; } return $self->{values}->{resources}->{$key} unless @_; my $uri = shift or die( "Did not provide a value to $key()" ); $self->resources( $key => $uri ); return 1; }; } foreach my $key ( grep { $_ ne "resources" } @tuple_keys) { *$key = sub { my $self = shift; return $self->{values}->{$key} unless @_; my @added; while ( @_ ) { my $module = shift or last; my $version = shift || 0; push @added, [ $module, $version ]; } push @{ $self->{values}->{$key} }, @added; return map {@$_} @added; }; } # Resource handling my %lc_resource = map { $_ => 1 } qw{ homepage license bugtracker repository }; sub resources { my $self = shift; while ( @_ ) { my $name = shift or last; my $value = shift or next; if ( $name eq lc $name and ! $lc_resource{$name} ) { die("Unsupported reserved lowercase resource '$name'"); } $self->{values}->{resources} ||= []; push @{ $self->{values}->{resources} }, [ $name, $value ]; } $self->{values}->{resources}; } # Aliases for build_requires that will have alternative # meanings in some future version of META.yml. sub test_requires { shift->build_requires(@_) } sub install_requires { shift->build_requires(@_) } # Aliases for installdirs options sub install_as_core { $_[0]->installdirs('perl') } sub install_as_cpan { $_[0]->installdirs('site') } sub install_as_site { $_[0]->installdirs('site') } sub install_as_vendor { $_[0]->installdirs('vendor') } sub dynamic_config { my $self = shift; my $value = @_ ? shift : 1; if ( $self->{values}->{dynamic_config} ) { # Once dynamic we never change to static, for safety return 0; } $self->{values}->{dynamic_config} = $value ? 1 : 0; return 1; } # Convenience command sub static_config { shift->dynamic_config(0); } sub perl_version { my $self = shift; return $self->{values}->{perl_version} unless @_; my $version = shift or die( "Did not provide a value to perl_version()" ); # Normalize the version $version = $self->_perl_version($version); # We don't support the really old versions unless ( $version >= 5.005 ) { die "Module::Install only supports 5.005 or newer (use ExtUtils::MakeMaker)\n"; } $self->{values}->{perl_version} = $version; } sub all_from { my ( $self, $file ) = @_; unless ( defined($file) ) { my $name = $self->name or die( "all_from called with no args without setting name() first" ); $file = join('/', 'lib', split(/-/, $name)) . '.pm'; $file =~ s{.*/}{} unless -e $file; unless ( -e $file ) { die("all_from cannot find $file from $name"); } } unless ( -f $file ) { die("The path '$file' does not exist, or is not a file"); } $self->{values}{all_from} = $file; # Some methods pull from POD instead of code. # If there is a matching .pod, use that instead my $pod = $file; $pod =~ s/\.pm$/.pod/i; $pod = $file unless -e $pod; # Pull the different values $self->name_from($file) unless $self->name; $self->version_from($file) unless $self->version; $self->perl_version_from($file) unless $self->perl_version; $self->author_from($pod) unless @{$self->author || []}; $self->license_from($pod) unless $self->license; $self->abstract_from($pod) unless $self->abstract; return 1; } sub provides { my $self = shift; my $provides = ( $self->{values}->{provides} ||= {} ); %$provides = (%$provides, @_) if @_; return $provides; } sub auto_provides { my $self = shift; return $self unless $self->is_admin; unless (-e 'MANIFEST') { warn "Cannot deduce auto_provides without a MANIFEST, skipping\n"; return $self; } # Avoid spurious warnings as we are not checking manifest here. local $SIG{__WARN__} = sub {1}; require ExtUtils::Manifest; local *ExtUtils::Manifest::manicheck = sub { return }; require Module::Build; my $build = Module::Build->new( dist_name => $self->name, dist_version => $self->version, license => $self->license, ); $self->provides( %{ $build->find_dist_packages || {} } ); } sub feature { my $self = shift; my $name = shift; my $features = ( $self->{values}->{features} ||= [] ); my $mods; if ( @_ == 1 and ref( $_[0] ) ) { # The user used ->feature like ->features by passing in the second # argument as a reference. Accomodate for that. $mods = $_[0]; } else { $mods = \@_; } my $count = 0; push @$features, ( $name => [ map { ref($_) ? ( ref($_) eq 'HASH' ) ? %$_ : @$_ : $_ } @$mods ] ); return @$features; } sub features { my $self = shift; while ( my ( $name, $mods ) = splice( @_, 0, 2 ) ) { $self->feature( $name, @$mods ); } return $self->{values}->{features} ? @{ $self->{values}->{features} } : (); } sub no_index { my $self = shift; my $type = shift; push @{ $self->{values}->{no_index}->{$type} }, @_ if $type; return $self->{values}->{no_index}; } sub read { my $self = shift; $self->include_deps( 'YAML::Tiny', 0 ); require YAML::Tiny; my $data = YAML::Tiny::LoadFile('META.yml'); # Call methods explicitly in case user has already set some values. while ( my ( $key, $value ) = each %$data ) { next unless $self->can($key); if ( ref $value eq 'HASH' ) { while ( my ( $module, $version ) = each %$value ) { $self->can($key)->($self, $module => $version ); } } else { $self->can($key)->($self, $value); } } return $self; } sub write { my $self = shift; return $self unless $self->is_admin; $self->admin->write_meta; return $self; } sub version_from { require ExtUtils::MM_Unix; my ( $self, $file ) = @_; $self->version( ExtUtils::MM_Unix->parse_version($file) ); # for version integrity check $self->makemaker_args( VERSION_FROM => $file ); } sub abstract_from { require ExtUtils::MM_Unix; my ( $self, $file ) = @_; $self->abstract( bless( { DISTNAME => $self->name }, 'ExtUtils::MM_Unix' )->parse_abstract($file) ); } # Add both distribution and module name sub name_from { my ($self, $file) = @_; if ( Module::Install::_read($file) =~ m/ ^ \s* package \s* ([\w:]+) \s* ; /ixms ) { my ($name, $module_name) = ($1, $1); $name =~ s{::}{-}g; $self->name($name); unless ( $self->module_name ) { $self->module_name($module_name); } } else { die("Cannot determine name from $file\n"); } } sub _extract_perl_version { if ( $_[0] =~ m/ ^\s* (?:use|require) \s* v? ([\d_\.]+) \s* ; /ixms ) { my $perl_version = $1; $perl_version =~ s{_}{}g; return $perl_version; } else { return; } } sub perl_version_from { my $self = shift; my $perl_version=_extract_perl_version(Module::Install::_read($_[0])); if ($perl_version) { $self->perl_version($perl_version); } else { warn "Cannot determine perl version info from $_[0]\n"; return; } } sub author_from { my $self = shift; my $content = Module::Install::_read($_[0]); if ($content =~ m/ =head \d \s+ (?:authors?)\b \s* ([^\n]*) | =head \d \s+ (?:licen[cs]e|licensing|copyright|legal)\b \s* .*? copyright .*? \d\d\d[\d.]+ \s* (?:\bby\b)? \s* ([^\n]*) /ixms) { my $author = $1 || $2; # XXX: ugly but should work anyway... if (eval "require Pod::Escapes; 1") { # Pod::Escapes has a mapping table. # It's in core of perl >= 5.9.3, and should be installed # as one of the Pod::Simple's prereqs, which is a prereq # of Pod::Text 3.x (see also below). $author =~ s{ E<( (\d+) | ([A-Za-z]+) )> } { defined $2 ? chr($2) : defined $Pod::Escapes::Name2character_number{$1} ? chr($Pod::Escapes::Name2character_number{$1}) : do { warn "Unknown escape: E<$1>"; "E<$1>"; }; }gex; } elsif (eval "require Pod::Text; 1" && $Pod::Text::VERSION < 3) { # Pod::Text < 3.0 has yet another mapping table, # though the table name of 2.x and 1.x are different. # (1.x is in core of Perl < 5.6, 2.x is in core of # Perl < 5.9.3) my $mapping = ($Pod::Text::VERSION < 2) ? \%Pod::Text::HTML_Escapes : \%Pod::Text::ESCAPES; $author =~ s{ E<( (\d+) | ([A-Za-z]+) )> } { defined $2 ? chr($2) : defined $mapping->{$1} ? $mapping->{$1} : do { warn "Unknown escape: E<$1>"; "E<$1>"; }; }gex; } else { $author =~ s{E}{<}g; $author =~ s{E}{>}g; } $self->author($author); } else { warn "Cannot determine author info from $_[0]\n"; } } #Stolen from M::B my %license_urls = ( perl => 'http://dev.perl.org/licenses/', apache => 'http://apache.org/licenses/LICENSE-2.0', apache_1_1 => 'http://apache.org/licenses/LICENSE-1.1', artistic => 'http://opensource.org/licenses/artistic-license.php', artistic_2 => 'http://opensource.org/licenses/artistic-license-2.0.php', lgpl => 'http://opensource.org/licenses/lgpl-license.php', lgpl2 => 'http://opensource.org/licenses/lgpl-2.1.php', lgpl3 => 'http://opensource.org/licenses/lgpl-3.0.html', bsd => 'http://opensource.org/licenses/bsd-license.php', gpl => 'http://opensource.org/licenses/gpl-license.php', gpl2 => 'http://opensource.org/licenses/gpl-2.0.php', gpl3 => 'http://opensource.org/licenses/gpl-3.0.html', mit => 'http://opensource.org/licenses/mit-license.php', mozilla => 'http://opensource.org/licenses/mozilla1.1.php', open_source => undef, unrestricted => undef, restrictive => undef, unknown => undef, ); sub license { my $self = shift; return $self->{values}->{license} unless @_; my $license = shift or die( 'Did not provide a value to license()' ); $license = __extract_license($license) || lc $license; $self->{values}->{license} = $license; # Automatically fill in license URLs if ( $license_urls{$license} ) { $self->resources( license => $license_urls{$license} ); } return 1; } sub _extract_license { my $pod = shift; my $matched; return __extract_license( ($matched) = $pod =~ m/ (=head \d \s+ L(?i:ICEN[CS]E|ICENSING)\b.*?) (=head \d.*|=cut.*|)\z /xms ) || __extract_license( ($matched) = $pod =~ m/ (=head \d \s+ (?:C(?i:OPYRIGHTS?)|L(?i:EGAL))\b.*?) (=head \d.*|=cut.*|)\z /xms ); } sub __extract_license { my $license_text = shift or return; my @phrases = ( '(?:under )?the same (?:terms|license) as (?:perl|the perl (?:\d )?programming language)' => 'perl', 1, '(?:under )?the terms of (?:perl|the perl programming language) itself' => 'perl', 1, 'Artistic and GPL' => 'perl', 1, 'GNU general public license' => 'gpl', 1, 'GNU public license' => 'gpl', 1, 'GNU lesser general public license' => 'lgpl', 1, 'GNU lesser public license' => 'lgpl', 1, 'GNU library general public license' => 'lgpl', 1, 'GNU library public license' => 'lgpl', 1, 'GNU Free Documentation license' => 'unrestricted', 1, 'GNU Affero General Public License' => 'open_source', 1, '(?:Free)?BSD license' => 'bsd', 1, 'Artistic license 2\.0' => 'artistic_2', 1, 'Artistic license' => 'artistic', 1, 'Apache (?:Software )?license' => 'apache', 1, 'GPL' => 'gpl', 1, 'LGPL' => 'lgpl', 1, 'BSD' => 'bsd', 1, 'Artistic' => 'artistic', 1, 'MIT' => 'mit', 1, 'Mozilla Public License' => 'mozilla', 1, 'Q Public License' => 'open_source', 1, 'OpenSSL License' => 'unrestricted', 1, 'SSLeay License' => 'unrestricted', 1, 'zlib License' => 'open_source', 1, 'proprietary' => 'proprietary', 0, ); while ( my ($pattern, $license, $osi) = splice(@phrases, 0, 3) ) { $pattern =~ s#\s+#\\s+#gs; if ( $license_text =~ /\b$pattern\b/i ) { return $license; } } return ''; } sub license_from { my $self = shift; if (my $license=_extract_license(Module::Install::_read($_[0]))) { $self->license($license); } else { warn "Cannot determine license info from $_[0]\n"; return 'unknown'; } } sub _extract_bugtracker { my @links = $_[0] =~ m#L<( https?\Q://rt.cpan.org/\E[^>]+| https?\Q://github.com/\E[\w_]+/[\w_]+/issues| https?\Q://code.google.com/p/\E[\w_\-]+/issues/list )>#gx; my %links; @links{@links}=(); @links=keys %links; return @links; } sub bugtracker_from { my $self = shift; my $content = Module::Install::_read($_[0]); my @links = _extract_bugtracker($content); unless ( @links ) { warn "Cannot determine bugtracker info from $_[0]\n"; return 0; } if ( @links > 1 ) { warn "Found more than one bugtracker link in $_[0]\n"; return 0; } # Set the bugtracker bugtracker( $links[0] ); return 1; } sub requires_from { my $self = shift; my $content = Module::Install::_readperl($_[0]); my @requires = $content =~ m/^use\s+([^\W\d]\w*(?:::\w+)*)\s+(v?[\d\.]+)/mg; while ( @requires ) { my $module = shift @requires; my $version = shift @requires; $self->requires( $module => $version ); } } sub test_requires_from { my $self = shift; my $content = Module::Install::_readperl($_[0]); my @requires = $content =~ m/^use\s+([^\W\d]\w*(?:::\w+)*)\s+([\d\.]+)/mg; while ( @requires ) { my $module = shift @requires; my $version = shift @requires; $self->test_requires( $module => $version ); } } # Convert triple-part versions (eg, 5.6.1 or 5.8.9) to # numbers (eg, 5.006001 or 5.008009). # Also, convert double-part versions (eg, 5.8) sub _perl_version { my $v = $_[-1]; $v =~ s/^([1-9])\.([1-9]\d?\d?)$/sprintf("%d.%03d",$1,$2)/e; $v =~ s/^([1-9])\.([1-9]\d?\d?)\.(0|[1-9]\d?\d?)$/sprintf("%d.%03d%03d",$1,$2,$3 || 0)/e; $v =~ s/(\.\d\d\d)000$/$1/; $v =~ s/_.+$//; if ( ref($v) ) { # Numify $v = $v + 0; } return $v; } sub add_metadata { my $self = shift; my %hash = @_; for my $key (keys %hash) { warn "add_metadata: $key is not prefixed with 'x_'.\n" . "Use appopriate function to add non-private metadata.\n" unless $key =~ /^x_/; $self->{values}->{$key} = $hash{$key}; } } ###################################################################### # MYMETA Support sub WriteMyMeta { die "WriteMyMeta has been deprecated"; } sub write_mymeta_yaml { my $self = shift; # We need YAML::Tiny to write the MYMETA.yml file unless ( eval { require YAML::Tiny; 1; } ) { return 1; } # Generate the data my $meta = $self->_write_mymeta_data or return 1; # Save as the MYMETA.yml file print "Writing MYMETA.yml\n"; YAML::Tiny::DumpFile('MYMETA.yml', $meta); } sub write_mymeta_json { my $self = shift; # We need JSON to write the MYMETA.json file unless ( eval { require JSON; 1; } ) { return 1; } # Generate the data my $meta = $self->_write_mymeta_data or return 1; # Save as the MYMETA.yml file print "Writing MYMETA.json\n"; Module::Install::_write( 'MYMETA.json', JSON->new->pretty(1)->canonical->encode($meta), ); } sub _write_mymeta_data { my $self = shift; # If there's no existing META.yml there is nothing we can do return undef unless -f 'META.yml'; # We need Parse::CPAN::Meta to load the file unless ( eval { require Parse::CPAN::Meta; 1; } ) { return undef; } # Merge the perl version into the dependencies my $val = $self->Meta->{values}; my $perl = delete $val->{perl_version}; if ( $perl ) { $val->{requires} ||= []; my $requires = $val->{requires}; # Canonize to three-dot version after Perl 5.6 if ( $perl >= 5.006 ) { $perl =~ s{^(\d+)\.(\d\d\d)(\d*)}{join('.', $1, int($2||0), int($3||0))}e } unshift @$requires, [ perl => $perl ]; } # Load the advisory META.yml file my @yaml = Parse::CPAN::Meta::LoadFile('META.yml'); my $meta = $yaml[0]; # Overwrite the non-configure dependency hashs delete $meta->{requires}; delete $meta->{build_requires}; delete $meta->{recommends}; if ( exists $val->{requires} ) { $meta->{requires} = { map { @$_ } @{ $val->{requires} } }; } if ( exists $val->{build_requires} ) { $meta->{build_requires} = { map { @$_ } @{ $val->{build_requires} } }; } return $meta; } 1; Web-Scraper-0.37/inc/Module/Install/Repository.pm000644 000765 000024 00000004256 12040350115 022636 0ustar00miyagawastaff000000 000000 #line 1 package Module::Install::Repository; use strict; use 5.005; use vars qw($VERSION); $VERSION = '0.06'; use base qw(Module::Install::Base); sub _execute { my ($command) = @_; `$command`; } sub auto_set_repository { my $self = shift; return unless $Module::Install::AUTHOR; my $repo = _find_repo(\&_execute); if ($repo) { $self->repository($repo); } else { warn "Cannot determine repository URL\n"; } } sub _find_repo { my ($execute) = @_; if (-e ".git") { # TODO support remote besides 'origin'? if ($execute->('git remote show -n origin') =~ /URL: (.*)$/m) { # XXX Make it public clone URL, but this only works with github my $git_url = $1; $git_url =~ s![\w\-]+\@([^:]+):!git://$1/!; return $git_url; } elsif ($execute->('git svn info') =~ /URL: (.*)$/m) { return $1; } } elsif (-e ".svn") { if (`svn info` =~ /URL: (.*)$/m) { return $1; } } elsif (-e "_darcs") { # defaultrepo is better, but that is more likely to be ssh, not http if (my $query_repo = `darcs query repo`) { if ($query_repo =~ m!Default Remote: (http://.+)!) { return $1; } } open my $handle, '<', '_darcs/prefs/repos' or return; while (<$handle>) { chomp; return $_ if m!^http://!; } } elsif (-e ".hg") { if ($execute->('hg paths') =~ /default = (.*)$/m) { my $mercurial_url = $1; $mercurial_url =~ s!^ssh://hg\@(bitbucket\.org/)!https://$1!; return $mercurial_url; } } elsif (-e "$ENV{HOME}/.svk") { # Is there an explicit way to check if it's an svk checkout? my $svk_info = `svk info` or return; SVK_INFO: { if ($svk_info =~ /Mirrored From: (.*), Rev\./) { return $1; } if ($svk_info =~ m!Merged From: (/mirror/.*), Rev\.!) { $svk_info = `svk info /$1` or return; redo SVK_INFO; } } return; } } 1; __END__ =encoding utf-8 #line 128 Web-Scraper-0.37/inc/Module/Install/Scripts.pm000644 000765 000024 00000001011 12040350115 022070 0ustar00miyagawastaff000000 000000 #line 1 package Module::Install::Scripts; use strict 'vars'; use Module::Install::Base (); use vars qw{$VERSION @ISA $ISCORE}; BEGIN { $VERSION = '1.06'; @ISA = 'Module::Install::Base'; $ISCORE = 1; } sub install_script { my $self = shift; my $args = $self->makemaker_args; my $exe = $args->{EXE_FILES} ||= []; foreach ( @_ ) { if ( -f $_ ) { push @$exe, $_; } elsif ( -d 'script' and -f "script/$_" ) { push @$exe, "script/$_"; } else { die("Cannot find script '$_'"); } } } 1; Web-Scraper-0.37/inc/Module/Install/Win32.pm000644 000765 000024 00000003403 12040350115 021352 0ustar00miyagawastaff000000 000000 #line 1 package Module::Install::Win32; use strict; use Module::Install::Base (); use vars qw{$VERSION @ISA $ISCORE}; BEGIN { $VERSION = '1.06'; @ISA = 'Module::Install::Base'; $ISCORE = 1; } # determine if the user needs nmake, and download it if needed sub check_nmake { my $self = shift; $self->load('can_run'); $self->load('get_file'); require Config; return unless ( $^O eq 'MSWin32' and $Config::Config{make} and $Config::Config{make} =~ /^nmake\b/i and ! $self->can_run('nmake') ); print "The required 'nmake' executable not found, fetching it...\n"; require File::Basename; my $rv = $self->get_file( url => 'http://download.microsoft.com/download/vc15/Patch/1.52/W95/EN-US/Nmake15.exe', ftp_url => 'ftp://ftp.microsoft.com/Softlib/MSLFILES/Nmake15.exe', local_dir => File::Basename::dirname($^X), size => 51928, run => 'Nmake15.exe /o > nul', check_for => 'Nmake.exe', remove => 1, ); die <<'END_MESSAGE' unless $rv; ------------------------------------------------------------------------------- Since you are using Microsoft Windows, you will need the 'nmake' utility before installation. It's available at: http://download.microsoft.com/download/vc15/Patch/1.52/W95/EN-US/Nmake15.exe or ftp://ftp.microsoft.com/Softlib/MSLFILES/Nmake15.exe Please download the file manually, save it to a directory in %PATH% (e.g. C:\WINDOWS\COMMAND\), then launch the MS-DOS command line shell, "cd" to that directory, and run "Nmake15.exe" from there; that will create the 'nmake.exe' file needed by this module. You may then resume the installation process described in README. ------------------------------------------------------------------------------- END_MESSAGE } 1; Web-Scraper-0.37/inc/Module/Install/WriteAll.pm000644 000765 000024 00000002376 12040350115 022203 0ustar00miyagawastaff000000 000000 #line 1 package Module::Install::WriteAll; use strict; use Module::Install::Base (); use vars qw{$VERSION @ISA $ISCORE}; BEGIN { $VERSION = '1.06'; @ISA = qw{Module::Install::Base}; $ISCORE = 1; } sub WriteAll { my $self = shift; my %args = ( meta => 1, sign => 0, inline => 0, check_nmake => 1, @_, ); $self->sign(1) if $args{sign}; $self->admin->WriteAll(%args) if $self->is_admin; $self->check_nmake if $args{check_nmake}; unless ( $self->makemaker_args->{PL_FILES} ) { # XXX: This still may be a bit over-defensive... unless ($self->makemaker(6.25)) { $self->makemaker_args( PL_FILES => {} ) if -f 'Build.PL'; } } # Until ExtUtils::MakeMaker support MYMETA.yml, make sure # we clean it up properly ourself. $self->realclean_files('MYMETA.yml'); if ( $args{inline} ) { $self->Inline->write; } else { $self->Makefile->write; } # The Makefile write process adds a couple of dependencies, # so write the META.yml files after the Makefile. if ( $args{meta} ) { $self->Meta->write; } # Experimental support for MYMETA if ( $ENV{X_MYMETA} ) { if ( $ENV{X_MYMETA} eq 'JSON' ) { $self->Meta->write_mymeta_json; } else { $self->Meta->write_mymeta_yaml; } } return 1; } 1; Web-Scraper-0.37/eg/dave-trailer-HD.pl000755 000765 000024 00000001204 11162225735 020323 0ustar00miyagawastaff000000 000000 #!/usr/bin/perl use strict; use warnings; use lib "lib"; use Web::Scraper; use URI; use YAML; # extract HD trailers from Dave's trailer page my $uri = URI->new("http://www.drfoster.f2s.com/"); my $s = scraper { process "td>ul>li", "trailers[]" => scraper { process_first "li>b", title => "TEXT"; process_first "ul>li>a[href]", url => '@href'; process "ul>li>ul>li>a", "movies[]" => sub { my $elem = shift; return { text => $elem->as_text, href => $elem->attr('href'), }; }; }; result "trailers"; }; warn Dump $s->scrape($uri); Web-Scraper-0.37/eg/ebay-auction.pl000755 000765 000024 00000001145 11162225735 020037 0ustar00miyagawastaff000000 000000 #!/usr/bin/perl use strict; use warnings; use URI; use lib "lib"; use Web::Scraper; my $ebay_auction = scraper { process "h3.ens>a", description => 'TEXT', url => '@href'; process "td.ebcPr>span", price => "TEXT"; process "div.ebPicture >a>img", image => '@src'; result 'description', 'url', 'price', 'image'; }; my $ebay = scraper { process "table.ebItemlist tr.single", "auctions[]" => $ebay_auction; result 'auctions'; }; my $auctions = $ebay->scrape( URI->new("http://search.ebay.com/apple-ipod-nano_W0QQssPageNameZWLRS") ); use YAML; warn Dump $auctions; Web-Scraper-0.37/eg/extract-links.pl000755 000765 000024 00000000447 11162225735 020253 0ustar00miyagawastaff000000 000000 #!/usr/bin/perl use strict; use warnings; use URI; use lib "lib"; use Web::Scraper; my $uri = shift @ARGV or die "URI needed"; my $scraper = scraper { process "a[href]", "urls[]" => '@href'; result 'urls'; }; my $links = $scraper->scrape(URI->new($uri)); use YAML; warn Dump $links; Web-Scraper-0.37/eg/hatena-keyword.pl000755 000765 000024 00000001202 11162225735 020373 0ustar00miyagawastaff000000 000000 #!/usr/bin/perl use strict; use warnings; use lib "lib"; use URI; use Web::Scraper; # same as http://d.hatena.ne.jp/secondlife/20060922/1158923779 my $keyword = scraper { process 'span.title > a:first-child', title => 'TEXT', url => '@href'; process 'span.furigana', furigana => 'TEXT'; process 'ul.list-circle > li:first-child > a', category => 'TEXT'; }; my $res = $keyword->scrape(URI->new("http://d.hatena.ne.jp/keyword/%BA%B0%CC%EE%A4%A2%A4%B5%C8%FE")); use YAML; warn Dump $res; __END__ --- category: アイドル furigana: こんのあさみ title: 紺野あさ美 url: /keyword/%ba%b0%cc%ee%a4%a2%a4%b5%c8%fe?kid=800 Web-Scraper-0.37/eg/jp-playstation-store.pl000755 000765 000024 00000000453 11162225735 021570 0ustar00miyagawastaff000000 000000 #!/usr/bin/perl use strict; use Web::Scraper; use URI; use YAML; my $stuff = URI->new("http://www.jp.playstation.com/store/"); my $scraper = scraper { process "#Sinfo p a", 'news[]' => { link => '@href', title => 'TEXT' }; }; my $result = $scraper->scrape($stuff); print YAML::Dump $result; Web-Scraper-0.37/eg/rel-tag.pl000755 000765 000024 00000000770 11162225735 017015 0ustar00miyagawastaff000000 000000 #!/usr/bin/perl # Extract tags from web pages that have rel-tag microformat use strict; use warnings; use URI; use URI::Escape; use Web::Scraper; use YAML; my $uri = shift or die "Usage: rel-tag.pl URL\n"; my $scraper = scraper { process 'a[rel~="tag"]', 'tags[]' => sub { my $uri = URI->new($_->attr('href')); my $label = (grep length, split '/', $uri->path)[-1]; $label =~ s/\+/%20/g; uri_unescape($label); }; }; warn Dump $scraper->scrape(URI->new($uri)); Web-Scraper-0.37/eg/twitter-friends.pl000755 000765 000024 00000000723 11162225735 020612 0ustar00miyagawastaff000000 000000 #!/usr/bin/perl use strict; use warnings; use lib "lib"; use URI; use Web::Scraper; my $nick = shift || "miyagawa"; my $uri = URI->new("http://twitter.com/$nick"); my $twitter = scraper { process 'a[rel=~"contact"]', 'friends[]' => scraper { process 'a', url => '@href', name => '@title'; process 'img', src => '@src'; }; result 'friends'; }; my $friends = $twitter->scrape($uri); use YAML; warn Dump $friends; Web-Scraper-0.37/bin/scraper000755 000765 000024 00000005057 11162225735 016667 0ustar00miyagawastaff000000 000000 #!/usr/bin/perl use strict; use warnings; use Config; use Term::ReadLine; use Data::Dumper; use HTML::Entities; use URI; use Web::Scraper; use YAML; sub WARN() { return sub { warn $_->isTextNode ? HTML::Entities::encode($_->as_XML, q("'<>&)) : $_->as_HTML(q('"&<>), "", {}); }; } my $print = sub { if ($ENV{PAGER}) { open my $pager, "|$ENV{PAGER}"; print $pager @_; } else { print @_; } }; my(@stack, $source); my $stuff = process_args($ARGV[0]) or die "Usage: scraper [URI-or-filename]\n"; my $term = Term::ReadLine->new("Web::Scraper"); my $scraper = scraper { run_loop($_[0], $term) }; $scraper->user_agent->env_proxy; my $result = $scraper->scrape($stuff); sub process_args { my $uri = shift; if (!-t STDIN and my $content = join "", ) { $source = [ 'stdin' ]; return \$content; } elsif ($uri && $uri =~ m!^https?://!) { $source = [ "URI", $uri ]; return URI->new($uri); } elsif ($uri && -e $uri) { $source = [ 'file', $uri ]; open my $fh, "<", $uri or die "$uri: $!"; return join "", <$fh>; } return; } sub run_loop { my($tree, $term) = @_; while (defined(my $in = $term->readline("scraper> "))) { if ($in eq 'd') { $Data::Dumper::Indent = 1; warn Dumper result; } elsif ($in eq 'y') { warn Dump result; } elsif ($in eq 's') { $print->($tree->as_HTML(q('"&<>), " ", {})); } elsif ($in eq 'q') { return; } elsif ($in eq 'c') { print generate_code($source, $stack[-1]); } elsif ($in =~ /^c\s+all\s*$/) { print generate_code($source, @stack); } else { my $res = eval $in; warn $@ if $@; push @stack, $in unless $@; } } } sub generate_code { my($source, @stack) = @_; my $code_stack = join "\n", map { " $_" . (/;$/ ? "" : ";") } @stack; my($var, $stuff) = $source->[0] eq 'stdin' ? ('$input', '\join "", ') : $source->[0] eq 'URI' ? ('$uri', qq(URI->new("$source->[1]"))) : $source->[0] eq 'file' ? ('$file', qq(\\do { my \$file = "$source->[1]"; open my \$fh, \$file or die "\$file: \$!"; join '', <\$fh> })) : '...'; return <scrape($var); CODE }