Web-Scraper-0.37/ 000755 000765 000024 00000000000 12040350116 014507 5 ustar 00miyagawa staff 000000 000000 Web-Scraper-0.37/.gitignore 000644 000765 000024 00000000063 11162227072 016506 0 ustar 00miyagawa staff 000000 000000 META.yml
Makefile
inc/
pm_to_blib
*~
t/libxml-*.t
Web-Scraper-0.37/bin/ 000755 000765 000024 00000000000 12040350116 015257 5 ustar 00miyagawa staff 000000 000000 Web-Scraper-0.37/Changes 000644 000765 000024 00000015736 12040350075 016022 0 ustar 00miyagawa staff 000000 000000 Revision history for Perl extension Web::Scraper
0.37 Fri Oct 19 15:09:17 PDT 2012
- Repack with the latest Module::Install
0.36 Sat Nov 19 12:12:54 PST 2011
- Support HTML5 tags by not ignoring unknonw tags (leedo)
0.35 Mon Sep 26 18:40:06 PDT 2011
- Added support for comments() XPath #3 (Perlover)
0.34 Thu Feb 24 09:35:12 PST 2011
- Skip xml_simple.t if LibXML is not there (omega)
0.33 Thu Feb 17 09:12:55 PST 2011
- Remove failing invalid XPath tests
0.32 Wed Feb 3 22:13:01 PST 2010
- Removes poking around charset and LWP's decoded_content
(Thanks to flatwhatson)
- More docs (jshirley)
0.31 Sun Jul 19 00:43:54 PDT 2009
- Use new LWP's content_charset method instead of HTTP::Response::Encoding
(Thanks to hanekomu)
0.30 Wed Jul 8 15:47:21 PDT 2009
- No warnings when use()d multiple times in the same package
0.29 Wed Jul 8 13:40:14 PDT 2009
- Adds Web::Scraper::LibXML which uses HTML::TreeBuilder::LibXML
(without the replace_original hack)
0.28 Sat Mar 28 14:31:45 PDT 2009
- Call ->eof when parsing with HTML::TreeBuilder
(Thanks to Tokuhiro Matsuno)
0.27 Tue Mar 24 12:09:04 PDT 2009
- Added tests to use HTML::TreeBuilder::LibXML
(Thanks to Tokuhiro Matsuno)
0.26 Thu Jan 15 11:37:56 PST 2009
- Fixed an error message when GET request fails
0.25 Sun Jan 11 13:36:44 PST 2009
- scrape() now accepts HTTP::Response as well for Remedie/Plagger
- repository moved to github http://github.com/miyagawa/web-scraper/tree/master
0.24 Sun Nov 25 15:58:38 PST 2007
- Support duck typing in filter args to take object that has 'filter' method
This could give Web::Scraper::Filter::Pipe a better interface
(Thanks to hanekomu and tokuhirom)
0.23 Sat Nov 24 17:21:14 PST 2007
- Upped Web::Scraper dependency
- Skip & test until HTML::TreeBuilder::XPath fixes it
- removed eg/search-cpan.pl
0.22 Wed Oct 17 17:51:54 PDT 2007
- 's' on scraper shell now prints to pager (e.g. less) if PAGER is set
0.21_01 Thu Oct 4 01:05:00 PDT 2007
- Added an experimental filter support
(Thanks to hirose31, tokuhirom and Yappo for brainstorming)
0.21 Wed Oct 3 10:37:13 PDT 2007
- Bumped up HTML::TreeBuilder dependency to fix 12_html.t issues
[rt.cpan.org #29733]
0.20 Wed Oct 3 00:28:13 PDT 2007
- Fixed a bug where URI is not absolutized with a hash reference value
- Added eg/jp-playstation-store.pl
0.19 Thu Sep 20 22:42:30 PDT 2007
- Try to get HTML encoding from META tags as well, when there's
no charset value in HTTP response header.
0.18 Thu Sep 20 19:49:11 PDT 2007
- Fixed a bug where URI is not absolutized when scraper is nested
- Use as_XML not as_HTML in 'RAW'
0.17 Wed Sep 19 19:12:25 PDT 2007
- Reverted Term::Encoding support since it causes segfaults
(double utf-8 encoding) in some environment
0.16 Tue Sep 18 04:48:47 PDT 2007
- Support 'RAW' and 'TEXT' for TextNode object
- Call Term::Encoding from scraper shell if installed
0.15 Sat Sep 15 21:28:10 PDT 2007
- Call env_proxy in scraper CLI
- Added $Web::Scraper::UserAgent and $scraper->user_agent accessor to deal
with UserAgent object
- Don't escape non-ASCII characters into XXXX; in scraper shell 's' and WARN
0.14 Fri Sep 14 16:06:20 PDT 2007
- Fix bin/scraper to work with older Term::ReadLine.
(Thanks to Tina Müller [RT:29079])
- Now link elements like img@src and a@href are automatically
converted to absolute URI using the current URI as a base.
Only effective when you do $s->scrape(URI) or $s->scrape(\$html, URI)
- Added 'HTML' and its alias 'RAW' to get the HTML chunk inside the tag
process "script", "code" => 'RAW';
Handy if you want the raw HTML code inside
--- selector
script
--- expected
function foo() {
return bar;
}
=== a
--- html
foo bar
--- selector
a
--- expected
foo bar
=== div
--- html
--- selector
#foo
--- expected
foo bar
bar
=== non-ascii
--- html
テスト
--- selector
#foo
--- expected
テスト
=== textarea
--- html
--- selector
textarea
--- expected
\n
foo
bar
\n
baz
Web-Scraper-0.37/t/13_textnode.t 000644 000765 000024 00000001520 11162225735 017306 0 ustar 00miyagawa staff 000000 000000 use strict;
use Test::Base;
use Web::Scraper;
plan tests => 1 * blocks;
filters {
selector => 'chomp',
want => 'chomp',
expected => 'chomp',
};
run {
my $block = shift;
my $s = scraper {
process $block->selector, want => $block->want;
result 'want';
};
my $want = $s->scrape($block->html);
is $want, $block->expected, $block->name;
};
__DATA__
=== TEXT
--- html
foo bar
--- selector
//p/node()[2]
--- want
TEXT
--- expected
bar
=== TEXT
--- html
foo bar
--- selector
//p/node()[2]
--- want
TEXT
--- expected
bar
=== TEXT
--- html
foo bar & baz
--- selector
//p/node()[2]
--- want
TEXT
--- expected
bar & baz
=== RAW HTML
--- SKIP
--- html
foo bar & baz
--- selector
//p/node()[2]
--- want
RAW
--- expected
bar & baz
Web-Scraper-0.37/t/14_absolute_nested.t 000644 000765 000024 00000001157 11162225735 020643 0 ustar 00miyagawa staff 000000 000000 use strict;
use Test::Base;
use Web::Scraper;
plan tests => 1 * blocks;
filters {
selector => 'chomp',
expected => 'chomp',
};
run {
my $block = shift;
my $s = scraper {
process $block->selector, want => scraper {
process "img", image => '@src';
result "image";
};
result 'want';
};
my $want = $s->scrape($block->html, $block->url);
is $want, $block->expected, $block->name;
};
__DATA__
===
--- url: http://example.com/
--- html
--- selector
a#foo
--- expected
http://example.com/foo.jpg
Web-Scraper-0.37/t/15_absolute_hash.t 000644 000765 000024 00000001041 11162225735 020275 0 ustar 00miyagawa staff 000000 000000 use strict;
use Test::Base;
use Web::Scraper;
plan tests => 1 * blocks;
filters {
expected => 'chomp',
};
run {
my $block = shift;
my $s = scraper {
process $block->selector, 'want[]' => { link => '@href' };
result 'want';
};
my $want = $s->scrape($block->html, $block->url);
is $want->[0]->{link}, $block->expected, $block->name;
};
__DATA__
===
--- url: http://example.com/
--- html
--- selector
a#foo
--- expected
http://example.com/foo.html
Web-Scraper-0.37/t/16_filter.t 000644 000765 000024 00000002535 11162225735 016753 0 ustar 00miyagawa staff 000000 000000 use strict;
use Test::Base;
use Web::Scraper;
plan tests => 1 * blocks;
filters {
expected => 'chomp',
want => 'eval',
};
run {
my $block = shift;
my $s = scraper {
process 'a', want => $block->want;
result 'want';
};
my $want = $s->scrape('foo');
my $expected = $block->expected eq 'undef' ? undef : $block->expected;
is $want, $expected, $block->name;
};
BEGIN {
package Web::Scraper::Filter::foo;
use base qw( Web::Scraper::Filter );
sub filter { tr/a-z/b-za/ }
package Web::Scraper::Filter::bar;
use base qw( Web::Scraper::Filter );
sub filter { $_[1] . 'bar' }
}
package main;
__DATA__
=== tr
--- want
['TEXT', 'foo']
--- expected
gpp
=== shift + return
--- want
['TEXT', 'bar']
--- expected
foobar
=== inline callback
--- want
['TEXT', sub { return "baz" } ]
--- expected
baz
=== inline callback + s///
--- want
['TEXT', sub { s/foo/bax/ } ]
--- expected
bax
=== stack
--- want
['TEXT', 'bar', 'foo' ]
--- expected
gppcbs
=== stack
--- want
['TEXT', 'bar', sub { s/foo/bar/ } ]
--- expected
barbar
=== no match
--- want
['TEXT', sub { s/xxx/yyy/g }]
--- expected
foo
=== undef
--- want
['TEXT', sub { return }]
--- expected
undef
=== number
--- want
['TEXT', sub { return 3 }]
--- expected
3
=== object
--- want
['TEXT', Web::Scraper::Filter::foo->new]
--- expected
gpp
Web-Scraper-0.37/t/17_filter_loop.t 000644 000765 000024 00000000702 11162225735 017777 0 ustar 00miyagawa staff 000000 000000 use strict;
use Test::Base;
use Web::Scraper;
plan tests => 1 * blocks;
filters {
expected => 'yaml',
want => 'eval',
};
run {
my $block = shift;
my $s = scraper {
process 'a', 'want[]' => $block->want;
result 'want';
};
my $want = $s->scrape('foobar');
is_deeply $want, $block->expected, $block->name;
};
__DATA__
=== tr
--- want
['TEXT', sub { tr/a-z/b-za/ }]
--- expected
- gpp
- cbs
Web-Scraper-0.37/t/18_http_response.t 000644 000765 000024 00000001214 11162225735 020356 0 ustar 00miyagawa staff 000000 000000 use strict;
use warnings;
use URI;
use LWP::UserAgent;
use Web::Scraper;
use Test::More;
plan skip_all => "LIVE_TEST not enabled"
unless $ENV{LIVE_TEST} || $ENV{TEST_ALL};
plan tests => 2;
my $ua = LWP::UserAgent->new;
{
my $res = $ua->get("http://www.yahoo.co.jp/");
my $result = scraper {
process 'title', title => 'TEXT';
}->scrape($res);
is $result->{title}, 'Yahoo! JAPAN';
}
{
my $res = $ua->get("http://b.hatena.ne.jp/");
my $result = scraper {
process 'img.csschanger', image => '@src';
}->scrape($res);
is $result->{image}, 'http://b.hatena.ne.jp/images/logo1.gif', 'Absolute URI';
}
Web-Scraper-0.37/t/19_decode_content.t 000644 000765 000024 00000001276 11332462511 020441 0 ustar 00miyagawa staff 000000 000000 use strict;
use warnings;
use URI;
use LWP::UserAgent;
use Web::Scraper;
use Test::More;
plan skip_all => "LIVE_TEST not enabled"
unless $ENV{LIVE_TEST} || $ENV{TEST_ALL};
plan tests => 2;
my $ua = LWP::UserAgent->new;
$ua->default_header('Accept-Encoding' => 'gzip');
{
my $res = $ua->get("http://www.yahoo.co.jp/");
my $result = scraper {
process 'title', title => 'TEXT';
}->scrape($res);
is $result->{title}, 'Yahoo! JAPAN';
}
{
my $res = $ua->get("http://b.hatena.ne.jp/");
my $result = scraper {
process 'img.csschanger', image => '@src';
}->scrape($res);
is $result->{image}, 'http://b.hatena.ne.jp/images/logo1.gif', 'Absolute URI';
}
Web-Scraper-0.37/t/20_comment_nodes.t 000644 000765 000024 00000002272 11640224677 020316 0 ustar 00miyagawa staff 000000 000000 use strict;
use Test::Base;
use utf8;
use Web::Scraper;
plan skip_all => "Please upgrade HTML::TreeBuilder::XPath and HTML::TreeBuilder::LibXML modules for comment nodes supporting"
unless
eval "use HTML::TreeBuilder::XPath 0.14; 1" &&
eval "use HTML::TreeBuilder::LibXML 0.13; 1";
plan tests => 1 * blocks;
filters {
selector => 'chomp',
expected => [ 'chomp', 'newline' ],
html => 'newline',
};
sub newline {
s/\\n\n/\n/g;
}
# For turning off of "Wide character warnings if test failed"
my $builder = Test::More->builder;
binmode $builder->output, ":utf8";
binmode $builder->failure_output, ":utf8";
binmode $builder->todo_output, ":utf8";
run {
my $block = shift;
my $s = scraper {
process $block->selector, want => 'TEXT';
result 'want';
};
my $want = $s->scrape($block->html);
is $want, $block->expected, $block->name;
};
__DATA__
=== comment in p
--- html
This is a paragraph bla bla bla
--- selector
//p/comment()
--- expected
This is the comment
=== non-ascii comment
--- html
Bla bla bla
--- selector
//p/comment()
--- expected
テスト
Web-Scraper-0.37/t/21_html5.t 000644 000765 000024 00000001157 11662006662 016513 0 ustar 00miyagawa staff 000000 000000 use strict;
use Test::Base;
use utf8;
use Web::Scraper;
plan tests => 1 * blocks;
filters {
selector => 'chomp',
expected => [ 'chomp', 'newline' ],
html => 'newline',
};
sub newline {
s/\\n\n/\n/g;
}
run {
my $block = shift;
my $s = scraper {
process $block->selector, want => 'HTML';
result 'want';
};
my $want = $s->scrape($block->html);
is $want, $block->expected, $block->name;
};
__DATA__
=== header
--- html
--- selector
header
--- expected
hello
=== section
--- html
--- selector
header
--- expected
hello
Web-Scraper-0.37/t/redefine.t 000644 000765 000024 00000000177 11225220721 016727 0 ustar 00miyagawa staff 000000 000000 BEGIN {
use Test::More 'no_plan';
$SIG{__WARN__} = sub { fail shift };
}
use Web::Scraper;
use Web::Scraper;
ok 1;
Web-Scraper-0.37/t/xml-simple.t 000644 000765 000024 00000001547 11531513276 017251 0 ustar 00miyagawa staff 000000 000000 use strict;
use Test::Requires qw(HTML::TreeBuilder::LibXML);
use Test::Base;
use Web::Scraper::LibXML;
filters { expected => [ 'lines', 'chomp' ] };
plan tests => 1 * blocks;
run {
my $block = shift;
my $s = scraper {
process $block->selector, "value[]", $block->get;
};
my $r = $s->scrape($block->input);
is_deeply $r->{value}, [ $block->expected ];
};
__END__
===
--- input
bar
--- selector: foo
--- get: TEXT
--- expected
bar
===
--- input
baz
bax
--- selector: foo>bar
--- get: TEXT
--- expected
baz
bax
===
--- input
--- selector: bar
--- get: @attr
--- expected
test bar
Hello & World
Web-Scraper-0.37/lib/Web/ 000755 000765 000024 00000000000 12040350116 015772 5 ustar 00miyagawa staff 000000 000000 Web-Scraper-0.37/lib/Web/Scraper/ 000755 000765 000024 00000000000 12040350116 017371 5 ustar 00miyagawa staff 000000 000000 Web-Scraper-0.37/lib/Web/Scraper.pm 000644 000765 000024 00000027731 12040350102 017734 0 ustar 00miyagawa staff 000000 000000 package Web::Scraper;
use strict;
use warnings;
use 5.008001;
use Carp;
use Scalar::Util qw(blessed);
use List::Util qw(first);
use HTML::Entities;
use HTML::Tagset;
use HTML::TreeBuilder::XPath;
use HTML::Selector::XPath;
use UNIVERSAL::require;
our $VERSION = '0.37';
sub import {
my $class = shift;
my $pkg = caller;
no strict 'refs';
no warnings 'redefine';
*{"$pkg\::scraper"} = _build_scraper($class);
*{"$pkg\::process"} = sub { goto &process };
*{"$pkg\::process_first"} = sub { goto &process_first };
*{"$pkg\::result"} = sub { goto &result };
}
our $UserAgent;
sub __ua {
require LWP::UserAgent;
$UserAgent ||= LWP::UserAgent->new(agent => __PACKAGE__ . "/" . $VERSION);
$UserAgent;
}
sub user_agent {
my $self = shift;
$self->{user_agent} = shift if @_;
$self->{user_agent} || __ua;
}
sub define {
my($class, $coderef) = @_;
bless { code => $coderef }, $class;
}
sub _build_scraper {
my $class = shift;
return sub(&) {
my($coderef) = @_;
bless { code => $coderef }, $class;
};
}
sub scrape {
my $self = shift;
my($stuff, $current) = @_;
my($html, $tree);
if (blessed($stuff) && $stuff->isa('URI')) {
my $ua = $self->user_agent;
my $res = $ua->get($stuff);
return $self->scrape($res, $stuff->as_string);
} elsif (blessed($stuff) && $stuff->isa('HTTP::Response')) {
if ($stuff->is_success) {
$html = $stuff->decoded_content;
} else {
croak "GET " . $stuff->request->uri . " failed: ", $stuff->status_line;
}
$current ||= $stuff->request->uri;
} elsif (blessed($stuff) && $stuff->isa('HTML::Element')) {
$tree = $stuff->clone;
} elsif (ref($stuff) && ref($stuff) eq 'SCALAR') {
$html = $$stuff;
} else {
$html = $stuff;
}
$tree ||= $self->build_tree($html);
my $stash = {};
no warnings 'redefine';
local *process = create_process(0, $tree, $stash, $current);
local *process_first = create_process(1, $tree, $stash, $current);
my $retval;
local *result = sub {
$retval++;
my @keys = @_;
if (@keys == 1) {
return $stash->{$keys[0]};
} elsif (@keys) {
my %res;
@res{@keys} = @{$stash}{@keys};
return \%res;
} else {
return $stash;
}
};
my $ret = $self->{code}->($tree);
$tree->delete;
# check user specified return value
return $ret if $retval;
return $stash;
}
sub build_tree {
my($self, $html) = @_;
my $t = HTML::TreeBuilder::XPath->new;
$t->store_comments(1) if ($t->can('store_comments'));
$t->ignore_unknown(0);
$t->parse($html);
$t->eof;
$t;
}
sub create_process {
my($first, $tree, $stash, $uri) = @_;
sub {
my($exp, @attr) = @_;
my $xpath = $exp =~ m!^(?:/|id\()! ? $exp : HTML::Selector::XPath::selector_to_xpath($exp);
my @nodes = eval {
local $SIG{__WARN__} = sub { };
$tree->findnodes($xpath);
};
if ($@) {
die "'$xpath' doesn't look like a valid XPath expression: $@";
}
@nodes or return;
@nodes = ($nodes[0]) if $first;
while (my($key, $val) = splice(@attr, 0, 2)) {
if (!defined $val) {
if (ref($key) && ref($key) eq 'CODE') {
for my $node (@nodes) {
local $_ = $node;
$key->($node);
}
} else {
die "Don't know what to do with $key => undef";
}
} elsif ($key =~ s!\[\]$!!) {
$stash->{$key} = [ map __get_value($_, $val, $uri), @nodes ];
} else {
$stash->{$key} = __get_value($nodes[0], $val, $uri);
}
}
return;
};
}
sub __get_value {
my($node, $val, $uri) = @_;
if (ref($val) && ref($val) eq 'CODE') {
local $_ = $node;
return $val->($node);
} elsif (blessed($val) && $val->isa('Web::Scraper')) {
return $val->scrape($node, $uri);
} elsif ($val =~ s!^@!!) {
my $value = $node->attr($val);
if ($uri && is_link_element($node, $val)) {
require URI;
$value = URI->new_abs($value, $uri);
}
return $value;
} elsif (lc($val) eq 'content' || lc($val) eq 'text') {
# getValue method is used for getting a content of comment nodes
# from HTML::TreeBuilder::XPath (version >= 0.14)
# or HTML::TreeBuilder::LibXML (version >= 0.13)
# getValue method works like as_text in both modules
# for other node types
return $node->isTextNode
? $node->string_value
: ($node->can('getValue')
? $node->getValue
: $node->as_text);
} elsif (lc($val) eq 'raw' || lc($val) eq 'html') {
if ($node->isTextNode) {
if ($HTML::TreeBuilder::XPath::VERSION < 0.09) {
return HTML::Entities::encode($node->as_XML, q("'<>&));
} else {
return $node->as_XML;
}
}
my $html = $node->as_XML;
$html =~ s!^<.*?>!!;
$html =~ s!\s*\w+>\n*$!!;
return $html;
} elsif (ref($val) eq 'HASH') {
my $values;
for my $key (keys %$val) {
$values->{$key} = __get_value($node, $val->{$key}, $uri);
}
return $values;
} elsif (ref($val) eq 'ARRAY') {
my $how = $val->[0];
my $value = __get_value($node, $how, $uri);
for my $filter (@$val[1..$#$val]) {
$value = run_filter($value, $filter);
}
return $value;
} else {
Carp::croak "Unknown value type $val";
}
}
sub run_filter {
my($value, $filter) = @_;
## sub { s/foo/bar/g } is a valid filter
## sub { DateTime::Format::Foo->parse_string(shift) } is valid too
my $callback;
my $module;
if (ref($filter) eq 'CODE') {
$callback = $filter;
$module = "$filter";
} elsif (!ref($filter)) {
$module = $filter =~ s/^\+// ? $filter : "Web::Scraper::Filter::$filter";
unless ($module->isa('Web::Scraper::Filter')) {
$module->require or Carp::croak("Loading $module: $@");
}
$callback = sub { $module->new->filter(shift) };
} elsif (blessed($filter) && $filter->can('filter')) {
$callback = sub { $filter->filter(shift) };
} else {
Carp::croak("Don't know filter type $filter");
}
local $_ = $value;
my $retval = eval { $callback->($value) };
if ($@) {
Carp::croak("Filter $module had an error: $@");
}
no warnings 'uninitialized';
# sub { s/foo/bar/ } returns number or PL_sv_no which is stringified to ''
if (($retval =~ /^\d+$/ and $_ ne $value) or (defined($retval) and $retval eq '')) {
$value = $_;
} else {
$value = $retval;
}
return $value;
}
sub is_link_element {
my($node, $attr) = @_;
my $link_elements = $HTML::Tagset::linkElements{$node->tag} || [];
for my $elem (@$link_elements) {
return 1 if $attr eq $elem;
}
return;
}
sub __stub {
my $func = shift;
return sub {
croak "Can't call $func() outside scraper block";
};
}
*process = __stub 'process';
*process_first = __stub 'process_first';
*result = __stub 'result';
1;
__END__
=for stopwords API SCRAPI Scrapi
=head1 NAME
Web::Scraper - Web Scraping Toolkit using HTML and CSS Selectors or XPath expressions
=head1 SYNOPSIS
use URI;
use Web::Scraper;
# First, create your scraper block
my $tweets = scraper {
# Parse all LIs with the class "status", store them into a resulting
# array 'tweets'. We embed another scraper for each tweet.
process "li.status", "tweets[]" => scraper {
# And, in that array, pull in the elementy with the class
# "entry-content", "entry-date" and the link
process ".entry-content", body => 'TEXT';
process ".entry-date", when => 'TEXT';
process 'a[rel="bookmark"]', link => '@href';
};
};
my $res = $tweets->scrape( URI->new("http://twitter.com/miyagawa") );
# The result has the populated tweets array
for my $tweet (@{$res->{tweets}}) {
print "$tweet->{body} $tweet->{when} (link: $tweet->{link})\n";
}
The structure would resemble this (visually)
{
tweets => [
{ body => $body, when => $date, link => $uri },
{ body => $body, when => $date, link => $uri },
]
}
=head1 DESCRIPTION
Web::Scraper is a web scraper toolkit, inspired by Ruby's equivalent
Scrapi. It provides a DSL-ish interface for traversing HTML documents and
returning a neatly arranged Perl data strcuture.
The I and I blocks provide a method to define what segments
of a document to extract. It understands HTML and CSS Selectors as well as
XPath expressions.
=head1 METHODS
=head2 scraper
$scraper = scraper { ... };
Creates a new Web::Scraper object by wrapping the DSL code that will be fired when I method is called.
=head2 scrape
$res = $scraper->scrape(URI->new($uri));
$res = $scraper->scrape($html_content);
$res = $scraper->scrape(\$html_content);
$res = $scraper->scrape($http_response);
$res = $scraper->scrape($html_element);
Retrieves the HTML from URI, HTTP::Response, HTML::Tree or text
strings and creates a DOM object, then fires the callback scraper code
to retrieve the data structure.
If you pass URI or HTTP::Response object, Web::Scraper will
automatically guesses the encoding of the content by looking at
Content-Type headers and META tags. Otherwise you need to decode the
HTML to Unicode before passing it to I method.
You can optionally pass the base URL when you pass the HTML content as
a string instead of URI or HTTP::Response.
$res = $scraper->scrape($html_content, "http://example.com/foo");
This way Web::Scraper can resolve the relative links found in the document.
=head2 process
scraper {
process "tag.class", key => 'TEXT';
process '//tag[contains(@foo, "bar")]', key2 => '@attr';
process '//comment()', 'comments[]' => 'TEXT';
};
I is the method to find matching elements from HTML with CSS
selector or XPath expression, then extract text or attributes into the
result stash.
If the first argument begins with "//" or "id(" it's treated as an
XPath expression and otherwise CSS selector.
# 2008/12/21
# date => "2008/12/21"
process ".date", date => 'TEXT';
#
# link => URI->new("http://example.com/")
process ".body > a", link => '@href';
#
# comment => " HTML Comment here "
#
# NOTES: A comment nodes are accessed when installed
# the HTML::TreeBuilder::XPath (version >= 0.14) and/or
# the HTML::TreeBuilder::LibXML (version >= 0.13)
process "//div[contains(@class, 'body')]/comment()", comment => 'TEXT';
#
# link => URI->new("http://example.com/"), text => "foo"
process ".body > a", link => '@href', text => 'TEXT';
#
# list => [ "foo", "bar" ]
process "li", "list[]" => "TEXT";
#
# list => [ { id => "1", text => "foo" }, { id => "2", text => "bar" } ];
process "li", "list[]" => { id => '@id', text => "TEXT" };
=head1 EXAMPLES
There are many examples in the C dir packaged in this distribution.
It is recommended to look through these.
=head1 NESTED SCRAPERS
TBD
=head1 FILTERS
TBD
=head1 AUTHOR
Tatsuhiko Miyagawa Emiyagawa@bulknews.netE
=head1 LICENSE
This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself.
=head1 SEE ALSO
L
L
=cut
Web-Scraper-0.37/lib/Web/Scraper/Filter.pm 000644 000765 000024 00000002715 11162225735 021175 0 ustar 00miyagawa staff 000000 000000 package Web::Scraper::Filter;
use strict;
use warnings;
sub new {
my $class = shift;
bless {}, $class;
}
1;
__END__
=for stopwords namespace inline callback
=head1 NAME
Web::Scraper::Filter - Base class for Web::Scraper filters
=head1 SYNOPSIS
package Web::Scraper::Filter::YAML;
use base qw( Web::Scraper::Filter );
use YAML ();
sub filter {
my($self, $value) = @_;
YAML::Load($value);
}
1;
use Web::Scraper;
my $scraper = scraper {
process ".yaml-code", data => [ 'TEXT', 'YAML' ];
};
=head1 DESCRIPTION
Web::Scraper::Filter is a base class for text filters in
Web::Scraper. You can create your own text filter by subclassing this
module.
There are two ways to create and use your custom filter. If you name
your filter Web::Scraper::Filter::Something, you just call:
process $exp, $key => [ 'TEXT', 'Something' ];
If you declare your filter under your own namespace, like
'MyApp::Filter::Foo',
process $exp, $key => [ 'TEXT', '+MyApp::Filter::Foo' ];
You can also inline your filter function without creating a filter
class:
process $exp, $key => [ 'TEXT', sub { s/foo/bar/ } ];
Note that this function munges C<$_> and returns the count of
replacement. Filter code special cases if the return value of the
callback is number and C<$_> value is updated.
You can, of course, stack filters like:
process $exp, $key => [ '@href', 'Foo', '+MyApp::Filter::Bar', \&baz ];
=head1 AUTHOR
Tatsuhiko Miyagawa
=cut
Web-Scraper-0.37/lib/Web/Scraper/LibXML.pm 000644 000765 000024 00000002263 11225234342 021027 0 ustar 00miyagawa staff 000000 000000 package Web::Scraper::LibXML;
use strict;
use base qw( Web::Scraper );
use HTML::TreeBuilder::LibXML;
sub build_tree {
my($self, $html) = @_;
my $t = HTML::TreeBuilder::LibXML->new;
$t->parse($html);
$t->eof;
$t;
}
1;
__END__
=head1 NAME
Web::Scraper::LibXML - Drop-in replacement for Web::Scraper to use LibXML
=head1 SYNOPSIS
use Web::Scraper::LibXML;
# same as Web::Scraper
my $scraper = scraper { ... };
=head1 DESCRIPTION
Web::Scraper::LibXML is a drop-in replacement for Web::Scraper to use
the fast libxml-based HTML tree builder, HTML::TreeBuilder::LibXML.
This is almost identical to HTML::TreeBuilder::LibXML's
I installer, like:
use HTML::TreeBuilder::LibXML;
HTML::TreeBuilder::LibXML->replace_original();
use Web::Scraper;
my $scraper = scraper { ... };
# this code uses LibXML parser
which overrides HTML::TreeBuilder::XPath's new() constructor so that
L of your code using HTML::TreeBuilder::XPath is switched to the
libxml based parser.
This module, instead, gives you more control over which TreeBuilder to
use, depending on the site etc.
=head1 SEE ALSO
L L
=cut
Web-Scraper-0.37/inc/Module/ 000755 000765 000024 00000000000 12040350116 016505 5 ustar 00miyagawa staff 000000 000000 Web-Scraper-0.37/inc/Module/Install/ 000755 000765 000024 00000000000 12040350116 020113 5 ustar 00miyagawa staff 000000 000000 Web-Scraper-0.37/inc/Module/Install.pm 000644 000765 000024 00000030135 12040350115 020452 0 ustar 00miyagawa staff 000000 000000 #line 1
package Module::Install;
# For any maintainers:
# The load order for Module::Install is a bit magic.
# It goes something like this...
#
# IF ( host has Module::Install installed, creating author mode ) {
# 1. Makefile.PL calls "use inc::Module::Install"
# 2. $INC{inc/Module/Install.pm} set to installed version of inc::Module::Install
# 3. The installed version of inc::Module::Install loads
# 4. inc::Module::Install calls "require Module::Install"
# 5. The ./inc/ version of Module::Install loads
# } ELSE {
# 1. Makefile.PL calls "use inc::Module::Install"
# 2. $INC{inc/Module/Install.pm} set to ./inc/ version of Module::Install
# 3. The ./inc/ version of Module::Install loads
# }
use 5.005;
use strict 'vars';
use Cwd ();
use File::Find ();
use File::Path ();
use vars qw{$VERSION $MAIN};
BEGIN {
# All Module::Install core packages now require synchronised versions.
# This will be used to ensure we don't accidentally load old or
# different versions of modules.
# This is not enforced yet, but will be some time in the next few
# releases once we can make sure it won't clash with custom
# Module::Install extensions.
$VERSION = '1.06';
# Storage for the pseudo-singleton
$MAIN = undef;
*inc::Module::Install::VERSION = *VERSION;
@inc::Module::Install::ISA = __PACKAGE__;
}
sub import {
my $class = shift;
my $self = $class->new(@_);
my $who = $self->_caller;
#-------------------------------------------------------------
# all of the following checks should be included in import(),
# to allow "eval 'require Module::Install; 1' to test
# installation of Module::Install. (RT #51267)
#-------------------------------------------------------------
# Whether or not inc::Module::Install is actually loaded, the
# $INC{inc/Module/Install.pm} is what will still get set as long as
# the caller loaded module this in the documented manner.
# If not set, the caller may NOT have loaded the bundled version, and thus
# they may not have a MI version that works with the Makefile.PL. This would
# result in false errors or unexpected behaviour. And we don't want that.
my $file = join( '/', 'inc', split /::/, __PACKAGE__ ) . '.pm';
unless ( $INC{$file} ) { die <<"END_DIE" }
Please invoke ${\__PACKAGE__} with:
use inc::${\__PACKAGE__};
not:
use ${\__PACKAGE__};
END_DIE
# This reportedly fixes a rare Win32 UTC file time issue, but
# as this is a non-cross-platform XS module not in the core,
# we shouldn't really depend on it. See RT #24194 for detail.
# (Also, this module only supports Perl 5.6 and above).
eval "use Win32::UTCFileTime" if $^O eq 'MSWin32' && $] >= 5.006;
# If the script that is loading Module::Install is from the future,
# then make will detect this and cause it to re-run over and over
# again. This is bad. Rather than taking action to touch it (which
# is unreliable on some platforms and requires write permissions)
# for now we should catch this and refuse to run.
if ( -f $0 ) {
my $s = (stat($0))[9];
# If the modification time is only slightly in the future,
# sleep briefly to remove the problem.
my $a = $s - time;
if ( $a > 0 and $a < 5 ) { sleep 5 }
# Too far in the future, throw an error.
my $t = time;
if ( $s > $t ) { die <<"END_DIE" }
Your installer $0 has a modification time in the future ($s > $t).
This is known to create infinite loops in make.
Please correct this, then run $0 again.
END_DIE
}
# Build.PL was formerly supported, but no longer is due to excessive
# difficulty in implementing every single feature twice.
if ( $0 =~ /Build.PL$/i ) { die <<"END_DIE" }
Module::Install no longer supports Build.PL.
It was impossible to maintain duel backends, and has been deprecated.
Please remove all Build.PL files and only use the Makefile.PL installer.
END_DIE
#-------------------------------------------------------------
# To save some more typing in Module::Install installers, every...
# use inc::Module::Install
# ...also acts as an implicit use strict.
$^H |= strict::bits(qw(refs subs vars));
#-------------------------------------------------------------
unless ( -f $self->{file} ) {
foreach my $key (keys %INC) {
delete $INC{$key} if $key =~ /Module\/Install/;
}
local $^W;
require "$self->{path}/$self->{dispatch}.pm";
File::Path::mkpath("$self->{prefix}/$self->{author}");
$self->{admin} = "$self->{name}::$self->{dispatch}"->new( _top => $self );
$self->{admin}->init;
@_ = ($class, _self => $self);
goto &{"$self->{name}::import"};
}
local $^W;
*{"${who}::AUTOLOAD"} = $self->autoload;
$self->preload;
# Unregister loader and worker packages so subdirs can use them again
delete $INC{'inc/Module/Install.pm'};
delete $INC{'Module/Install.pm'};
# Save to the singleton
$MAIN = $self;
return 1;
}
sub autoload {
my $self = shift;
my $who = $self->_caller;
my $cwd = Cwd::cwd();
my $sym = "${who}::AUTOLOAD";
$sym->{$cwd} = sub {
my $pwd = Cwd::cwd();
if ( my $code = $sym->{$pwd} ) {
# Delegate back to parent dirs
goto &$code unless $cwd eq $pwd;
}
unless ($$sym =~ s/([^:]+)$//) {
# XXX: it looks like we can't retrieve the missing function
# via $$sym (usually $main::AUTOLOAD) in this case.
# I'm still wondering if we should slurp Makefile.PL to
# get some context or not ...
my ($package, $file, $line) = caller;
die <<"EOT";
Unknown function is found at $file line $line.
Execution of $file aborted due to runtime errors.
If you're a contributor to a project, you may need to install
some Module::Install extensions from CPAN (or other repository).
If you're a user of a module, please contact the author.
EOT
}
my $method = $1;
if ( uc($method) eq $method ) {
# Do nothing
return;
} elsif ( $method =~ /^_/ and $self->can($method) ) {
# Dispatch to the root M:I class
return $self->$method(@_);
}
# Dispatch to the appropriate plugin
unshift @_, ( $self, $1 );
goto &{$self->can('call')};
};
}
sub preload {
my $self = shift;
unless ( $self->{extensions} ) {
$self->load_extensions(
"$self->{prefix}/$self->{path}", $self
);
}
my @exts = @{$self->{extensions}};
unless ( @exts ) {
@exts = $self->{admin}->load_all_extensions;
}
my %seen;
foreach my $obj ( @exts ) {
while (my ($method, $glob) = each %{ref($obj) . '::'}) {
next unless $obj->can($method);
next if $method =~ /^_/;
next if $method eq uc($method);
$seen{$method}++;
}
}
my $who = $self->_caller;
foreach my $name ( sort keys %seen ) {
local $^W;
*{"${who}::$name"} = sub {
${"${who}::AUTOLOAD"} = "${who}::$name";
goto &{"${who}::AUTOLOAD"};
};
}
}
sub new {
my ($class, %args) = @_;
delete $INC{'FindBin.pm'};
{
# to suppress the redefine warning
local $SIG{__WARN__} = sub {};
require FindBin;
}
# ignore the prefix on extension modules built from top level.
my $base_path = Cwd::abs_path($FindBin::Bin);
unless ( Cwd::abs_path(Cwd::cwd()) eq $base_path ) {
delete $args{prefix};
}
return $args{_self} if $args{_self};
$args{dispatch} ||= 'Admin';
$args{prefix} ||= 'inc';
$args{author} ||= ($^O eq 'VMS' ? '_author' : '.author');
$args{bundle} ||= 'inc/BUNDLES';
$args{base} ||= $base_path;
$class =~ s/^\Q$args{prefix}\E:://;
$args{name} ||= $class;
$args{version} ||= $class->VERSION;
unless ( $args{path} ) {
$args{path} = $args{name};
$args{path} =~ s!::!/!g;
}
$args{file} ||= "$args{base}/$args{prefix}/$args{path}.pm";
$args{wrote} = 0;
bless( \%args, $class );
}
sub call {
my ($self, $method) = @_;
my $obj = $self->load($method) or return;
splice(@_, 0, 2, $obj);
goto &{$obj->can($method)};
}
sub load {
my ($self, $method) = @_;
$self->load_extensions(
"$self->{prefix}/$self->{path}", $self
) unless $self->{extensions};
foreach my $obj (@{$self->{extensions}}) {
return $obj if $obj->can($method);
}
my $admin = $self->{admin} or die <<"END_DIE";
The '$method' method does not exist in the '$self->{prefix}' path!
Please remove the '$self->{prefix}' directory and run $0 again to load it.
END_DIE
my $obj = $admin->load($method, 1);
push @{$self->{extensions}}, $obj;
$obj;
}
sub load_extensions {
my ($self, $path, $top) = @_;
my $should_reload = 0;
unless ( grep { ! ref $_ and lc $_ eq lc $self->{prefix} } @INC ) {
unshift @INC, $self->{prefix};
$should_reload = 1;
}
foreach my $rv ( $self->find_extensions($path) ) {
my ($file, $pkg) = @{$rv};
next if $self->{pathnames}{$pkg};
local $@;
my $new = eval { local $^W; require $file; $pkg->can('new') };
unless ( $new ) {
warn $@ if $@;
next;
}
$self->{pathnames}{$pkg} =
$should_reload ? delete $INC{$file} : $INC{$file};
push @{$self->{extensions}}, &{$new}($pkg, _top => $top );
}
$self->{extensions} ||= [];
}
sub find_extensions {
my ($self, $path) = @_;
my @found;
File::Find::find( sub {
my $file = $File::Find::name;
return unless $file =~ m!^\Q$path\E/(.+)\.pm\Z!is;
my $subpath = $1;
return if lc($subpath) eq lc($self->{dispatch});
$file = "$self->{path}/$subpath.pm";
my $pkg = "$self->{name}::$subpath";
$pkg =~ s!/!::!g;
# If we have a mixed-case package name, assume case has been preserved
# correctly. Otherwise, root through the file to locate the case-preserved
# version of the package name.
if ( $subpath eq lc($subpath) || $subpath eq uc($subpath) ) {
my $content = Module::Install::_read($subpath . '.pm');
my $in_pod = 0;
foreach ( split //, $content ) {
$in_pod = 1 if /^=\w/;
$in_pod = 0 if /^=cut/;
next if ($in_pod || /^=cut/); # skip pod text
next if /^\s*#/; # and comments
if ( m/^\s*package\s+($pkg)\s*;/i ) {
$pkg = $1;
last;
}
}
}
push @found, [ $file, $pkg ];
}, $path ) if -d $path;
@found;
}
#####################################################################
# Common Utility Functions
sub _caller {
my $depth = 0;
my $call = caller($depth);
while ( $call eq __PACKAGE__ ) {
$depth++;
$call = caller($depth);
}
return $call;
}
# Done in evals to avoid confusing Perl::MinimumVersion
eval( $] >= 5.006 ? <<'END_NEW' : <<'END_OLD' ); die $@ if $@;
sub _read {
local *FH;
open( FH, '<', $_[0] ) or die "open($_[0]): $!";
my $string = do { local $/; };
close FH or die "close($_[0]): $!";
return $string;
}
END_NEW
sub _read {
local *FH;
open( FH, "< $_[0]" ) or die "open($_[0]): $!";
my $string = do { local $/; };
close FH or die "close($_[0]): $!";
return $string;
}
END_OLD
sub _readperl {
my $string = Module::Install::_read($_[0]);
$string =~ s/(?:\015{1,2}\012|\015|\012)/\n/sg;
$string =~ s/(\n)\n*__(?:DATA|END)__\b.*\z/$1/s;
$string =~ s/\n\n=\w+.+?\n\n=cut\b.+?\n+/\n\n/sg;
return $string;
}
sub _readpod {
my $string = Module::Install::_read($_[0]);
$string =~ s/(?:\015{1,2}\012|\015|\012)/\n/sg;
return $string if $_[0] =~ /\.pod\z/;
$string =~ s/(^|\n=cut\b.+?\n+)[^=\s].+?\n(\n=\w+|\z)/$1$2/sg;
$string =~ s/\n*=pod\b[^\n]*\n+/\n\n/sg;
$string =~ s/\n*=cut\b[^\n]*\n+/\n\n/sg;
$string =~ s/^\n+//s;
return $string;
}
# Done in evals to avoid confusing Perl::MinimumVersion
eval( $] >= 5.006 ? <<'END_NEW' : <<'END_OLD' ); die $@ if $@;
sub _write {
local *FH;
open( FH, '>', $_[0] ) or die "open($_[0]): $!";
foreach ( 1 .. $#_ ) {
print FH $_[$_] or die "print($_[0]): $!";
}
close FH or die "close($_[0]): $!";
}
END_NEW
sub _write {
local *FH;
open( FH, "> $_[0]" ) or die "open($_[0]): $!";
foreach ( 1 .. $#_ ) {
print FH $_[$_] or die "print($_[0]): $!";
}
close FH or die "close($_[0]): $!";
}
END_OLD
# _version is for processing module versions (eg, 1.03_05) not
# Perl versions (eg, 5.8.1).
sub _version ($) {
my $s = shift || 0;
my $d =()= $s =~ /(\.)/g;
if ( $d >= 2 ) {
# Normalise multipart versions
$s =~ s/(\.)(\d{1,3})/sprintf("$1%03d",$2)/eg;
}
$s =~ s/^(\d+)\.?//;
my $l = $1 || 0;
my @v = map {
$_ . '0' x (3 - length $_)
} $s =~ /(\d{1,3})\D?/g;
$l = $l . '.' . join '', @v if @v;
return $l + 0;
}
sub _cmp ($$) {
_version($_[1]) <=> _version($_[2]);
}
# Cloned from Params::Util::_CLASS
sub _CLASS ($) {
(
defined $_[0]
and
! ref $_[0]
and
$_[0] =~ m/^[^\W\d]\w*(?:::\w+)*\z/s
) ? $_[0] : undef;
}
1;
# Copyright 2008 - 2012 Adam Kennedy.
Web-Scraper-0.37/inc/Module/Install/AuthorTests.pm 000644 000765 000024 00000002215 12040350115 022735 0 ustar 00miyagawa staff 000000 000000 #line 1
package Module::Install::AuthorTests;
use 5.005;
use strict;
use Module::Install::Base;
use Carp ();
#line 16
use vars qw{$VERSION $ISCORE @ISA};
BEGIN {
$VERSION = '0.002';
$ISCORE = 1;
@ISA = qw{Module::Install::Base};
}
#line 42
sub author_tests {
my ($self, @dirs) = @_;
_add_author_tests($self, \@dirs, 0);
}
#line 56
sub recursive_author_tests {
my ($self, @dirs) = @_;
_add_author_tests($self, \@dirs, 1);
}
sub _wanted {
my $href = shift;
sub { /\.t$/ and -f $_ and $href->{$File::Find::dir} = 1 }
}
sub _add_author_tests {
my ($self, $dirs, $recurse) = @_;
return unless $Module::Install::AUTHOR;
my @tests = $self->tests ? (split / /, $self->tests) : 't/*.t';
# XXX: pick a default, later -- rjbs, 2008-02-24
my @dirs = @$dirs ? @$dirs : Carp::confess "no dirs given to author_tests";
@dirs = grep { -d } @dirs;
if ($recurse) {
require File::Find;
my %test_dir;
File::Find::find(_wanted(\%test_dir), @dirs);
$self->tests( join ' ', @tests, map { "$_/*.t" } sort keys %test_dir );
} else {
$self->tests( join ' ', @tests, map { "$_/*.t" } sort @dirs );
}
}
#line 107
1;
Web-Scraper-0.37/inc/Module/Install/Base.pm 000644 000765 000024 00000002147 12040350115 021326 0 ustar 00miyagawa staff 000000 000000 #line 1
package Module::Install::Base;
use strict 'vars';
use vars qw{$VERSION};
BEGIN {
$VERSION = '1.06';
}
# Suspend handler for "redefined" warnings
BEGIN {
my $w = $SIG{__WARN__};
$SIG{__WARN__} = sub { $w };
}
#line 42
sub new {
my $class = shift;
unless ( defined &{"${class}::call"} ) {
*{"${class}::call"} = sub { shift->_top->call(@_) };
}
unless ( defined &{"${class}::load"} ) {
*{"${class}::load"} = sub { shift->_top->load(@_) };
}
bless { @_ }, $class;
}
#line 61
sub AUTOLOAD {
local $@;
my $func = eval { shift->_top->autoload } or return;
goto &$func;
}
#line 75
sub _top {
$_[0]->{_top};
}
#line 90
sub admin {
$_[0]->_top->{admin}
or
Module::Install::Base::FakeAdmin->new;
}
#line 106
sub is_admin {
! $_[0]->admin->isa('Module::Install::Base::FakeAdmin');
}
sub DESTROY {}
package Module::Install::Base::FakeAdmin;
use vars qw{$VERSION};
BEGIN {
$VERSION = $Module::Install::Base::VERSION;
}
my $fake;
sub new {
$fake ||= bless(\@_, $_[0]);
}
sub AUTOLOAD {}
sub DESTROY {}
# Restore warning handler
BEGIN {
$SIG{__WARN__} = $SIG{__WARN__}->();
}
1;
#line 159
Web-Scraper-0.37/inc/Module/Install/Can.pm 000644 000765 000024 00000006157 12040350115 021162 0 ustar 00miyagawa staff 000000 000000 #line 1
package Module::Install::Can;
use strict;
use Config ();
use ExtUtils::MakeMaker ();
use Module::Install::Base ();
use vars qw{$VERSION @ISA $ISCORE};
BEGIN {
$VERSION = '1.06';
@ISA = 'Module::Install::Base';
$ISCORE = 1;
}
# check if we can load some module
### Upgrade this to not have to load the module if possible
sub can_use {
my ($self, $mod, $ver) = @_;
$mod =~ s{::|\\}{/}g;
$mod .= '.pm' unless $mod =~ /\.pm$/i;
my $pkg = $mod;
$pkg =~ s{/}{::}g;
$pkg =~ s{\.pm$}{}i;
local $@;
eval { require $mod; $pkg->VERSION($ver || 0); 1 };
}
# Check if we can run some command
sub can_run {
my ($self, $cmd) = @_;
my $_cmd = $cmd;
return $_cmd if (-x $_cmd or $_cmd = MM->maybe_command($_cmd));
for my $dir ((split /$Config::Config{path_sep}/, $ENV{PATH}), '.') {
next if $dir eq '';
require File::Spec;
my $abs = File::Spec->catfile($dir, $cmd);
return $abs if (-x $abs or $abs = MM->maybe_command($abs));
}
return;
}
# Can our C compiler environment build XS files
sub can_xs {
my $self = shift;
# Ensure we have the CBuilder module
$self->configure_requires( 'ExtUtils::CBuilder' => 0.27 );
# Do we have the configure_requires checker?
local $@;
eval "require ExtUtils::CBuilder;";
if ( $@ ) {
# They don't obey configure_requires, so it is
# someone old and delicate. Try to avoid hurting
# them by falling back to an older simpler test.
return $self->can_cc();
}
# Do we have a working C compiler
my $builder = ExtUtils::CBuilder->new(
quiet => 1,
);
unless ( $builder->have_compiler ) {
# No working C compiler
return 0;
}
# Write a C file representative of what XS becomes
require File::Temp;
my ( $FH, $tmpfile ) = File::Temp::tempfile(
"compilexs-XXXXX",
SUFFIX => '.c',
);
binmode $FH;
print $FH <<'END_C';
#include "EXTERN.h"
#include "perl.h"
#include "XSUB.h"
int main(int argc, char **argv) {
return 0;
}
int boot_sanexs() {
return 1;
}
END_C
close $FH;
# Can the C compiler access the same headers XS does
my @libs = ();
my $object = undef;
eval {
local $^W = 0;
$object = $builder->compile(
source => $tmpfile,
);
@libs = $builder->link(
objects => $object,
module_name => 'sanexs',
);
};
my $result = $@ ? 0 : 1;
# Clean up all the build files
foreach ( $tmpfile, $object, @libs ) {
next unless defined $_;
1 while unlink;
}
return $result;
}
# Can we locate a (the) C compiler
sub can_cc {
my $self = shift;
my @chunks = split(/ /, $Config::Config{cc}) or return;
# $Config{cc} may contain args; try to find out the program part
while (@chunks) {
return $self->can_run("@chunks") || (pop(@chunks), next);
}
return;
}
# Fix Cygwin bug on maybe_command();
if ( $^O eq 'cygwin' ) {
require ExtUtils::MM_Cygwin;
require ExtUtils::MM_Win32;
if ( ! defined(&ExtUtils::MM_Cygwin::maybe_command) ) {
*ExtUtils::MM_Cygwin::maybe_command = sub {
my ($self, $file) = @_;
if ($file =~ m{^/cygdrive/}i and ExtUtils::MM_Win32->can('maybe_command')) {
ExtUtils::MM_Win32->maybe_command($file);
} else {
ExtUtils::MM_Unix->maybe_command($file);
}
}
}
}
1;
__END__
#line 236
Web-Scraper-0.37/inc/Module/Install/Fetch.pm 000644 000765 000024 00000004627 12040350115 021512 0 ustar 00miyagawa staff 000000 000000 #line 1
package Module::Install::Fetch;
use strict;
use Module::Install::Base ();
use vars qw{$VERSION @ISA $ISCORE};
BEGIN {
$VERSION = '1.06';
@ISA = 'Module::Install::Base';
$ISCORE = 1;
}
sub get_file {
my ($self, %args) = @_;
my ($scheme, $host, $path, $file) =
$args{url} =~ m|^(\w+)://([^/]+)(.+)/(.+)| or return;
if ( $scheme eq 'http' and ! eval { require LWP::Simple; 1 } ) {
$args{url} = $args{ftp_url}
or (warn("LWP support unavailable!\n"), return);
($scheme, $host, $path, $file) =
$args{url} =~ m|^(\w+)://([^/]+)(.+)/(.+)| or return;
}
$|++;
print "Fetching '$file' from $host... ";
unless (eval { require Socket; Socket::inet_aton($host) }) {
warn "'$host' resolve failed!\n";
return;
}
return unless $scheme eq 'ftp' or $scheme eq 'http';
require Cwd;
my $dir = Cwd::getcwd();
chdir $args{local_dir} or return if exists $args{local_dir};
if (eval { require LWP::Simple; 1 }) {
LWP::Simple::mirror($args{url}, $file);
}
elsif (eval { require Net::FTP; 1 }) { eval {
# use Net::FTP to get past firewall
my $ftp = Net::FTP->new($host, Passive => 1, Timeout => 600);
$ftp->login("anonymous", 'anonymous@example.com');
$ftp->cwd($path);
$ftp->binary;
$ftp->get($file) or (warn("$!\n"), return);
$ftp->quit;
} }
elsif (my $ftp = $self->can_run('ftp')) { eval {
# no Net::FTP, fallback to ftp.exe
require FileHandle;
my $fh = FileHandle->new;
local $SIG{CHLD} = 'IGNORE';
unless ($fh->open("|$ftp -n")) {
warn "Couldn't open ftp: $!\n";
chdir $dir; return;
}
my @dialog = split(/\n/, <<"END_FTP");
open $host
user anonymous anonymous\@example.com
cd $path
binary
get $file $file
quit
END_FTP
foreach (@dialog) { $fh->print("$_\n") }
$fh->close;
} }
else {
warn "No working 'ftp' program available!\n";
chdir $dir; return;
}
unless (-f $file) {
warn "Fetching failed: $@\n";
chdir $dir; return;
}
return if exists $args{size} and -s $file != $args{size};
system($args{run}) if exists $args{run};
unlink($file) if $args{remove};
print(((!exists $args{check_for} or -e $args{check_for})
? "done!" : "failed! ($!)"), "\n");
chdir $dir; return !$?;
}
1;
Web-Scraper-0.37/inc/Module/Install/Makefile.pm 000644 000765 000024 00000027437 12040350115 022202 0 ustar 00miyagawa staff 000000 000000 #line 1
package Module::Install::Makefile;
use strict 'vars';
use ExtUtils::MakeMaker ();
use Module::Install::Base ();
use Fcntl qw/:flock :seek/;
use vars qw{$VERSION @ISA $ISCORE};
BEGIN {
$VERSION = '1.06';
@ISA = 'Module::Install::Base';
$ISCORE = 1;
}
sub Makefile { $_[0] }
my %seen = ();
sub prompt {
shift;
# Infinite loop protection
my @c = caller();
if ( ++$seen{"$c[1]|$c[2]|$_[0]"} > 3 ) {
die "Caught an potential prompt infinite loop ($c[1]|$c[2]|$_[0])";
}
# In automated testing or non-interactive session, always use defaults
if ( ($ENV{AUTOMATED_TESTING} or -! -t STDIN) and ! $ENV{PERL_MM_USE_DEFAULT} ) {
local $ENV{PERL_MM_USE_DEFAULT} = 1;
goto &ExtUtils::MakeMaker::prompt;
} else {
goto &ExtUtils::MakeMaker::prompt;
}
}
# Store a cleaned up version of the MakeMaker version,
# since we need to behave differently in a variety of
# ways based on the MM version.
my $makemaker = eval $ExtUtils::MakeMaker::VERSION;
# If we are passed a param, do a "newer than" comparison.
# Otherwise, just return the MakeMaker version.
sub makemaker {
( @_ < 2 or $makemaker >= eval($_[1]) ) ? $makemaker : 0
}
# Ripped from ExtUtils::MakeMaker 6.56, and slightly modified
# as we only need to know here whether the attribute is an array
# or a hash or something else (which may or may not be appendable).
my %makemaker_argtype = (
C => 'ARRAY',
CONFIG => 'ARRAY',
# CONFIGURE => 'CODE', # ignore
DIR => 'ARRAY',
DL_FUNCS => 'HASH',
DL_VARS => 'ARRAY',
EXCLUDE_EXT => 'ARRAY',
EXE_FILES => 'ARRAY',
FUNCLIST => 'ARRAY',
H => 'ARRAY',
IMPORTS => 'HASH',
INCLUDE_EXT => 'ARRAY',
LIBS => 'ARRAY', # ignore ''
MAN1PODS => 'HASH',
MAN3PODS => 'HASH',
META_ADD => 'HASH',
META_MERGE => 'HASH',
PL_FILES => 'HASH',
PM => 'HASH',
PMLIBDIRS => 'ARRAY',
PMLIBPARENTDIRS => 'ARRAY',
PREREQ_PM => 'HASH',
CONFIGURE_REQUIRES => 'HASH',
SKIP => 'ARRAY',
TYPEMAPS => 'ARRAY',
XS => 'HASH',
# VERSION => ['version',''], # ignore
# _KEEP_AFTER_FLUSH => '',
clean => 'HASH',
depend => 'HASH',
dist => 'HASH',
dynamic_lib=> 'HASH',
linkext => 'HASH',
macro => 'HASH',
postamble => 'HASH',
realclean => 'HASH',
test => 'HASH',
tool_autosplit => 'HASH',
# special cases where you can use makemaker_append
CCFLAGS => 'APPENDABLE',
DEFINE => 'APPENDABLE',
INC => 'APPENDABLE',
LDDLFLAGS => 'APPENDABLE',
LDFROM => 'APPENDABLE',
);
sub makemaker_args {
my ($self, %new_args) = @_;
my $args = ( $self->{makemaker_args} ||= {} );
foreach my $key (keys %new_args) {
if ($makemaker_argtype{$key}) {
if ($makemaker_argtype{$key} eq 'ARRAY') {
$args->{$key} = [] unless defined $args->{$key};
unless (ref $args->{$key} eq 'ARRAY') {
$args->{$key} = [$args->{$key}]
}
push @{$args->{$key}},
ref $new_args{$key} eq 'ARRAY'
? @{$new_args{$key}}
: $new_args{$key};
}
elsif ($makemaker_argtype{$key} eq 'HASH') {
$args->{$key} = {} unless defined $args->{$key};
foreach my $skey (keys %{ $new_args{$key} }) {
$args->{$key}{$skey} = $new_args{$key}{$skey};
}
}
elsif ($makemaker_argtype{$key} eq 'APPENDABLE') {
$self->makemaker_append($key => $new_args{$key});
}
}
else {
if (defined $args->{$key}) {
warn qq{MakeMaker attribute "$key" is overriden; use "makemaker_append" to append values\n};
}
$args->{$key} = $new_args{$key};
}
}
return $args;
}
# For mm args that take multiple space-seperated args,
# append an argument to the current list.
sub makemaker_append {
my $self = shift;
my $name = shift;
my $args = $self->makemaker_args;
$args->{$name} = defined $args->{$name}
? join( ' ', $args->{$name}, @_ )
: join( ' ', @_ );
}
sub build_subdirs {
my $self = shift;
my $subdirs = $self->makemaker_args->{DIR} ||= [];
for my $subdir (@_) {
push @$subdirs, $subdir;
}
}
sub clean_files {
my $self = shift;
my $clean = $self->makemaker_args->{clean} ||= {};
%$clean = (
%$clean,
FILES => join ' ', grep { length $_ } ($clean->{FILES} || (), @_),
);
}
sub realclean_files {
my $self = shift;
my $realclean = $self->makemaker_args->{realclean} ||= {};
%$realclean = (
%$realclean,
FILES => join ' ', grep { length $_ } ($realclean->{FILES} || (), @_),
);
}
sub libs {
my $self = shift;
my $libs = ref $_[0] ? shift : [ shift ];
$self->makemaker_args( LIBS => $libs );
}
sub inc {
my $self = shift;
$self->makemaker_args( INC => shift );
}
sub _wanted_t {
}
sub tests_recursive {
my $self = shift;
my $dir = shift || 't';
unless ( -d $dir ) {
die "tests_recursive dir '$dir' does not exist";
}
my %tests = map { $_ => 1 } split / /, ($self->tests || '');
require File::Find;
File::Find::find(
sub { /\.t$/ and -f $_ and $tests{"$File::Find::dir/*.t"} = 1 },
$dir
);
$self->tests( join ' ', sort keys %tests );
}
sub write {
my $self = shift;
die "&Makefile->write() takes no arguments\n" if @_;
# Check the current Perl version
my $perl_version = $self->perl_version;
if ( $perl_version ) {
eval "use $perl_version; 1"
or die "ERROR: perl: Version $] is installed, "
. "but we need version >= $perl_version";
}
# Make sure we have a new enough MakeMaker
require ExtUtils::MakeMaker;
if ( $perl_version and $self->_cmp($perl_version, '5.006') >= 0 ) {
# This previous attempted to inherit the version of
# ExtUtils::MakeMaker in use by the module author, but this
# was found to be untenable as some authors build releases
# using future dev versions of EU:MM that nobody else has.
# Instead, #toolchain suggests we use 6.59 which is the most
# stable version on CPAN at time of writing and is, to quote
# ribasushi, "not terminally fucked, > and tested enough".
# TODO: We will now need to maintain this over time to push
# the version up as new versions are released.
$self->build_requires( 'ExtUtils::MakeMaker' => 6.59 );
$self->configure_requires( 'ExtUtils::MakeMaker' => 6.59 );
} else {
# Allow legacy-compatibility with 5.005 by depending on the
# most recent EU:MM that supported 5.005.
$self->build_requires( 'ExtUtils::MakeMaker' => 6.36 );
$self->configure_requires( 'ExtUtils::MakeMaker' => 6.36 );
}
# Generate the MakeMaker params
my $args = $self->makemaker_args;
$args->{DISTNAME} = $self->name;
$args->{NAME} = $self->module_name || $self->name;
$args->{NAME} =~ s/-/::/g;
$args->{VERSION} = $self->version or die <<'EOT';
ERROR: Can't determine distribution version. Please specify it
explicitly via 'version' in Makefile.PL, or set a valid $VERSION
in a module, and provide its file path via 'version_from' (or
'all_from' if you prefer) in Makefile.PL.
EOT
if ( $self->tests ) {
my @tests = split ' ', $self->tests;
my %seen;
$args->{test} = {
TESTS => (join ' ', grep {!$seen{$_}++} @tests),
};
} elsif ( $Module::Install::ExtraTests::use_extratests ) {
# Module::Install::ExtraTests doesn't set $self->tests and does its own tests via harness.
# So, just ignore our xt tests here.
} elsif ( -d 'xt' and ($Module::Install::AUTHOR or $ENV{RELEASE_TESTING}) ) {
$args->{test} = {
TESTS => join( ' ', map { "$_/*.t" } grep { -d $_ } qw{ t xt } ),
};
}
if ( $] >= 5.005 ) {
$args->{ABSTRACT} = $self->abstract;
$args->{AUTHOR} = join ', ', @{$self->author || []};
}
if ( $self->makemaker(6.10) ) {
$args->{NO_META} = 1;
#$args->{NO_MYMETA} = 1;
}
if ( $self->makemaker(6.17) and $self->sign ) {
$args->{SIGN} = 1;
}
unless ( $self->is_admin ) {
delete $args->{SIGN};
}
if ( $self->makemaker(6.31) and $self->license ) {
$args->{LICENSE} = $self->license;
}
my $prereq = ($args->{PREREQ_PM} ||= {});
%$prereq = ( %$prereq,
map { @$_ } # flatten [module => version]
map { @$_ }
grep $_,
($self->requires)
);
# Remove any reference to perl, PREREQ_PM doesn't support it
delete $args->{PREREQ_PM}->{perl};
# Merge both kinds of requires into BUILD_REQUIRES
my $build_prereq = ($args->{BUILD_REQUIRES} ||= {});
%$build_prereq = ( %$build_prereq,
map { @$_ } # flatten [module => version]
map { @$_ }
grep $_,
($self->configure_requires, $self->build_requires)
);
# Remove any reference to perl, BUILD_REQUIRES doesn't support it
delete $args->{BUILD_REQUIRES}->{perl};
# Delete bundled dists from prereq_pm, add it to Makefile DIR
my $subdirs = ($args->{DIR} || []);
if ($self->bundles) {
my %processed;
foreach my $bundle (@{ $self->bundles }) {
my ($mod_name, $dist_dir) = @$bundle;
delete $prereq->{$mod_name};
$dist_dir = File::Basename::basename($dist_dir); # dir for building this module
if (not exists $processed{$dist_dir}) {
if (-d $dist_dir) {
# List as sub-directory to be processed by make
push @$subdirs, $dist_dir;
}
# Else do nothing: the module is already present on the system
$processed{$dist_dir} = undef;
}
}
}
unless ( $self->makemaker('6.55_03') ) {
%$prereq = (%$prereq,%$build_prereq);
delete $args->{BUILD_REQUIRES};
}
if ( my $perl_version = $self->perl_version ) {
eval "use $perl_version; 1"
or die "ERROR: perl: Version $] is installed, "
. "but we need version >= $perl_version";
if ( $self->makemaker(6.48) ) {
$args->{MIN_PERL_VERSION} = $perl_version;
}
}
if ($self->installdirs) {
warn qq{old INSTALLDIRS (probably set by makemaker_args) is overriden by installdirs\n} if $args->{INSTALLDIRS};
$args->{INSTALLDIRS} = $self->installdirs;
}
my %args = map {
( $_ => $args->{$_} ) } grep {defined($args->{$_} )
} keys %$args;
my $user_preop = delete $args{dist}->{PREOP};
if ( my $preop = $self->admin->preop($user_preop) ) {
foreach my $key ( keys %$preop ) {
$args{dist}->{$key} = $preop->{$key};
}
}
my $mm = ExtUtils::MakeMaker::WriteMakefile(%args);
$self->fix_up_makefile($mm->{FIRST_MAKEFILE} || 'Makefile');
}
sub fix_up_makefile {
my $self = shift;
my $makefile_name = shift;
my $top_class = ref($self->_top) || '';
my $top_version = $self->_top->VERSION || '';
my $preamble = $self->preamble
? "# Preamble by $top_class $top_version\n"
. $self->preamble
: '';
my $postamble = "# Postamble by $top_class $top_version\n"
. ($self->postamble || '');
local *MAKEFILE;
open MAKEFILE, "+< $makefile_name" or die "fix_up_makefile: Couldn't open $makefile_name: $!";
eval { flock MAKEFILE, LOCK_EX };
my $makefile = do { local $/; };
$makefile =~ s/\b(test_harness\(\$\(TEST_VERBOSE\), )/$1'inc', /;
$makefile =~ s/( -I\$\(INST_ARCHLIB\))/ -Iinc$1/g;
$makefile =~ s/( "-I\$\(INST_LIB\)")/ "-Iinc"$1/g;
$makefile =~ s/^(FULLPERL = .*)/$1 "-Iinc"/m;
$makefile =~ s/^(PERL = .*)/$1 "-Iinc"/m;
# Module::Install will never be used to build the Core Perl
# Sometimes PERL_LIB and PERL_ARCHLIB get written anyway, which breaks
# PREFIX/PERL5LIB, and thus, install_share. Blank them if they exist
$makefile =~ s/^PERL_LIB = .+/PERL_LIB =/m;
#$makefile =~ s/^PERL_ARCHLIB = .+/PERL_ARCHLIB =/m;
# Perl 5.005 mentions PERL_LIB explicitly, so we have to remove that as well.
$makefile =~ s/(\"?)-I\$\(PERL_LIB\)\1//g;
# XXX - This is currently unused; not sure if it breaks other MM-users
# $makefile =~ s/^pm_to_blib\s+:\s+/pm_to_blib :: /mg;
seek MAKEFILE, 0, SEEK_SET;
truncate MAKEFILE, 0;
print MAKEFILE "$preamble$makefile$postamble" or die $!;
close MAKEFILE or die $!;
1;
}
sub preamble {
my ($self, $text) = @_;
$self->{preamble} = $text . $self->{preamble} if defined $text;
$self->{preamble};
}
sub postamble {
my ($self, $text) = @_;
$self->{postamble} ||= $self->admin->postamble;
$self->{postamble} .= $text if defined $text;
$self->{postamble}
}
1;
__END__
#line 544
Web-Scraper-0.37/inc/Module/Install/Metadata.pm 000644 000765 000024 00000043277 12040350115 022205 0 ustar 00miyagawa staff 000000 000000 #line 1
package Module::Install::Metadata;
use strict 'vars';
use Module::Install::Base ();
use vars qw{$VERSION @ISA $ISCORE};
BEGIN {
$VERSION = '1.06';
@ISA = 'Module::Install::Base';
$ISCORE = 1;
}
my @boolean_keys = qw{
sign
};
my @scalar_keys = qw{
name
module_name
abstract
version
distribution_type
tests
installdirs
};
my @tuple_keys = qw{
configure_requires
build_requires
requires
recommends
bundles
resources
};
my @resource_keys = qw{
homepage
bugtracker
repository
};
my @array_keys = qw{
keywords
author
};
*authors = \&author;
sub Meta { shift }
sub Meta_BooleanKeys { @boolean_keys }
sub Meta_ScalarKeys { @scalar_keys }
sub Meta_TupleKeys { @tuple_keys }
sub Meta_ResourceKeys { @resource_keys }
sub Meta_ArrayKeys { @array_keys }
foreach my $key ( @boolean_keys ) {
*$key = sub {
my $self = shift;
if ( defined wantarray and not @_ ) {
return $self->{values}->{$key};
}
$self->{values}->{$key} = ( @_ ? $_[0] : 1 );
return $self;
};
}
foreach my $key ( @scalar_keys ) {
*$key = sub {
my $self = shift;
return $self->{values}->{$key} if defined wantarray and !@_;
$self->{values}->{$key} = shift;
return $self;
};
}
foreach my $key ( @array_keys ) {
*$key = sub {
my $self = shift;
return $self->{values}->{$key} if defined wantarray and !@_;
$self->{values}->{$key} ||= [];
push @{$self->{values}->{$key}}, @_;
return $self;
};
}
foreach my $key ( @resource_keys ) {
*$key = sub {
my $self = shift;
unless ( @_ ) {
return () unless $self->{values}->{resources};
return map { $_->[1] }
grep { $_->[0] eq $key }
@{ $self->{values}->{resources} };
}
return $self->{values}->{resources}->{$key} unless @_;
my $uri = shift or die(
"Did not provide a value to $key()"
);
$self->resources( $key => $uri );
return 1;
};
}
foreach my $key ( grep { $_ ne "resources" } @tuple_keys) {
*$key = sub {
my $self = shift;
return $self->{values}->{$key} unless @_;
my @added;
while ( @_ ) {
my $module = shift or last;
my $version = shift || 0;
push @added, [ $module, $version ];
}
push @{ $self->{values}->{$key} }, @added;
return map {@$_} @added;
};
}
# Resource handling
my %lc_resource = map { $_ => 1 } qw{
homepage
license
bugtracker
repository
};
sub resources {
my $self = shift;
while ( @_ ) {
my $name = shift or last;
my $value = shift or next;
if ( $name eq lc $name and ! $lc_resource{$name} ) {
die("Unsupported reserved lowercase resource '$name'");
}
$self->{values}->{resources} ||= [];
push @{ $self->{values}->{resources} }, [ $name, $value ];
}
$self->{values}->{resources};
}
# Aliases for build_requires that will have alternative
# meanings in some future version of META.yml.
sub test_requires { shift->build_requires(@_) }
sub install_requires { shift->build_requires(@_) }
# Aliases for installdirs options
sub install_as_core { $_[0]->installdirs('perl') }
sub install_as_cpan { $_[0]->installdirs('site') }
sub install_as_site { $_[0]->installdirs('site') }
sub install_as_vendor { $_[0]->installdirs('vendor') }
sub dynamic_config {
my $self = shift;
my $value = @_ ? shift : 1;
if ( $self->{values}->{dynamic_config} ) {
# Once dynamic we never change to static, for safety
return 0;
}
$self->{values}->{dynamic_config} = $value ? 1 : 0;
return 1;
}
# Convenience command
sub static_config {
shift->dynamic_config(0);
}
sub perl_version {
my $self = shift;
return $self->{values}->{perl_version} unless @_;
my $version = shift or die(
"Did not provide a value to perl_version()"
);
# Normalize the version
$version = $self->_perl_version($version);
# We don't support the really old versions
unless ( $version >= 5.005 ) {
die "Module::Install only supports 5.005 or newer (use ExtUtils::MakeMaker)\n";
}
$self->{values}->{perl_version} = $version;
}
sub all_from {
my ( $self, $file ) = @_;
unless ( defined($file) ) {
my $name = $self->name or die(
"all_from called with no args without setting name() first"
);
$file = join('/', 'lib', split(/-/, $name)) . '.pm';
$file =~ s{.*/}{} unless -e $file;
unless ( -e $file ) {
die("all_from cannot find $file from $name");
}
}
unless ( -f $file ) {
die("The path '$file' does not exist, or is not a file");
}
$self->{values}{all_from} = $file;
# Some methods pull from POD instead of code.
# If there is a matching .pod, use that instead
my $pod = $file;
$pod =~ s/\.pm$/.pod/i;
$pod = $file unless -e $pod;
# Pull the different values
$self->name_from($file) unless $self->name;
$self->version_from($file) unless $self->version;
$self->perl_version_from($file) unless $self->perl_version;
$self->author_from($pod) unless @{$self->author || []};
$self->license_from($pod) unless $self->license;
$self->abstract_from($pod) unless $self->abstract;
return 1;
}
sub provides {
my $self = shift;
my $provides = ( $self->{values}->{provides} ||= {} );
%$provides = (%$provides, @_) if @_;
return $provides;
}
sub auto_provides {
my $self = shift;
return $self unless $self->is_admin;
unless (-e 'MANIFEST') {
warn "Cannot deduce auto_provides without a MANIFEST, skipping\n";
return $self;
}
# Avoid spurious warnings as we are not checking manifest here.
local $SIG{__WARN__} = sub {1};
require ExtUtils::Manifest;
local *ExtUtils::Manifest::manicheck = sub { return };
require Module::Build;
my $build = Module::Build->new(
dist_name => $self->name,
dist_version => $self->version,
license => $self->license,
);
$self->provides( %{ $build->find_dist_packages || {} } );
}
sub feature {
my $self = shift;
my $name = shift;
my $features = ( $self->{values}->{features} ||= [] );
my $mods;
if ( @_ == 1 and ref( $_[0] ) ) {
# The user used ->feature like ->features by passing in the second
# argument as a reference. Accomodate for that.
$mods = $_[0];
} else {
$mods = \@_;
}
my $count = 0;
push @$features, (
$name => [
map {
ref($_) ? ( ref($_) eq 'HASH' ) ? %$_ : @$_ : $_
} @$mods
]
);
return @$features;
}
sub features {
my $self = shift;
while ( my ( $name, $mods ) = splice( @_, 0, 2 ) ) {
$self->feature( $name, @$mods );
}
return $self->{values}->{features}
? @{ $self->{values}->{features} }
: ();
}
sub no_index {
my $self = shift;
my $type = shift;
push @{ $self->{values}->{no_index}->{$type} }, @_ if $type;
return $self->{values}->{no_index};
}
sub read {
my $self = shift;
$self->include_deps( 'YAML::Tiny', 0 );
require YAML::Tiny;
my $data = YAML::Tiny::LoadFile('META.yml');
# Call methods explicitly in case user has already set some values.
while ( my ( $key, $value ) = each %$data ) {
next unless $self->can($key);
if ( ref $value eq 'HASH' ) {
while ( my ( $module, $version ) = each %$value ) {
$self->can($key)->($self, $module => $version );
}
} else {
$self->can($key)->($self, $value);
}
}
return $self;
}
sub write {
my $self = shift;
return $self unless $self->is_admin;
$self->admin->write_meta;
return $self;
}
sub version_from {
require ExtUtils::MM_Unix;
my ( $self, $file ) = @_;
$self->version( ExtUtils::MM_Unix->parse_version($file) );
# for version integrity check
$self->makemaker_args( VERSION_FROM => $file );
}
sub abstract_from {
require ExtUtils::MM_Unix;
my ( $self, $file ) = @_;
$self->abstract(
bless(
{ DISTNAME => $self->name },
'ExtUtils::MM_Unix'
)->parse_abstract($file)
);
}
# Add both distribution and module name
sub name_from {
my ($self, $file) = @_;
if (
Module::Install::_read($file) =~ m/
^ \s*
package \s*
([\w:]+)
\s* ;
/ixms
) {
my ($name, $module_name) = ($1, $1);
$name =~ s{::}{-}g;
$self->name($name);
unless ( $self->module_name ) {
$self->module_name($module_name);
}
} else {
die("Cannot determine name from $file\n");
}
}
sub _extract_perl_version {
if (
$_[0] =~ m/
^\s*
(?:use|require) \s*
v?
([\d_\.]+)
\s* ;
/ixms
) {
my $perl_version = $1;
$perl_version =~ s{_}{}g;
return $perl_version;
} else {
return;
}
}
sub perl_version_from {
my $self = shift;
my $perl_version=_extract_perl_version(Module::Install::_read($_[0]));
if ($perl_version) {
$self->perl_version($perl_version);
} else {
warn "Cannot determine perl version info from $_[0]\n";
return;
}
}
sub author_from {
my $self = shift;
my $content = Module::Install::_read($_[0]);
if ($content =~ m/
=head \d \s+ (?:authors?)\b \s*
([^\n]*)
|
=head \d \s+ (?:licen[cs]e|licensing|copyright|legal)\b \s*
.*? copyright .*? \d\d\d[\d.]+ \s* (?:\bby\b)? \s*
([^\n]*)
/ixms) {
my $author = $1 || $2;
# XXX: ugly but should work anyway...
if (eval "require Pod::Escapes; 1") {
# Pod::Escapes has a mapping table.
# It's in core of perl >= 5.9.3, and should be installed
# as one of the Pod::Simple's prereqs, which is a prereq
# of Pod::Text 3.x (see also below).
$author =~ s{ E<( (\d+) | ([A-Za-z]+) )> }
{
defined $2
? chr($2)
: defined $Pod::Escapes::Name2character_number{$1}
? chr($Pod::Escapes::Name2character_number{$1})
: do {
warn "Unknown escape: E<$1>";
"E<$1>";
};
}gex;
}
elsif (eval "require Pod::Text; 1" && $Pod::Text::VERSION < 3) {
# Pod::Text < 3.0 has yet another mapping table,
# though the table name of 2.x and 1.x are different.
# (1.x is in core of Perl < 5.6, 2.x is in core of
# Perl < 5.9.3)
my $mapping = ($Pod::Text::VERSION < 2)
? \%Pod::Text::HTML_Escapes
: \%Pod::Text::ESCAPES;
$author =~ s{ E<( (\d+) | ([A-Za-z]+) )> }
{
defined $2
? chr($2)
: defined $mapping->{$1}
? $mapping->{$1}
: do {
warn "Unknown escape: E<$1>";
"E<$1>";
};
}gex;
}
else {
$author =~ s{E}{<}g;
$author =~ s{E}{>}g;
}
$self->author($author);
} else {
warn "Cannot determine author info from $_[0]\n";
}
}
#Stolen from M::B
my %license_urls = (
perl => 'http://dev.perl.org/licenses/',
apache => 'http://apache.org/licenses/LICENSE-2.0',
apache_1_1 => 'http://apache.org/licenses/LICENSE-1.1',
artistic => 'http://opensource.org/licenses/artistic-license.php',
artistic_2 => 'http://opensource.org/licenses/artistic-license-2.0.php',
lgpl => 'http://opensource.org/licenses/lgpl-license.php',
lgpl2 => 'http://opensource.org/licenses/lgpl-2.1.php',
lgpl3 => 'http://opensource.org/licenses/lgpl-3.0.html',
bsd => 'http://opensource.org/licenses/bsd-license.php',
gpl => 'http://opensource.org/licenses/gpl-license.php',
gpl2 => 'http://opensource.org/licenses/gpl-2.0.php',
gpl3 => 'http://opensource.org/licenses/gpl-3.0.html',
mit => 'http://opensource.org/licenses/mit-license.php',
mozilla => 'http://opensource.org/licenses/mozilla1.1.php',
open_source => undef,
unrestricted => undef,
restrictive => undef,
unknown => undef,
);
sub license {
my $self = shift;
return $self->{values}->{license} unless @_;
my $license = shift or die(
'Did not provide a value to license()'
);
$license = __extract_license($license) || lc $license;
$self->{values}->{license} = $license;
# Automatically fill in license URLs
if ( $license_urls{$license} ) {
$self->resources( license => $license_urls{$license} );
}
return 1;
}
sub _extract_license {
my $pod = shift;
my $matched;
return __extract_license(
($matched) = $pod =~ m/
(=head \d \s+ L(?i:ICEN[CS]E|ICENSING)\b.*?)
(=head \d.*|=cut.*|)\z
/xms
) || __extract_license(
($matched) = $pod =~ m/
(=head \d \s+ (?:C(?i:OPYRIGHTS?)|L(?i:EGAL))\b.*?)
(=head \d.*|=cut.*|)\z
/xms
);
}
sub __extract_license {
my $license_text = shift or return;
my @phrases = (
'(?:under )?the same (?:terms|license) as (?:perl|the perl (?:\d )?programming language)' => 'perl', 1,
'(?:under )?the terms of (?:perl|the perl programming language) itself' => 'perl', 1,
'Artistic and GPL' => 'perl', 1,
'GNU general public license' => 'gpl', 1,
'GNU public license' => 'gpl', 1,
'GNU lesser general public license' => 'lgpl', 1,
'GNU lesser public license' => 'lgpl', 1,
'GNU library general public license' => 'lgpl', 1,
'GNU library public license' => 'lgpl', 1,
'GNU Free Documentation license' => 'unrestricted', 1,
'GNU Affero General Public License' => 'open_source', 1,
'(?:Free)?BSD license' => 'bsd', 1,
'Artistic license 2\.0' => 'artistic_2', 1,
'Artistic license' => 'artistic', 1,
'Apache (?:Software )?license' => 'apache', 1,
'GPL' => 'gpl', 1,
'LGPL' => 'lgpl', 1,
'BSD' => 'bsd', 1,
'Artistic' => 'artistic', 1,
'MIT' => 'mit', 1,
'Mozilla Public License' => 'mozilla', 1,
'Q Public License' => 'open_source', 1,
'OpenSSL License' => 'unrestricted', 1,
'SSLeay License' => 'unrestricted', 1,
'zlib License' => 'open_source', 1,
'proprietary' => 'proprietary', 0,
);
while ( my ($pattern, $license, $osi) = splice(@phrases, 0, 3) ) {
$pattern =~ s#\s+#\\s+#gs;
if ( $license_text =~ /\b$pattern\b/i ) {
return $license;
}
}
return '';
}
sub license_from {
my $self = shift;
if (my $license=_extract_license(Module::Install::_read($_[0]))) {
$self->license($license);
} else {
warn "Cannot determine license info from $_[0]\n";
return 'unknown';
}
}
sub _extract_bugtracker {
my @links = $_[0] =~ m#L<(
https?\Q://rt.cpan.org/\E[^>]+|
https?\Q://github.com/\E[\w_]+/[\w_]+/issues|
https?\Q://code.google.com/p/\E[\w_\-]+/issues/list
)>#gx;
my %links;
@links{@links}=();
@links=keys %links;
return @links;
}
sub bugtracker_from {
my $self = shift;
my $content = Module::Install::_read($_[0]);
my @links = _extract_bugtracker($content);
unless ( @links ) {
warn "Cannot determine bugtracker info from $_[0]\n";
return 0;
}
if ( @links > 1 ) {
warn "Found more than one bugtracker link in $_[0]\n";
return 0;
}
# Set the bugtracker
bugtracker( $links[0] );
return 1;
}
sub requires_from {
my $self = shift;
my $content = Module::Install::_readperl($_[0]);
my @requires = $content =~ m/^use\s+([^\W\d]\w*(?:::\w+)*)\s+(v?[\d\.]+)/mg;
while ( @requires ) {
my $module = shift @requires;
my $version = shift @requires;
$self->requires( $module => $version );
}
}
sub test_requires_from {
my $self = shift;
my $content = Module::Install::_readperl($_[0]);
my @requires = $content =~ m/^use\s+([^\W\d]\w*(?:::\w+)*)\s+([\d\.]+)/mg;
while ( @requires ) {
my $module = shift @requires;
my $version = shift @requires;
$self->test_requires( $module => $version );
}
}
# Convert triple-part versions (eg, 5.6.1 or 5.8.9) to
# numbers (eg, 5.006001 or 5.008009).
# Also, convert double-part versions (eg, 5.8)
sub _perl_version {
my $v = $_[-1];
$v =~ s/^([1-9])\.([1-9]\d?\d?)$/sprintf("%d.%03d",$1,$2)/e;
$v =~ s/^([1-9])\.([1-9]\d?\d?)\.(0|[1-9]\d?\d?)$/sprintf("%d.%03d%03d",$1,$2,$3 || 0)/e;
$v =~ s/(\.\d\d\d)000$/$1/;
$v =~ s/_.+$//;
if ( ref($v) ) {
# Numify
$v = $v + 0;
}
return $v;
}
sub add_metadata {
my $self = shift;
my %hash = @_;
for my $key (keys %hash) {
warn "add_metadata: $key is not prefixed with 'x_'.\n" .
"Use appopriate function to add non-private metadata.\n" unless $key =~ /^x_/;
$self->{values}->{$key} = $hash{$key};
}
}
######################################################################
# MYMETA Support
sub WriteMyMeta {
die "WriteMyMeta has been deprecated";
}
sub write_mymeta_yaml {
my $self = shift;
# We need YAML::Tiny to write the MYMETA.yml file
unless ( eval { require YAML::Tiny; 1; } ) {
return 1;
}
# Generate the data
my $meta = $self->_write_mymeta_data or return 1;
# Save as the MYMETA.yml file
print "Writing MYMETA.yml\n";
YAML::Tiny::DumpFile('MYMETA.yml', $meta);
}
sub write_mymeta_json {
my $self = shift;
# We need JSON to write the MYMETA.json file
unless ( eval { require JSON; 1; } ) {
return 1;
}
# Generate the data
my $meta = $self->_write_mymeta_data or return 1;
# Save as the MYMETA.yml file
print "Writing MYMETA.json\n";
Module::Install::_write(
'MYMETA.json',
JSON->new->pretty(1)->canonical->encode($meta),
);
}
sub _write_mymeta_data {
my $self = shift;
# If there's no existing META.yml there is nothing we can do
return undef unless -f 'META.yml';
# We need Parse::CPAN::Meta to load the file
unless ( eval { require Parse::CPAN::Meta; 1; } ) {
return undef;
}
# Merge the perl version into the dependencies
my $val = $self->Meta->{values};
my $perl = delete $val->{perl_version};
if ( $perl ) {
$val->{requires} ||= [];
my $requires = $val->{requires};
# Canonize to three-dot version after Perl 5.6
if ( $perl >= 5.006 ) {
$perl =~ s{^(\d+)\.(\d\d\d)(\d*)}{join('.', $1, int($2||0), int($3||0))}e
}
unshift @$requires, [ perl => $perl ];
}
# Load the advisory META.yml file
my @yaml = Parse::CPAN::Meta::LoadFile('META.yml');
my $meta = $yaml[0];
# Overwrite the non-configure dependency hashs
delete $meta->{requires};
delete $meta->{build_requires};
delete $meta->{recommends};
if ( exists $val->{requires} ) {
$meta->{requires} = { map { @$_ } @{ $val->{requires} } };
}
if ( exists $val->{build_requires} ) {
$meta->{build_requires} = { map { @$_ } @{ $val->{build_requires} } };
}
return $meta;
}
1;
Web-Scraper-0.37/inc/Module/Install/Repository.pm 000644 000765 000024 00000004256 12040350115 022636 0 ustar 00miyagawa staff 000000 000000 #line 1
package Module::Install::Repository;
use strict;
use 5.005;
use vars qw($VERSION);
$VERSION = '0.06';
use base qw(Module::Install::Base);
sub _execute {
my ($command) = @_;
`$command`;
}
sub auto_set_repository {
my $self = shift;
return unless $Module::Install::AUTHOR;
my $repo = _find_repo(\&_execute);
if ($repo) {
$self->repository($repo);
} else {
warn "Cannot determine repository URL\n";
}
}
sub _find_repo {
my ($execute) = @_;
if (-e ".git") {
# TODO support remote besides 'origin'?
if ($execute->('git remote show -n origin') =~ /URL: (.*)$/m) {
# XXX Make it public clone URL, but this only works with github
my $git_url = $1;
$git_url =~ s![\w\-]+\@([^:]+):!git://$1/!;
return $git_url;
} elsif ($execute->('git svn info') =~ /URL: (.*)$/m) {
return $1;
}
} elsif (-e ".svn") {
if (`svn info` =~ /URL: (.*)$/m) {
return $1;
}
} elsif (-e "_darcs") {
# defaultrepo is better, but that is more likely to be ssh, not http
if (my $query_repo = `darcs query repo`) {
if ($query_repo =~ m!Default Remote: (http://.+)!) {
return $1;
}
}
open my $handle, '<', '_darcs/prefs/repos' or return;
while (<$handle>) {
chomp;
return $_ if m!^http://!;
}
} elsif (-e ".hg") {
if ($execute->('hg paths') =~ /default = (.*)$/m) {
my $mercurial_url = $1;
$mercurial_url =~ s!^ssh://hg\@(bitbucket\.org/)!https://$1!;
return $mercurial_url;
}
} elsif (-e "$ENV{HOME}/.svk") {
# Is there an explicit way to check if it's an svk checkout?
my $svk_info = `svk info` or return;
SVK_INFO: {
if ($svk_info =~ /Mirrored From: (.*), Rev\./) {
return $1;
}
if ($svk_info =~ m!Merged From: (/mirror/.*), Rev\.!) {
$svk_info = `svk info /$1` or return;
redo SVK_INFO;
}
}
return;
}
}
1;
__END__
=encoding utf-8
#line 128
Web-Scraper-0.37/inc/Module/Install/Scripts.pm 000644 000765 000024 00000001011 12040350115 022070 0 ustar 00miyagawa staff 000000 000000 #line 1
package Module::Install::Scripts;
use strict 'vars';
use Module::Install::Base ();
use vars qw{$VERSION @ISA $ISCORE};
BEGIN {
$VERSION = '1.06';
@ISA = 'Module::Install::Base';
$ISCORE = 1;
}
sub install_script {
my $self = shift;
my $args = $self->makemaker_args;
my $exe = $args->{EXE_FILES} ||= [];
foreach ( @_ ) {
if ( -f $_ ) {
push @$exe, $_;
} elsif ( -d 'script' and -f "script/$_" ) {
push @$exe, "script/$_";
} else {
die("Cannot find script '$_'");
}
}
}
1;
Web-Scraper-0.37/inc/Module/Install/Win32.pm 000644 000765 000024 00000003403 12040350115 021352 0 ustar 00miyagawa staff 000000 000000 #line 1
package Module::Install::Win32;
use strict;
use Module::Install::Base ();
use vars qw{$VERSION @ISA $ISCORE};
BEGIN {
$VERSION = '1.06';
@ISA = 'Module::Install::Base';
$ISCORE = 1;
}
# determine if the user needs nmake, and download it if needed
sub check_nmake {
my $self = shift;
$self->load('can_run');
$self->load('get_file');
require Config;
return unless (
$^O eq 'MSWin32' and
$Config::Config{make} and
$Config::Config{make} =~ /^nmake\b/i and
! $self->can_run('nmake')
);
print "The required 'nmake' executable not found, fetching it...\n";
require File::Basename;
my $rv = $self->get_file(
url => 'http://download.microsoft.com/download/vc15/Patch/1.52/W95/EN-US/Nmake15.exe',
ftp_url => 'ftp://ftp.microsoft.com/Softlib/MSLFILES/Nmake15.exe',
local_dir => File::Basename::dirname($^X),
size => 51928,
run => 'Nmake15.exe /o > nul',
check_for => 'Nmake.exe',
remove => 1,
);
die <<'END_MESSAGE' unless $rv;
-------------------------------------------------------------------------------
Since you are using Microsoft Windows, you will need the 'nmake' utility
before installation. It's available at:
http://download.microsoft.com/download/vc15/Patch/1.52/W95/EN-US/Nmake15.exe
or
ftp://ftp.microsoft.com/Softlib/MSLFILES/Nmake15.exe
Please download the file manually, save it to a directory in %PATH% (e.g.
C:\WINDOWS\COMMAND\), then launch the MS-DOS command line shell, "cd" to
that directory, and run "Nmake15.exe" from there; that will create the
'nmake.exe' file needed by this module.
You may then resume the installation process described in README.
-------------------------------------------------------------------------------
END_MESSAGE
}
1;
Web-Scraper-0.37/inc/Module/Install/WriteAll.pm 000644 000765 000024 00000002376 12040350115 022203 0 ustar 00miyagawa staff 000000 000000 #line 1
package Module::Install::WriteAll;
use strict;
use Module::Install::Base ();
use vars qw{$VERSION @ISA $ISCORE};
BEGIN {
$VERSION = '1.06';
@ISA = qw{Module::Install::Base};
$ISCORE = 1;
}
sub WriteAll {
my $self = shift;
my %args = (
meta => 1,
sign => 0,
inline => 0,
check_nmake => 1,
@_,
);
$self->sign(1) if $args{sign};
$self->admin->WriteAll(%args) if $self->is_admin;
$self->check_nmake if $args{check_nmake};
unless ( $self->makemaker_args->{PL_FILES} ) {
# XXX: This still may be a bit over-defensive...
unless ($self->makemaker(6.25)) {
$self->makemaker_args( PL_FILES => {} ) if -f 'Build.PL';
}
}
# Until ExtUtils::MakeMaker support MYMETA.yml, make sure
# we clean it up properly ourself.
$self->realclean_files('MYMETA.yml');
if ( $args{inline} ) {
$self->Inline->write;
} else {
$self->Makefile->write;
}
# The Makefile write process adds a couple of dependencies,
# so write the META.yml files after the Makefile.
if ( $args{meta} ) {
$self->Meta->write;
}
# Experimental support for MYMETA
if ( $ENV{X_MYMETA} ) {
if ( $ENV{X_MYMETA} eq 'JSON' ) {
$self->Meta->write_mymeta_json;
} else {
$self->Meta->write_mymeta_yaml;
}
}
return 1;
}
1;
Web-Scraper-0.37/eg/dave-trailer-HD.pl 000755 000765 000024 00000001204 11162225735 020323 0 ustar 00miyagawa staff 000000 000000 #!/usr/bin/perl
use strict;
use warnings;
use lib "lib";
use Web::Scraper;
use URI;
use YAML;
# extract HD trailers from Dave's trailer page
my $uri = URI->new("http://www.drfoster.f2s.com/");
my $s = scraper {
process "td>ul>li", "trailers[]" => scraper {
process_first "li>b", title => "TEXT";
process_first "ul>li>a[href]", url => '@href';
process "ul>li>ul>li>a", "movies[]" => sub {
my $elem = shift;
return {
text => $elem->as_text,
href => $elem->attr('href'),
};
};
};
result "trailers";
};
warn Dump $s->scrape($uri);
Web-Scraper-0.37/eg/ebay-auction.pl 000755 000765 000024 00000001145 11162225735 020037 0 ustar 00miyagawa staff 000000 000000 #!/usr/bin/perl
use strict;
use warnings;
use URI;
use lib "lib";
use Web::Scraper;
my $ebay_auction = scraper {
process "h3.ens>a",
description => 'TEXT',
url => '@href';
process "td.ebcPr>span", price => "TEXT";
process "div.ebPicture >a>img", image => '@src';
result 'description', 'url', 'price', 'image';
};
my $ebay = scraper {
process "table.ebItemlist tr.single",
"auctions[]" => $ebay_auction;
result 'auctions';
};
my $auctions = $ebay->scrape( URI->new("http://search.ebay.com/apple-ipod-nano_W0QQssPageNameZWLRS") );
use YAML;
warn Dump $auctions;
Web-Scraper-0.37/eg/extract-links.pl 000755 000765 000024 00000000447 11162225735 020253 0 ustar 00miyagawa staff 000000 000000 #!/usr/bin/perl
use strict;
use warnings;
use URI;
use lib "lib";
use Web::Scraper;
my $uri = shift @ARGV or die "URI needed";
my $scraper = scraper {
process "a[href]", "urls[]" => '@href';
result 'urls';
};
my $links = $scraper->scrape(URI->new($uri));
use YAML;
warn Dump $links;
Web-Scraper-0.37/eg/hatena-keyword.pl 000755 000765 000024 00000001202 11162225735 020373 0 ustar 00miyagawa staff 000000 000000 #!/usr/bin/perl
use strict;
use warnings;
use lib "lib";
use URI;
use Web::Scraper;
# same as http://d.hatena.ne.jp/secondlife/20060922/1158923779
my $keyword = scraper {
process 'span.title > a:first-child', title => 'TEXT', url => '@href';
process 'span.furigana', furigana => 'TEXT';
process 'ul.list-circle > li:first-child > a', category => 'TEXT';
};
my $res = $keyword->scrape(URI->new("http://d.hatena.ne.jp/keyword/%BA%B0%CC%EE%A4%A2%A4%B5%C8%FE"));
use YAML;
warn Dump $res;
__END__
---
category: アイドル
furigana: こんのあさみ
title: 紺野あさ美
url: /keyword/%ba%b0%cc%ee%a4%a2%a4%b5%c8%fe?kid=800
Web-Scraper-0.37/eg/jp-playstation-store.pl 000755 000765 000024 00000000453 11162225735 021570 0 ustar 00miyagawa staff 000000 000000 #!/usr/bin/perl
use strict;
use Web::Scraper;
use URI;
use YAML;
my $stuff = URI->new("http://www.jp.playstation.com/store/");
my $scraper = scraper {
process "#Sinfo p a", 'news[]' => { link => '@href', title => 'TEXT' };
};
my $result = $scraper->scrape($stuff);
print YAML::Dump $result;
Web-Scraper-0.37/eg/rel-tag.pl 000755 000765 000024 00000000770 11162225735 017015 0 ustar 00miyagawa staff 000000 000000 #!/usr/bin/perl
# Extract tags from web pages that have rel-tag microformat
use strict;
use warnings;
use URI;
use URI::Escape;
use Web::Scraper;
use YAML;
my $uri = shift or die "Usage: rel-tag.pl URL\n";
my $scraper = scraper {
process 'a[rel~="tag"]', 'tags[]' => sub {
my $uri = URI->new($_->attr('href'));
my $label = (grep length, split '/', $uri->path)[-1];
$label =~ s/\+/%20/g;
uri_unescape($label);
};
};
warn Dump $scraper->scrape(URI->new($uri));
Web-Scraper-0.37/eg/twitter-friends.pl 000755 000765 000024 00000000723 11162225735 020612 0 ustar 00miyagawa staff 000000 000000 #!/usr/bin/perl
use strict;
use warnings;
use lib "lib";
use URI;
use Web::Scraper;
my $nick = shift || "miyagawa";
my $uri = URI->new("http://twitter.com/$nick");
my $twitter = scraper {
process 'a[rel=~"contact"]',
'friends[]' => scraper {
process 'a', url => '@href', name => '@title';
process 'img', src => '@src';
};
result 'friends';
};
my $friends = $twitter->scrape($uri);
use YAML;
warn Dump $friends;
Web-Scraper-0.37/bin/scraper 000755 000765 000024 00000005057 11162225735 016667 0 ustar 00miyagawa staff 000000 000000 #!/usr/bin/perl
use strict;
use warnings;
use Config;
use Term::ReadLine;
use Data::Dumper;
use HTML::Entities;
use URI;
use Web::Scraper;
use YAML;
sub WARN() {
return sub {
warn $_->isTextNode
? HTML::Entities::encode($_->as_XML, q("'<>&))
: $_->as_HTML(q('"&<>), "", {});
};
}
my $print = sub {
if ($ENV{PAGER}) {
open my $pager, "|$ENV{PAGER}";
print $pager @_;
} else {
print @_;
}
};
my(@stack, $source);
my $stuff = process_args($ARGV[0])
or die "Usage: scraper [URI-or-filename]\n";
my $term = Term::ReadLine->new("Web::Scraper");
my $scraper = scraper { run_loop($_[0], $term) };
$scraper->user_agent->env_proxy;
my $result = $scraper->scrape($stuff);
sub process_args {
my $uri = shift;
if (!-t STDIN and my $content = join "", ) {
$source = [ 'stdin' ];
return \$content;
} elsif ($uri && $uri =~ m!^https?://!) {
$source = [ "URI", $uri ];
return URI->new($uri);
} elsif ($uri && -e $uri) {
$source = [ 'file', $uri ];
open my $fh, "<", $uri or die "$uri: $!";
return join "", <$fh>;
}
return;
}
sub run_loop {
my($tree, $term) = @_;
while (defined(my $in = $term->readline("scraper> "))) {
if ($in eq 'd') {
$Data::Dumper::Indent = 1;
warn Dumper result;
} elsif ($in eq 'y') {
warn Dump result;
} elsif ($in eq 's') {
$print->($tree->as_HTML(q('"&<>), " ", {}));
} elsif ($in eq 'q') {
return;
} elsif ($in eq 'c') {
print generate_code($source, $stack[-1]);
} elsif ($in =~ /^c\s+all\s*$/) {
print generate_code($source, @stack);
} else {
my $res = eval $in;
warn $@ if $@;
push @stack, $in unless $@;
}
}
}
sub generate_code {
my($source, @stack) = @_;
my $code_stack = join "\n", map { " $_" . (/;$/ ? "" : ";") } @stack;
my($var, $stuff) =
$source->[0] eq 'stdin' ? ('$input', '\join "", ') :
$source->[0] eq 'URI' ? ('$uri', qq(URI->new("$source->[1]"))) :
$source->[0] eq 'file' ? ('$file', qq(\\do { my \$file = "$source->[1]"; open my \$fh, \$file or die "\$file: \$!"; join '', <\$fh> })) :
'...';
return <scrape($var);
CODE
}