tags
## unless we're stacking (bug #5723)
if(@TEXT and exists $TAGS{$Tag}) {
$TEXT[-1]->{_TEXT} .= $T->[-1] ; # $TEXT[-1]->{_TEXT} .= $T->as_is;
my $pop = pop @TEXT;
$TEXT[-1]->{_TEXT} .= $pop->{_TEXT} if @TEXT;
$pop->{_TEXT} = _stripHTML( \$pop->{_TEXT} ) if $self->strip;
$self->{_cb}->($self, $pop) if exists $self->{_cb};
}
}
if(defined $NL) {
$$NL{tag} = $Tag;
my $base = $self->{_base};
for my $at( @VALID_URL_ATTRIBUTES ) {
if( exists $$NL{$at} ) {
$$NL{$at} = URI->new_abs( $$NL{$at}, $base) if $base;
}
}
if(exists $self->{_cb}) {
$self->{_cb}->($self, $NL ) if not $got_TAGS_IN_NEED or not @TEXT; #bug#5470
} else {
push @{$self->{_LINKS}}, $NL;
}
}
}## endof while (my $token = $p->get_token)
undef $self->{_tp};
return();
}
sub links {
my $self = shift;
## just like HTML::LinkExtor's
return $self->{_LINKS};
}
sub _stripHTML {
my $HtmlRef = shift;
my $tp = new HTML::TokeParser( $HtmlRef ); # my $tp = new HTML::TokeParser::Simple( $HtmlRef );
my $t = $tp->get_token(); # MUST BE A START TAG (@TAGS_IN_NEED)
# otherwise it ain't come from LinkExtractor
if($t->[0] eq 'S' ) { # if($t->is_start_tag) {
return $tp->get_trimmed_text( '/'.$t->[1] ); # return $tp->get_trimmed_text( '/'.$t->return_tag );
} else {
require Data::Dumper;
local $Data::Dumper::Indent=1;
die " IMPOSSIBLE!!!! ",
Data::Dumper::Dumper(
'$HtmlRef',$HtmlRef,
'$t', $t,
);
}
}
1;
package main;
unless(caller()) {
require Data::Dumper;
if(@ARGV) {
for my $file( @ARGV ) {
if( -e $file ) {
my $LX = new HTML::LinkExtractor( );
$LX->parse( $file );
print Data::Dumper::Dumper($LX->links);
undef $LX;
} else {
warn "The file `$file' doesn't exist\n";
}
}
} else {
my $INPUT = q{
COUNT THEM BOYS AND GIRLS, LINKS OUTGHT TO HAVE 9 ELEMENTS.
1
2
3
4 Perlmonks.org
5
hello there
6
7 now
8 To be or not to be.
9
Just Another Perl Hacker,
};
my $LX = new HTML::LinkExtractor();
$LX->parse(\$INPUT);
print scalar(@{$LX->links()})." we GOT\n";
print Data::Dumper::Dumper( $LX->links() );
}
}
__END__
=head1 NAME
HTML::LinkExtractor - Extract I> from an HTML document
=head1 DESCRIPTION
HTML::LinkExtractor is used for extracting links from HTML.
It is very similar to L,
except that besides getting the URL, you also get the link-text.
Example ( B ):
use HTML::LinkExtractor;
use Data::Dumper;
my $input = q{If I am a LINK!!! };
my $LX = new HTML::LinkExtractor();
$LX->parse(\$input);
print Dumper($LX->links);
__END__
# the above example will yield
$VAR1 = [
{
'_TEXT' => ' I am a LINK!!! ',
'href' => bless(do{\(my $o = 'http://perl.com/')}, 'URI::http'),
'tag' => 'a'
}
];
C will also correctly extract nested
I> tags.
=head1 SYNOPSIS
## the demo
perl LinkExtractor.pm
perl LinkExtractor.pm file.html othefile.html
## or if the module is installed, but you don't know where
perl -MHTML::LinkExtractor -e" system $^X, $INC{q{HTML/LinkExtractor.pm}} "
perl -MHTML::LinkExtractor -e' system $^X, $INC{q{HTML/LinkExtractor.pm}} '
## or
use HTML::LinkExtractor;
use LWP qw( get ); # use LWP::Simple qw( get );
my $base = 'http://search.cpan.org';
my $html = get($base.'/recent');
my $LX = new HTML::LinkExtractor();
$LX->parse(\$html);
print qq{\n};
for my $Link( @{ $LX->links } ) {
## new modules are linked by /author/NAME/Dist
if( $$Link{href}=~ m{^\/author\/\w+} ) {
print $$Link{_TEXT}."\n";
}
}
undef $LX;
__END__
## or
use HTML::LinkExtractor;
use Data::Dumper;
my $input = q{If I am a LINK!!! };
my $LX = new HTML::LinkExtractor(
sub {
print Data::Dumper::Dumper(@_);
},
'http://perlFox.org/',
);
$LX->parse(\$input);
$LX->strip(1);
$LX->parse(\$input);
__END__
#### Calculate to total size of a web-page
#### adds up the sizes of all the images and stylesheets and stuff
use strict;
use LWP; # use LWP::Simple;
use HTML::LinkExtractor;
#
my $url = shift || 'http://www.google.com';
my $html = get($url);
my $Total = length $html;
#
print "initial size $Total\n";
#
my $LX = new HTML::LinkExtractor(
sub {
my( $X, $tag ) = @_;
#
unless( grep {$_ eq $tag->{tag} } @HTML::LinkExtractor::TAGS_IN_NEED ) {
#
print "$$tag{tag}\n";
#
for my $urlAttr ( @{$HTML::LinkExtractor::TAGS{$$tag{tag}}} ) {
if( exists $$tag{$urlAttr} ) {
my $size = (head( $$tag{$urlAttr} ))[1];
$Total += $size if $size;
print "adding $size\n" if $size;
}
}
}
},
$url,
0
);
#
$LX->parse(\$html);
#
print "The total size of \n$url\n is $Total bytes\n";
__END__
=head1 METHODS
=head2 C<$LX-Enew([\&callback, [$baseUrl, [1]]])>
Accepts 3 arguments, all of which are optional.
If for example you want to pass a C<$baseUrl>, but don't
want to have a callback invoked, just put C in place of a subref.
This is the only class method.
=over 4
=item 1
a callback ( a sub reference, as in C, or C<\&sub>)
which is to be called each time a new LINK is encountered
( for C<@HTML::LinkExtractor::TAGS_IN_NEED> this means
after the closing tag is encountered )
The callback receives an object reference(C<$LX>) and a link hashref.
=item 2
and a base URL ( URI->new, so its up to you to make sure it's valid
which is used to convert all relative URI's to absolute ones.
$ALinkP{href} = URI->new_abs( $ALink{href}, $base );
=item 3
A "boolean" (just stick with 1).
See the example in L<"DESCRIPTION">.
Normally, you'd get back _TEXT that looks like
'_TEXT' => ' I am a LINK!!! ',
If you turn this option on, you'll get the following instead
'_TEXT' => ' I am a LINK!!! ',
The private utility function C<_stripHTML> does this
by using Ls
method get_trimmed_text.
You can turn this feature on an off by using
C<$LX-Estrip(undef EE 0 EE 1)>
=back
=head2 C<$LX-Eparse( $filename EE *FILEHANDLE EE \$FileContent )>
Each time you call C, you should pass it a
C<$filename> a C<*FILEHANDLE> or a C<\$FileContent>
Each time you call C a new C object
is created and stored in C<$this-E{_tp}>.
You shouldn't need to mess with the TokeParser object.
=head2 C<$LX-Elinks()>
Only after you call C will this method return anything.
This method returns a reference to an ArrayOfHashes,
which basically looks like (Data::Dumper output)
$VAR1 = [ { tag => 'img', src => 'image.png' }, ];
Please note that if yo provide a callback this array will be empty.
=head2 C<$LX-Estrip( [ 0 || 1 ])>
If you pass in C (or nothing), returns the state of the option.
Passing in a true or false value sets the option.
If you wanna know what the option does see
Lnew([\&callback, [$baseUrl, [1]]])>|/"METHODS">
=head1 WHAT'S A LINK-type tag
Take a look at C<%HTML::LinkExtractor::TAGS> to see
what I consider to be link-type-tag.
Take a look at C<@HTML::LinkExtractor::VALID_URL_ATTRIBUTES> to see
all the possible tag attributes which can contain URI's (the links!!)
Take a look at C<@HTML::LinkExtractor::TAGS_IN_NEED> to see
the tags for which the C<'_TEXT'> attribute is provided,
like Ca href="#"E TEST E/aE>
=head2 How can that be?!?!
I took at look at L|HTML::Tagset>
and the following URL's
http://www.blooberry.com/indexdot/html/tagindex/all.htm
http://www.blooberry.com/indexdot/html/tagpages/a/a-hyperlink.htm
http://www.blooberry.com/indexdot/html/tagpages/a/applet.htm
http://www.blooberry.com/indexdot/html/tagpages/a/area.htm
http://www.blooberry.com/indexdot/html/tagpages/b/base.htm
http://www.blooberry.com/indexdot/html/tagpages/b/bgsound.htm
http://www.blooberry.com/indexdot/html/tagpages/d/del.htm
http://www.blooberry.com/indexdot/html/tagpages/d/div.htm
http://www.blooberry.com/indexdot/html/tagpages/e/embed.htm
http://www.blooberry.com/indexdot/html/tagpages/f/frame.htm
http://www.blooberry.com/indexdot/html/tagpages/i/ins.htm
http://www.blooberry.com/indexdot/html/tagpages/i/image.htm
http://www.blooberry.com/indexdot/html/tagpages/i/iframe.htm
http://www.blooberry.com/indexdot/html/tagpages/i/ilayer.htm
http://www.blooberry.com/indexdot/html/tagpages/i/inputimage.htm
http://www.blooberry.com/indexdot/html/tagpages/l/layer.htm
http://www.blooberry.com/indexdot/html/tagpages/l/link.htm
http://www.blooberry.com/indexdot/html/tagpages/o/object.htm
http://www.blooberry.com/indexdot/html/tagpages/q/q.htm
http://www.blooberry.com/indexdot/html/tagpages/s/script.htm
http://www.blooberry.com/indexdot/html/tagpages/s/sound.htm
And the special cases
http://www.blooberry.com/indexdot/html/tagpages/d/doctype.htm
'!doctype' is really a process instruction, but is still listed
in %TAGS with 'url' as the attribute
and
http://www.blooberry.com/indexdot/html/tagpages/m/meta.htm
If there is a valid url, 'url' is set as the attribute.
The meta tag has no 'attributes' listed in %TAGS.
=head1 SEE ALSO
L, L, L.
=head1 AUTHOR
D.H (PodMaster)
Please use http://rt.cpan.org/ to report bugs.
Just go to
http://rt.cpan.org/NoAuth/Bugs.html?Dist=HTML-Scrubber
to see a bug list and/or repot new ones.
=head1 LICENSE
Copyright (c) 2003, 2004 by D.H. (PodMaster).
All rights reserved.
This module is free software;
you can redistribute it and/or modify it under
the same terms as Perl itself.
The LICENSE file contains the full text of the license.
=cut
HTML-LinkExtractor-0.13/Makefile.PL 0100777 0000620 0000621 00000001547 10126723435 015743 0 ustar __ mkpasswd use ExtUtils::MakeMaker;
# See lib/ExtUtils/MakeMaker.pm for details of how to influence
# the contents of the Makefile that is written.
WriteMakefile(
'NAME' => 'HTML::LinkExtractor',
'VERSION_FROM' => 'LinkExtractor.pm', # finds $VERSION
PREREQ_PM => {
# 'HTML::TokeParser::Simple' => '2', # or a minimum workable version
'HTML::TokeParser' => 0,
},
($] >= 5.005 ? ## Add these new keywords supported since 5.005
(ABSTRACT_FROM => 'LinkExtractor.pm', # retrieve abstract from module
AUTHOR => 'PodMaster') : ()),
);
__END__
perl Makefile.PL && nmake realclean && cls && perl Makefile.PL && nmake disttest
nmake dist TAR=ptar TARFLAGS=-zcvf COMPRESS=echo
perl -e"my $f=glob q,*.tar,;warn rename $f, qq,$f.gz,"
chmod 7777 *.gz
perl -le" `cpan-upload $_` for( (sort glob q,*.gz,)[-1]) "
HTML-LinkExtractor-0.13/MANIFEST 0100777 0000620 0000621 00000000233 07767367025 015131 0 ustar __ mkpasswd Changes
LinkExtractor.pm
Makefile.PL
MANIFEST
README
test.pl
LICENSE
META.yml Module meta-data (added by MakeMaker)
HTML-LinkExtractor-0.13/META.yml 0100777 0000620 0000621 00000000547 10167356226 015246 0 ustar __ mkpasswd # http://module-build.sourceforge.net/META-spec.html
#XXXXXXX This is a prototype!!! It will change in the future!!! XXXXX#
name: HTML-LinkExtractor
version: 0.13
version_from: LinkExtractor.pm
installdirs: site
requires:
HTML::TokeParser: 0
distribution_type: module
generated_by: ExtUtils::MakeMaker version 6.25
HTML-LinkExtractor-0.13/README 0100777 0000620 0000621 00000000516 10017737313 014643 0 ustar __ mkpasswd INSTALLATION
To install this module type the following:
perl Makefile.PL
make
make test
make install
If you are on a windows box you should use 'nmake' rather than 'make'.
For documentation, try
perldoc HTML::LinkExtractor
perldoc LinkExtractor.pm
pod2text LinkExtractor.pm >>README
HTML-LinkExtractor-0.13/test.pl 0100777 0000620 0000621 00000003500 10126723325 015272 0 ustar __ mkpasswd # Before `make install' is performed this script should be runnable with
# `make test'. After `make install' it should work as `perl test.pl'
#########################
# change 'tests => 1' to 'tests => last_test_to_print';
use Test;
BEGIN {
plan(
tests => 12,
onfail => sub { exit @_ },
);
}
use HTML::LinkExtractor;
ok(1); # If we made it this far, we're ok.
my $output = `$^X $INC{'HTML/LinkExtractor.pm'}`;
#use Data::Dumper;die Dumper $output;
ok( $output =~ m{9 we GOT} or 0 );
ok( $output =~ m{\Q'cite' => 'http://www.stonehenge.com/merlyn/'} or 0 );
ok( $output =~ m{\Q'url' => 'http://www.foo.com/foo.html'} or 0 );
my $LX = HTML::LinkExtractor::->new(undef,undef,1);
ok( $LX->strip or 0 );
ok( $LX->strip(1) && $LX->strip or 0 );
$LX->parse(\ q{ stuff that matters } );
#use Data::Dumper;warn Dumper( scalar $LX->links );
ok( $LX->links->[0]->{_TEXT} eq "stuff that matters" or 0);
$LX = HTML::LinkExtractor::->new(
sub {
my( $lx, $link ) = @_;
$output = $link->{_TEXT};
},
'http://use.perl.org', 1
);
ok(1);
$LX->parse(\ q{
perl guy
} );
ok( $output eq 'perl guy' or 0 );
ok( @{ $LX->links } == 0 ? 1 : 0 );
# bug#5470
$output = [];
$LX = HTML::LinkExtractor::->new(
sub {
my( $lx, $link ) = @_;
push @$output,$link;
},
'http://use.perl.org', 1
);
$LX->parse(\ q{
} );
ok( @$output == 2 );
$LX = HTML::LinkExtractor::->new(undef, 'http://use.perl.org', 1 );
$LX->parse(\ q{
} );
ok( @{ $LX->links } == 2 );