SubtableHead Zero | SubtableHead One | SubtableHead Two | SubtableHead Three | SubtableHead Four | SubtableHead Five | SubtableHead Six | SubtableHead Seven | SubtableHead Eight | SubtableHead Nine |
(1,0) | (1,1) | (1,2) | (1,3) | (1,4) | (1,5) | (1,6) | (1,7) | (1,8) | (1,9) |
(2,0) | (2,1) | (2,2) | (2,3) | (2,4) | (2,5) | (2,6) | (2,7) | (2,8) | (2,9) |
(3,0) | (3,1) | (3,2) | (3,3) | (3,4) | (3,5) | (3,6) | (3,7) | (3,8) | (3,9) |
(4,0) | (4,1) | (4,2) | (4,3) | (4,4) | (4,5) | (4,6) | (4,7) | (4,8) | (4,9) |
(5,0) | (5,1) | (5,2) | (5,3) | (5,4) | (5,5)Header Zero | Header One | Header Two | Header Three | Header Four | Header Five | Header Six | Header Seven | Header Eight | Header Nine | (1,0) | (1,1) | (1,2) | (1,3) | (1,4) | (1,5) | (1,6) | (1,7) | (1,8) | (1,9) | (2,0) | (2,1) | (2,2) | (2,3) | (2,4) | (2,5) | (2,6) | (2,7) | (2,8) | (2,9) | (3,0) | (3,1) | (3,2) | (3,3) | (3,4) | (3,5) | (3,6) | (3,7) | (3,8) | (3,9) | (4,0) | (4,1) | (4,2) | (4,3) | (4,4) | (4,5) | (4,6) | (4,7) | (4,8) | (4,9) | (5,0) | (5,1) | (5,2) | (5,3) | (5,4) | (5,5) | (5,6) | (5,7) | (5,8) | (5,9) | (6,0) | (6,1) | (6,2) | (6,3) | (6,4) | (6,5) | (6,6) | (6,7) | (6,8) | (6,9) | (7,0) | (7,1) | (7,2) | (7,3) | (7,4) | (7,5) | (7,6) | (7,7) | (7,8) | (7,9) | (8,0) | (8,1) | (8,2) | (8,3) | (8,4) | (8,5) | (8,6) | (8,7) | (8,8) | (8,9) | (9,0) | (9,1) | (9,2) | (9,3) | (9,4) | (9,5) | (9,6) | (9,7) | (9,8) | (9,9) |
| (5,6) | (5,7) | (5,8) | (5,9) |
(6,0) | (6,1) | (6,2) | (6,3) | (6,4) | (6,5) | (6,6) | (6,7) | (6,8) | (6,9) |
(7,0) | (7,1) | (7,2) | (7,3) | (7,4) | (7,5) | (7,6) | (7,7)Header Zero | Header One | Header Two | Header Three | Header Four | Header Five | Header Six | Header Seven | Header Eight | Header Nine | (1,0) | (1,1) | (1,2) | (1,3) | (1,4) | (1,5) | (1,6) | (1,7) | (1,8) | (1,9) | (2,0) | (2,1) | (2,2) | (2,3) | (2,4) | (2,5) | (2,6) | (2,7) | (2,8) | (2,9) | (3,0) | (3,1) | (3,2) | (3,3) | (3,4) | (3,5) | (3,6) | (3,7) | (3,8) | (3,9) | (4,0) | (4,1) | (4,2) | (4,3) | (4,4) | (4,5) | (4,6) | (4,7) | (4,8) | (4,9) | (5,0) | (5,1) | (5,2) | (5,3) | (5,4) | (5,5) | (5,6) | (5,7) | (5,8) | (5,9) | (6,0) | (6,1) | (6,2) | (6,3) | (6,4) | (6,5) | (6,6) | (6,7) | (6,8) | (6,9) | (7,0) | (7,1) | (7,2) | (7,3) | (7,4) | (7,5) | (7,6) | (7,7) | (7,8) | (7,9) | (8,0) | (8,1) | (8,2) | (8,3) | (8,4) | (8,5) | (8,6) | (8,7) | (8,8) | (8,9) | (9,0) | (9,1) | (9,2) | (9,3) | (9,4) | (9,5) | (9,6) | (9,7) | (9,8) | (9,9) |
| (7,8) | (7,9) |
(8,0) | (8,1) | (8,2) | (8,3) | (8,4) | (8,5) | (8,6) | (8,7) | (8,8) | (8,9) |
(9,0) | (9,1) | (9,2) | (9,3) | (9,4) | (9,5) | (9,6) | (9,7) | (9,8) | (9,9) |
HTML-TableExtract-2.11/t/15_depth_count.t 0000755 0001750 0001750 00000000765 10321276257 016556 0 ustar sisk sisk #!/usr/bin/perl
use strict;
use lib './lib';
use Test::More tests => 112;
use FindBin;
use lib $FindBin::RealBin;
use testload;
my $file = "$Dat_Dir/basic.html";
use HTML::TableExtract;
# By count
my $label = 'by depth and count';
my $te = HTML::TableExtract->new(
depth => 0,
count => 2,
);
ok($te->parse_file($file), "$label (parse_file)");
my @tablestates = $te->tables;
cmp_ok(@tablestates, '==', 1, "$label (extract count)");
good_data($_, "$label (data)") foreach @tablestates;
HTML-TableExtract-2.11/t/14_headers.t 0000755 0001750 0001750 00000001406 10376136540 015645 0 ustar sisk sisk #!/usr/bin/perl
use strict;
use lib './lib';
use Test::More tests => 464;
use FindBin;
use lib $FindBin::RealBin;
use testload;
my $file = "$Dat_Dir/basic.html";
use HTML::TableExtract;
# By headers
my $label = 'by headers';
my $te = HTML::TableExtract->new(
headers => [qw(Eight Six Four Two Zero)],
);
ok($te->parse_file($file), "$label (parse_file)");
my @tablestates = $te->tables;
cmp_ok(@tablestates, '==', 5, "$label (extract count)");
good_data($_, "$label (data)") foreach @tablestates;
$te = HTML::TableExtract->new(
headers => [qw(Eight Two)],
);
ok($te->parse_file($file), "$label (parse_file)");
@tablestates = $te->tables;
cmp_ok(@tablestates, '==', 5, "$label (extract count)");
good_slice_data($_, "$label (data)", 0, 3) foreach @tablestates;
HTML-TableExtract-2.11/lib/ 0000755 0001750 0001750 00000000000 11625004044 014027 5 ustar sisk sisk HTML-TableExtract-2.11/lib/HTML/ 0000755 0001750 0001750 00000000000 11625004044 014573 5 ustar sisk sisk HTML-TableExtract-2.11/lib/HTML/TableExtract.pm 0000644 0001750 0001750 00000153512 11625003542 017524 0 ustar sisk sisk package HTML::TableExtract;
# This package extracts tables from HTML. Tables of interest may be
# specified using header information, depth, order in a depth, table tag
# attributes, or some combination of the four. See the POD for more
# information.
#
# Author: Matthew P. Sisk. See the POD for copyright information.
use strict;
use Carp;
use vars qw($VERSION @ISA);
$VERSION = '2.11';
use HTML::Parser;
@ISA = qw(HTML::Parser);
use HTML::Entities;
# trickery for subclassing from HTML::TreeBuilder rather than the
# default HTML::Parser. (use HTML::TableExtract qw(tree);) Also installs
# a mode constant TREE().
BEGIN { *TREE = sub { 0 } }
sub import {
my $class = shift;
no warnings;
*TREE = @_ ? sub { 1 } : sub { 0 };
return unless @_;
my $mode = shift;
croak "Unknown mode '$mode'\n" unless $mode eq 'tree';
eval "use HTML::TreeBuilder";
croak "Problem loading HTML::TreeBuilder : $@\n" if $@;
eval "use HTML::ElementTable 1.17";
croak "problem loading HTML::ElementTable : $@\n" if $@;
@ISA = qw(HTML::TreeBuilder);
$class;
}
# Backwards compatibility for deprecated methods
*table_state = *table;
*table_states = *tables;
*first_table_state_found = *first_table_found;
###
my %Defaults = (
headers => undef,
depth => undef,
count => undef,
attribs => undef,
subtables => undef,
gridmap => 1,
decode => 1,
automap => 1,
slice_columns => 1,
keep_headers => 0,
br_translate => 1,
error_handle => \*STDOUT,
debug => 0,
keep_html => 0,
strip_html_on_match => 1,
);
my $Dpat = join('|', sort keys %Defaults);
### Constructor
sub new {
my $that = shift;
my $class = ref($that) || $that;
my(%pass, %parms, $k, $v);
while (($k,$v) = splice(@_, 0, 2)) {
if ($k eq 'headers') {
ref $v eq 'ARRAY'
or croak "Param '$k' must be passed in ref to array\n";
$parms{$k} = $v;
}
elsif ($k =~ /^$Dpat$/) {
$parms{$k} = $v;
}
else {
$pass{$k} = $v;
}
}
my $self = $class->SUPER::new(%pass);
bless $self, $class;
foreach (keys %parms, keys %Defaults) {
$self->{$_} = exists $parms{$_} && defined $parms{$_} ?
$parms{$_} : $Defaults{$_};
}
if ($self->{headers}) {
$self->_emsg("TE here, headers: ", join(',', @{$self->{headers}}), "\n")
if $self->{debug};
$self->{gridmap} = 1;
}
# Initialize counts and containers
$self->_reset_state;
$self;
}
### HTML::Parser overrides
sub start {
my $self = shift;
my @res;
@res = $self->SUPER::start(@_) if TREE();
# Create a new table state if entering a table.
if ($_[0] eq 'table') {
my $ts = $self->_enter_table(@_);
$ts->tree($res[0]) if @res;
}
# Rows and cells are next.
if ($self->{_in_a_table}) {
my $ts = $self->current_table;
my $skiptag = 0;
if ($_[0] eq 'tr') {
$ts->_enter_row;
++$skiptag;
}
elsif ($_[0] eq 'td' || $_[0] eq 'th') {
$ts->_enter_cell(@_);
my %attrs = ref $_[1] ? %{$_[1]} : {};
my $rspan = $attrs{rowspan} || 1;
my $cspan = $attrs{colspan} || 1;
$ts->_rasterizer->($ts->row_count, $rspan, $cspan);
$ts->_anchor_item(@res);
++$skiptag;
}
if ($self->{keep_html} && !$skiptag) {
$self->text($_[3]);
}
}
# Replace