HTML-Strip-1.06/Changes 0100644 0001767 0001767 00000003212 10373073265 013267 0 ustar alex alex Revision history for Perl extension HTML::Strip.
1.06 Fri Feb 10 11:18:35 2006
- documented 'set_decode_entities' method
1.05 Thu Feb 9 12:11:50 2006
- added 'set_decode_entities' method
1.04 Mon Jan 24 16:41:51 2005
- Replaced all instances of strcmp with strcasecmp to make the
module case-insensitive towards HTML tag names
1.03 Wed Jul 7 13:42:26 2004
- Added 'emit_spaces' configuration option which can turn off
attempted conversion of HTML tags into spaces
- Constructor options now passed in a hash
1.02 Tue Feb 24 16:24:18 2004
- Yet more checks to prevent extraneous whitespace
- Added many more tests
1.01 Mon Jul 7 18:15:59 2003
- Removed provision for escaped quotes in attributes values
- More checks to prevent the outputting of extraneous whitespace
1.00 Wed Jun 11 12:05:47 2003
- rewritten in C, using a struct for each object to keep track
of state and striptags
0.05 Thu May 22 19:49:25 2003
- removed "XSOPT => '-C++'" from Makefile.PL as it was
unnecessary and causing problems for some people
- added "#include " to strip_html.cpp as it's
absence was causing problems for some people
0.04 Sun Mar 23 12:45:13 2003
- Tweaked docs, added FAQ explaining why 0.03 failed cpan testing
0.03 Sat Mar 22 11:20:34 2003
- rewritten in C++ to make striptags an attribute of each
object
0.02 Mon Mar 17 18:20:01 2003
- added set_striptags() method
- documented module
0.01 Tue Mar 4 18:17:38 2003
- original version; created by h2xs 1.21 with options
-A -n HTML::Strip html_strip.h
HTML-Strip-1.06/Makefile.PL 0100644 0001767 0001767 00000001414 10373072621 013743 0 ustar alex alex use ExtUtils::MakeMaker;
# See lib/ExtUtils/MakeMaker.pm for details of how to influence
# the contents of the Makefile that is written.
WriteMakefile(
'NAME' => 'HTML::Strip',
'VERSION_FROM' => 'Strip.pm', # finds $VERSION
'PREREQ_PM' => {}, # e.g., Module::Name => 1.1
($] >= 5.005 ? ## Add these new keywords supported since 5.005
(ABSTRACT_FROM => 'Strip.pm', # retrieve abstract from module
AUTHOR => 'Alex Bowley ') : ()),
'LIBS' => [''], # e.g., '-lm'
'DEFINE' => '', # e.g., '-DHAVE_SOMETHING'
# Insert -I. if you add *.h files later:
'INC' => '', # e.g., '-I/usr/include/other'
# Un-comment this if you add C files to link with later:
'OBJECT' => '$(O_FILES)', # link all the C files too
);
HTML-Strip-1.06/MANIFEST 0100644 0001767 0001767 00000000140 10373072621 013115 0 ustar alex alex Changes
Makefile.PL
MANIFEST
README
Strip.pm
Strip.xs
strip_html.h
strip_html.c
typemap
test.pl
HTML-Strip-1.06/README 0100644 0001767 0001767 00000001046 10373072621 012652 0 ustar alex alex HTML::Strip
===========
This module strips HTML-like markup from text.
It is written in XS, and thus about five times quicker than using
regular expressions for the same task.
INSTALLATION
To install this module type the following:
perl Makefile.PL
make
make test
make install
COPYRIGHT AND LICENCE
Please report any bugs/suggestions to Alex Bowley
Copyright (c) 2003 Alex Bowley. All rights reserved.
This program is free software; you can redistribute it and/or modify it under
the same terms as Perl itself. HTML-Strip-1.06/Strip.pm 0100644 0001767 0001767 00000013713 10373073210 013430 0 ustar alex alex package HTML::Strip;
use 5.006;
use warnings;
use strict;
use Carp qw( carp croak );
require Exporter;
require DynaLoader;
our @ISA = qw(Exporter DynaLoader);
# Items to export into callers namespace by default. Note: do not export
# names by default without a very good reason. Use EXPORT_OK instead.
# Do not simply export all your public functions/methods/constants.
# This allows declaration use HTML::Strip ':all';
# If you do not need this, moving things directly into @EXPORT or @EXPORT_OK
# will save memory.
our %EXPORT_TAGS = ( 'all' => [ qw(
) ] );
our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } );
our @EXPORT = qw();
our $VERSION = '1.06';
bootstrap HTML::Strip $VERSION;
# Preloaded methods go here.
my $_html_entities_p = eval 'require HTML::Entities';
my %defaults = (
striptags => [qw( title
style
script
applet )],
emit_spaces => 1,
decode_entities => 1,
);
sub new {
my $class = shift;
my $obj = create();
bless $obj, $class;
my %args = (%defaults, @_);
while( my ($key, $value) = each %args ) {
my $method = "set_${key}";
if( $obj->can($method) ) {
$obj->$method($value);
} else {
carp "Invalid setting '$key'";
}
}
return $obj;
}
sub set_striptags {
my ($self, @tags) = @_;
if( ref($tags[0]) eq 'ARRAY' ) {
$self->set_striptags_ref( $tags[0] );
} else {
$self->set_striptags_ref( \@tags );
}
}
sub parse {
my ($self, $text) = @_;
my $stripped = $self->strip_html( $text );
if( $self->decode_entities && $_html_entities_p ) {
$stripped = HTML::Entities::decode($stripped);
}
return $stripped;
}
sub eof {
my $self = shift;
$self->reset();
}
1;
__END__
# Below is stub documentation for your module. You better edit it!
=head1 NAME
HTML::Strip - Perl extension for stripping HTML markup from text.
=head1 SYNOPSIS
use HTML::Strip;
my $hs = HTML::Strip->new();
my $clean_text = $hs->parse( $raw_html );
$hs->eof;
=head1 DESCRIPTION
This module simply strips HTML-like markup from text in a very quick
and brutal manner. It could quite easily be used to strip XML or SGML
from text as well; but removing HTML markup is a much more common
problem, hence this module lives in the HTML:: namespace.
It is written in XS, and thus about five times quicker than using
regular expressions for the same task.
It does I do any syntax checking (if you want that, use
L), instead it merely applies the following rules:
=over 4
=item 1
Anything that looks like a tag, or group of tags will be replaced with
a single space character. Tags are considered to be anything that
starts with a C> and ends with a C>; with the caveat that a
C> character may appear in either of the following without
ending the tag:
=over 4
=item Quote
Quotes are considered to start with either a C<'> or a C<"> character,
and end with a matching character I preceded by an even number or
escaping slashes (i.e. C<\"> does not end the quote but C<\\\\"> does).
=item Comment
If the tag starts with an exclamation mark, it is assumed to be a
declaration or a comment. Within such tags, C> characters do not
end the tag if they appear within pairs of double dashes (e.g. C!--
Ea href="old.htm"Eold pageE/aE --E> would be
stripped completely).
=back
=item 2
Anything the appears within so-called I is stripped as
well. By default, these tags are C, Cbar' ), 'bar' );
$hs->eof;
ok( $hs->parse( '<# just data #>bar' ), 'bar' );
$hs->eof;
#ok( $hs->parse( '>>>>>>>>>>> ]]>bar' ), 'bar' );
#$hs->eof;
ok( $hs->parse( 'bar' ), 'bar' );
$hs->eof;
my $html_entities_p = eval 'require HTML::Entities' ? '' : 'HTML::Entities not available';
skip( $html_entities_p, $hs->parse( '<foo>' ), '' );
$hs->eof;
skip( $html_entities_p, $hs->parse( '<foo>' ), '' );
$hs->eof;
$hs->set_decode_entities(0);
skip( $html_entities_p, $hs->parse( '<foo>' ), '<foo>' );
$hs->eof;
skip( $html_entities_p, $hs->parse( '<foo>' ), '<foo>' );
$hs->eof;
my $hs2 = new HTML::Strip;
$hs2->set_striptags( [ 'foo' ] );
ok( $hs2->parse( 'bar' ), 'foo bar' );
$hs2->eof;
ok( $hs2->parse( 'foobar' ), 'bar' );
$hs2->eof;
ok( $hs->parse( 'bar' ), 'bar' );
$hs->eof;
my @striptags = qw(baz quux);
$hs->set_striptags( @striptags );
ok( $hs->parse( 'fumblebarfoo' ), 'bar' );
$hs->eof;
ok( $hs->parse( 'fumblefoobar' ), 'bar' );
$hs->eof;
ok( $hs->parse( ' baz ' ), ' baz ' );
$hs->eof;