HTML-Clean-1.4/0000755000175000017500000000000013536677554012167 5ustar pavelpavelHTML-Clean-1.4/lib/0000755000175000017500000000000013535413447012721 5ustar pavelpavelHTML-Clean-1.4/lib/HTML/0000755000175000017500000000000013535413447013465 5ustar pavelpavelHTML-Clean-1.4/lib/HTML/Clean.pm0000644000175000017500000003634313536677411015062 0ustar pavelpavelpackage HTML::Clean; use Carp; use IO::File; use Fcntl; use strict; require 5.004; use vars qw($VERSION @ISA @EXPORT @EXPORT_OK); require Exporter; require AutoLoader; # Items to export to callers namespace @EXPORT = qw(); $VERSION = '1.4'; =pod =head1 NAME HTML::Clean - Cleans up HTML code for web browsers, not humans =head1 SYNOPSIS use HTML::Clean; $h = HTML::Clean->new($filename); # or.. $h = HTML::Clean->new($htmlcode); $h->compat(); $h->strip(); $data = $h->data(); print $$data; =head1 DESCRIPTION The HTML::Clean module encapsulates a number of common techniques for minimizing the size of HTML files. You can typically save between 10% and 50% of the size of a HTML file using these methods. It provides the following features: =over 8 =item Remove unneeded whitespace (begining of line, etc) =item Remove unneeded META elements. =item Remove HTML comments (except for styles, javascript and SSI) =item Replace tags with equivilant shorter tags ( --> ) =item etc. =back The entire proces is configurable, so you can pick and choose what you want to clean. =cut =head1 THE HTML::Clean CLASS =head2 $h = HTML::Clean->new($dataorfile, [$level]); This creates a new HTML::Clean object. A Prerequisite for all other functions in this module. The $dataorfile parameter supplies the input HTML, either a filename, or a reference to a scalar value holding the HTML, for example: $h = HTML::Clean->new("/htdocs/index.html"); $html = "Hello!"; $h = HTML::Clean->new(\$html); An optional 'level' parameter controls the level of optimization performed. Levels range from 1 to 9. Level 1 includes only simple fast optimizations. Level 9 includes all optimizations. =cut sub new { my $this = shift; my $class = ref($this) || $this; my $self = {}; bless $self, $class; my $data = shift; my $level = shift; if ($self->initialize($data)) { # set the default level $level = 9 if (!$level); $self->level($level); return $self; } else { undef $self; return undef; } } # # Set up the data in the self hash.. # =head2 $h->initialize($dataorfile) This function allows you to reinitialize the HTML data used by the current object. This is useful if you are processing many files. $dataorfile has the same usage as the new method. Return 0 for an error, 1 for success. =cut sub initialize { my($self, $data) = @_; $self->{'DATA'} = undef; # Not defined? Just return true. return(1) if (!$data); # Check if it's a ref if (ref($data)) { $self->{DATA} = $data; return(1); } # Newline char, really an error, but just go with it.. if ($data =~ /\n/) { $self->{'DATA'} = \$data; } # No newline? Must be a filename if (-f $data) { my $storage; sysopen(IN, "$data", O_RDONLY) || return(0); while () { $storage .= $_; } close(IN); $self->{'DATA'} = \$storage; return(1); } return(0); # file not found? } =head2 $h->level([$level]) Get/set the optimization level. $level is a number from 1 to 9. =cut sub level { my($self, $level) = @_; if (defined($level) && ($level > 0) && ($level < 10)) { $self->{'LEVEL'} = $level } return($self->{'LEVEL'}); } =head2 $myref = $h->data() Returns the current HTML data as a scalar reference. =cut sub data { my($self) = @_; return $self->{'DATA'}; } # Junk HTML comments (INTERNAL) sub _commentcheck($) { my($comment) = @_; $_ = $comment; # Server side include return($comment) if (m,^$,si); return($comment) if (m,navigator\.app(name|version),si); # Stylesheet return($comment) if (m,[A-z0-9]+\:[A-z0-9]+\s*\{.*\},si); return(''); } # Remove javascript comments (INTERNAL) sub _jscomments { my($js) = @_; $js =~ s,\n\s*//.*?\n,\n,sig; $js =~ s,\s+//.*?\n,\n,sig; # insure javascript is hidden if ($js =~ m,\n,si; } return($js); } # Clean up other javascript stuff.. sub _javascript { my($js) = @_; # remove excess whitespace at the beginning and end of lines $js =~ s,\s*\n+\s*,\n,sig; # braces/semicolon at end of line, join next line $js =~ s,([;{}])\n,$1,sig; # What else is safe to do? return($js); } # replace #000000 -> black, etc.. # Does the browser render faster with RGB? You would think so.. sub _defcolorcheck ($) { my($c) = @_; $c =~ s/\#000000/black/; $c =~ s/\#c0c0c0/silver/i; $c =~ s/\#808080/gray/; $c =~ s/\#ffffff/white/i; $c =~ s/\#800000/maroon/; $c =~ s/\#ff0000/red/i; $c =~ s/\#800080/purple/; $c =~ s/\#ff00ff/fuchsia/i; $c =~ s/\#ff00ff/fuchsia/i; $c =~ s/\#008000/green/; $c =~ s/\#00ff00/lime/i; $c =~ s/\#808000/olive/; $c =~ s/\#ffff00/yellow/i; $c =~ s/\#000080/navy/; $c =~ s/\#0000ff/blue/i; $c =~ s/\#008080/teal/i; $c =~ s/\#00ffff/aqua/i; return($c); } # For replacing entities with numerics use vars qw/ %_ENTITIES/; %_ENTITIES = ( 'Agrave' => 192, 'Aacute' => 193, 'Acirc' => 194, 'Atilde' => 195, 'Auml' => 196, 'Aring' => 197, 'AElig' => 198, 'Ccedil' => 199, 'Egrave' => 200, 'Eacute' => 201, 'Ecirc' => 202, 'Euml' => 203, 'Igrave' => 204, 'Iacute' => 205, 'Icirc' => 206, 'Iuml' => 207, 'ETH' => 208, 'Ntilde' => 209, 'Ograve' => 210, 'Oacute' => 211, 'Ocirc' => 212, 'Otilde' => 213, 'Ouml' => 214, 'Oslash' => 216, 'Ugrave' => 217, 'Uacute' => 218, 'Ucirc' => 219, 'Uuml' => 220, 'Yacute' => 221, 'THORN' => 222, 'szlig' => 223, 'agrave' => 224, 'aacute' => 225, 'acirc' => 226, 'atilde' => 227, 'auml' => 228, 'aring' => 229, 'aelig' => 230, 'ccedil' => 231, 'egrave' => 232, 'eacute' => 233, 'ecirc' => 234, 'euml' => 235, 'igrave' => 236, 'iacute' => 237, 'icirc' => 238, 'iuml' => 239, 'eth' => 240, 'ntilde' => 241, 'ograve' => 242, 'oacute' => 243, 'ocirc' => 244, 'otilde' => 245, 'ouml' => 246, 'oslash' => 248, 'ugrave' => 249, 'uacute' => 250, 'ucirc' => 251, 'uuml' => 252, 'yacute' => 253, 'thorn' => 254, 'yuml' => 255 ); =head2 strip(\%options); Removes excess space from HTML You can control the optimizations used by specifying them in the %options hash reference. The following options are recognized: =over 8 =item boolean values (0 or 1 values) whitespace Remove excess whitespace shortertags -> , etc.. blink No blink tags. contenttype Remove default contenttype. comments Remove excess comments. entities " -> ", etc. dequote remove quotes from tag parameters where possible. defcolor recode colors in shorter form. (#ffffff -> white, etc.) javascript remove excess spaces and newlines in javascript code. htmldefaults remove default values for some html tags lowercasetags translate all HTML tags to lowercase =item parameterized values meta Takes a space separated list of meta tags to remove, default "GENERATOR FORMATTER" emptytags Takes a space separated list of tags to remove when there is no content between the start and end tag, like this: . The default is 'b i font center' =back =cut use vars qw/ $do_whitespace $do_shortertags $do_meta $do_blink $do_contenttype $do_comments $do_entities $do_dequote $do_defcolor $do_emptytags $do_javascript $do_htmldefaults $do_lowercasetags $do_defbaseurl /; $do_whitespace = 1; $do_shortertags = 1; $do_meta = "generator formatter"; $do_blink = 1; $do_contenttype = 1; $do_comments = 1; $do_entities = 1; $do_dequote = 1; $do_defcolor = 1; $do_emptytags = 'b i font center'; $do_javascript = 1; $do_htmldefaults = 1; $do_lowercasetags = 1; $do_defbaseurl = ''; sub strip { my($self, $options) = @_; my $h = $self->{'DATA'}; my $level = $self->{'LEVEL'}; # Select a set of options based on $level, and then modify based on # user supplied options. _level_defaults($level); if(defined($options)) { no strict 'refs'; for (keys(%$options)) { ${"do_" . lc($_)} = $options->{$_} if defined ${"do_" . lc($_)}; } } if ($do_shortertags) { $$h =~ s,,,sgi; $$h =~ s,,,sgi; $$h =~ s,,,sgi; $$h =~ s,,,sgi; } if ($do_whitespace) { $$h =~ s,[\r\n]+,\n,sg; # Carriage/LF -> LF $$h =~ s,\s+\n,\n,sg; # empty line $$h =~ s,\n\s+<,\n<,sg; # space before tag $$h =~ s,\n\s+,\n ,sg; # other spaces $$h =~ s,>\n\s*<,><,sg; # LF/spaces between tags.. # Remove excess spaces within tags.. note, we could parse out the elements # and rewrite for excess spaces between elements. perhaps next version. # removed due to problems with > and < in tag elements.. #$$h =~ s,\s+>,>,sg; #$$h =~ s,<\s+,<,sg; # do this again later.. } if ($do_entities) { $$h =~ s,",\",sg; # Simplify long entity names if using default charset... $$h =~ m,charset=([^\"]+)\",; if (!defined($1) || ($1 eq 'iso-8859-1')) { $$h =~ s,&([A-z]+);,($_ENTITIES{$1}) ? chr($_ENTITIES{$1}) : $&,sige; } } if ($do_meta) { foreach my $m (split(/\s+/, $do_meta)) { $$h =~ s,]*?>,,sig; } } if ($do_contenttype) { # Don't need this, since it is the default for most web servers # Also gets rid of 'blinking pages' in older versions of netscape. $$h =~ s,,,sig; } if ($do_defcolor) { $$h =~ s,(<[^<]+?color=['"]?\#[0-9A-Fa-f]+["']?),_defcolorcheck($&),sige; } if ($do_comments) { # don't strip server side includes.. # try not to get javascript, or styles... $$h =~ s,,_commentcheck($&),sige; # Remove javascript comments $$h =~ s,]*(java|ecma)script[^>]*>.*?,_jscomments($&),sige; } if ($do_javascript) { # $$h =~ s,]*(java|ecma)script[^>]*>.*?,_javascript($&),sige; } if ($do_blink) { $$h =~ s,,,sgi; $$h =~ s,,,sgi; } if ($do_dequote) { while ($$h =~ s,<([A-z]+ [A-z]+=)(['"])([A-z0-9]+)\2(\s*?[^>]*?>),<$1$3$4,sig) { # Remove alphanumeric quotes. Note, breaks DTD.. ; } } # remove , etc.. if ($do_emptytags) { my $pat = $do_emptytags; $pat =~ s/\s+/|/g; while ($$h =~ s,<($pat)(\s+[^>]*?)?>\s*,,siog){} } if ($do_htmldefaults) { # Tables # seems to break things.. #$$h =~ s,(]*)\s+border=0([^>]*>),$1$2,sig; $$h =~ s,(]*)\s+rowspan=1([^>]*>),$1$2,sig; $$h =~ s,(]*)\s+colspan=1([^>]*>),$1$2,sig; # # P, TABLE tags are default left aligned.. # lynx is inconsistent in this manner though.. $$h =~ s,<(P|table|td)( [^>]*)align=\"?left\"?([^>]*)>,<$1$2$3>,sig; # OL start=1 $$h =~ s,(
    ]*)start=\"?1\"?([^>]*>),$1$2,sig; # FORM $$h =~ s,(
    ]*)method=\"?get\"?([^>]*>),$1$2,sig; $$h =~ s,(]*)enctype=\"application/x-www-form-urlencoded\"([^>]*>),$1$2,sig; # hr $$h =~ s,(
    ]*)align=\"?center\"?([^>]*>),$1$2,sig; $$h =~ s,(
    ]*)width=\"?100%\"?([^>]*>),$1$2,sig; # URLs $$h =~ s,(href|src)(=\"?http://[^/:]+):80/,$1$2/,sig; } if ($do_whitespace) { # remove space within tags
    becomes
    $$h =~ s,\s+>,>,sg; $$h =~ s,<\s+,<,sg; # join lines with a space at the beginning/end of the line # and a line that begins with a tag $$h =~ s,>\n ,> ,sig; $$h =~ s, \n<, <,sig; } if ($do_lowercasetags) { # translate tags to lowercase to (hopefully) improve compressability.. # simple tags

    ,

    etc. $$h =~ s,(<[/]?[a-zA-Z][a-zA-Z0-9_-]*\s*>),\L$1\E,sg; # the rest.. $$h =~ s/(<[a-zA-Z][a-zA-Z0-9_-]*)(\s+.*?>)/_lowercasetag($1,$2)/sge; } } sub _lowercasetag { my($prefix, $body) = @_; $prefix =~ s/^(.+)$/\L$1\E/; $body =~ s/(\s+[a-zA-Z][a-zA-Z0-9_-]*)(\s*=\s*[^"\s]+|\s*=\s*"[^"]*"|>|\s)/\L$1\E$2/sg; return $prefix.$body; } # set options based on the level provided.. INTERNAL sub _level_defaults($) { my ($level) = @_; $do_whitespace = 1; # always do this... # level 2 $do_shortertags = ($level > 1) ? 1 : 0; $do_meta = ($level > 1) ? "generator formatter" : ""; $do_contenttype = ($level > 1) ? 1 : 0; # level 3 $do_entities = ($level > 2) ? 1 : 0; $do_blink = ($level > 2) ? 1 : 0; # level 4 $do_comments = ($level > 3) ? 1 : 0; $do_dequote = ($level > 3) ? 1 : 0; $do_defcolor = ($level > 3) ? 1 : 0; $do_emptytags = ($level > 3) ? 'b i font center' : 0; $do_javascript = ($level > 3) ? 1 : 0; $do_htmldefaults = ($level > 3) ? 1 : 0; $do_lowercasetags = ($level > 3) ? 1 : 0; # higher levels reserved for more intensive optimizations. } ###################################################################### =head2 compat() This function improves the cross-platform compatibility of your HTML. Currently checks for the following problems: =over 8 =item Insuring all IMG tags have ALT elements. =item Use of Arial, Futura, or Verdana as a font face. =item Positioning the tag immediately after the <head> tag. =back =cut sub compat { my($self, $level, $options) = @_; my $h = $self->{'DATA'}; $$h =~ s/face="arial"/face="arial,helvetica,sansserif"/sgi; $$h =~ s/face="(verdana|futura)"/face="$1,arial,helvetica,sansserif"/sgi; # insure that <title> tag is directly after the <head> tag # Some search engines only search the first N chars. (PLweb for instance..) if ($$h =~ s,<title>(.*),,si) { my $title = $1; $$h =~ s,,$title,si; } # Look for IMG without ALT tags. $$h =~ s/(]+>)/_imgalt($1)/segi; } sub _imgalt { my($tag) = @_; $tag =~ s/>/ alt="">/ if ($tag !~ /alt=/i); return($tag); } =head2 defrontpage(); This function converts pages created with Microsoft Frontpage to something a Unix server will understand a bit better. This function currently does the following: =over 8 =item Converts Frontpage 'hit counters' into a unix specific format. =item Removes some frontpage specific html comments =back =cut sub defrontpage { my($self) = @_; my $h = $self->{'DATA'}; while ($$h =~ s,,,xis) { print "Converted a Hitcounter.. $1, $2, $3\n"; } $$h =~ s,,,sgx; } =head1 SEE ALSO =head2 Modules FrontPage::Web, FrontPage::File =head2 Web Sites =over 6 =item Distribution Site - http://people.itu.int/~lindner/ =back =head1 AUTHORS and CO-AUTHORS Paul Lindner for the International Telecommunication Union (ITU) Pavel Kuptsov =head1 COPYRIGHT The HTML::Strip module is Copyright (c) 1998,99 by the ITU, Geneva Switzerland. All rights reserved. You may distribute under the terms of either the GNU General Public License or the Artistic License, as specified in the Perl README file. =cut 1; __END__ HTML-Clean-1.4/MANIFEST0000644000175000017500000000045413535415121013276 0ustar pavelpavelREADME TODO Changes MANIFEST Makefile.PL bin/htmlclean lib/HTML/Clean.pm t/basic.t t/files.t t/testpages/altavista.html t/testpages/ibm.html t/testpages/yahoo.html t/testpages/microsoft.html t/testpages/infoseek.html t/testpages/itu.html t/testpages/cnn.html t/testpages/hairy.html META.yml META.jsonHTML-Clean-1.4/Changes0000644000175000017500000000545013536677501013456 0ustar pavelpavelRevision history for Perl module HTML::Clean -------------------------------------------- 1.3 Tue Sep 13 14:36 MSK 2019 - Fix meta description 1.3 Tue Sep 9 14:10 MSK 2019 - Fix pod and typo 1.1 Tue Sep 9 12:30 MSK 2019 - Fix pod. Add github repository 0.9 Tue Aug 20 09:24 PDT 2019 - Fix warnings - use IO::File instead deprecated module IO. 0.8 Tue Sep 5 08:02:05 PDT 2000 - Fix dequote regex to be stricter. Make sure quote/double quote match, only use alpha as the paramater name. Reported by Renzo Tomà. - Removed regexes for removing space within tags and border=0 case. This Causes text in attributes to fail, and table borders get messed up on IE. Reported by Gregory Stark. - Fix for improper parsing of emptytags from Jens Quade. - Fix from Tobias Weber to clean javascript 0.7 Sat Apr 24 03:04:45 MET DST 1999 - Add missing 'hairy.html' file to fix broken test cases. - Add alt="" items to images without alt tags. - Apply same compatibility option to Verdana and Futura font. 0.6 Fri Apr 23 14:03:30 MET DST 1999 - Now removes whitespace from within tags, like this: becomes
    - Do a second pass at whitespace removal - Remove some default elements from specific tags border=0 from table, method=get from forms, etc. - Remove default port 80 from URLs - New option, lowercasetags to make all tags lowercase. Quantitative testing shows that this improves compressibility, it should make pages download faster over modems with compression turned on. - Expanded tests. Use lynx to quickly see if the changed HTML 'looks' correct. 0.5 Mon Feb 22 13:12:32 MET 1999 - Now removes empty tag sets. For instance is now eliminated. (From Philippe Verdret) - Cleans up excess space in inline javascript functions. Does a better job of removing javascript comments. (idea from Phillippe Verdret) - Added a larger list of default color names to replace. 0.4 Mon Jan 18 15:10:35 MET 1999 - Bug Fix: use upper case filehandle names (from numerous people..) - Enabled level and options (patch from Mike Heins) strip() function changed. No longer accepts level param. htmlclean shell script takes -1 .. -9 as command line options. - Clean up HTML colors, replace with shorter text names. For example, bgcolor="#ffffff" -> bgcolor=white - When using the iso-8859-1 charset remap character entities like É to the eight bit equivalent. - More documentation 0.3 Mon Jan 11 14:05:15 MET 1999 - Fixed serious htmlclean script bug. - Added a little more documentation. 0.2 Tue Dec 29 10:13:16 MET 1998 - expanded number of strip options - First CPAN release.. 0.1 Fri Apr 17 13:42:11 1998 - original version HTML-Clean-1.4/META.yml0000664000175000017500000000124513536677423013437 0ustar pavelpavel--- abstract: HTML::Clean - Cleans up HTML code for web browsers, not humans author: - Paul Lindner x_contributors: - Paul Lindner - Pavel Kuptsov dynamic_config: 0 generated_by: 'Module::Build version 0.4216, CPAN::Meta::Converter version 2.150001' keywords: - html - html clean - html strip license: perl meta-spec: url: http://module-build.sourceforge.net/META-spec-v1.4.html version: 1.4 name: HTML-Clean provides: HTML::Clean: file: lib/HTML/Clean.pm version: 1.4 resources: license: http://dev.perl.org/licenses/ repository: https://github.com/poizon/HTML-Clean.git version: 1.4HTML-Clean-1.4/bin/0000755000175000017500000000000013535413430012713 5ustar pavelpavelHTML-Clean-1.4/bin/htmlclean0000755000175000017500000000633213535413447014624 0ustar pavelpavel#!/usr/local/bin/perl # htmlclean # Copyright (C) 1998 by ITU use strict; sub usage { print STDERR < =head1 DESCRIPTION This program provides a command-line interface to the HTML::Clean module, which can help you to provide more compatible, smaller HTML files at the expense of reducing the human readability of the HTML code. In some cases you may be able to reduce the size of your HTML by up to 50%! The HTML::Clean library provides a number of features that improve your HTML for browsing and serving: B passes each file given on the command line to the library and writes out the new HTML according to the specified options. The default is to create a backup file and replace the file with cleaned HTML. =over 6 =item Removing whitespace, Comments and other useless or redundant constructs =item Insuring that font tags work across multiple operating systems =back For full details see the documentations for L itself. =head1 OPTIONS =over 4 =item C<-V> Print the version of the program. =item C<-v> Verbose mode. Print out the original and final file sizes, plus the compression percent. For example: 5261 4065 22% /tmp/development-es.html 5258 4061 22% /tmp/development-fr.html 4651 3683 20% /tmp/development.html =back =head1 SEE ALSO For the library, see L =head1 AUTHOR C is written by Paul Lindner, =head1 COPYRIGHT Copyright (c> 1998 by ITU under the same terms as Perl. =cut usage() if ($#ARGV == -1); usage() if ($#ARGV >=0 && $ARGV[0] eq '-?'); use HTML::Clean; use Getopt::Long; my (%opts); $Getopt::Long::getopt_compat = 1; # avoid parsing +'s as options (doesn't work!) &Getopt::Long::config(qw(no_ignore_case no_getopt_compat)); &GetOptions(\%opts, qw(v V t=s 1 2 3 4 5 6 7 8 9)); &print_version if ($opts{'V'}); &usage if ($#ARGV == -1); # we MUST have at least one file my($verbose) = $opts{'v'}; my $level = 9; foreach my $i (1, 2, 3, 4, 5, 6, 7, 8, 9) { $level = $i if ($opts{$i}); } &main($level, \@ARGV); exit 0; sub main { my($level, $files) = @_; my $h = new HTML::Clean(); # Just a empty holder.. print_error('initializing...') if (!$h); $h->level($level); foreach my $f (@$files) { my $result = $h->initialize($f); print_error($f) if ($result == 0); my $d = $h->data(); my $origlen = length($$d); # add options to control these... $h->compat(); $h->strip(); my $newlen = length($$d); my $pct = 0; if ($origlen > 0) { $pct = (100 * ($origlen - $newlen)) / $origlen; } printf "%6d %6d %2d%% %s\n", $origlen, $newlen, $pct, $f if ($verbose); # Okay, now move the files around.. rename($f, "$f.bak") || die "Cannot rename '$f': $!\n"; open(output, ">$f") || die "Cannot overwrite '$f': $!\n"; print output $$d; close(output); } } sub print_error { my($msg) = @_; print STDERR <" ], "x_contributors" : [ "Paul Lindner ", "Pavel Kuptsov " ], "dynamic_config" : 0, "generated_by" : "Module::Build version 0.4216", "keywords" : [ "html", "html clean", "html strip" ], "license" : [ "perl_5" ], "meta-spec" : { "url" : "http://search.cpan.org/perldoc?CPAN::Meta::Spec", "version" : "2" }, "name" : "HTML-Clean", "provides" : { "HTML::Clean" : { "file" : "lib/HTML/Clean.pm", "version" : "1.4" } }, "release_status" : "stable", "resources" : { "license" : [ "http://dev.perl.org/licenses/" ], "repository" : { "type" : "git", "url" : "https://github.com/poizon/HTML-Clean.git", "web" : "https://github.com/poizon/HTML-Clean" } }, "version" : "1.4" }HTML-Clean-1.4/t/0000755000175000017500000000000013535413430012406 5ustar pavelpavelHTML-Clean-1.4/t/basic.t0000644000175000017500000000267413535413447013675 0ustar pavelpavel# Before `make install' is performed this script should be runnable with # `make test'. After `make install' it should work as `perl test.pl' ######################### We start with some black magic to print on failure. # Change 1..1 below to 1..last_test_to_print . BEGIN { $| = 1; print "1..6\n"; } END {print "not ok 1\n" unless $loaded;} use HTML::Clean; $loaded = 1; ; # Test constructors my $h = new HTML::Clean(); if (!defined($h)) { print "not ok 1\n"; } else { print "ok 1\n"; } my $data = "testing, 1 2 3\n"; $h = new HTML::Clean(\$data); if (!defined($h)) { print "not ok 2\n"; } else { print "ok 2\n"; } # test level operator $h->level(2); if ($h->level() != 2) { print "not ok 3\n"; print "Level is " . $h->level() . "\n"; } else { print "ok 3\n"; } $h->level(9); # Test stripping.. # first val is text to manipulate # second val is good result my @data = ( "some bold textsome italic text", "some bold textsome italic text", "À©ñ", "À©ñ", "Some text with empty tags ", "Some text with empty tags ", ); my $test = 3; while (1) { $test++; my $orig = shift(@data) || last; my $good = shift(@data); $h->initialize(\$orig); $h->compat(); $h->strip(); if ($orig eq $good) { print "ok $test\n"; } else { print "not ok $test\n"; print "got:\n$orig\n\nexpected:\n$good\n\n"; } } exit; HTML-Clean-1.4/t/files.t0000644000175000017500000000247013535413447013710 0ustar pavelpavel#!/usr/local/bin/perl ######################### We start with some black magic to print on failure. # Change 1..1 below to 1..last_test_to_print . BEGIN { $| = 1; print "1..9\n"; } END {print "not ok 1\n" unless $loaded;} use HTML::Clean; $loaded = 1; $test = 1; print "ok 1\n"; foreach $page ('hairy', 'altavista', 'microsoft', 'ibm', 'yahoo', 'infoseek', 'itu', 'cnn') { $test ++; my $h = new HTML::Clean("t/testpages/$page.html"); print "not ok $test\n" if (! defined($h)); # compat changes the 'look' of the page for lynx.. # $h->compat(); $h->strip(); if (open(OUTFILE, ">t/testpages/t$page.html")) { print OUTFILE ${$h->data()}; close(OUTFILE); } else { print "not ok $test\n"; } # if we can open lynx test that.. if (open(P, "lynx -nolist -dump t/testpages/$page.html |")) { my $cvtpage = ''; my $origpage = ''; while (

    ) { $origpage .= $_; } close(P); if (open(P, "lynx -nolist -dump t/testpages/t$page.html |")) { while (

    ) { $cvtpage .= $_; } close(P); if (abs(length($origpage) - length($cvtpage)) > 30) { print STDERR "\nWarning, lynx detects different page sizes for $page " . length($origpage) . ", " . length($cvtpage) . "\n"; } } } print "ok $test\n"; } HTML-Clean-1.4/t/testpages/0000755000175000017500000000000013535413430014405 5ustar pavelpavelHTML-Clean-1.4/t/testpages/ibm.html0000644000175000017500000002322613535413447016057 0ustar pavelpavel IBM Corporation

    [ IBM.com for Tue, 22 Dec 1998 14:10:37  ]
    HomeNewsProductsSupportSolutionsPartnersIBM
    [ Search ]
    Shop
    Contact IBM
    Download
    Find a job
    ---
    www.rei.com is an IBM e-business
    [ I hope that one's for me: IBM products make great holiday gifts ]
    Discover IBM / Today's News
    Microelectronics Microelectronics
    The New IBM Microelectronics Gallery: Take a virtual tour
    think leadership think leadership
    The Trendspotter's Guide
    Other Voices Other Voices
    Tech Nation: Simpler than what?
    IBM announces OEM storage shipment milestone
    IBM and United Way launch national early learning initiative
    Golf fans: Connect to the 1998 PGA Tour

    Text only version
    HTML-Clean-1.4/t/testpages/cnn.html0000644000175000017500000016745513535413447016103 0ustar pavelpavel CNN Interactive
    SHOP@CNN
    Browse by
    ad info
    Click here to try four free issues of Sports Illustrated.
    Click here to try four free issues of Sports Illustrated.

    CNNin
    * MAIN PAGE
     WORLD
     U.S.
     LOCAL
     POLITICS
     WEATHER
     BUSINESS
     SPORTS
     SCI-TECH
     ENTERTAINMENT
     BOOKS
     TRAVEL
     FOOD
     HEALTH
     STYLE
     IN-DEPTH

     custom news
     Headline News brief
     daily almanac
     CNN networks
     on-air transcripts
     news quiz
     jobs
     ad info

      CNN WEB SITES:
    CNN Websites
    Free Email Open a pop-up controller to bring you CNN news anytime Personalize your CNN Home Page Listen to the CNN Networks LIVE on your desktop View video news and CNN Programs on demand
    Video, Audio, Customize, Remote, Email
       For a QUICK read of the Headline News, click here.
     Search CNN  go 
    January 29, 1999 -- Updated 4:55 a.m. EST, 0955 GMT, @455 Swatch internet time
    Impeachment
    Republicans force through their trial road map

    In a party-line 54-44 vote, Republicans Thursday pushed through their proposal for proceeding with the deposition phase of the impeachment trial of President Bill Clinton. Under the plan, witnesses will be deposed starting Monday and February 12 has been set as the trial's target end date.

    The Senate's OK of the Republican plan was the last of three rapid-fire votes Thursday evening after negotiations failed to yield a bipartisan agreement. The Democrats' proposal was first rejected, also by a 54-44 margin. The senators next voted down a Democratic attempt to move immediately to a final vote on the articles of impeachment.

    FULL STORY video icon


     
    An artist on the cutting edge
    saw artist

    In other news:

    Kosovo crisis talks launched as military threat looms
    Magnitude of Colombian quake 'exceeds all calculations' video icon
    Violence sweeps East Timor as independence drive gains momentum
    U.S. fighters attack Iraqi anti-aircraft site
    Investigators on Salt Lake Olympic money trail
    Federal budget surplus tops $100 billion
    Charles, Camilla appear together for first time

     PATHFINDER SITES:
     MORE SERVICES:
     video on demand
     video archive
     audio on demand
     news email services
     free email accounts
     desktop headlines
     pointcast
     pagenet

     DISCUSSION:
     message boards
     chat
     feedback

     SITE GUIDES:
     help
     contents
     search

     FASTER ACCESS:
     europe
     japan

     WEB SERVICES:


    barnesandnoble.com
     


    SHOP@CNN
    Browse by

    SPECIAL DEAL:
    HOW TO GET WHAT YOU WANT...
    by John Gray
    (hardcover)

    Our Price: $17.46
    (30% off!)

    Barnes and Noble
     
      QUICK VOTE:     ON CNN:

    Should people avoid using words that sound like slurs but aren't -- like "niggardly"?

    Yes -- it's what people hear that counts
    No -- it's what people mean that counts
    View Results
    tv What's on CNN?
    What's on CNN Int'l?
    Domestic Schedule

    WORLD:   U.S.:

      LINK OF THE DAY:     CUSTOM NEWS:
    Link of the day

    FIND LOW FARES FAST WITH:

    Air deals at a glance
    Speedy roundtrip flight search
    Low-fare desktop ticker

    The online booking source for busy people


    Personalize CNN.com
    Visit YOUR CustomNews
      sample stories:



    SPORTS:   BUSINESS:

    POLITICS:   WEATHER:

     ROMANTIC CAPERS:  MOVIES:
    Cleopatra
    Ideas for planning a surprise Valentine's escape
      horner
    Oscar has already recognized him, as have the Golden Globes. Is a Grammy next for this "Titanic" composer?

     SCIENCE AND NATURE:    COMPUTING:

    SPACE:   IN-DEPTH:

    HEALTH:   ENTERTAINMENT:

      PICTURE OF THE DAY:   COMPUTING:
    Staying close to mom
    Staying close to mom
    Presented by Oracle
      chaos
    Chaos theory can bring more privacy to Internet communications

    STYLE:   TRAVEL:

    BOOKS:   FOOD:

     ALLPOLITICS TOONS:  RECIPE:
    Allpolitics toon
    Political 'toonist Mike Luckovich has a question for Monica Lewinsky
      Apple Crisp
    Break out the vanilla ice cream for this one:
    Apple crisp

    COUNTDOWN TO 2000:   COLD WAR:
     

    VIEWS:   MEDIA SHOWCASE:

     FRINGE:    DISCUSSION:

     AUDIO:    VIDEO ON-DEMAND:

    Listen to CNN live...

    Plus sports, technology, health and entertainment news on-demand.

    All in Audioselect

     
    Videoselect
    Judge expected to decide on charges against Anwar
    Larry King Live
    Crossfire


      
    Back to the top
    © 1999 Cable News Network. All Rights Reserved.
    Terms under which this service is provided to you.
    Read our privacy guidelines.

    HTML-Clean-1.4/t/testpages/hairy.html0000644000175000017500000000262413535413447016423 0ustar pavelpavel This is a test page that touches most options of HTML::Clean This is a bold line with some empty tags .

    How about some excess space within tags?

    This is a table with default values. Remove the border=0 from the page

    Tests for align=left in various tags

    1. item 1
    2. item 2

    Link to the ITU Home page with :80 in the URL
    An image without an alt tag.

    Voilá test me
    HTML-Clean-1.4/t/testpages/yahoo.html0000644000175000017500000002416213535413447016427 0ustar pavelpavelYahoo!
    Yahoo
    Yahoo! Auctions
    furby, beanies, zelda
    OfficeMax.com: Up to $300 IBM Aptiva rebatesPark Your
    Domain Free
    options
    Shopping - Yellow Pages - People Search - Maps - Travel Agent - Classifieds - Personals - Games - Chat
    Email - Calendar - Pager - My Yahoo! - Today's News - Sports - Weather - TV - Stock Quotes - more...
    Yahoo! Shopping : Apparel, Books, CDs, Computers, Electronics, Games, Toys, Videos, more...
    Featured Stores : Service Merchandise - Cambridge SoundWorks - FAO Schwarz
    Arts & Humanities
    Literature, Photography...

    Business & Economy
    Companies, Finance, Jobs...

    Computers & Internet
    Internet, WWW, Software, Games...

    Education
    Universities, K-12, College Entrance...

    Entertainment
    Cool Links, Movies, Humor, Music...

    Government
    Military, Politics, Law, Taxes...

    Health
    Medicine, Diseases, Drugs, Fitness...
    News & Media
    Full Coverage, Newspapers, TV...

    Recreation & Sports
    Sports, Travel, Autos, Outdoors...

    Reference
    Libraries, Dictionaries, Quotations...

    Regional
    Countries, Regions, US States...

    Science
    Biology, Astronomy, Engineering...

    Social Science
    Archaeology, Economics, Languages...

    Society & Culture
    People, Environment, Religion...
    In the News
    ·President Clinton is impeached
    ·US, UK end air strikes in Iraq
    ·Woman gives birth to octuplets
    ·NFL , NHL
    more...

    Inside Yahoo!
    ·Y! Games - crossword, hearts, euchre
    ·Holiday guide - food and drink, gift ideas
    ·Y! Clubs - create your own community
    more...
    World Yahoo!sAmericas : Canada - Spanish
    Europe : Denmark - France - Germany - Italy - Norway - Spain - Sweden - UK & Ireland
    Pacific Rim : Asia - Australia & NZ - Chinese - Japan - Korea
    Yahoo! Get LocalLA - NYC - SF Bay - Chicago - more...    
    Other GuidesAutos - Computers - Employment - Local Events - Net Events - Message Boards
    Movies - Real Estate - Small Business - Ski & Snow - Y! Internet Life - Yahooligans!
    Smart Shopping with
    How to Suggest a Site - Company Info - Privacy Policy - Contributors - Openings at Yahoo!
    HTML-Clean-1.4/t/testpages/itu.html0000644000175000017500000003137013535413447016110 0ustar pavelpavel International Telecommunication Union (ITU) Home Page
    ITU Home Page

    ITU Highlights

     ITU Menu Bar

    [Swisscom]

      
    The ITU, headquartered in Geneva, Switzerland is an international organization within which governments and the private sector coordinate global telecom networks and services.

    ITU Meetings and Conferences

    ITU Publications
    The ITU is the leading publisher of telecommunication technology, regulatory and standards information. Many publications can be purchased through our Electronic Bookshop or the ITU Publications Online subscription service.

    ITU Newsroom

    Internet Broadcasting Service

    Information Exchange Services (TIES)

    Job Vacancies

    Selected Web Sites

    United Nations System
    Official site listing of all UN organizations.

    Geneva Permanent Missions to the UN

    [Add Active Channel]

    Add ITU Now!

      

    Banner

    1998 Plenipotentiary Conference

    World TELECOM 99

    Year 2000 "Millennium Compliance"

    International Mobile Telecommunications (IMT)
    The ITU vision of global wireless access in the 21st century

    ITU Global Directory

    ISO 3166-Based Top Level Domains Survey

    For more ITU Features see our Highlights page.


    Search ITU's Site:


    Home | Search | Site Map | Help | Contact | Comments | © Copyright

    English | Français | Español

    Last Modified: 1998-12-03

    About the ITU Web Site

    HTML-Clean-1.4/t/testpages/microsoft.html0000644000175000017500000004507213535413447017320 0ustar pavelpavel Welcome to Microsoft's Homepage
    microsoft.com Home   All Products  |   Support  |   Search  |   microsoft.com Home  
    Microsoft
      Home  |   Events  |   Training  |   Downloads  |   Newsletters  |   U.S. & International  |   About Our Site  |

    Internet Explorer
    Download it free!

    Product Families
    BackOffice
    Developer Tools
    Office
    MSN
    Windows

    Business Solutions
    Industries
    Small Business

    Developers
    Software Developers
    Web Site Builders

    Education
    Academic Products
    Education Resellers
    Higher Education
    K-12 Education

    IT Professionals
    Digital Nervous System
    IT Professionals/Execs
    Solution Providers
    Year 2000

    Partners & Resellers
    Becoming a Partner
    Find a Services Partner
    Resellers Consultants

    Personal Use
    Games
    Kids
    Personal Computing
    Seniors

    About Microsoft
    Company Overview
    Jobs
    Press Information
    Privacy/Security
    Investor Relations
    US Offices & Web Sites

    Starship Rediscover the Planet with Starship Voyage
    Climb aboard the high-tech vessel Starship, packed to the gunwales with PCs, network equipment, and state-of-the art satellite transmission equipment. Thanks to an intranet and Web interface designed by Microsoft, you can take a virtual voyage this holiday season.

    Join the 100,000 Who've Signed Up for Office 2000 Preview
    Get What You Need with MSN Shopping's Last Minute Gift Guide
    Get a Palm-size PC, and a Whole Lot More, with our Holiday Offer
    Top 10 Tips to Get Your New Computer Running Smoothly
    Get Software Ideas for Your New PC With Our CD-ROM Sampler
    Microsoft and fine.com Help Girl Fight Cancer

    Subscribe to Our Free E-Mail Newsletter!

    For a text-only version of the home page click here.
    Microsoft and the freedom to innovate - What's your opinion?



    Last Updated: Monday, December 21, 1998
    ©1998 Microsoft Corporation. All rights reserved. Terms of Use Privacy Policy
    HTML-Clean-1.4/t/testpages/altavista.html0000644000175000017500000003121313535413447017273 0ustar pavelpavel AltaVista: Main Page
    AltaVista Click Here

    Ask AltaVistaTM a question.  Or enter a few words in Help - Advanced
    Example: What happened today on All My Children?
    Specialty
    Searches

    AV Family Filter - AV Photo Finder - AV Tools & Gadgets
    Entertainment - Health - Holiday Shopping - Careers - Maps
    People Finder - Stock Quotes - Travel - Usenet - Yellow Pages

    CATEGORIES

    Automotive

    Business & Finance

    Computers & Internet

    Health & Fitness

    Hobbies & Interests

    Home & Family

    Media & Amusements

    People & Chat

    Reference & Education

    Shopping & Services

    Society & Politics

    Sports & Recreation

    Travel & Vacations

    NEWS BY ABCNEWS.com
    White House Wants Censure
    The Cost of Containing Iraq
    China Tells Balloon To Land
    Virus Attacks MCI Systems

    ALTAVISTA HIGHLIGHTS

    Take an entertainment break:
    Classic Arcade-Game Crackdown
    World: Noble Peace Prize Concert
    How to avoid the winter bloat:
    Staying Fit in Spite of Wintertime Gloom

    OTHER SERVICES

    AltaVista Discovery - Video Search Demo
    FREE Email - AV Translation Services
    Make Us Your Homepage - Create A Card
    Photo Albums! - Asian Languages

    Click Here

    Featured Sponsors

    Click HereNow open, the Levi's® Online Store.

    Click HereAndy's Garage - New Stuff, Dirt Cheap!

    Click HereShop DOCKERS.com The definitive online store.

    Click HereDeerskin brings you great leather values

    AltaVista Home | Help | Feedback | Advertising Info | Set your Preferences | Text-Only Version
    COMPAQ | Disclaimer | Privacy | Our Search Network | About AltaVista | Add a Page
    HTML-Clean-1.4/t/testpages/infoseek.html0000644000175000017500000006166713535413447017126 0ustar pavelpavel Infoseek
     Homepage  Homepage  Stocks  News  Yellow Pages  Maps  Chat  Free Web Page  Help  UPS Service Center

        People Finder - Shareware - Company Capsules - More... GO Network beta
     
    Use Extra Search Precision within 
     
    Search Tips
    Advanced Search


     

    News Headlines
     

    - Iraq Threatens To Continue Its Air War With West
    - U.S. Fighter Planes Clash With Iraq In No-Fly Zone
    - NFL Wrap-Up: Jacksonville Rookies Rip Apart Steelers


    Patch Adams stars Robin Williams and opens December 25Click here for 9NETAVENUE!
     

    Infoseek Today
     
     -  Monday Night Football
     -  The latest business news
     -  Happy Kwanzaa!
     -  Virus or hoax?
     

    Infoseek Tools
     
    - The FASTEST way to FIND what you're LOOKING for! Try Express by Infoseek NOW!
    - Free Web Advertising with Infoseek Clicks
    - Fast access to searching! Try Infoseek Quickseek
    - Make Infoseek a part of your internet experience with Free Infoseek tools
     
      
    GONetwork  Try the beta!
    Automotive
    Buy a car, Insurance Center...
     
    Business
    Business resources, small businesses...
     
    Careers
    At work, resumes, find an ideal job...
     
    Communications
    Business solutions, mobile & wireless, networking...
     
    Computer
    Buy a PC, buy software, download...
     
    Education
    College, continuing ed, K-12, science & nature
     
    Entertainment
    Movies, music, TV, books...
     
    The Good Life
    Gardening, recipes, relationships, restaurants...
     
    Health
    Medical info, wellness, fitness...
     
    Internet
    Intranet, web publishing, Netcenter...
     
    Kids & Family
    Fun for kids, parenting...
     
    News
    Business, tech, world...
     
    Personal Finance
    Insurance Center, investing, mutual funds, credit...
     
    Real Estate
    Buy a home, Loan Center, Insurance Center...
     
    Shopping
    Apparel, auctions, CDs, gifts, flowers...
     
    Sports
    NFL, NHL, NBA, MLB, more...
     
    Travel
    Air travel, money savers, vacation ideas...
     
    Women's
    Family issues, get fit, job strategies...
     







    INFOSEEK WORLDWIDE: Brasil |  Danmark |  Deutschland |  en español |  France |  Italia |  Japan  |   México  |   Nederland |  Sverige |  United Kingdom 


    Feedback  |   Make Infoseek your start page  |   Add URL  |   Help  |   About Infoseek  |   Advertise on Infoseek  |   Jobs @ Infoseek  |   Intranet software  |   Infoseek Instant Messaging  |   Chat   |   Infoseek Clicks

    Best viewed with:
     Microsoft Internet Explorer   Netscape Tune-Up!

    Copyright © 1994-98 Infoseek Corporation.
    All rights reserved. 
    Disclaimer    Privacy Policy
    Local information:
     TRUSTe Program
    HTML-Clean-1.4/Makefile.PL0000644000175000017500000000054513535413447014131 0ustar pavelpaveluse ExtUtils::MakeMaker; # See lib/ExtUtils/MakeMaker.pm for details of how to influence # the contents of the Makefile that is written. WriteMakefile( 'NAME' => 'HTML::Clean', 'VERSION_FROM' => 'lib/HTML/Clean.pm', # finds $VERSION 'EXE_FILES' => [ 'bin/htmlclean' ], 'dist' => { COMPRESS => 'gzip', SUFFIX => 'gz' }, ); HTML-Clean-1.4/TODO0000644000175000017500000000124313535413447012643 0ustar pavelpavelTODO list for the HTML::Clean Module ------------------------------------ * May need to be more selective with some of the regexps, so as to not clobber JavaScript. * Add length/width elements to IMG tags? * Add a real parser/grammar system, like a real compiler, then we can optimize repeated HTML elements, like this:
    sometext
    some more text
    This would also allow specific handlers for specific content types i.e. PRE blocks, Javascript, Stylesheets, ASP, etc... * Replace
    with just
    * Add counters so we can collect statistics on the usefullness of the various optimizations