libhtml-clean-perl-0.8.orig/0040775000175000017500000000000007247704261014230 5ustar srzsrzlibhtml-clean-perl-0.8.orig/bin/0040775000175000017500000000000007247704261015000 5ustar srzsrzlibhtml-clean-perl-0.8.orig/bin/htmlclean0100775000175000017500000000633207226702725016676 0ustar srzsrz#!/usr/local/bin/perl # htmlclean # Copyright (C) 1998 by ITU use strict; sub usage { print STDERR < =head1 DESCRIPTION This program provides a command-line interface to the HTML::Clean module, which can help you to provide more compatible, smaller HTML files at the expense of reducing the human readability of the HTML code. In some cases you may be able to reduce the size of your HTML by up to 50%! The HTML::Clean library provides a number of features that improve your HTML for browsing and serving: B passes each file given on the command line to the library and writes out the new HTML according to the specified options. The default is to create a backup file and replace the file with cleaned HTML. =over 6 =item Removing whitespace, Comments and other useless or redundant constructs =item Insuring that font tags work across multiple operating systems =back For full details see the documentations for L itself. =head1 OPTIONS =over 4 =item C<-V> Print the version of the program. =item C<-v> Verbose mode. Print out the original and final file sizes, plus the compression percent. For example: 5261 4065 22% /tmp/development-es.html 5258 4061 22% /tmp/development-fr.html 4651 3683 20% /tmp/development.html =back =head1 SEE ALSO For the library, see L =head1 AUTHOR C is written by Paul Lindner, =head1 COPYRIGHT Copyright (c> 1998 by ITU under the same terms as Perl. =cut usage() if ($#ARGV == -1); usage() if ($#ARGV >=0 && $ARGV[0] eq '-?'); use HTML::Clean; use Getopt::Long; my (%opts); $Getopt::Long::getopt_compat = 1; # avoid parsing +'s as options (doesn't work!) &Getopt::Long::config(qw(no_ignore_case no_getopt_compat)); &GetOptions(\%opts, qw(v V t=s 1 2 3 4 5 6 7 8 9)); &print_version if ($opts{'V'}); &usage if ($#ARGV == -1); # we MUST have at least one file my($verbose) = $opts{'v'}; my $level = 9; foreach my $i (1, 2, 3, 4, 5, 6, 7, 8, 9) { $level = $i if ($opts{$i}); } &main($level, \@ARGV); exit 0; sub main { my($level, $files) = @_; my $h = new HTML::Clean(); # Just a empty holder.. print_error('initializing...') if (!$h); $h->level($level); foreach my $f (@$files) { my $result = $h->initialize($f); print_error($f) if ($result == 0); my $d = $h->data(); my $origlen = length($$d); # add options to control these... $h->compat(); $h->strip(); my $newlen = length($$d); my $pct = 0; if ($origlen > 0) { $pct = (100 * ($origlen - $newlen)) / $origlen; } printf "%6d %6d %2d%% %s\n", $origlen, $newlen, $pct, $f if ($verbose); # Okay, now move the files around.. rename($f, "$f.bak") || die "Cannot rename '$f': $!\n"; open(output, ">$f") || die "Cannot overwrite '$f': $!\n"; print output $$d; close(output); } } sub print_error { my($msg) = @_; print STDERR < becomes - Do a second pass at whitespace removal - Remove some default elements from specific tags border=0 from table, method=get from forms, etc. - Remove default port 80 from URLs - New option, lowercasetags to make all tags lowercase. Quantitative testing shows that this improves compressibility, it should make pages download faster over modems with compression turned on. - Expanded tests. Use lynx to quickly see if the changed HTML 'looks' correct. 0.5 Mon Feb 22 13:12:32 MET 1999 - Now removes empty tag sets. For instance is now eliminated. (From Philippe Verdret) - Cleans up excess space in inline javascript functions. Does a better job of removing javascript comments. (idea from Phillippe Verdret) - Added a larger list of default color names to replace. 0.4 Mon Jan 18 15:10:35 MET 1999 - Bug Fix: use upper case filehandle names (from numerous people..) - Enabled level and options (patch from Mike Heins) strip() function changed. No longer accepts level param. htmlclean shell script takes -1 .. -9 as command line options. - Clean up HTML colors, replace with shorter text names. For example, bgcolor="#ffffff" -> bgcolor=white - When using the iso-8859-1 charset remap character entities like É to the eight bit equivalent. - More documentation 0.3 Mon Jan 11 14:05:15 MET 1999 - Fixed serious htmlclean script bug. - Added a little more documentation. 0.2 Tue Dec 29 10:13:16 MET 1998 - expanded number of strip options - First CPAN release.. 0.1 Fri Apr 17 13:42:11 1998 - original version libhtml-clean-perl-0.8.orig/MANIFEST0100664000175000017500000000043207226702725015355 0ustar srzsrzREADME TODO Changes MANIFEST Makefile.PL bin/htmlclean lib/HTML/Clean.pm t/basic.t t/files.t t/testpages/altavista.html t/testpages/ibm.html t/testpages/yahoo.html t/testpages/microsoft.html t/testpages/infoseek.html t/testpages/itu.html t/testpages/cnn.html t/testpages/hairy.html libhtml-clean-perl-0.8.orig/Makefile.PL0100664000175000017500000000054507226702725016203 0ustar srzsrzuse ExtUtils::MakeMaker; # See lib/ExtUtils/MakeMaker.pm for details of how to influence # the contents of the Makefile that is written. WriteMakefile( 'NAME' => 'HTML::Clean', 'VERSION_FROM' => 'lib/HTML/Clean.pm', # finds $VERSION 'EXE_FILES' => [ 'bin/htmlclean' ], 'dist' => { COMPRESS => 'gzip', SUFFIX => 'gz' }, ); libhtml-clean-perl-0.8.orig/README0100664000175000017500000000445307226702725015113 0ustar srzsrzHTML::Clean - Cleans up HTML code for web browsers, not humans ------------------------------------------------------------------- 1. INTRODUCTION -- ``The HTML Compiler!'' The majority of the web pages of the internet today are much larger than they need to be. The reason for this is that HTML tends to be stored in a human readable format, with indenting, newlines and comments. However, all of these comments, whitespace etc. are ignored by the browser, and needlessly lengthen download times. Second, many people are using WYSIWYG HTML editors these days. This makes creating content easy. However these editors can cause a number of compatibility problems by tying themselves to a particular browser or operating system. Enter HTML::Clean. The HTML::Clean module encapsulates a number of HTML optimizations and cleanups. The end result is HTML that loads faster, displays properly in more browsers. Think of it as a compiler that translates HTML input into optimized machine readable code. Uses You can use the library in your perl-cgi scripts to optimize data. Or use the included htmlclean script to preprocess HTML files or templates on your server. The results should be pages that load faster, less load on your web server and better cross-platform HTML. To read about the latest features, see the Changes file. To find out about known bugs and to see what's planned for future versions, see the TODO file. 2. AVAILABILITY You can get the latest version of the HTML::Clean module from the Comprehensive Perl Archive Network (CPAN) or from the author's homepage: http://www.perl.org/CPAN/modules/by-module/HTML/ http://people.itu.int/~lindner/ 3. PREREQUISITES The HTML module requires the following: Perl 5.004 The HTML::Clean module may work with earlier versions of the above, but the author hasn't tested this. The latest versions can be obtained from CPAN: http://www.perl.org/CPAN/ 4. INSTALLATION To build this module, run the following commands: perl Makefile.PL make make test make install 5. COPYRIGHT Copyright (c) 1999 ITU. All rights reserved. This program is free software; you can redistribute it and/or modify it under the same terms as Perl itself. 6. AUTHOR INFORMATION Paul Lindner paul.lindner@itu.int http://people.itu.int/~lindner/ libhtml-clean-perl-0.8.orig/TODO0100664000175000017500000000124307226702725014715 0ustar srzsrzTODO list for the HTML::Clean Module ------------------------------------ * May need to be more selective with some of the regexps, so as to not clobber JavaScript. * Add length/width elements to IMG tags? * Add a real parser/grammar system, like a real compiler, then we can optimize repeated HTML elements, like this:
sometext
some more text
This would also allow specific handlers for specific content types i.e. PRE blocks, Javascript, Stylesheets, ASP, etc... * Replace
with just
* Add counters so we can collect statistics on the usefullness of the various optimizations libhtml-clean-perl-0.8.orig/lib/0040775000175000017500000000000007247704261014776 5ustar srzsrzlibhtml-clean-perl-0.8.orig/lib/HTML/0040775000175000017500000000000007247704261015542 5ustar srzsrzlibhtml-clean-perl-0.8.orig/lib/HTML/Clean.pm0100664000175000017500000003644407226703377017136 0ustar srzsrzpackage HTML::Clean; use Carp; use IO; use Fcntl; use strict; require 5.004; use vars qw($VERSION @ISA @EXPORT @EXPORT_OK); require Exporter; require AutoLoader; # Items to export to callers namespace @EXPORT = qw(); $VERSION = '0.8'; =head1 NAME HTML::Clean - Cleans up HTML code for web browsers, not humans =head1 SYNOPSIS use HTML::Clean; $h = new HTML::Clean($filename); # or.. $h = new HTML::Clean($htmlcode); $h->compat(); $h->strip(); $data = $h->data(); print $$data; =head1 DESCRIPTION The HTML::Clean module encapsulates a number of common techniques for minimizing the size of HTML files. You can typically save between 10% and 50% of the size of a HTML file using these methods. It provides the following features: =over 8 =item Remove unneeded whitespace (begining of line, etc) =item Remove unneeded META elements. =item Remove HTML comments (except for styles, javascript and SSI) =item Replace tags with equivilant shorter tags ( --> ) =item etc. =back The entire proces is configurable, so you can pick and choose what you want to clean. =head1 THE HTML::Clean CLASS =over 4 =cut ###################################################################### =head2 $h = new HTML::Clean($dataorfile, [$level]); This creates a new HTML::Clean object. A Prerequisite for all other functions in this module. The $dataorfile parameter supplies the input HTML, either a filename, or a reference to a scalar value holding the HTML, for example: $h = new HTML::Clean("/htdocs/index.html"); $html = "Hello!"; $h = new HTML::Clean(\$html); An optional 'level' parameter controls the level of optimization performed. Levels range from 1 to 9. Level 1 includes only simple fast optimizations. Level 9 includes all optimizations. =cut sub new { my $this = shift; my $class = ref($this) || $this; my $self = {}; bless $self, $class; my $data = shift; my $level = shift; if ($self->initialize($data)) { # set the default level $level = 9 if (!$level); $self->level($level); return $self; } else { undef $self; return undef; } } # # Set up the data in the self hash.. # =head2 $h->initialize($dataorfile) This function allows you to reinitialize the HTML data used by the current object. This is useful if you are processing many files. $dataorfile has the same usage as the new method. Return 0 for an error, 1 for success. =cut sub initialize { my($self, $data) = @_; $self->{'DATA'} = undef; # Not defined? Just return true. return(1) if (!$data); # Check if it's a ref if (ref($data)) { $self->{DATA} = $data; return(1); } # Newline char, really an error, but just go with it.. if ($data =~ /\n/) { $self->{'DATA'} = \$data; } # No newline? Must be a filename if (-f $data) { my $storage; sysopen(IN, "$data", O_RDONLY) || return(0); while () { $storage .= $_; } close(IN); $self->{'DATA'} = \$storage; return(1); } return(0); # file not found? } =head2 $h->level([$level]) Get/set the optimization level. $level is a number from 1 to 9. =cut sub level { my($self, $level) = @_; if (defined($level) && ($level > 0) && ($level < 10)) { $self->{'LEVEL'} = $level } return($self->{'LEVEL'}); } =head2 $myref = $h->data() Returns the current HTML data as a scalar reference. =cut sub data { my($self) = @_; return $self->{'DATA'}; } # Junk HTML comments (INTERNAL) sub _commentcheck($) { my($comment) = @_; $_ = $comment; # Server side include return($comment) if (m,^$,si); return($comment) if (m,navigator\.app(name|version),si); # Stylesheet return($comment) if (m,[A-z0-9]+\:[A-z0-9]+\s*\{.*\},si); return(''); } # Remove javascript comments (INTERNAL) sub _jscomments { my($js) = @_; $js =~ s,\n\s*//.*?\n,\n,sig; $js =~ s,\s+//.*?\n,\n,sig; # insure javascript is hidden if ($js =~ m,\n,si; } return($js); } # Clean up other javascript stuff.. sub _javascript { my($js) = @_; # remove excess whitespace at the beginning and end of lines $js =~ s,\s*\n+\s*,\n,sig; # braces/semicolon at end of line, join next line $js =~ s,([;{}])\n,$1,sig; # What else is safe to do? return($js); } # replace #000000 -> black, etc.. # Does the browser render faster with RGB? You would think so.. sub _defcolorcheck ($) { my($c) = @_; $c =~ s/\#000000/black/; $c =~ s/\#c0c0c0/silver/i; $c =~ s/\#808080/gray/; $c =~ s/\#ffffff/white/i; $c =~ s/\#800000/maroon/; $c =~ s/\#ff0000/red/i; $c =~ s/\#800080/purple/; $c =~ s/\#ff00ff/fuchsia/i; $c =~ s/\#ff00ff/fuchsia/i; $c =~ s/\#008000/green/; $c =~ s/\#00ff00/lime/i; $c =~ s/\#808000/olive/; $c =~ s/\#ffff00/yellow/i; $c =~ s/\#000080/navy/; $c =~ s/\#0000ff/blue/i; $c =~ s/\#008080/teal/i; $c =~ s/\#00ffff/aqua/i; return($c); } # For replacing entities with numerics use vars qw/ %_ENTITIES/; %_ENTITIES = ( 'Agrave' => 192, 'Aacute' => 193, 'Acirc' => 194, 'Atilde' => 195, 'Auml' => 196, 'Aring' => 197, 'AElig' => 198, 'Ccedil' => 199, 'Egrave' => 200, 'Eacute' => 201, 'Ecirc' => 202, 'Euml' => 203, 'Igrave' => 204, 'Iacute' => 205, 'Icirc' => 206, 'Iuml' => 207, 'ETH' => 208, 'Ntilde' => 209, 'Ograve' => 210, 'Oacute' => 211, 'Ocirc' => 212, 'Otilde' => 213, 'Ouml' => 214, 'Oslash' => 216, 'Ugrave' => 217, 'Uacute' => 218, 'Ucirc' => 219, 'Uuml' => 220, 'Yacute' => 221, 'THORN' => 222, 'szlig' => 223, 'agrave' => 224, 'aacute' => 225, 'acirc' => 226, 'atilde' => 227, 'auml' => 228, 'aring' => 229, 'aelig' => 230, 'ccedil' => 231, 'egrave' => 232, 'eacute' => 233, 'ecirc' => 234, 'euml' => 235, 'igrave' => 236, 'iacute' => 237, 'icirc' => 238, 'iuml' => 239, 'eth' => 240, 'ntilde' => 241, 'ograve' => 242, 'oacute' => 243, 'ocirc' => 244, 'otilde' => 245, 'ouml' => 246, 'oslash' => 248, 'ugrave' => 249, 'uacute' => 250, 'ucirc' => 251, 'uuml' => 252, 'yacute' => 253, 'thorn' => 254, 'yuml' => 255 ); =head2 strip(\%options); Removes excess space from HTML You can control the optimizations used by specifying them in the %options hash reference. The following options are recognized: =over 8 =item boolean values (0 or 1 values) whitespace Remove excess whitespace shortertags -> , etc.. blink No blink tags. contenttype Remove default contenttype. comments Remove excess comments. entities " -> ", etc. dequote remove quotes from tag parameters where possible. defcolor recode colors in shorter form. (#ffffff -> white, etc.) javascript remove excess spaces and newlines in javascript code. htmldefaults remove default values for some html tags lowercasetags translate all HTML tags to lowercase =item parameterized values meta Takes a space separated list of meta tags to remove, default "GENERATOR FORMATTER" emptytags Takes a space separated list of tags to remove when there is no content between the start and end tag, like this: . The default is 'b i font center' =back =cut use vars qw/ $do_whitespace $do_shortertags $do_meta $do_blink $do_contenttype $do_comments $do_entities $do_dequote $do_defcolor $do_emptytags $do_javascript $do_htmldefaults $do_lowercasetags $do_defbaseurl /; $do_whitespace = 1; $do_shortertags = 1; $do_meta = "generator formatter"; $do_blink = 1; $do_contenttype = 1; $do_comments = 1; $do_entities = 1; $do_dequote = 1; $do_defcolor = 1; $do_emptytags = 'b i font center'; $do_javascript = 1; $do_htmldefaults = 1; $do_lowercasetags = 1; $do_defbaseurl = ''; sub strip { my($self, $options) = @_; my $h = $self->{'DATA'}; my $level = $self->{'LEVEL'}; # Select a set of options based on $level, and then modify based on # user supplied options. _level_defaults($level); if(defined($options)) { no strict 'refs'; for (keys(%$options)) { ${"do_" . lc($_)} = $options->{$_} if defined ${"do_" . lc($_)}; } } if ($do_shortertags) { $$h =~ s,,,sgi; $$h =~ s,,,sgi; $$h =~ s,,,sgi; $$h =~ s,,,sgi; } if ($do_whitespace) { $$h =~ s,[\r\n]+,\n,sg; # Carriage/LF -> LF $$h =~ s,\s+\n,\n,sg; # empty line $$h =~ s,\n\s+<,\n<,sg; # space before tag $$h =~ s,\n\s+,\n ,sg; # other spaces $$h =~ s,>\n\s*<,><,sg; # LF/spaces between tags.. # Remove excess spaces within tags.. note, we could parse out the elements # and rewrite for excess spaces between elements. perhaps next version. # removed due to problems with > and < in tag elements.. #$$h =~ s,\s+>,>,sg; #$$h =~ s,<\s+,<,sg; # do this again later.. } if ($do_entities) { $$h =~ s,",\",sg; # Simplify long entity names if using default charset... $$h =~ m,charset=([^\"]+)\",; if (!defined($1) || ($1 eq 'iso-8859-1')) { $$h =~ s,&([A-z]+);,($_ENTITIES{$1}) ? chr($_ENTITIES{$1}) : $&,sige; } } if ($do_meta) { foreach my $m (split(/\s+/, $do_meta)) { $$h =~ s,]*?>,,sig; } } if ($do_contenttype) { # Don't need this, since it is the default for most web servers # Also gets rid of 'blinking pages' in older versions of netscape. $$h =~ s,,,sig; } if ($do_defcolor) { $$h =~ s,(<[^<]+?color=['"]?\#[0-9A-Fa-f]+["']?),_defcolorcheck($&),sige; } if ($do_comments) { # don't strip server side includes.. # try not to get javascript, or styles... $$h =~ s,,_commentcheck($&),sige; # Remove javascript comments $$h =~ s,]*(java|ecma)script[^>]*>.*?,_jscomments($&),sige; } if ($do_javascript) { # $$h =~ s,]*(java|ecma)script[^>]*>.*?,_javascript($&),sige; } if ($do_blink) { $$h =~ s,,,sgi; $$h =~ s,,,sgi; } if ($do_dequote) { while ($$h =~ s,<([A-z]+ [A-z]+=)(['"])([A-z0-9]+)\2(\s*?[^>]*?>),<$1$3$4,sig) { # Remove alphanumeric quotes. Note, breaks DTD.. ; } } # remove , etc.. if ($do_emptytags) { my $pat = $do_emptytags; $pat =~ s/\s+/|/g; while ($$h =~ s,<($pat)(\s+[^>]*?)?>\s*,,siog){} } if ($do_htmldefaults) { # Tables # seems to break things.. #$$h =~ s,(]*)\s+border=0([^>]*>),$1$2,sig; $$h =~ s,(]*)\s+rowspan=1([^>]*>),$1$2,sig; $$h =~ s,(]*)\s+colspan=1([^>]*>),$1$2,sig; # # P, TABLE tags are default left aligned.. # lynx is inconsistent in this manner though.. $$h =~ s,<(P|table|td)( [^>]*)align=\"?left\"?([^>]*)>,<$1$2$3>,sig; # OL start=1 $$h =~ s,(
    ]*)start=\"?1\"?([^>]*>),$1$2,sig; # FORM $$h =~ s,(
    ]*)method=\"?get\"?([^>]*>),$1$2,sig; $$h =~ s,(]*)enctype=\"application/x-www-form-urlencoded\"([^>]*>),$1$2,sig; # hr $$h =~ s,(
    ]*)align=\"?center\"?([^>]*>),$1$2,sig; $$h =~ s,(
    ]*)width=\"?100%\"?([^>]*>),$1$2,sig; # URLs $$h =~ s,(href|src)(=\"?http://[^/:]+):80/,$1$2/,sig; } if ($do_whitespace) { # remove space within tags
    becomes
    $$h =~ s,\s+>,>,sg; $$h =~ s,<\s+,<,sg; # join lines with a space at the beginning/end of the line # and a line that begins with a tag $$h =~ s,>\n ,> ,sig; $$h =~ s, \n<, <,sig; } if ($do_lowercasetags) { # translate tags to lowercase to (hopefully) improve compressability.. # simple tags

    ,

    etc. $$h =~ s,(<[/]?[a-zA-Z][a-zA-Z0-9_-]*\s*>),\L$1\E,sg; # the rest.. $$h =~ s/(<[a-zA-Z][a-zA-Z0-9_-]*)(\s+.*?>)/_lowercasetag($1,$2)/sge; } } sub _lowercasetag { my($prefix, $body) = @_; $prefix =~ s/^(.+)$/\L$1\E/; $body =~ s/(\s+[a-zA-Z][a-zA-Z0-9_-]*)(\s*=\s*[^"\s]+|\s*=\s*"[^"]*"|>|\s)/\L$1\E$2/sg; return $prefix.$body; } # set options based on the level provided.. INTERNAL sub _level_defaults($) { my ($level) = @_; $do_whitespace = 1; # always do this... # level 2 $do_shortertags = ($level > 1) ? 1 : 0; $do_meta = ($level > 1) ? "generator formatter" : ""; $do_contenttype = ($level > 1) ? 1 : 0; # level 3 $do_entities = ($level > 2) ? 1 : 0; $do_blink = ($level > 2) ? 1 : 0; # level 4 $do_comments = ($level > 3) ? 1 : 0; $do_dequote = ($level > 3) ? 1 : 0; $do_defcolor = ($level > 3) ? 1 : 0; $do_emptytags = ($level > 3) ? 'b i font center' : 0; $do_javascript = ($level > 3) ? 1 : 0; $do_htmldefaults = ($level > 3) ? 1 : 0; $do_lowercasetags = ($level > 3) ? 1 : 0; # higher levels reserved for more intensive optimizations. } ###################################################################### =head2 compat() This function improves the cross-platform compatibility of your HTML. Currently checks for the following problems: =over 8 =item Insuring all IMG tags have ALT elements. =item Use of Arial, Futura, or Verdana as a font face. =item Positioning the tag immediately after the <head> tag. =back =cut sub compat { my($self, $level, $options) = @_; my $h = $self->{'DATA'}; $$h =~ s/face="arial"/face="arial,helvetica,sansserif"/sgi; $$h =~ s/face="(verdana|futura)"/face="$1,arial,helvetica,sansserif"/sgi; # insure that <title> tag is directly after the <head> tag # Some search engines only search the first N chars. (PLweb for instance..) if ($$h =~ s,<title>(.*),,si) { my $title = $1; $$h =~ s,,$title,si; } # Look for IMG without ALT tags. $$h =~ s/(]+>)/_imgalt($1)/segi; } sub _imgalt { my($tag) = @_; $tag =~ s/>/ alt="">/ if ($tag !~ /alt=/i); return($tag); } =head2 defrontpage(); This function converts pages created with Microsoft Frontpage to something a Unix server will understand a bit better. This function currently does the following: =over 8 =item Converts Frontpage 'hit counters' into a unix specific format. =item Removes some frontpage specific html comments =back =cut sub defrontpage { my($self) = @_; my $h = $self->{'DATA'}; while ($$h =~ s,,,xis) { print "Converted a Hitcounter.. $1, $2, $3\n"; } $$h =~ s,,,sgx; } =back =head1 SEE ALSO =head2 Modules FrontPage::Web, FrontPage::File =head2 Web Sites =over 6 =item Distribution Site - http://people.itu.int/~lindner/ =back =head1 AUTHORS Paul Lindner for the International Telecommunication Union (ITU) =head1 COPYRIGHT The HTML::Strip module is Copyright (c) 1998,99 by the ITU, Geneva Switzerland. All rights reserved. You may distribute under the terms of either the GNU General Public License or the Artistic License, as specified in the Perl README file. =cut 1; __END__ libhtml-clean-perl-0.8.orig/t/0040775000175000017500000000000007247704261014473 5ustar srzsrzlibhtml-clean-perl-0.8.orig/t/basic.t0100664000175000017500000000267407226702725015747 0ustar srzsrz# Before `make install' is performed this script should be runnable with # `make test'. After `make install' it should work as `perl test.pl' ######################### We start with some black magic to print on failure. # Change 1..1 below to 1..last_test_to_print . BEGIN { $| = 1; print "1..6\n"; } END {print "not ok 1\n" unless $loaded;} use HTML::Clean; $loaded = 1; ; # Test constructors my $h = new HTML::Clean(); if (!defined($h)) { print "not ok 1\n"; } else { print "ok 1\n"; } my $data = "testing, 1 2 3\n"; $h = new HTML::Clean(\$data); if (!defined($h)) { print "not ok 2\n"; } else { print "ok 2\n"; } # test level operator $h->level(2); if ($h->level() != 2) { print "not ok 3\n"; print "Level is " . $h->level() . "\n"; } else { print "ok 3\n"; } $h->level(9); # Test stripping.. # first val is text to manipulate # second val is good result my @data = ( "some bold textsome italic text", "some bold textsome italic text", "À©ñ", "À©ñ", "Some text with empty tags ", "Some text with empty tags ", ); my $test = 3; while (1) { $test++; my $orig = shift(@data) || last; my $good = shift(@data); $h->initialize(\$orig); $h->compat(); $h->strip(); if ($orig eq $good) { print "ok $test\n"; } else { print "not ok $test\n"; print "got:\n$orig\n\nexpected:\n$good\n\n"; } } exit; libhtml-clean-perl-0.8.orig/t/files.t0100664000175000017500000000247007226702725015762 0ustar srzsrz#!/usr/local/bin/perl ######################### We start with some black magic to print on failure. # Change 1..1 below to 1..last_test_to_print . BEGIN { $| = 1; print "1..9\n"; } END {print "not ok 1\n" unless $loaded;} use HTML::Clean; $loaded = 1; $test = 1; print "ok 1\n"; foreach $page ('hairy', 'altavista', 'microsoft', 'ibm', 'yahoo', 'infoseek', 'itu', 'cnn') { $test ++; my $h = new HTML::Clean("t/testpages/$page.html"); print "not ok $test\n" if (! defined($h)); # compat changes the 'look' of the page for lynx.. # $h->compat(); $h->strip(); if (open(OUTFILE, ">t/testpages/t$page.html")) { print OUTFILE ${$h->data()}; close(OUTFILE); } else { print "not ok $test\n"; } # if we can open lynx test that.. if (open(P, "lynx -nolist -dump t/testpages/$page.html |")) { my $cvtpage = ''; my $origpage = ''; while (

    ) { $origpage .= $_; } close(P); if (open(P, "lynx -nolist -dump t/testpages/t$page.html |")) { while (

    ) { $cvtpage .= $_; } close(P); if (abs(length($origpage) - length($cvtpage)) > 30) { print STDERR "\nWarning, lynx detects different page sizes for $page " . length($origpage) . ", " . length($cvtpage) . "\n"; } } } print "ok $test\n"; } libhtml-clean-perl-0.8.orig/t/testpages/0040775000175000017500000000000007247704261016472 5ustar srzsrzlibhtml-clean-perl-0.8.orig/t/testpages/altavista.html0100664000175000017500000003121307226702725021345 0ustar srzsrz AltaVista: Main Page

AltaVista Click Here

Ask AltaVistaTM a question.  Or enter a few words in Help - Advanced
Example: What happened today on All My Children?
Specialty
Searches

AV Family Filter - AV Photo Finder - AV Tools & Gadgets
Entertainment - Health - Holiday Shopping - Careers - Maps
People Finder - Stock Quotes - Travel - Usenet - Yellow Pages

CATEGORIES

Automotive

Business & Finance

Computers & Internet

Health & Fitness

Hobbies & Interests

Home & Family

Media & Amusements

People & Chat

Reference & Education

Shopping & Services

Society & Politics

Sports & Recreation

Travel & Vacations

NEWS BY ABCNEWS.com
White House Wants Censure
The Cost of Containing Iraq
China Tells Balloon To Land
Virus Attacks MCI Systems

ALTAVISTA HIGHLIGHTS

Take an entertainment break:
Classic Arcade-Game Crackdown
World: Noble Peace Prize Concert
How to avoid the winter bloat:
Staying Fit in Spite of Wintertime Gloom

OTHER SERVICES

AltaVista Discovery - Video Search Demo
FREE Email - AV Translation Services
Make Us Your Homepage - Create A Card
Photo Albums! - Asian Languages

Click Here

Featured Sponsors

Click HereNow open, the Levi's® Online Store.

Click HereAndy's Garage - New Stuff, Dirt Cheap!

Click HereShop DOCKERS.com The definitive online store.

Click HereDeerskin brings you great leather values

AltaVista Home | Help | Feedback | Advertising Info | Set your Preferences | Text-Only Version
COMPAQ | Disclaimer | Privacy | Our Search Network | About AltaVista | Add a Page
libhtml-clean-perl-0.8.orig/t/testpages/cnn.html0100664000175000017500000016745507226702725020155 0ustar srzsrz CNN Interactive
SHOP@CNN
Browse by
ad info
Click here to try four free issues of Sports Illustrated.
Click here to try four free issues of Sports Illustrated.

CNNin
* MAIN PAGE
 WORLD
 U.S.
 LOCAL
 POLITICS
 WEATHER
 BUSINESS
 SPORTS
 SCI-TECH
 ENTERTAINMENT
 BOOKS
 TRAVEL
 FOOD
 HEALTH
 STYLE
 IN-DEPTH

 custom news
 Headline News brief
 daily almanac
 CNN networks
 on-air transcripts
 news quiz
 jobs
 ad info

  CNN WEB SITES:
CNN Websites
Free Email Open a pop-up controller to bring you CNN news anytime Personalize your CNN Home Page Listen to the CNN Networks LIVE on your desktop View video news and CNN Programs on demand
Video, Audio, Customize, Remote, Email
   For a QUICK read of the Headline News, click here.
 Search CNN  go 
January 29, 1999 -- Updated 4:55 a.m. EST, 0955 GMT, @455 Swatch internet time
Impeachment
Republicans force through their trial road map

In a party-line 54-44 vote, Republicans Thursday pushed through their proposal for proceeding with the deposition phase of the impeachment trial of President Bill Clinton. Under the plan, witnesses will be deposed starting Monday and February 12 has been set as the trial's target end date.

The Senate's OK of the Republican plan was the last of three rapid-fire votes Thursday evening after negotiations failed to yield a bipartisan agreement. The Democrats' proposal was first rejected, also by a 54-44 margin. The senators next voted down a Democratic attempt to move immediately to a final vote on the articles of impeachment.

FULL STORY video icon


 
An artist on the cutting edge
saw artist

In other news:

Kosovo crisis talks launched as military threat looms
Magnitude of Colombian quake 'exceeds all calculations' video icon
Violence sweeps East Timor as independence drive gains momentum
U.S. fighters attack Iraqi anti-aircraft site
Investigators on Salt Lake Olympic money trail
Federal budget surplus tops $100 billion
Charles, Camilla appear together for first time

 PATHFINDER SITES:
 MORE SERVICES:
 video on demand
 video archive
 audio on demand
 news email services
 free email accounts
 desktop headlines
 pointcast
 pagenet

 DISCUSSION:
 message boards
 chat
 feedback

 SITE GUIDES:
 help
 contents
 search

 FASTER ACCESS:
 europe
 japan

 WEB SERVICES:


barnesandnoble.com
 


SHOP@CNN
Browse by

SPECIAL DEAL:
HOW TO GET WHAT YOU WANT...
by John Gray
(hardcover)

Our Price: $17.46
(30% off!)

Barnes and Noble
 
  QUICK VOTE:     ON CNN:

Should people avoid using words that sound like slurs but aren't -- like "niggardly"?

Yes -- it's what people hear that counts
No -- it's what people mean that counts
View Results
tv What's on CNN?
What's on CNN Int'l?
Domestic Schedule

WORLD:   U.S.:

  LINK OF THE DAY:     CUSTOM NEWS:
Link of the day

FIND LOW FARES FAST WITH:

Air deals at a glance
Speedy roundtrip flight search
Low-fare desktop ticker

The online booking source for busy people


Personalize CNN.com
Visit YOUR CustomNews
  sample stories:



SPORTS:   BUSINESS:

POLITICS:   WEATHER:

 ROMANTIC CAPERS:  MOVIES:
Cleopatra
Ideas for planning a surprise Valentine's escape
  horner
Oscar has already recognized him, as have the Golden Globes. Is a Grammy next for this "Titanic" composer?

 SCIENCE AND NATURE:    COMPUTING:

SPACE:   IN-DEPTH:

HEALTH:   ENTERTAINMENT:

  PICTURE OF THE DAY:   COMPUTING:
Staying close to mom
Staying close to mom
Presented by Oracle
  chaos
Chaos theory can bring more privacy to Internet communications

STYLE:   TRAVEL:

BOOKS:   FOOD:

 ALLPOLITICS TOONS:  RECIPE:
Allpolitics toon
Political 'toonist Mike Luckovich has a question for Monica Lewinsky
  Apple Crisp
Break out the vanilla ice cream for this one:
Apple crisp

COUNTDOWN TO 2000:   COLD WAR:
 

VIEWS:   MEDIA SHOWCASE:

 FRINGE:    DISCUSSION:

 AUDIO:    VIDEO ON-DEMAND:

Listen to CNN live...

Plus sports, technology, health and entertainment news on-demand.

All in Audioselect

 
Videoselect
Judge expected to decide on charges against Anwar
Larry King Live
Crossfire


  
Back to the top
© 1999 Cable News Network. All Rights Reserved.
Terms under which this service is provided to you.
Read our privacy guidelines.

libhtml-clean-perl-0.8.orig/t/testpages/hairy.html0100664000175000017500000000262407226703377020501 0ustar srzsrz This is a test page that touches most options of HTML::Clean This is a bold line with some empty tags .

How about some excess space within tags?

This is a table with default values. Remove the border=0 from the page

Tests for align=left in various tags

  1. item 1
  2. item 2

Link to the ITU Home page with :80 in the URL
An image without an alt tag.

Voilá test me
libhtml-clean-perl-0.8.orig/t/testpages/ibm.html0100664000175000017500000002322607226702725020131 0ustar srzsrz IBM Corporation
[ IBM.com for Tue, 22 Dec 1998 14:10:37  ]
HomeNewsProductsSupportSolutionsPartnersIBM
[ Search ]
Shop
Contact IBM
Download
Find a job
---
www.rei.com is an IBM e-business
[ I hope that one's for me: IBM products make great holiday gifts ]
Discover IBM / Today's News
Microelectronics Microelectronics
The New IBM Microelectronics Gallery: Take a virtual tour
think leadership think leadership
The Trendspotter's Guide
Other Voices Other Voices
Tech Nation: Simpler than what?
IBM announces OEM storage shipment milestone
IBM and United Way launch national early learning initiative
Golf fans: Connect to the 1998 PGA Tour

Text only version
libhtml-clean-perl-0.8.orig/t/testpages/infoseek.html0100664000175000017500000006166707226702725021200 0ustar srzsrz Infoseek
 Homepage  Homepage  Stocks  News  Yellow Pages  Maps  Chat  Free Web Page  Help  UPS Service Center

    People Finder - Shareware - Company Capsules - More... GO Network beta
 
Use Extra Search Precision within 
 
Search Tips
Advanced Search


 

News Headlines
 

- Iraq Threatens To Continue Its Air War With West
- U.S. Fighter Planes Clash With Iraq In No-Fly Zone
- NFL Wrap-Up: Jacksonville Rookies Rip Apart Steelers


Patch Adams stars Robin Williams and opens December 25Click here for 9NETAVENUE!
 

Infoseek Today
 
 -  Monday Night Football
 -  The latest business news
 -  Happy Kwanzaa!
 -  Virus or hoax?
 

Infoseek Tools
 
- The FASTEST way to FIND what you're LOOKING for! Try Express by Infoseek NOW!
- Free Web Advertising with Infoseek Clicks
- Fast access to searching! Try Infoseek Quickseek
- Make Infoseek a part of your internet experience with Free Infoseek tools
 
  
GONetwork  Try the beta!
Automotive
Buy a car, Insurance Center...
 
Business
Business resources, small businesses...
 
Careers
At work, resumes, find an ideal job...
 
Communications
Business solutions, mobile & wireless, networking...
 
Computer
Buy a PC, buy software, download...
 
Education
College, continuing ed, K-12, science & nature
 
Entertainment
Movies, music, TV, books...
 
The Good Life
Gardening, recipes, relationships, restaurants...
 
Health
Medical info, wellness, fitness...
 
Internet
Intranet, web publishing, Netcenter...
 
Kids & Family
Fun for kids, parenting...
 
News
Business, tech, world...
 
Personal Finance
Insurance Center, investing, mutual funds, credit...
 
Real Estate
Buy a home, Loan Center, Insurance Center...
 
Shopping
Apparel, auctions, CDs, gifts, flowers...
 
Sports
NFL, NHL, NBA, MLB, more...
 
Travel
Air travel, money savers, vacation ideas...
 
Women's
Family issues, get fit, job strategies...
 







INFOSEEK WORLDWIDE: Brasil |  Danmark |  Deutschland |  en español |  France |  Italia |  Japan  |   México  |   Nederland |  Sverige |  United Kingdom 


Feedback  |   Make Infoseek your start page  |   Add URL  |   Help  |   About Infoseek  |   Advertise on Infoseek  |   Jobs @ Infoseek  |   Intranet software  |   Infoseek Instant Messaging  |   Chat   |   Infoseek Clicks

Best viewed with:
 Microsoft Internet Explorer   Netscape Tune-Up!

Copyright © 1994-98 Infoseek Corporation.
All rights reserved. 
Disclaimer    Privacy Policy
Local information:
 TRUSTe Program
libhtml-clean-perl-0.8.orig/t/testpages/itu.html0100664000175000017500000003137007226702725020162 0ustar srzsrz International Telecommunication Union (ITU) Home Page
ITU Home Page

ITU Highlights

 ITU Menu Bar

[Swisscom]

  
The ITU, headquartered in Geneva, Switzerland is an international organization within which governments and the private sector coordinate global telecom networks and services.

ITU Meetings and Conferences

ITU Publications
The ITU is the leading publisher of telecommunication technology, regulatory and standards information. Many publications can be purchased through our Electronic Bookshop or the ITU Publications Online subscription service.

ITU Newsroom

Internet Broadcasting Service

Information Exchange Services (TIES)

Job Vacancies

Selected Web Sites

United Nations System
Official site listing of all UN organizations.

Geneva Permanent Missions to the UN

[Add Active Channel]

Add ITU Now!

  

Banner

1998 Plenipotentiary Conference

World TELECOM 99

Year 2000 "Millennium Compliance"

International Mobile Telecommunications (IMT)
The ITU vision of global wireless access in the 21st century

ITU Global Directory

ISO 3166-Based Top Level Domains Survey

For more ITU Features see our Highlights page.


Search ITU's Site:


Home | Search | Site Map | Help | Contact | Comments | © Copyright

English | Français | Español

Last Modified: 1998-12-03

About the ITU Web Site

libhtml-clean-perl-0.8.orig/t/testpages/microsoft.html0100664000175000017500000004507207226702725021372 0ustar srzsrz Welcome to Microsoft's Homepage
microsoft.com Home   All Products  |   Support  |   Search  |   microsoft.com Home  
Microsoft
  Home  |   Events  |   Training  |   Downloads  |   Newsletters  |   U.S. & International  |   About Our Site  |

Internet Explorer
Download it free!

Product Families
BackOffice
Developer Tools
Office
MSN
Windows

Business Solutions
Industries
Small Business

Developers
Software Developers
Web Site Builders

Education
Academic Products
Education Resellers
Higher Education
K-12 Education

IT Professionals
Digital Nervous System
IT Professionals/Execs
Solution Providers
Year 2000

Partners & Resellers
Becoming a Partner
Find a Services Partner
Resellers Consultants

Personal Use
Games
Kids
Personal Computing
Seniors

About Microsoft
Company Overview
Jobs
Press Information
Privacy/Security
Investor Relations
US Offices & Web Sites

Starship Rediscover the Planet with Starship Voyage
Climb aboard the high-tech vessel Starship, packed to the gunwales with PCs, network equipment, and state-of-the art satellite transmission equipment. Thanks to an intranet and Web interface designed by Microsoft, you can take a virtual voyage this holiday season.

Join the 100,000 Who've Signed Up for Office 2000 Preview
Get What You Need with MSN Shopping's Last Minute Gift Guide
Get a Palm-size PC, and a Whole Lot More, with our Holiday Offer
Top 10 Tips to Get Your New Computer Running Smoothly
Get Software Ideas for Your New PC With Our CD-ROM Sampler
Microsoft and fine.com Help Girl Fight Cancer

Subscribe to Our Free E-Mail Newsletter!

For a text-only version of the home page click here.
Microsoft and the freedom to innovate - What's your opinion?



Last Updated: Monday, December 21, 1998
©1998 Microsoft Corporation. All rights reserved. Terms of Use Privacy Policy
libhtml-clean-perl-0.8.orig/t/testpages/yahoo.html0100664000175000017500000002416207226702725020501 0ustar srzsrzYahoo!
Yahoo
Yahoo! Auctions
furby, beanies, zelda
OfficeMax.com: Up to $300 IBM Aptiva rebatesPark Your
Domain Free
options
Shopping - Yellow Pages - People Search - Maps - Travel Agent - Classifieds - Personals - Games - Chat
Email - Calendar - Pager - My Yahoo! - Today's News - Sports - Weather - TV - Stock Quotes - more...
Yahoo! Shopping : Apparel, Books, CDs, Computers, Electronics, Games, Toys, Videos, more...
Featured Stores : Service Merchandise - Cambridge SoundWorks - FAO Schwarz
Arts & Humanities
Literature, Photography...

Business & Economy
Companies, Finance, Jobs...

Computers & Internet
Internet, WWW, Software, Games...

Education
Universities, K-12, College Entrance...

Entertainment
Cool Links, Movies, Humor, Music...

Government
Military, Politics, Law, Taxes...

Health
Medicine, Diseases, Drugs, Fitness...
News & Media
Full Coverage, Newspapers, TV...

Recreation & Sports
Sports, Travel, Autos, Outdoors...

Reference
Libraries, Dictionaries, Quotations...

Regional
Countries, Regions, US States...

Science
Biology, Astronomy, Engineering...

Social Science
Archaeology, Economics, Languages...

Society & Culture
People, Environment, Religion...
In the News
·President Clinton is impeached
·US, UK end air strikes in Iraq
·Woman gives birth to octuplets
·NFL , NHL
more...

Inside Yahoo!
·Y! Games - crossword, hearts, euchre
·Holiday guide - food and drink, gift ideas
·Y! Clubs - create your own community
more...
World Yahoo!sAmericas : Canada - Spanish
Europe : Denmark - France - Germany - Italy - Norway - Spain - Sweden - UK & Ireland
Pacific Rim : Asia - Australia & NZ - Chinese - Japan - Korea
Yahoo! Get LocalLA - NYC - SF Bay - Chicago - more...    
Other GuidesAutos - Computers - Employment - Local Events - Net Events - Message Boards
Movies - Real Estate - Small Business - Ski & Snow - Y! Internet Life - Yahooligans!
Smart Shopping with
How to Suggest a Site - Company Info - Privacy Policy - Contributors - Openings at Yahoo!