HTML-Diff-0.561/0000755000076500000240000000000010524070222013215 5ustar ezrastaff00000000000000HTML-Diff-0.561/Changes0000644000076500000240000000234010524065112014511 0ustar ezrastaff000000000000000.56, November 7, 2006: * In build process, fixed dependency check for prerequisite Algorithm::Diff (#7340). * Produce correct output when a diff chunk is precisely '0' (#6756). * Install the htmldiff tool into your bin directory on make install (#7341). * Fixed some bugs where html_word_diff gave the wrong flag for a chunk (e.g. ['+', 'b', 'b c']). 0.55, April 27, 2004: * Fixed a problem where some "same" chunks were considered "different." This occured if you had container tags that used uppercase letters. * First element of result always contained a 'different' chunk between undef values. It's now suppressed. 0.54, June 15, 2003: * Following XTHML spec, added all the "single" (unclosable) tags, such as IMG, BASE, LINK, etc. to the list of such tags used by the module. * Converted this list into an exposed package variable, so that push @HTML::Diff::UNBALANCED_TAGS, $new_tag_name allows you to modify the list of "single" tags. 0.52, April 17, 2003: * Switched to Makefile.PL, thereby fixing broken tests. v0.51, April 16, 2003: * Added htmldiff script contributed by Maurice Aubrey. v0.5, March 31, 2003 -- initial release HTML-Diff-0.561/htmldiff0000644000076500000240000000150310523746351014750 0ustar ezrastaff00000000000000#!/usr/bin/perl # # htmldiff uses HTML::Diff to create an HTML file that shows the difference # between two HTML files, given on the command line. # # Contributed by Maurice Aubrey # use strict; use HTML::Diff; @ARGV == 2 or die "Usage: $0 \n"; my @txt; foreach (@ARGV) { open my $fh, $_ or die "unable to read '$_': $!"; local $/; push @txt, scalar <$fh>; } print qq{\n}; foreach (@{ html_word_diff(@txt) }) { my($type, $left, $right) = @$_; # debug #$left =~ s/\n/ /g; #$right =~ s/\n/ /g; #print "TYPE:$type\nLEFT: $left\nRIGHT: $right\n\n"; #next; if ($type eq 'u') { print $left; } else { print "$left" if length $left; print "$right" if length $right; } } HTML-Diff-0.561/lib/0000755000076500000240000000000010524070222013763 5ustar ezrastaff00000000000000HTML-Diff-0.561/lib/HTML/0000755000076500000240000000000010524070222014527 5ustar ezrastaff00000000000000HTML-Diff-0.561/lib/HTML/Diff.pm0000644000076500000240000001574310524070062015751 0ustar ezrastaff00000000000000#!/usr/bin/perl package HTML::Diff; $VERSION = '0.561'; use strict; use Exporter; our @ISA = qw(Exporter); our @EXPORT = qw(line_diff word_diff html_word_diff); # This list of tags is taken from the XHTML spec and includes # all those for which no closing tag is expected. In addition # the pattern below matches any tag which ends with a slash / our @UNBALANCED_TAGS = qw(br hr p li base basefont meta link col colgroup frame input isindex area embed img bgsound marquee); use Algorithm::Diff 'sdiff'; sub member { my ($item, @list) = @_; return scalar(grep {$_ eq $item} @list); } sub html_word_diff { my ($left, $right) = @_; # Split the two texts into words and tags. my (@leftchks) = $left =~ m/(<[^>]*>\s*|[^<]+)/gm; my (@rightchks) = $right =~ m/(<[^>]*>\s*|[^<]+)/gm; @leftchks = map { $_ =~ /^<[^>]*>$/ ? $_ : ($_ =~ m/(\S+\s*)/gm) } @leftchks; @rightchks = map { $_ =~ /^<[^>]*>$/ ? $_ : ($_ =~ m/(\S+\s*)/gm) } @rightchks; # Remove blanks; maybe the above regexes could handle this? @leftchks = grep { $_ ne '' } @leftchks; @rightchks = grep { $_ ne '' } @rightchks; # Now we process each segment by turning it into a pair. The first element # is the text as we want it to read in the result. The second element is # the value we will to use in comparisons. It contains an identifier # for each of the balanced tags that it lies within. # This subroutine holds state in the tagstack variable my $tagstack = []; my $smear_tags = sub { if ($_ =~ /^<.*>/) { if ($_ =~ m|^]*)|; $tag = lc $tag; # print STDERR "Found closer of $tag with " . (scalar @$tagstack) . " stack items\n"; # If we found the closer for the tag on top # of the stack, pop it off. if ((scalar @$tagstack) > 0 && $$tagstack[-1] eq $tag) { my $stacktag = pop @$tagstack; } return [$_, $tag]; } else { my ($tag) = m|^<\s*([^\s>]*)|; $tag = lc $tag; # print STDERR "Found opener of $tag with " . (scalar @$tagstack) . " stack items\n"; if (member($tag, @UNBALANCED_TAGS) || $tag =~ m#/\s*>$#) { # (tags without correspond closer tags) return [$_, $tag]; } else { push @$tagstack, $tag; } return [$_, $_]; } } else { my $result = [$_, (join "!!!", (@$tagstack, $_)) ]; return $result; } }; # Now do the "smear tags" operation across each of the chunk-lists $tagstack = []; @leftchks = map { &$smear_tags } @leftchks; # TBD: better modularity would preclude having to reset the stack $tagstack = []; @rightchks = map { &$smear_tags } @rightchks; # print STDERR Data::Dumper::Dumper(\@leftchks); # print STDERR Data::Dumper::Dumper(\@rightchks); # Now do the diff, using the "comparison" half of the pair to # compare two chuncks. my $chunks = sdiff(\@leftchks, \@rightchks, sub { $_ = elem_cmprsn(shift); $_ =~ s/\s+$/ /g; $_ }); # print STDERR Data::Dumper::Dumper($chunks); # Finally, process the output of sdiff by concatenating # consecutive chunks that were "unchanged." my $lastsignal = ''; my $lbuf = ""; my $rbuf = ""; my @result; my $ch; foreach $ch (@$chunks) { my ($signal, $left, $right) = @$ch; if ($signal ne $lastsignal && $lastsignal ne '') { if ($signal ne 'u' && $lastsignal ne 'u') { $signal = 'c'; } else { push @result, [$lastsignal, $lbuf, $rbuf]; $lbuf = ""; $rbuf = ""; } } # if ($signal eq 'u' && $lastsignal ne 'u') { # push @result, [$lastsignal, $lbuf, $rbuf] # unless $lastsignal eq ''; # $lbuf = ""; # $rbuf = ""; # } elsif ($signal ne 'u' && $lastsignal eq 'u') { # push @result, [$lastsignal, $lbuf, $rbuf]; # $lbuf = ""; # $rbuf = ""; # } my $lelem = elem_mkp($left); my $relem = elem_mkp($right); $lbuf .= (defined $lelem ? $lelem : ''); $rbuf .= (defined $relem ? $relem : ''); $lastsignal = $signal; } push @result, [$lastsignal, $lbuf, $rbuf]; return \@result; } # these are like "accessors" for the two halves of the diff-chunk pairs sub elem_mkp { my ($e) = @_; return undef unless ref $e eq 'ARRAY'; my ($mkp, $cmp) = @$e; return $mkp; } sub elem_cmprsn { my ($e) = @_; return undef unless ref $e eq 'ARRAY'; my ($mkp, $cmp) = @$e; return $cmp; } # Finally a couple of non-HTML diff routines sub line_diff { my ($left, $right) = @_; my (@leftchks) = $left =~ m/(.*\n?)/gm; my (@rightchks) = $right =~ m/(.*\n?)/gm; my $result = sdiff(\@leftchks, \@rightchks); # my @result = map { [ $_->[1], $_->[2] ] } @$result; return $result; } sub word_diff { my ($left, $right) = @_; my (@leftchks) = $left =~ m/([^\s]*\s?)/gm; my (@rightchks) = $right =~ m/([^\s]*\s?)/gm; my $result = sdiff(\@leftchks, \@rightchks); my @result = (map { [ $_->[1], $_->[2] ] } @$result); return $result; } 1; =pod =head1 HTML::Diff This module compares two strings of HTML and returns a list of a chunks which indicate the diff between the two input strings, where changes in formatting are considered changes. HTML::Diff does not strictly parse the HTML. Instead, it uses regular expressions to make a decent effort at understanding the given HTML. As a result, there are many valid HTML documents for which it will not produce the correct answer. But there may be some invalid HTML documents for which it gives you the answer you're looking for. Your mileage may vary; test it on lots of inputs from your domain before relying on it. =head1 SYNOPSIS $result = html_word_diff($left_text, $right_text); =head1 DESCRIPTION Returns a reference to a list of triples [, , ]. Each triple represents a check of the input texts. The flag tells you whether it represents a deletion, insertion, a modification, or an unchanged chunk. Every character of each input text is accounted for by some triple in the output. Specifically, Concatenating all the members from the return value should produce C<$left_text>, and likewise the members concatenate together to produce C<$right_text>. The is either C<'u'>, C<'+'>, C<'-'>, or C<'c'>, indicating whether the two chunks are the same, the $right_text contained this chunk and the left chunk didn't, or vice versa, or the two chunks are simply different. This follows the usage of Algorithm::Diff. The difference is computed on a word-by-word basis, "breaking" on visible words in the HTML text. If a tag only is changed, it will not be returned as an independent chunk but will be shown as a change to one of the neighboring words. For balanced tags, such as , it is intended that a change to the tag will be treated as a change to all words in between. =head1 AUTHOR Whipped up by Ezra elias kilty Cooper, . Patch contributed by Adam . =head1 SEE ALSO Algorithm::Diff =cut HTML-Diff-0.561/Makefile.PL0000644000076500000240000000033410524070052015170 0ustar ezrastaff00000000000000#!/usr/bin/perl use ExtUtils::MakeMaker; WriteMakefile(NAME => "HTML::Diff", VERSION => 0.561, PREREQ_PM => { 'Algorithm::Diff' => '1.15' }, INST_LIB => "./lib", EXE_FILES => ["htmldiff"]); HTML-Diff-0.561/MANIFEST0000644000076500000240000000023010524065602014347 0ustar ezrastaff00000000000000Changes Makefile.PL MANIFEST README htmldiff t/01-diff.t lib/HTML/Diff.pm META.yml Module meta-data (added by MakeMaker) HTML-Diff-0.561/META.yml0000644000076500000240000000042210524070222014464 0ustar ezrastaff00000000000000#XXXXXXX This is a prototype!!! It will change in the future!!! XXXXX# name: HTML-Diff version: 0.561 version_from: installdirs: site requires: Algorithm::Diff: 1.15 distribution_type: module generated_by: ExtUtils::MakeMaker version 6.12 HTML-Diff-0.561/README0000644000076500000240000000075110523760270014110 0ustar ezrastaff00000000000000HTML::Diff is a Perl module for finding changes between two strings of HTML. To install, simply copy HTML/Diff.pm to somewhere in your perl include path. Or, give the following commands at a shell prompt: % perl ./Makefile.PL % make % make test % make install HTML::Diff is copyright (c) 2003-2006 Ezra elias kilty Cooper . All rights reserved. This program is free software; you can redistribute it and/or modify it under the same terms as Perl itself. HTML-Diff-0.561/t/0000755000076500000240000000000010524070222013460 5ustar ezrastaff00000000000000HTML-Diff-0.561/t/01-diff.t0000755000076500000240000002603410523757316015022 0ustar ezrastaff00000000000000#!/usr/bin/perl use strict; use Test; BEGIN { plan tests => 10 } use Getopt::Long; my ($verbose); GetOptions("verbose!" => \$verbose) or die "Parsing command line failed."; use Data::Dumper; use HTML::Diff qw(line_diff word_diff html_word_diff); my $test_text_a = "Four score and seven years ago, our forefathers brought forth on this continent a new nation conceived in liberty and dedicated to the proposition that all men are created equal. Now our great nation is engaged in civil war."; my $test_text_b = "Four score and seven years ago, our forefathers brought forth on this continent dedicated to the proposition that all men are created equal. Now our great nation is enagaged in civil war"; my $test_text_c = 'PEOPLE said, "The evening-bell is sounding, the sun is setting." A strange wondrous tone was heard in the narrow streets of a large town. It was like the sound of a church-bell: but it was only heard for a moment, for the rolling of the carriages, and the voices of the multitude made too great a noise.'; my $test_text_d = 'PEOPLE said, "The bell is sounding." A strange wondrous was heard tone in the narrow streets of a large town. A long time passed. It was like the sound of a church-bell: but it was only heard for a moment, for the rolling of the carriages, and the voices of the multitude made too great a noise.'; # This next pair tests the usual HTML changes # (e.g. "a b c d" -> "a b c d" considers all of "a b c d" # as a change). It also tests that whitespace changes are effectively ignored. my $test_html_a = '

Strategic Directions for Research in Theory of Computing

September 23, 1996

< this is an unkown html tag /> Anne Condon, University of Wisconsin
Faith Fich, University of Toronto
Greg N. Frederickson, Purdue University
Andrew V. Goldberg, NEC Research Institute
David S. Johnson, AT&T Bell Laboratories
Michael C. Loui, University of Illinois at Urbana-Champaign
Steven Mahaney, DIMACS

Prabhakar Raghavan, IBM Almaden Research Center
John Savage, Brown University
Alan Selman, SUNY at Buffalo
David B. Shmoys, Cornell University

Abstract. This report focuses on two core areas of theory of computing: discrete algorithms and computational complexity theory. The report reviews the purposes and goals of theoretical research, summarizes selected past and recent achievements, explains the importance of sustaining core research, and identifies promising opportunities for future research. Some research opportunities build bridges between theory of computing and other areas of computer science, and other science and engineering disciplines.

'; my $test_html_b = '

Strategic Directions for Research in Theory of Computing

September 23, 1996

Anne Condon, University of Wisconsin
Faith Fich, University of Toronto
Greg N. Frederickson, Purdue University
Andrew V. Goldberg, NEC Research Institute
David S. Johnson, AT&T Bell Laboratories
Michael C. Loui, University of Illinois at Urbana-Champaign
Steven Mahaney, DIMACS
Prabhakar Raghavan, IBM Almaden Research Center
John Savage, Brown University
Alan Selman, SUNY at Buffalo
David B. Shmoys, Cornell University

Abstract. This report focuses on two core areas of theory of computing: discrete algorithms and computational complexity theory. The report reviews the purposes and goals of theoretical research, summarizes selected past and recent achievements, explains the importance of sustaining core research, and identifies promising opportunities for future research. Some research opportunities build bridges between theory of computing and other areas of computer science, and other science and engineering disciplines.

'; sub print_diff { my $ch; my ($chunks) = @_; foreach $ch (@$chunks) { my ($flag, $m, $o) = @$ch; unless ($flag eq 'u') { print "<< old\n"; print "$o"; print ">> new\n"; print "$m"; print "==\n"; # TBD: make some kind of warning about lacking a newline at the end } else { print "$m"; } } } sub test_diff_continuity { my ($a, $b, $diffalgo, $ignore_whitespace) = @_; my $chunks = &$diffalgo($a, $b); my ($runningb, $runninga); $runninga = $runningb = ""; my $ch; foreach $ch (@$chunks) { my ($flag, $ach, $bch) = @$ch; $runninga .= $ach || ''; $runningb .= $bch || ''; } if ($ignore_whitespace) { $a =~ s/\s\s+/ /g; $b =~ s/\s\s+/ /g; $runninga =~ s/\s\s+/ /g; $runningb =~ s/\s\s+/ /g; } return ($a eq $runninga) && ($b eq $runningb); } sub expect_diff { my ($a, $b, $algo, $expectation) = @_; } if ($verbose) { my $chunks = HTML::Diff::line_diff($test_text_a, $test_text_b); print_diff($chunks); $chunks = HTML::Diff::word_diff($test_text_c, $test_text_d); print "\n"; print_diff($chunks); } ok(deep_compare(html_word_diff('', ''), [['', undef, undef]])); ok(deep_compare(html_word_diff('0', ''), [['-', '0', '']])); ok(deep_compare(html_word_diff('', '0'), [['+', '', '0']])); ok(deep_compare(html_word_diff('0', '0'), [['u', '0', '0']])); ok(deep_compare(html_word_diff('a b b', 'b b c'), [['-', 'a ', ''], ['u', 'b ', 'b '], ['c', 'b', 'b c']])); print "Testing line_diff on test_text_a and test_text_b\n" if $verbose; ok(test_diff_continuity($test_text_a, $test_text_b, \&HTML::Diff::line_diff)); print "Testing html_word_diff on test_text_a and test_text_b\n" if $verbose; ok(test_diff_continuity($test_text_a, $test_text_b, \&HTML::Diff::html_word_diff)); print "Testing html_word_diff on test_html_a and test_html_b\n" if $verbose; ok(test_diff_continuity($test_html_a, $test_html_b, \&HTML::Diff::html_word_diff, 1)); my $result = HTML::Diff::html_word_diff($test_html_a, $test_html_b); # Use the following lines to capture a "correct" result (when you # think you've got one) which can be used to validate future tests # open OUT, ">expect"; # print OUT Dumper($result); # close OUT; # This value is the result we expect from HTML::Diff::html_word_diff() # If the actual result differs by one byte, it's a failure. # When the diff code is changed, you'll need to calculate a new expected # value using the lines above, and paste the resulting value below. my $expect = [ [ '-', ' ', '' ], [ 'u', '

', '

' ], [ 'c', '', '' ], [ 'u', 'Strategic Directions for ', 'Strategic Directions for ' ], [ 'c', ' Research in Theory of Computing ', ' Research in Theory of Computing ' ], [ 'u', '

', '

' ], [ '-', ' ', '' ], [ 'u', 'September 23, 1996

', 'September 23, 1996

' ], [ '-', '< this is an unkown html tag /> ', '' ], [ 'u', 'Anne Condon, University of Wisconsin
Faith Fich, University of Toronto
Greg N. Frederickson, Purdue University
Andrew V. Goldberg, NEC Research Institute
David S. Johnson, AT&T Bell Laboratories
Michael C. Loui, University of Illinois at Urbana-Champaign
Steven Mahaney, DIMACS ', 'Anne Condon, University of Wisconsin
Faith Fich, University of Toronto
Greg N. Frederickson, Purdue University
Andrew V. Goldberg, NEC Research Institute
David S. Johnson, AT&T Bell Laboratories
Michael C. Loui, University of Illinois at Urbana-Champaign
Steven Mahaney, DIMACS ' ], [ 'c', '

', '
' ], [ 'u', 'Prabhakar Raghavan, IBM Almaden Research Center
John Savage, Brown University
Alan Selman, SUNY at Buffalo
David B. Shmoys, Cornell University

Abstract. This report focuses on two core areas of theory of computing: discrete algorithms and computational complexity theory. The report reviews the purposes and goals of theoretical research, summarizes selected past and recent achievements, ', 'Prabhakar Raghavan, IBM Almaden Research Center
John Savage, Brown University
Alan Selman, SUNY at Buffalo
David B. Shmoys, Cornell University

Abstract. This report focuses on two core areas of theory of computing: discrete algorithms and computational complexity theory. The report reviews the purposes and goals of theoretical research, summarizes selected past and recent achievements, ' ], [ 'c', 'explains the importance of ', 'explains the importance of ' ], [ 'u', 'sustaining core research, and identifies promising opportunities for future research. Some research opportunities build bridges between theory of computing and other areas of computer science, and other science and engineering disciplines.

', 'sustaining core research, and identifies promising opportunities for future research. Some research opportunities build bridges between theory of computing and other areas of computer science, and other science and engineering disciplines.

' ], ]; ok(deep_compare($result, $expect)); # Given two array refs of array refs, of array refs... return true if # the two structures are isomorphic and all the corresponding scalars # are equal # TBD: make it more efficient; builds up call stack too much. # TBD: Take a binary test as an arg, to replace eq sub deep_compare { my ($a, $b) = @_; my ($x, $y); if (!ref($a) && !ref($b)) { return $a eq $b; } else { return 0 unless ((ref($a) eq 'ARRAY') && (ref($b) eq 'ARRAY')); while ($x = shift @$a) { $y = shift @$b; return 0 unless deep_compare($x, $y); } } return 1; } my $diffchunks = HTML::Diff::html_word_diff($test_html_a, $test_html_b); if ($verbose) { print "Result of diff:\n"; print "[$_]\n" foreach (map {join "||", @$_} @$diffchunks); } sub check_diff_integrity { my $failure = 0; foreach my $chunk (@{$_[0]}) { my ($mark, $left, $right) = @$chunk; if ($mark ne 'u' && $left eq $right) { print "[$left] is [$right] but HTML::Diff thinks they're different!\n"; $failure = 1; } } return !$failure; } my $A = "

  • monkey

Search

"; my $B = "
  • monkey
  • llama

Search

"; $result = html_word_diff($A, $B); ok(check_diff_integrity($result)); sub diff_file { my ($left, $right) = @_; open LEFT, $left; open RIGHT, $right; $/ = undef; my $Left = ; my $Right = ; close LEFT; close RIGHT; my $diff_chunks = html_word_diff($Left, $Right); print_diff($diff_chunks); print "\n"; } 1;