HTML-HTML5-Parser-0.301/0000755000175000017500000000000012166545250012464 5ustar taitaiHTML-HTML5-Parser-0.301/inc/0000755000175000017500000000000012166545247013243 5ustar taitaiHTML-HTML5-Parser-0.301/inc/YAML/0000755000175000017500000000000012166545247014005 5ustar taitaiHTML-HTML5-Parser-0.301/inc/YAML/Tiny.pm0000644000175000017500000003534412166545147015276 0ustar taitai#line 1 package YAML::Tiny; use strict; # UTF Support? sub HAVE_UTF8 () { $] >= 5.007003 } BEGIN { if ( HAVE_UTF8 ) { # The string eval helps hide this from Test::MinimumVersion eval "require utf8;"; die "Failed to load UTF-8 support" if $@; } # Class structure require 5.004; require Exporter; require Carp; $YAML::Tiny::VERSION = '1.51'; # $YAML::Tiny::VERSION = eval $YAML::Tiny::VERSION; @YAML::Tiny::ISA = qw{ Exporter }; @YAML::Tiny::EXPORT = qw{ Load Dump }; @YAML::Tiny::EXPORT_OK = qw{ LoadFile DumpFile freeze thaw }; # Error storage $YAML::Tiny::errstr = ''; } # The character class of all characters we need to escape # NOTE: Inlined, since it's only used once # my $RE_ESCAPE = '[\\x00-\\x08\\x0b-\\x0d\\x0e-\\x1f\"\n]'; # Printed form of the unprintable characters in the lowest range # of ASCII characters, listed by ASCII ordinal position. my @UNPRINTABLE = qw( z x01 x02 x03 x04 x05 x06 a x08 t n v f r x0e x0f x10 x11 x12 x13 x14 x15 x16 x17 x18 x19 x1a e x1c x1d x1e x1f ); # Printable characters for escapes my %UNESCAPES = ( z => "\x00", a => "\x07", t => "\x09", n => "\x0a", v => "\x0b", f => "\x0c", r => "\x0d", e => "\x1b", '\\' => '\\', ); # Special magic boolean words my %QUOTE = map { $_ => 1 } qw{ null Null NULL y Y yes Yes YES n N no No NO true True TRUE false False FALSE on On ON off Off OFF }; ##################################################################### # Implementation # Create an empty YAML::Tiny object sub new { my $class = shift; bless [ @_ ], $class; } # Create an object from a file sub read { my $class = ref $_[0] ? ref shift : shift; # Check the file my $file = shift or return $class->_error( 'You did not specify a file name' ); return $class->_error( "File '$file' does not exist" ) unless -e $file; return $class->_error( "'$file' is a directory, not a file" ) unless -f _; return $class->_error( "Insufficient permissions to read '$file'" ) unless -r _; # Slurp in the file local $/ = undef; local *CFG; unless ( open(CFG, $file) ) { return $class->_error("Failed to open file '$file': $!"); } my $contents = ; unless ( close(CFG) ) { return $class->_error("Failed to close file '$file': $!"); } $class->read_string( $contents ); } # Create an object from a string sub read_string { my $class = ref $_[0] ? ref shift : shift; my $self = bless [], $class; my $string = $_[0]; eval { unless ( defined $string ) { die \"Did not provide a string to load"; } # Byte order marks # NOTE: Keeping this here to educate maintainers # my %BOM = ( # "\357\273\277" => 'UTF-8', # "\376\377" => 'UTF-16BE', # "\377\376" => 'UTF-16LE', # "\377\376\0\0" => 'UTF-32LE' # "\0\0\376\377" => 'UTF-32BE', # ); if ( $string =~ /^(?:\376\377|\377\376|\377\376\0\0|\0\0\376\377)/ ) { die \"Stream has a non UTF-8 BOM"; } else { # Strip UTF-8 bom if found, we'll just ignore it $string =~ s/^\357\273\277//; } # Try to decode as utf8 utf8::decode($string) if HAVE_UTF8; # Check for some special cases return $self unless length $string; unless ( $string =~ /[\012\015]+\z/ ) { die \"Stream does not end with newline character"; } # Split the file into lines my @lines = grep { ! /^\s*(?:\#.*)?\z/ } split /(?:\015{1,2}\012|\015|\012)/, $string; # Strip the initial YAML header @lines and $lines[0] =~ /^\%YAML[: ][\d\.]+.*\z/ and shift @lines; # A nibbling parser while ( @lines ) { # Do we have a document header? if ( $lines[0] =~ /^---\s*(?:(.+)\s*)?\z/ ) { # Handle scalar documents shift @lines; if ( defined $1 and $1 !~ /^(?:\#.+|\%YAML[: ][\d\.]+)\z/ ) { push @$self, $self->_read_scalar( "$1", [ undef ], \@lines ); next; } } if ( ! @lines or $lines[0] =~ /^(?:---|\.\.\.)/ ) { # A naked document push @$self, undef; while ( @lines and $lines[0] !~ /^---/ ) { shift @lines; } } elsif ( $lines[0] =~ /^\s*\-/ ) { # An array at the root my $document = [ ]; push @$self, $document; $self->_read_array( $document, [ 0 ], \@lines ); } elsif ( $lines[0] =~ /^(\s*)\S/ ) { # A hash at the root my $document = { }; push @$self, $document; $self->_read_hash( $document, [ length($1) ], \@lines ); } else { die \"YAML::Tiny failed to classify the line '$lines[0]'"; } } }; if ( ref $@ eq 'SCALAR' ) { return $self->_error(${$@}); } elsif ( $@ ) { require Carp; Carp::croak($@); } return $self; } # Deparse a scalar string to the actual scalar sub _read_scalar { my ($self, $string, $indent, $lines) = @_; # Trim trailing whitespace $string =~ s/\s*\z//; # Explitic null/undef return undef if $string eq '~'; # Single quote if ( $string =~ /^\'(.*?)\'(?:\s+\#.*)?\z/ ) { return '' unless defined $1; $string = $1; $string =~ s/\'\'/\'/g; return $string; } # Double quote. # The commented out form is simpler, but overloaded the Perl regex # engine due to recursion and backtracking problems on strings # larger than 32,000ish characters. Keep it for reference purposes. # if ( $string =~ /^\"((?:\\.|[^\"])*)\"\z/ ) { if ( $string =~ /^\"([^\\"]*(?:\\.[^\\"]*)*)\"(?:\s+\#.*)?\z/ ) { # Reusing the variable is a little ugly, # but avoids a new variable and a string copy. $string = $1; $string =~ s/\\"/"/g; $string =~ s/\\([never\\fartz]|x([0-9a-fA-F]{2}))/(length($1)>1)?pack("H2",$2):$UNESCAPES{$1}/gex; return $string; } # Special cases if ( $string =~ /^[\'\"!&]/ ) { die \"YAML::Tiny does not support a feature in line '$string'"; } return {} if $string =~ /^{}(?:\s+\#.*)?\z/; return [] if $string =~ /^\[\](?:\s+\#.*)?\z/; # Regular unquoted string if ( $string !~ /^[>|]/ ) { if ( $string =~ /^(?:-(?:\s|$)|[\@\%\`])/ or $string =~ /:(?:\s|$)/ ) { die \"YAML::Tiny found illegal characters in plain scalar: '$string'"; } $string =~ s/\s+#.*\z//; return $string; } # Error die \"YAML::Tiny failed to find multi-line scalar content" unless @$lines; # Check the indent depth $lines->[0] =~ /^(\s*)/; $indent->[-1] = length("$1"); if ( defined $indent->[-2] and $indent->[-1] <= $indent->[-2] ) { die \"YAML::Tiny found bad indenting in line '$lines->[0]'"; } # Pull the lines my @multiline = (); while ( @$lines ) { $lines->[0] =~ /^(\s*)/; last unless length($1) >= $indent->[-1]; push @multiline, substr(shift(@$lines), length($1)); } my $j = (substr($string, 0, 1) eq '>') ? ' ' : "\n"; my $t = (substr($string, 1, 1) eq '-') ? '' : "\n"; return join( $j, @multiline ) . $t; } # Parse an array sub _read_array { my ($self, $array, $indent, $lines) = @_; while ( @$lines ) { # Check for a new document if ( $lines->[0] =~ /^(?:---|\.\.\.)/ ) { while ( @$lines and $lines->[0] !~ /^---/ ) { shift @$lines; } return 1; } # Check the indent level $lines->[0] =~ /^(\s*)/; if ( length($1) < $indent->[-1] ) { return 1; } elsif ( length($1) > $indent->[-1] ) { die \"YAML::Tiny found bad indenting in line '$lines->[0]'"; } if ( $lines->[0] =~ /^(\s*\-\s+)[^\'\"]\S*\s*:(?:\s+|$)/ ) { # Inline nested hash my $indent2 = length("$1"); $lines->[0] =~ s/-/ /; push @$array, { }; $self->_read_hash( $array->[-1], [ @$indent, $indent2 ], $lines ); } elsif ( $lines->[0] =~ /^\s*\-(\s*)(.+?)\s*\z/ ) { # Array entry with a value shift @$lines; push @$array, $self->_read_scalar( "$2", [ @$indent, undef ], $lines ); } elsif ( $lines->[0] =~ /^\s*\-\s*\z/ ) { shift @$lines; unless ( @$lines ) { push @$array, undef; return 1; } if ( $lines->[0] =~ /^(\s*)\-/ ) { my $indent2 = length("$1"); if ( $indent->[-1] == $indent2 ) { # Null array entry push @$array, undef; } else { # Naked indenter push @$array, [ ]; $self->_read_array( $array->[-1], [ @$indent, $indent2 ], $lines ); } } elsif ( $lines->[0] =~ /^(\s*)\S/ ) { push @$array, { }; $self->_read_hash( $array->[-1], [ @$indent, length("$1") ], $lines ); } else { die \"YAML::Tiny failed to classify line '$lines->[0]'"; } } elsif ( defined $indent->[-2] and $indent->[-1] == $indent->[-2] ) { # This is probably a structure like the following... # --- # foo: # - list # bar: value # # ... so lets return and let the hash parser handle it return 1; } else { die \"YAML::Tiny failed to classify line '$lines->[0]'"; } } return 1; } # Parse an array sub _read_hash { my ($self, $hash, $indent, $lines) = @_; while ( @$lines ) { # Check for a new document if ( $lines->[0] =~ /^(?:---|\.\.\.)/ ) { while ( @$lines and $lines->[0] !~ /^---/ ) { shift @$lines; } return 1; } # Check the indent level $lines->[0] =~ /^(\s*)/; if ( length($1) < $indent->[-1] ) { return 1; } elsif ( length($1) > $indent->[-1] ) { die \"YAML::Tiny found bad indenting in line '$lines->[0]'"; } # Get the key unless ( $lines->[0] =~ s/^\s*([^\'\" ][^\n]*?)\s*:(\s+(?:\#.*)?|$)// ) { if ( $lines->[0] =~ /^\s*[?\'\"]/ ) { die \"YAML::Tiny does not support a feature in line '$lines->[0]'"; } die \"YAML::Tiny failed to classify line '$lines->[0]'"; } my $key = $1; # Do we have a value? if ( length $lines->[0] ) { # Yes $hash->{$key} = $self->_read_scalar( shift(@$lines), [ @$indent, undef ], $lines ); } else { # An indent shift @$lines; unless ( @$lines ) { $hash->{$key} = undef; return 1; } if ( $lines->[0] =~ /^(\s*)-/ ) { $hash->{$key} = []; $self->_read_array( $hash->{$key}, [ @$indent, length($1) ], $lines ); } elsif ( $lines->[0] =~ /^(\s*)./ ) { my $indent2 = length("$1"); if ( $indent->[-1] >= $indent2 ) { # Null hash entry $hash->{$key} = undef; } else { $hash->{$key} = {}; $self->_read_hash( $hash->{$key}, [ @$indent, length($1) ], $lines ); } } } } return 1; } # Save an object to a file sub write { my $self = shift; my $file = shift or return $self->_error('No file name provided'); # Write it to the file open( CFG, '>' . $file ) or return $self->_error( "Failed to open file '$file' for writing: $!" ); print CFG $self->write_string; close CFG; return 1; } # Save an object to a string sub write_string { my $self = shift; return '' unless @$self; # Iterate over the documents my $indent = 0; my @lines = (); foreach my $cursor ( @$self ) { push @lines, '---'; # An empty document if ( ! defined $cursor ) { # Do nothing # A scalar document } elsif ( ! ref $cursor ) { $lines[-1] .= ' ' . $self->_write_scalar( $cursor, $indent ); # A list at the root } elsif ( ref $cursor eq 'ARRAY' ) { unless ( @$cursor ) { $lines[-1] .= ' []'; next; } push @lines, $self->_write_array( $cursor, $indent, {} ); # A hash at the root } elsif ( ref $cursor eq 'HASH' ) { unless ( %$cursor ) { $lines[-1] .= ' {}'; next; } push @lines, $self->_write_hash( $cursor, $indent, {} ); } else { Carp::croak("Cannot serialize " . ref($cursor)); } } join '', map { "$_\n" } @lines; } sub _write_scalar { my $string = $_[1]; return '~' unless defined $string; return "''" unless length $string; if ( $string =~ /[\x00-\x08\x0b-\x0d\x0e-\x1f\"\'\n]/ ) { $string =~ s/\\/\\\\/g; $string =~ s/"/\\"/g; $string =~ s/\n/\\n/g; $string =~ s/([\x00-\x1f])/\\$UNPRINTABLE[ord($1)]/g; return qq|"$string"|; } if ( $string =~ /(?:^\W|\s|:\z)/ or $QUOTE{$string} ) { return "'$string'"; } return $string; } sub _write_array { my ($self, $array, $indent, $seen) = @_; if ( $seen->{refaddr($array)}++ ) { die "YAML::Tiny does not support circular references"; } my @lines = (); foreach my $el ( @$array ) { my $line = (' ' x $indent) . '-'; my $type = ref $el; if ( ! $type ) { $line .= ' ' . $self->_write_scalar( $el, $indent + 1 ); push @lines, $line; } elsif ( $type eq 'ARRAY' ) { if ( @$el ) { push @lines, $line; push @lines, $self->_write_array( $el, $indent + 1, $seen ); } else { $line .= ' []'; push @lines, $line; } } elsif ( $type eq 'HASH' ) { if ( keys %$el ) { push @lines, $line; push @lines, $self->_write_hash( $el, $indent + 1, $seen ); } else { $line .= ' {}'; push @lines, $line; } } else { die "YAML::Tiny does not support $type references"; } } @lines; } sub _write_hash { my ($self, $hash, $indent, $seen) = @_; if ( $seen->{refaddr($hash)}++ ) { die "YAML::Tiny does not support circular references"; } my @lines = (); foreach my $name ( sort keys %$hash ) { my $el = $hash->{$name}; my $line = (' ' x $indent) . "$name:"; my $type = ref $el; if ( ! $type ) { $line .= ' ' . $self->_write_scalar( $el, $indent + 1 ); push @lines, $line; } elsif ( $type eq 'ARRAY' ) { if ( @$el ) { push @lines, $line; push @lines, $self->_write_array( $el, $indent + 1, $seen ); } else { $line .= ' []'; push @lines, $line; } } elsif ( $type eq 'HASH' ) { if ( keys %$el ) { push @lines, $line; push @lines, $self->_write_hash( $el, $indent + 1, $seen ); } else { $line .= ' {}'; push @lines, $line; } } else { die "YAML::Tiny does not support $type references"; } } @lines; } # Set error sub _error { $YAML::Tiny::errstr = $_[1]; undef; } # Retrieve error sub errstr { $YAML::Tiny::errstr; } ##################################################################### # YAML Compatibility sub Dump { YAML::Tiny->new(@_)->write_string; } sub Load { my $self = YAML::Tiny->read_string(@_); unless ( $self ) { Carp::croak("Failed to load YAML document from string"); } if ( wantarray ) { return @$self; } else { # To match YAML.pm, return the last document return $self->[-1]; } } BEGIN { *freeze = *Dump; *thaw = *Load; } sub DumpFile { my $file = shift; YAML::Tiny->new(@_)->write($file); } sub LoadFile { my $self = YAML::Tiny->read($_[0]); unless ( $self ) { Carp::croak("Failed to load YAML document from '" . ($_[0] || '') . "'"); } if ( wantarray ) { return @$self; } else { # Return only the last document to match YAML.pm, return $self->[-1]; } } ##################################################################### # Use Scalar::Util if possible, otherwise emulate it BEGIN { local $@; eval { require Scalar::Util; }; my $v = eval("$Scalar::Util::VERSION") || 0; if ( $@ or $v < 1.18 ) { eval <<'END_PERL'; # Scalar::Util failed to load or too old sub refaddr { my $pkg = ref($_[0]) or return undef; if ( !! UNIVERSAL::can($_[0], 'can') ) { bless $_[0], 'Scalar::Util::Fake'; } else { $pkg = undef; } "$_[0]" =~ /0x(\w+)/; my $i = do { local $^W; hex $1 }; bless $_[0], $pkg if defined $pkg; $i; } END_PERL } else { *refaddr = *Scalar::Util::refaddr; } } 1; __END__ #line 1175 HTML-HTML5-Parser-0.301/inc/Module/0000755000175000017500000000000012166545247014470 5ustar taitaiHTML-HTML5-Parser-0.301/inc/Module/AutoInstall.pm0000644000175000017500000006216212166545200017261 0ustar taitai#line 1 package Module::AutoInstall; use strict; use Cwd (); use File::Spec (); use ExtUtils::MakeMaker (); use vars qw{$VERSION}; BEGIN { $VERSION = '1.06'; } # special map on pre-defined feature sets my %FeatureMap = ( '' => 'Core Features', # XXX: deprecated '-core' => 'Core Features', ); # various lexical flags my ( @Missing, @Existing, %DisabledTests, $UnderCPAN, $InstallDepsTarget, $HasCPANPLUS ); my ( $Config, $CheckOnly, $SkipInstall, $AcceptDefault, $TestOnly, $AllDeps, $UpgradeDeps ); my ( $PostambleActions, $PostambleActionsNoTest, $PostambleActionsUpgradeDeps, $PostambleActionsUpgradeDepsNoTest, $PostambleActionsListDeps, $PostambleActionsListAllDeps, $PostambleUsed, $NoTest); # See if it's a testing or non-interactive session _accept_default( $ENV{AUTOMATED_TESTING} or ! -t STDIN ); _init(); sub _accept_default { $AcceptDefault = shift; } sub _installdeps_target { $InstallDepsTarget = shift; } sub missing_modules { return @Missing; } sub do_install { __PACKAGE__->install( [ $Config ? ( UNIVERSAL::isa( $Config, 'HASH' ) ? %{$Config} : @{$Config} ) : () ], @Missing, ); } # initialize various flags, and/or perform install sub _init { foreach my $arg ( @ARGV, split( /[\s\t]+/, $ENV{PERL_AUTOINSTALL} || $ENV{PERL_EXTUTILS_AUTOINSTALL} || '' ) ) { if ( $arg =~ /^--config=(.*)$/ ) { $Config = [ split( ',', $1 ) ]; } elsif ( $arg =~ /^--installdeps=(.*)$/ ) { __PACKAGE__->install( $Config, @Missing = split( /,/, $1 ) ); exit 0; } elsif ( $arg =~ /^--upgradedeps=(.*)$/ ) { $UpgradeDeps = 1; __PACKAGE__->install( $Config, @Missing = split( /,/, $1 ) ); exit 0; } elsif ( $arg =~ /^--default(?:deps)?$/ ) { $AcceptDefault = 1; } elsif ( $arg =~ /^--check(?:deps)?$/ ) { $CheckOnly = 1; } elsif ( $arg =~ /^--skip(?:deps)?$/ ) { $SkipInstall = 1; } elsif ( $arg =~ /^--test(?:only)?$/ ) { $TestOnly = 1; } elsif ( $arg =~ /^--all(?:deps)?$/ ) { $AllDeps = 1; } } } # overrides MakeMaker's prompt() to automatically accept the default choice sub _prompt { goto &ExtUtils::MakeMaker::prompt unless $AcceptDefault; my ( $prompt, $default ) = @_; my $y = ( $default =~ /^[Yy]/ ); print $prompt, ' [', ( $y ? 'Y' : 'y' ), '/', ( $y ? 'n' : 'N' ), '] '; print "$default\n"; return $default; } # the workhorse sub import { my $class = shift; my @args = @_ or return; my $core_all; print "*** $class version " . $class->VERSION . "\n"; print "*** Checking for Perl dependencies...\n"; my $cwd = Cwd::cwd(); $Config = []; my $maxlen = length( ( sort { length($b) <=> length($a) } grep { /^[^\-]/ } map { ref($_) ? ( ( ref($_) eq 'HASH' ) ? keys(%$_) : @{$_} ) : '' } map { +{@args}->{$_} } grep { /^[^\-]/ or /^-core$/i } keys %{ +{@args} } )[0] ); # We want to know if we're under CPAN early to avoid prompting, but # if we aren't going to try and install anything anyway then skip the # check entirely since we don't want to have to load (and configure) # an old CPAN just for a cosmetic message $UnderCPAN = _check_lock(1) unless $SkipInstall || $InstallDepsTarget; while ( my ( $feature, $modules ) = splice( @args, 0, 2 ) ) { my ( @required, @tests, @skiptests ); my $default = 1; my $conflict = 0; if ( $feature =~ m/^-(\w+)$/ ) { my $option = lc($1); # check for a newer version of myself _update_to( $modules, @_ ) and return if $option eq 'version'; # sets CPAN configuration options $Config = $modules if $option eq 'config'; # promote every features to core status $core_all = ( $modules =~ /^all$/i ) and next if $option eq 'core'; next unless $option eq 'core'; } print "[" . ( $FeatureMap{ lc($feature) } || $feature ) . "]\n"; $modules = [ %{$modules} ] if UNIVERSAL::isa( $modules, 'HASH' ); unshift @$modules, -default => &{ shift(@$modules) } if ( ref( $modules->[0] ) eq 'CODE' ); # XXX: bugward combatability while ( my ( $mod, $arg ) = splice( @$modules, 0, 2 ) ) { if ( $mod =~ m/^-(\w+)$/ ) { my $option = lc($1); $default = $arg if ( $option eq 'default' ); $conflict = $arg if ( $option eq 'conflict' ); @tests = @{$arg} if ( $option eq 'tests' ); @skiptests = @{$arg} if ( $option eq 'skiptests' ); next; } printf( "- %-${maxlen}s ...", $mod ); if ( $arg and $arg =~ /^\D/ ) { unshift @$modules, $arg; $arg = 0; } # XXX: check for conflicts and uninstalls(!) them. my $cur = _version_of($mod); if (_version_cmp ($cur, $arg) >= 0) { print "loaded. ($cur" . ( $arg ? " >= $arg" : '' ) . ")\n"; push @Existing, $mod => $arg; $DisabledTests{$_} = 1 for map { glob($_) } @skiptests; } else { if (not defined $cur) # indeed missing { print "missing." . ( $arg ? " (would need $arg)" : '' ) . "\n"; } else { # no need to check $arg as _version_cmp ($cur, undef) would satisfy >= above print "too old. ($cur < $arg)\n"; } push @required, $mod => $arg; } } next unless @required; my $mandatory = ( $feature eq '-core' or $core_all ); if ( !$SkipInstall and ( $CheckOnly or ($mandatory and $UnderCPAN) or $AllDeps or $InstallDepsTarget or _prompt( qq{==> Auto-install the } . ( @required / 2 ) . ( $mandatory ? ' mandatory' : ' optional' ) . qq{ module(s) from CPAN?}, $default ? 'y' : 'n', ) =~ /^[Yy]/ ) ) { push( @Missing, @required ); $DisabledTests{$_} = 1 for map { glob($_) } @skiptests; } elsif ( !$SkipInstall and $default and $mandatory and _prompt( qq{==> The module(s) are mandatory! Really skip?}, 'n', ) =~ /^[Nn]/ ) { push( @Missing, @required ); $DisabledTests{$_} = 1 for map { glob($_) } @skiptests; } else { $DisabledTests{$_} = 1 for map { glob($_) } @tests; } } if ( @Missing and not( $CheckOnly or $UnderCPAN) ) { require Config; my $make = $Config::Config{make}; if ($InstallDepsTarget) { print "*** To install dependencies type '$make installdeps' or '$make installdeps_notest'.\n"; } else { print "*** Dependencies will be installed the next time you type '$make'.\n"; } # make an educated guess of whether we'll need root permission. print " (You may need to do that as the 'root' user.)\n" if eval '$>'; } print "*** $class configuration finished.\n"; chdir $cwd; # import to main:: no strict 'refs'; *{'main::WriteMakefile'} = \&Write if caller(0) eq 'main'; return (@Existing, @Missing); } sub _running_under { my $thing = shift; print <<"END_MESSAGE"; *** Since we're running under ${thing}, I'll just let it take care of the dependency's installation later. END_MESSAGE return 1; } # Check to see if we are currently running under CPAN.pm and/or CPANPLUS; # if we are, then we simply let it taking care of our dependencies sub _check_lock { return unless @Missing or @_; if ($ENV{PERL5_CPANM_IS_RUNNING}) { return _running_under('cpanminus'); } my $cpan_env = $ENV{PERL5_CPAN_IS_RUNNING}; if ($ENV{PERL5_CPANPLUS_IS_RUNNING}) { return _running_under($cpan_env ? 'CPAN' : 'CPANPLUS'); } require CPAN; if ($CPAN::VERSION > '1.89') { if ($cpan_env) { return _running_under('CPAN'); } return; # CPAN.pm new enough, don't need to check further } # last ditch attempt, this -will- configure CPAN, very sorry _load_cpan(1); # force initialize even though it's already loaded # Find the CPAN lock-file my $lock = MM->catfile( $CPAN::Config->{cpan_home}, ".lock" ); return unless -f $lock; # Check the lock local *LOCK; return unless open(LOCK, $lock); if ( ( $^O eq 'MSWin32' ? _under_cpan() : == getppid() ) and ( $CPAN::Config->{prerequisites_policy} || '' ) ne 'ignore' ) { print <<'END_MESSAGE'; *** Since we're running under CPAN, I'll just let it take care of the dependency's installation later. END_MESSAGE return 1; } close LOCK; return; } sub install { my $class = shift; my $i; # used below to strip leading '-' from config keys my @config = ( map { s/^-// if ++$i; $_ } @{ +shift } ); my ( @modules, @installed ); while ( my ( $pkg, $ver ) = splice( @_, 0, 2 ) ) { # grep out those already installed if ( _version_cmp( _version_of($pkg), $ver ) >= 0 ) { push @installed, $pkg; } else { push @modules, $pkg, $ver; } } if ($UpgradeDeps) { push @modules, @installed; @installed = (); } return @installed unless @modules; # nothing to do return @installed if _check_lock(); # defer to the CPAN shell print "*** Installing dependencies...\n"; return unless _connected_to('cpan.org'); my %args = @config; my %failed; local *FAILED; if ( $args{do_once} and open( FAILED, '.#autoinstall.failed' ) ) { while () { chomp; $failed{$_}++ } close FAILED; my @newmod; while ( my ( $k, $v ) = splice( @modules, 0, 2 ) ) { push @newmod, ( $k => $v ) unless $failed{$k}; } @modules = @newmod; } if ( _has_cpanplus() and not $ENV{PERL_AUTOINSTALL_PREFER_CPAN} ) { _install_cpanplus( \@modules, \@config ); } else { _install_cpan( \@modules, \@config ); } print "*** $class installation finished.\n"; # see if we have successfully installed them while ( my ( $pkg, $ver ) = splice( @modules, 0, 2 ) ) { if ( _version_cmp( _version_of($pkg), $ver ) >= 0 ) { push @installed, $pkg; } elsif ( $args{do_once} and open( FAILED, '>> .#autoinstall.failed' ) ) { print FAILED "$pkg\n"; } } close FAILED if $args{do_once}; return @installed; } sub _install_cpanplus { my @modules = @{ +shift }; my @config = _cpanplus_config( @{ +shift } ); my $installed = 0; require CPANPLUS::Backend; my $cp = CPANPLUS::Backend->new; my $conf = $cp->configure_object; return unless $conf->can('conf') # 0.05x+ with "sudo" support or _can_write($conf->_get_build('base')); # 0.04x # if we're root, set UNINST=1 to avoid trouble unless user asked for it. my $makeflags = $conf->get_conf('makeflags') || ''; if ( UNIVERSAL::isa( $makeflags, 'HASH' ) ) { # 0.03+ uses a hashref here $makeflags->{UNINST} = 1 unless exists $makeflags->{UNINST}; } else { # 0.02 and below uses a scalar $makeflags = join( ' ', split( ' ', $makeflags ), 'UNINST=1' ) if ( $makeflags !~ /\bUNINST\b/ and eval qq{ $> eq '0' } ); } $conf->set_conf( makeflags => $makeflags ); $conf->set_conf( prereqs => 1 ); while ( my ( $key, $val ) = splice( @config, 0, 2 ) ) { $conf->set_conf( $key, $val ); } my $modtree = $cp->module_tree; while ( my ( $pkg, $ver ) = splice( @modules, 0, 2 ) ) { print "*** Installing $pkg...\n"; MY::preinstall( $pkg, $ver ) or next if defined &MY::preinstall; my $success; my $obj = $modtree->{$pkg}; if ( $obj and _version_cmp( $obj->{version}, $ver ) >= 0 ) { my $pathname = $pkg; $pathname =~ s/::/\\W/; foreach my $inc ( grep { m/$pathname.pm/i } keys(%INC) ) { delete $INC{$inc}; } my $rv = $cp->install( modules => [ $obj->{module} ] ); if ( $rv and ( $rv->{ $obj->{module} } or $rv->{ok} ) ) { print "*** $pkg successfully installed.\n"; $success = 1; } else { print "*** $pkg installation cancelled.\n"; $success = 0; } $installed += $success; } else { print << "."; *** Could not find a version $ver or above for $pkg; skipping. . } MY::postinstall( $pkg, $ver, $success ) if defined &MY::postinstall; } return $installed; } sub _cpanplus_config { my @config = (); while ( @_ ) { my ($key, $value) = (shift(), shift()); if ( $key eq 'prerequisites_policy' ) { if ( $value eq 'follow' ) { $value = CPANPLUS::Internals::Constants::PREREQ_INSTALL(); } elsif ( $value eq 'ask' ) { $value = CPANPLUS::Internals::Constants::PREREQ_ASK(); } elsif ( $value eq 'ignore' ) { $value = CPANPLUS::Internals::Constants::PREREQ_IGNORE(); } else { die "*** Cannot convert option $key = '$value' to CPANPLUS version.\n"; } push @config, 'prereqs', $value; } elsif ( $key eq 'force' ) { push @config, $key, $value; } elsif ( $key eq 'notest' ) { push @config, 'skiptest', $value; } else { die "*** Cannot convert option $key to CPANPLUS version.\n"; } } return @config; } sub _install_cpan { my @modules = @{ +shift }; my @config = @{ +shift }; my $installed = 0; my %args; _load_cpan(); require Config; if (CPAN->VERSION < 1.80) { # no "sudo" support, probe for writableness return unless _can_write( MM->catfile( $CPAN::Config->{cpan_home}, 'sources' ) ) and _can_write( $Config::Config{sitelib} ); } # if we're root, set UNINST=1 to avoid trouble unless user asked for it. my $makeflags = $CPAN::Config->{make_install_arg} || ''; $CPAN::Config->{make_install_arg} = join( ' ', split( ' ', $makeflags ), 'UNINST=1' ) if ( $makeflags !~ /\bUNINST\b/ and eval qq{ $> eq '0' } ); # don't show start-up info $CPAN::Config->{inhibit_startup_message} = 1; # set additional options while ( my ( $opt, $arg ) = splice( @config, 0, 2 ) ) { ( $args{$opt} = $arg, next ) if $opt =~ /^(?:force|notest)$/; # pseudo-option $CPAN::Config->{$opt} = $arg; } if ($args{notest} && (not CPAN::Shell->can('notest'))) { die "Your version of CPAN is too old to support the 'notest' pragma"; } local $CPAN::Config->{prerequisites_policy} = 'follow'; while ( my ( $pkg, $ver ) = splice( @modules, 0, 2 ) ) { MY::preinstall( $pkg, $ver ) or next if defined &MY::preinstall; print "*** Installing $pkg...\n"; my $obj = CPAN::Shell->expand( Module => $pkg ); my $success = 0; if ( $obj and _version_cmp( $obj->cpan_version, $ver ) >= 0 ) { my $pathname = $pkg; $pathname =~ s/::/\\W/; foreach my $inc ( grep { m/$pathname.pm/i } keys(%INC) ) { delete $INC{$inc}; } my $rv = do { if ($args{force}) { CPAN::Shell->force( install => $pkg ) } elsif ($args{notest}) { CPAN::Shell->notest( install => $pkg ) } else { CPAN::Shell->install($pkg) } }; $rv ||= eval { $CPAN::META->instance( 'CPAN::Distribution', $obj->cpan_file, ) ->{install} if $CPAN::META; }; if ( $rv eq 'YES' ) { print "*** $pkg successfully installed.\n"; $success = 1; } else { print "*** $pkg installation failed.\n"; $success = 0; } $installed += $success; } else { print << "."; *** Could not find a version $ver or above for $pkg; skipping. . } MY::postinstall( $pkg, $ver, $success ) if defined &MY::postinstall; } return $installed; } sub _has_cpanplus { return ( $HasCPANPLUS = ( $INC{'CPANPLUS/Config.pm'} or _load('CPANPLUS::Shell::Default') ) ); } # make guesses on whether we're under the CPAN installation directory sub _under_cpan { require Cwd; require File::Spec; my $cwd = File::Spec->canonpath( Cwd::cwd() ); my $cpan = File::Spec->canonpath( $CPAN::Config->{cpan_home} ); return ( index( $cwd, $cpan ) > -1 ); } sub _update_to { my $class = __PACKAGE__; my $ver = shift; return if _version_cmp( _version_of($class), $ver ) >= 0; # no need to upgrade if ( _prompt( "==> A newer version of $class ($ver) is required. Install?", 'y' ) =~ /^[Nn]/ ) { die "*** Please install $class $ver manually.\n"; } print << "."; *** Trying to fetch it from CPAN... . # install ourselves _load($class) and return $class->import(@_) if $class->install( [], $class, $ver ); print << '.'; exit 1; *** Cannot bootstrap myself. :-( Installation terminated. . } # check if we're connected to some host, using inet_aton sub _connected_to { my $site = shift; return ( ( _load('Socket') and Socket::inet_aton($site) ) or _prompt( qq( *** Your host cannot resolve the domain name '$site', which probably means the Internet connections are unavailable. ==> Should we try to install the required module(s) anyway?), 'n' ) =~ /^[Yy]/ ); } # check if a directory is writable; may create it on demand sub _can_write { my $path = shift; mkdir( $path, 0755 ) unless -e $path; return 1 if -w $path; print << "."; *** You are not allowed to write to the directory '$path'; the installation may fail due to insufficient permissions. . if ( eval '$>' and lc(`sudo -V`) =~ /version/ and _prompt( qq( ==> Should we try to re-execute the autoinstall process with 'sudo'?), ((-t STDIN) ? 'y' : 'n') ) =~ /^[Yy]/ ) { # try to bootstrap ourselves from sudo print << "."; *** Trying to re-execute the autoinstall process with 'sudo'... . my $missing = join( ',', @Missing ); my $config = join( ',', UNIVERSAL::isa( $Config, 'HASH' ) ? %{$Config} : @{$Config} ) if $Config; return unless system( 'sudo', $^X, $0, "--config=$config", "--installdeps=$missing" ); print << "."; *** The 'sudo' command exited with error! Resuming... . } return _prompt( qq( ==> Should we try to install the required module(s) anyway?), 'n' ) =~ /^[Yy]/; } # load a module and return the version it reports sub _load { my $mod = pop; # method/function doesn't matter my $file = $mod; $file =~ s|::|/|g; $file .= '.pm'; local $@; return eval { require $file; $mod->VERSION } || ( $@ ? undef: 0 ); } # report version without loading a module sub _version_of { my $mod = pop; # method/function doesn't matter my $file = $mod; $file =~ s|::|/|g; $file .= '.pm'; foreach my $dir ( @INC ) { next if ref $dir; my $path = File::Spec->catfile($dir, $file); next unless -e $path; require ExtUtils::MM_Unix; return ExtUtils::MM_Unix->parse_version($path); } return undef; } # Load CPAN.pm and it's configuration sub _load_cpan { return if $CPAN::VERSION and $CPAN::Config and not @_; require CPAN; # CPAN-1.82+ adds CPAN::Config::AUTOLOAD to redirect to # CPAN::HandleConfig->load. CPAN reports that the redirection # is deprecated in a warning printed at the user. # CPAN-1.81 expects CPAN::HandleConfig->load, does not have # $CPAN::HandleConfig::VERSION but cannot handle # CPAN::Config->load # Which "versions expect CPAN::Config->load? if ( $CPAN::HandleConfig::VERSION || CPAN::HandleConfig->can('load') ) { # Newer versions of CPAN have a HandleConfig module CPAN::HandleConfig->load; } else { # Older versions had the load method in Config directly CPAN::Config->load; } } # compare two versions, either use Sort::Versions or plain comparison # return values same as <=> sub _version_cmp { my ( $cur, $min ) = @_; return -1 unless defined $cur; # if 0 keep comparing return 1 unless $min; $cur =~ s/\s+$//; # check for version numbers that are not in decimal format if ( ref($cur) or ref($min) or $cur =~ /v|\..*\./ or $min =~ /v|\..*\./ ) { if ( ( $version::VERSION or defined( _load('version') )) and version->can('new') ) { # use version.pm if it is installed. return version->new($cur) <=> version->new($min); } elsif ( $Sort::Versions::VERSION or defined( _load('Sort::Versions') ) ) { # use Sort::Versions as the sorting algorithm for a.b.c versions return Sort::Versions::versioncmp( $cur, $min ); } warn "Cannot reliably compare non-decimal formatted versions.\n" . "Please install version.pm or Sort::Versions.\n"; } # plain comparison local $^W = 0; # shuts off 'not numeric' bugs return $cur <=> $min; } # nothing; this usage is deprecated. sub main::PREREQ_PM { return {}; } sub _make_args { my %args = @_; $args{PREREQ_PM} = { %{ $args{PREREQ_PM} || {} }, @Existing, @Missing } if $UnderCPAN or $TestOnly; if ( $args{EXE_FILES} and -e 'MANIFEST' ) { require ExtUtils::Manifest; my $manifest = ExtUtils::Manifest::maniread('MANIFEST'); $args{EXE_FILES} = [ grep { exists $manifest->{$_} } @{ $args{EXE_FILES} } ]; } $args{test}{TESTS} ||= 't/*.t'; $args{test}{TESTS} = join( ' ', grep { !exists( $DisabledTests{$_} ) } map { glob($_) } split( /\s+/, $args{test}{TESTS} ) ); my $missing = join( ',', @Missing ); my $config = join( ',', UNIVERSAL::isa( $Config, 'HASH' ) ? %{$Config} : @{$Config} ) if $Config; $PostambleActions = ( ($missing and not $UnderCPAN) ? "\$(PERL) $0 --config=$config --installdeps=$missing" : "\$(NOECHO) \$(NOOP)" ); my $deps_list = join( ',', @Missing, @Existing ); $PostambleActionsUpgradeDeps = "\$(PERL) $0 --config=$config --upgradedeps=$deps_list"; my $config_notest = join( ',', (UNIVERSAL::isa( $Config, 'HASH' ) ? %{$Config} : @{$Config}), 'notest', 1 ) if $Config; $PostambleActionsNoTest = ( ($missing and not $UnderCPAN) ? "\$(PERL) $0 --config=$config_notest --installdeps=$missing" : "\$(NOECHO) \$(NOOP)" ); $PostambleActionsUpgradeDepsNoTest = "\$(PERL) $0 --config=$config_notest --upgradedeps=$deps_list"; $PostambleActionsListDeps = '@$(PERL) -le "print for @ARGV" ' . join(' ', map $Missing[$_], grep $_ % 2 == 0, 0..$#Missing); my @all = (@Missing, @Existing); $PostambleActionsListAllDeps = '@$(PERL) -le "print for @ARGV" ' . join(' ', map $all[$_], grep $_ % 2 == 0, 0..$#all); return %args; } # a wrapper to ExtUtils::MakeMaker::WriteMakefile sub Write { require Carp; Carp::croak "WriteMakefile: Need even number of args" if @_ % 2; if ($CheckOnly) { print << "."; *** Makefile not written in check-only mode. . return; } my %args = _make_args(@_); no strict 'refs'; $PostambleUsed = 0; local *MY::postamble = \&postamble unless defined &MY::postamble; ExtUtils::MakeMaker::WriteMakefile(%args); print << "." unless $PostambleUsed; *** WARNING: Makefile written with customized MY::postamble() without including contents from Module::AutoInstall::postamble() -- auto installation features disabled. Please contact the author. . return 1; } sub postamble { $PostambleUsed = 1; my $fragment; $fragment .= <<"AUTO_INSTALL" if !$InstallDepsTarget; config :: installdeps \t\$(NOECHO) \$(NOOP) AUTO_INSTALL $fragment .= <<"END_MAKE"; checkdeps :: \t\$(PERL) $0 --checkdeps installdeps :: \t$PostambleActions installdeps_notest :: \t$PostambleActionsNoTest upgradedeps :: \t$PostambleActionsUpgradeDeps upgradedeps_notest :: \t$PostambleActionsUpgradeDepsNoTest listdeps :: \t$PostambleActionsListDeps listalldeps :: \t$PostambleActionsListAllDeps END_MAKE return $fragment; } 1; __END__ #line 1193 HTML-HTML5-Parser-0.301/inc/Module/Package.pm0000644000175000017500000000311412166545203016350 0ustar taitai#line 1 ## # name: Module::Package # abstract: Postmodern Perl Module Packaging # author: Ingy döt Net # license: perl # copyright: 2011 # see: # - Module::Package::Plugin # - Module::Install::Package # - Module::Package::Tutorial package Module::Package; use 5.005; use strict; BEGIN { $Module::Package::VERSION = '0.30'; $inc::Module::Package::VERSION ||= $Module::Package::VERSION; @inc::Module::Package::ISA = __PACKAGE__; } sub import { my $class = shift; $INC{'inc/Module/Install.pm'} = __FILE__; unshift @INC, 'inc' unless $INC[0] eq 'inc'; eval "use Module::Install 1.01 (); 1" or $class->error($@); package main; Module::Install->import(); eval { module_package_internals_version_check($Module::Package::VERSION); module_package_internals_init(@_); }; if ($@) { $Module::Package::ERROR = $@; die $@; } } # XXX Remove this when things are stable. sub error { my ($class, $error) = @_; if (-e 'inc' and not -e 'inc/.author') { require Data::Dumper; $Data::Dumper::Sortkeys = 1; my $dump1 = Data::Dumper::Dumper(\%INC); my $dump2 = Data::Dumper::Dumper(\@INC); die <<"..."; This should not have happened. Hopefully this dump will explain the problem: inc::Module::Package: $inc::Module::Package::VERSION Module::Package: $Module::Package::VERSION inc::Module::Install: $inc::Module::Install::VERSION Module::Install: $Module::Install::VERSION Error: $error %INC: $dump1 \@INC: $dump2 ... } else { die $error; } } 1; HTML-HTML5-Parser-0.301/inc/Module/Install/0000755000175000017500000000000012166545247016076 5ustar taitaiHTML-HTML5-Parser-0.301/inc/Module/Install/Fetch.pm0000644000175000017500000000462712166545201017464 0ustar taitai#line 1 package Module::Install::Fetch; use strict; use Module::Install::Base (); use vars qw{$VERSION @ISA $ISCORE}; BEGIN { $VERSION = '1.06'; @ISA = 'Module::Install::Base'; $ISCORE = 1; } sub get_file { my ($self, %args) = @_; my ($scheme, $host, $path, $file) = $args{url} =~ m|^(\w+)://([^/]+)(.+)/(.+)| or return; if ( $scheme eq 'http' and ! eval { require LWP::Simple; 1 } ) { $args{url} = $args{ftp_url} or (warn("LWP support unavailable!\n"), return); ($scheme, $host, $path, $file) = $args{url} =~ m|^(\w+)://([^/]+)(.+)/(.+)| or return; } $|++; print "Fetching '$file' from $host... "; unless (eval { require Socket; Socket::inet_aton($host) }) { warn "'$host' resolve failed!\n"; return; } return unless $scheme eq 'ftp' or $scheme eq 'http'; require Cwd; my $dir = Cwd::getcwd(); chdir $args{local_dir} or return if exists $args{local_dir}; if (eval { require LWP::Simple; 1 }) { LWP::Simple::mirror($args{url}, $file); } elsif (eval { require Net::FTP; 1 }) { eval { # use Net::FTP to get past firewall my $ftp = Net::FTP->new($host, Passive => 1, Timeout => 600); $ftp->login("anonymous", 'anonymous@example.com'); $ftp->cwd($path); $ftp->binary; $ftp->get($file) or (warn("$!\n"), return); $ftp->quit; } } elsif (my $ftp = $self->can_run('ftp')) { eval { # no Net::FTP, fallback to ftp.exe require FileHandle; my $fh = FileHandle->new; local $SIG{CHLD} = 'IGNORE'; unless ($fh->open("|$ftp -n")) { warn "Couldn't open ftp: $!\n"; chdir $dir; return; } my @dialog = split(/\n/, <<"END_FTP"); open $host user anonymous anonymous\@example.com cd $path binary get $file $file quit END_FTP foreach (@dialog) { $fh->print("$_\n") } $fh->close; } } else { warn "No working 'ftp' program available!\n"; chdir $dir; return; } unless (-f $file) { warn "Fetching failed: $@\n"; chdir $dir; return; } return if exists $args{size} and -s $file != $args{size}; system($args{run}) if exists $args{run}; unlink($file) if $args{remove}; print(((!exists $args{check_for} or -e $args{check_for}) ? "done!" : "failed! ($!)"), "\n"); chdir $dir; return !$?; } 1; HTML-HTML5-Parser-0.301/inc/Module/Install/AutoInstall.pm0000644000175000017500000000416212166545200020663 0ustar taitai#line 1 package Module::Install::AutoInstall; use strict; use Module::Install::Base (); use vars qw{$VERSION @ISA $ISCORE}; BEGIN { $VERSION = '1.06'; @ISA = 'Module::Install::Base'; $ISCORE = 1; } sub AutoInstall { $_[0] } sub run { my $self = shift; $self->auto_install_now(@_); } sub write { my $self = shift; $self->auto_install(@_); } sub auto_install { my $self = shift; return if $self->{done}++; # Flatten array of arrays into a single array my @core = map @$_, map @$_, grep ref, $self->build_requires, $self->requires; my @config = @_; # We'll need Module::AutoInstall $self->include('Module::AutoInstall'); require Module::AutoInstall; my @features_require = Module::AutoInstall->import( (@config ? (-config => \@config) : ()), (@core ? (-core => \@core) : ()), $self->features, ); my %seen; my @requires = map @$_, map @$_, grep ref, $self->requires; while (my ($mod, $ver) = splice(@requires, 0, 2)) { $seen{$mod}{$ver}++; } my @build_requires = map @$_, map @$_, grep ref, $self->build_requires; while (my ($mod, $ver) = splice(@build_requires, 0, 2)) { $seen{$mod}{$ver}++; } my @configure_requires = map @$_, map @$_, grep ref, $self->configure_requires; while (my ($mod, $ver) = splice(@configure_requires, 0, 2)) { $seen{$mod}{$ver}++; } my @deduped; while (my ($mod, $ver) = splice(@features_require, 0, 2)) { push @deduped, $mod => $ver unless $seen{$mod}{$ver}++; } $self->requires(@deduped); $self->makemaker_args( Module::AutoInstall::_make_args() ); my $class = ref($self); $self->postamble( "# --- $class section:\n" . Module::AutoInstall::postamble() ); } sub installdeps_target { my ($self, @args) = @_; $self->include('Module::AutoInstall'); require Module::AutoInstall; Module::AutoInstall::_installdeps_target(1); $self->auto_install(@args); } sub auto_install_now { my $self = shift; $self->auto_install(@_); Module::AutoInstall::do_install(); } 1; HTML-HTML5-Parser-0.301/inc/Module/Install/Package.pm0000644000175000017500000002340512166545146017771 0ustar taitai#line 1 ## # name: Module::Install::Package # abstract: Module::Install support for Module::Package # author: Ingy döt Net # license: perl # copyright: 2011 # see: # - Module::Package # This module contains the Module::Package logic that must be available to # both the Author and the End User. Author-only logic goes in a # Module::Package::Plugin subclass. package Module::Install::Package; use strict; use Module::Install::Base; use vars qw'@ISA $VERSION'; @ISA = 'Module::Install::Base'; $VERSION = '0.30'; #-----------------------------------------------------------------------------# # XXX BOOTBUGHACK # This is here to try to get us out of Module-Package-0.11 cpantesters hell... # Remove this when the situation has blown over. sub pkg { *inc::Module::Package::VERSION = sub { $VERSION }; my $self = shift; $self->module_package_internals_init($@); } #-----------------------------------------------------------------------------# # We allow the author to specify key/value options after the plugin. These # options need to be available both at author time and install time. #-----------------------------------------------------------------------------# # OO accessor for command line options: sub package_options { @_>1?($_[0]->{package_options}=$_[1]):$_[0]->{package_options}} my $default_options = { deps_list => 1, install_bin => 1, install_share => 1, manifest_skip => 1, requires_from => 1, }; #-----------------------------------------------------------------------------# # Module::Install plugin directives. Use long, ugly names to not pollute the # Module::Install plugin namespace. These are only intended to be called from # Module::Package. #-----------------------------------------------------------------------------# # Module::Package starts off life as a normal call to this Module::Install # plugin directive: my $module_install_plugin; my $module_package_plugin; my $module_package_dist_plugin; # XXX ARGVHACK This @argv thing is a temporary fix for an ugly bug somewhere in the # Wikitext module usage. my @argv; sub module_package_internals_init { my $self = $module_install_plugin = shift; my ($plugin_spec, %options) = @_; $self->package_options({%$default_options, %options}); if ($module_install_plugin->is_admin) { $module_package_plugin = $self->_load_plugin($plugin_spec); $module_package_plugin->mi($module_install_plugin); $module_package_plugin->version_check($VERSION); } else { $module_package_dist_plugin = $self->_load_dist_plugin($plugin_spec); $module_package_dist_plugin->mi($module_install_plugin) if ref $module_package_dist_plugin; } # NOTE - This is the point in time where the body of Makefile.PL runs... return; sub INIT { return unless $module_install_plugin; return if $Module::Package::ERROR; eval { if ($module_install_plugin->is_admin) { $module_package_plugin->initial(); $module_package_plugin->main(); } else { $module_install_plugin->_initial(); $module_package_dist_plugin->_initial() if ref $module_package_dist_plugin; $module_install_plugin->_main(); $module_package_dist_plugin->_main() if ref $module_package_dist_plugin; } }; if ($@) { $Module::Package::ERROR = $@; die $@; } @argv = @ARGV; # XXX ARGVHACK } # If this Module::Install plugin was used (by Module::Package) then wrap # up any loose ends. This will get called after Makefile.PL has completed. sub END { @ARGV = @argv; # XXX ARGVHACK return unless $module_install_plugin; return if $Module::Package::ERROR; $module_package_plugin ? do { $module_package_plugin->final; $module_package_plugin->replicate_module_package; } : do { $module_install_plugin->_final; $module_package_dist_plugin->_final() if ref $module_package_dist_plugin; } } } # Module::Package, Module::Install::Package and Module::Package::Plugin # must all have the same version. Seems wise. sub module_package_internals_version_check { my ($self, $version) = @_; return if $version < 0.1800001; # XXX BOOTBUGHACK!! die <<"..." unless $version == $VERSION; Error! Something has gone awry: Module::Package version=$version is using Module::Install::Package version=$VERSION If you are the author of this module, try upgrading Module::Package. Otherwise, please notify the author of this error. ... } # Find and load the author side plugin: sub _load_plugin { my ($self, $spec, $namespace) = @_; $spec ||= ''; $namespace ||= 'Module::Package'; my $version = ''; $Module::Package::plugin_version = 0; if ($spec =~ s/\s+(\S+)\s*//) { $version = $1; $Module::Package::plugin_version = $version; } my ($module, $plugin) = not($spec) ? ('Plugin', "Plugin::basic") : ($spec =~ /^\w(\w|::)*$/) ? ($spec, $spec) : ($spec =~ /^:(\w+)$/) ? ('Plugin', "Plugin::$1") : ($spec =~ /^(\S*\w):(\w+)$/) ? ($1, "$1::$2") : die "$spec is invalid"; $module = "${namespace}::${module}"; $plugin = "${namespace}::${plugin}"; eval "use $module $version (); 1" or die $@; return $plugin->new(); } # Find and load the user side plugin: sub _load_dist_plugin { my ($self, $spec, $namespace) = @_; $spec ||= ''; $namespace ||= 'Module::Package::Dist'; my $r = eval { $self->_load_plugin($spec, $namespace); }; return $r if ref $r; return; } #-----------------------------------------------------------------------------# # These are the user side analogs to the author side plugin API calls. # Prefix with '_' to not pollute Module::Install plugin space. #-----------------------------------------------------------------------------# sub _initial { my ($self) = @_; } sub _main { my ($self) = @_; } # NOTE These must match Module::Package::Plugin::final. sub _final { my ($self) = @_; $self->_all_from; $self->_requires_from; $self->_install_bin; $self->_install_share; $self->_WriteAll; } #-----------------------------------------------------------------------------# # This section is where all the useful code bits go. These bits are needed by # both Author and User side runs. #-----------------------------------------------------------------------------# my $all_from = 0; sub _all_from { my $self = shift; return if $all_from++; return if $self->name; my $file = shift || "$main::PM" or die "all_from has no file"; $self->all_from($file); } my $requires_from = 0; sub _requires_from { my $self = shift; return if $requires_from++; return unless $self->package_options->{requires_from}; my $file = shift || "$main::PM" or die "requires_from has no file"; $self->requires_from($main::PM) } my $install_bin = 0; sub _install_bin { my $self = shift; return if $install_bin++; return unless $self->package_options->{install_bin}; return unless -d 'bin'; my @bin; File::Find::find(sub { return unless -f $_; push @bin, $File::Find::name; }, 'bin'); $self->install_script($_) for @bin; } my $install_share = 0; sub _install_share { my $self = shift; return if $install_share++; return unless $self->package_options->{install_share}; return unless -d 'share'; $self->install_share; } my $WriteAll = 0; sub _WriteAll { my $self = shift; return if $WriteAll++; $self->WriteAll(@_); } # Base package for Module::Package plugin distributed components. package Module::Package::Dist; sub new { my ($class, %args) = @_; bless \%args, $class; } sub mi { @_ > 1 ? ($_[0]->{mi}=$_[1]) : $_[0]->{mi}; } sub _initial { my ($self) = @_; } sub _main { my ($self) = @_; } sub _final { my ($self) = @_; } 1; #-----------------------------------------------------------------------------# # Take a guess at the primary .pm and .pod files for 'all_from', and friends. # Put them in global magical vars in the main:: namespace. #-----------------------------------------------------------------------------# package Module::Package::PM; use overload '""' => sub { $_[0]->guess_pm unless @{$_[0]}; return $_[0]->[0]; }; sub set { $_[0]->[0] = $_[1] } sub guess_pm { my $pm = ''; my $self = shift; if (-e 'META.yml') { open META, 'META.yml' or die "Can't open 'META.yml' for input:\n$!"; my $meta = do { local $/; }; close META; $meta =~ /^module_name: (\S+)$/m or die "Can't get module_name from META.yml"; $pm = $1; $pm =~ s!::!/!g; $pm = "lib/$pm.pm"; } else { require File::Find; my @array = (); File::Find::find(sub { return unless /\.pm$/; my $name = $File::Find::name; my $num = ($name =~ s!/+!/!g); my $ary = $array[$num] ||= []; push @$ary, $name; }, 'lib'); shift @array while @array and not defined $array[0]; die "Can't guess main module" unless @array; (($pm) = sort @{$array[0]}) or die "Can't guess main module"; } my $pmc = $pm . 'c'; $pm = $pmc if -e $pmc; $self->set($pm); } $main::PM = bless [$main::PM ? ($main::PM) : ()], __PACKAGE__; package Module::Package::POD; use overload '""' => sub { return $_[0]->[0] if @{$_[0]}; (my $pod = "$main::PM") =~ s/\.pm/.pod/ or die "Module::Package's \$main::PM value should end in '.pm'"; return -e $pod ? $pod : ''; }; sub set { $_[0][0] = $_[1] } $main::POD = bless [$main::POD ? ($main::POD) : ()], __PACKAGE__; 1; HTML-HTML5-Parser-0.301/inc/Module/Install/Contributors.pm0000644000175000017500000000055412166545170021130 0ustar taitai#line 1 package Module::Install::Contributors; use 5.006; use strict; use warnings; BEGIN { $Module::Install::Contributors::AUTHORITY = 'cpan:TOBYINK'; $Module::Install::Contributors::VERSION = '0.001'; } use base qw(Module::Install::Base); sub contributors { my $self = shift; push @{ $self->Meta->{values}{x_contributors} ||= [] }, @_; } 1; __END__ HTML-HTML5-Parser-0.301/inc/Module/Install/Win32.pm0000644000175000017500000000340312166545201017324 0ustar taitai#line 1 package Module::Install::Win32; use strict; use Module::Install::Base (); use vars qw{$VERSION @ISA $ISCORE}; BEGIN { $VERSION = '1.06'; @ISA = 'Module::Install::Base'; $ISCORE = 1; } # determine if the user needs nmake, and download it if needed sub check_nmake { my $self = shift; $self->load('can_run'); $self->load('get_file'); require Config; return unless ( $^O eq 'MSWin32' and $Config::Config{make} and $Config::Config{make} =~ /^nmake\b/i and ! $self->can_run('nmake') ); print "The required 'nmake' executable not found, fetching it...\n"; require File::Basename; my $rv = $self->get_file( url => 'http://download.microsoft.com/download/vc15/Patch/1.52/W95/EN-US/Nmake15.exe', ftp_url => 'ftp://ftp.microsoft.com/Softlib/MSLFILES/Nmake15.exe', local_dir => File::Basename::dirname($^X), size => 51928, run => 'Nmake15.exe /o > nul', check_for => 'Nmake.exe', remove => 1, ); die <<'END_MESSAGE' unless $rv; ------------------------------------------------------------------------------- Since you are using Microsoft Windows, you will need the 'nmake' utility before installation. It's available at: http://download.microsoft.com/download/vc15/Patch/1.52/W95/EN-US/Nmake15.exe or ftp://ftp.microsoft.com/Softlib/MSLFILES/Nmake15.exe Please download the file manually, save it to a directory in %PATH% (e.g. C:\WINDOWS\COMMAND\), then launch the MS-DOS command line shell, "cd" to that directory, and run "Nmake15.exe" from there; that will create the 'nmake.exe' file needed by this module. You may then resume the installation process described in README. ------------------------------------------------------------------------------- END_MESSAGE } 1; HTML-HTML5-Parser-0.301/inc/Module/Install/Makefile.pm0000644000175000017500000002743712166545151020160 0ustar taitai#line 1 package Module::Install::Makefile; use strict 'vars'; use ExtUtils::MakeMaker (); use Module::Install::Base (); use Fcntl qw/:flock :seek/; use vars qw{$VERSION @ISA $ISCORE}; BEGIN { $VERSION = '1.06'; @ISA = 'Module::Install::Base'; $ISCORE = 1; } sub Makefile { $_[0] } my %seen = (); sub prompt { shift; # Infinite loop protection my @c = caller(); if ( ++$seen{"$c[1]|$c[2]|$_[0]"} > 3 ) { die "Caught an potential prompt infinite loop ($c[1]|$c[2]|$_[0])"; } # In automated testing or non-interactive session, always use defaults if ( ($ENV{AUTOMATED_TESTING} or -! -t STDIN) and ! $ENV{PERL_MM_USE_DEFAULT} ) { local $ENV{PERL_MM_USE_DEFAULT} = 1; goto &ExtUtils::MakeMaker::prompt; } else { goto &ExtUtils::MakeMaker::prompt; } } # Store a cleaned up version of the MakeMaker version, # since we need to behave differently in a variety of # ways based on the MM version. my $makemaker = eval $ExtUtils::MakeMaker::VERSION; # If we are passed a param, do a "newer than" comparison. # Otherwise, just return the MakeMaker version. sub makemaker { ( @_ < 2 or $makemaker >= eval($_[1]) ) ? $makemaker : 0 } # Ripped from ExtUtils::MakeMaker 6.56, and slightly modified # as we only need to know here whether the attribute is an array # or a hash or something else (which may or may not be appendable). my %makemaker_argtype = ( C => 'ARRAY', CONFIG => 'ARRAY', # CONFIGURE => 'CODE', # ignore DIR => 'ARRAY', DL_FUNCS => 'HASH', DL_VARS => 'ARRAY', EXCLUDE_EXT => 'ARRAY', EXE_FILES => 'ARRAY', FUNCLIST => 'ARRAY', H => 'ARRAY', IMPORTS => 'HASH', INCLUDE_EXT => 'ARRAY', LIBS => 'ARRAY', # ignore '' MAN1PODS => 'HASH', MAN3PODS => 'HASH', META_ADD => 'HASH', META_MERGE => 'HASH', PL_FILES => 'HASH', PM => 'HASH', PMLIBDIRS => 'ARRAY', PMLIBPARENTDIRS => 'ARRAY', PREREQ_PM => 'HASH', CONFIGURE_REQUIRES => 'HASH', SKIP => 'ARRAY', TYPEMAPS => 'ARRAY', XS => 'HASH', # VERSION => ['version',''], # ignore # _KEEP_AFTER_FLUSH => '', clean => 'HASH', depend => 'HASH', dist => 'HASH', dynamic_lib=> 'HASH', linkext => 'HASH', macro => 'HASH', postamble => 'HASH', realclean => 'HASH', test => 'HASH', tool_autosplit => 'HASH', # special cases where you can use makemaker_append CCFLAGS => 'APPENDABLE', DEFINE => 'APPENDABLE', INC => 'APPENDABLE', LDDLFLAGS => 'APPENDABLE', LDFROM => 'APPENDABLE', ); sub makemaker_args { my ($self, %new_args) = @_; my $args = ( $self->{makemaker_args} ||= {} ); foreach my $key (keys %new_args) { if ($makemaker_argtype{$key}) { if ($makemaker_argtype{$key} eq 'ARRAY') { $args->{$key} = [] unless defined $args->{$key}; unless (ref $args->{$key} eq 'ARRAY') { $args->{$key} = [$args->{$key}] } push @{$args->{$key}}, ref $new_args{$key} eq 'ARRAY' ? @{$new_args{$key}} : $new_args{$key}; } elsif ($makemaker_argtype{$key} eq 'HASH') { $args->{$key} = {} unless defined $args->{$key}; foreach my $skey (keys %{ $new_args{$key} }) { $args->{$key}{$skey} = $new_args{$key}{$skey}; } } elsif ($makemaker_argtype{$key} eq 'APPENDABLE') { $self->makemaker_append($key => $new_args{$key}); } } else { if (defined $args->{$key}) { warn qq{MakeMaker attribute "$key" is overriden; use "makemaker_append" to append values\n}; } $args->{$key} = $new_args{$key}; } } return $args; } # For mm args that take multiple space-seperated args, # append an argument to the current list. sub makemaker_append { my $self = shift; my $name = shift; my $args = $self->makemaker_args; $args->{$name} = defined $args->{$name} ? join( ' ', $args->{$name}, @_ ) : join( ' ', @_ ); } sub build_subdirs { my $self = shift; my $subdirs = $self->makemaker_args->{DIR} ||= []; for my $subdir (@_) { push @$subdirs, $subdir; } } sub clean_files { my $self = shift; my $clean = $self->makemaker_args->{clean} ||= {}; %$clean = ( %$clean, FILES => join ' ', grep { length $_ } ($clean->{FILES} || (), @_), ); } sub realclean_files { my $self = shift; my $realclean = $self->makemaker_args->{realclean} ||= {}; %$realclean = ( %$realclean, FILES => join ' ', grep { length $_ } ($realclean->{FILES} || (), @_), ); } sub libs { my $self = shift; my $libs = ref $_[0] ? shift : [ shift ]; $self->makemaker_args( LIBS => $libs ); } sub inc { my $self = shift; $self->makemaker_args( INC => shift ); } sub _wanted_t { } sub tests_recursive { my $self = shift; my $dir = shift || 't'; unless ( -d $dir ) { die "tests_recursive dir '$dir' does not exist"; } my %tests = map { $_ => 1 } split / /, ($self->tests || ''); require File::Find; File::Find::find( sub { /\.t$/ and -f $_ and $tests{"$File::Find::dir/*.t"} = 1 }, $dir ); $self->tests( join ' ', sort keys %tests ); } sub write { my $self = shift; die "&Makefile->write() takes no arguments\n" if @_; # Check the current Perl version my $perl_version = $self->perl_version; if ( $perl_version ) { eval "use $perl_version; 1" or die "ERROR: perl: Version $] is installed, " . "but we need version >= $perl_version"; } # Make sure we have a new enough MakeMaker require ExtUtils::MakeMaker; if ( $perl_version and $self->_cmp($perl_version, '5.006') >= 0 ) { # This previous attempted to inherit the version of # ExtUtils::MakeMaker in use by the module author, but this # was found to be untenable as some authors build releases # using future dev versions of EU:MM that nobody else has. # Instead, #toolchain suggests we use 6.59 which is the most # stable version on CPAN at time of writing and is, to quote # ribasushi, "not terminally fucked, > and tested enough". # TODO: We will now need to maintain this over time to push # the version up as new versions are released. $self->build_requires( 'ExtUtils::MakeMaker' => 6.59 ); $self->configure_requires( 'ExtUtils::MakeMaker' => 6.59 ); } else { # Allow legacy-compatibility with 5.005 by depending on the # most recent EU:MM that supported 5.005. $self->build_requires( 'ExtUtils::MakeMaker' => 6.36 ); $self->configure_requires( 'ExtUtils::MakeMaker' => 6.36 ); } # Generate the MakeMaker params my $args = $self->makemaker_args; $args->{DISTNAME} = $self->name; $args->{NAME} = $self->module_name || $self->name; $args->{NAME} =~ s/-/::/g; $args->{VERSION} = $self->version or die <<'EOT'; ERROR: Can't determine distribution version. Please specify it explicitly via 'version' in Makefile.PL, or set a valid $VERSION in a module, and provide its file path via 'version_from' (or 'all_from' if you prefer) in Makefile.PL. EOT if ( $self->tests ) { my @tests = split ' ', $self->tests; my %seen; $args->{test} = { TESTS => (join ' ', grep {!$seen{$_}++} @tests), }; } elsif ( $Module::Install::ExtraTests::use_extratests ) { # Module::Install::ExtraTests doesn't set $self->tests and does its own tests via harness. # So, just ignore our xt tests here. } elsif ( -d 'xt' and ($Module::Install::AUTHOR or $ENV{RELEASE_TESTING}) ) { $args->{test} = { TESTS => join( ' ', map { "$_/*.t" } grep { -d $_ } qw{ t xt } ), }; } if ( $] >= 5.005 ) { $args->{ABSTRACT} = $self->abstract; $args->{AUTHOR} = join ', ', @{$self->author || []}; } if ( $self->makemaker(6.10) ) { $args->{NO_META} = 1; #$args->{NO_MYMETA} = 1; } if ( $self->makemaker(6.17) and $self->sign ) { $args->{SIGN} = 1; } unless ( $self->is_admin ) { delete $args->{SIGN}; } if ( $self->makemaker(6.31) and $self->license ) { $args->{LICENSE} = $self->license; } my $prereq = ($args->{PREREQ_PM} ||= {}); %$prereq = ( %$prereq, map { @$_ } # flatten [module => version] map { @$_ } grep $_, ($self->requires) ); # Remove any reference to perl, PREREQ_PM doesn't support it delete $args->{PREREQ_PM}->{perl}; # Merge both kinds of requires into BUILD_REQUIRES my $build_prereq = ($args->{BUILD_REQUIRES} ||= {}); %$build_prereq = ( %$build_prereq, map { @$_ } # flatten [module => version] map { @$_ } grep $_, ($self->configure_requires, $self->build_requires) ); # Remove any reference to perl, BUILD_REQUIRES doesn't support it delete $args->{BUILD_REQUIRES}->{perl}; # Delete bundled dists from prereq_pm, add it to Makefile DIR my $subdirs = ($args->{DIR} || []); if ($self->bundles) { my %processed; foreach my $bundle (@{ $self->bundles }) { my ($mod_name, $dist_dir) = @$bundle; delete $prereq->{$mod_name}; $dist_dir = File::Basename::basename($dist_dir); # dir for building this module if (not exists $processed{$dist_dir}) { if (-d $dist_dir) { # List as sub-directory to be processed by make push @$subdirs, $dist_dir; } # Else do nothing: the module is already present on the system $processed{$dist_dir} = undef; } } } unless ( $self->makemaker('6.55_03') ) { %$prereq = (%$prereq,%$build_prereq); delete $args->{BUILD_REQUIRES}; } if ( my $perl_version = $self->perl_version ) { eval "use $perl_version; 1" or die "ERROR: perl: Version $] is installed, " . "but we need version >= $perl_version"; if ( $self->makemaker(6.48) ) { $args->{MIN_PERL_VERSION} = $perl_version; } } if ($self->installdirs) { warn qq{old INSTALLDIRS (probably set by makemaker_args) is overriden by installdirs\n} if $args->{INSTALLDIRS}; $args->{INSTALLDIRS} = $self->installdirs; } my %args = map { ( $_ => $args->{$_} ) } grep {defined($args->{$_} ) } keys %$args; my $user_preop = delete $args{dist}->{PREOP}; if ( my $preop = $self->admin->preop($user_preop) ) { foreach my $key ( keys %$preop ) { $args{dist}->{$key} = $preop->{$key}; } } my $mm = ExtUtils::MakeMaker::WriteMakefile(%args); $self->fix_up_makefile($mm->{FIRST_MAKEFILE} || 'Makefile'); } sub fix_up_makefile { my $self = shift; my $makefile_name = shift; my $top_class = ref($self->_top) || ''; my $top_version = $self->_top->VERSION || ''; my $preamble = $self->preamble ? "# Preamble by $top_class $top_version\n" . $self->preamble : ''; my $postamble = "# Postamble by $top_class $top_version\n" . ($self->postamble || ''); local *MAKEFILE; open MAKEFILE, "+< $makefile_name" or die "fix_up_makefile: Couldn't open $makefile_name: $!"; eval { flock MAKEFILE, LOCK_EX }; my $makefile = do { local $/; }; $makefile =~ s/\b(test_harness\(\$\(TEST_VERBOSE\), )/$1'inc', /; $makefile =~ s/( -I\$\(INST_ARCHLIB\))/ -Iinc$1/g; $makefile =~ s/( "-I\$\(INST_LIB\)")/ "-Iinc"$1/g; $makefile =~ s/^(FULLPERL = .*)/$1 "-Iinc"/m; $makefile =~ s/^(PERL = .*)/$1 "-Iinc"/m; # Module::Install will never be used to build the Core Perl # Sometimes PERL_LIB and PERL_ARCHLIB get written anyway, which breaks # PREFIX/PERL5LIB, and thus, install_share. Blank them if they exist $makefile =~ s/^PERL_LIB = .+/PERL_LIB =/m; #$makefile =~ s/^PERL_ARCHLIB = .+/PERL_ARCHLIB =/m; # Perl 5.005 mentions PERL_LIB explicitly, so we have to remove that as well. $makefile =~ s/(\"?)-I\$\(PERL_LIB\)\1//g; # XXX - This is currently unused; not sure if it breaks other MM-users # $makefile =~ s/^pm_to_blib\s+:\s+/pm_to_blib :: /mg; seek MAKEFILE, 0, SEEK_SET; truncate MAKEFILE, 0; print MAKEFILE "$preamble$makefile$postamble" or die $!; close MAKEFILE or die $!; 1; } sub preamble { my ($self, $text) = @_; $self->{preamble} = $text . $self->{preamble} if defined $text; $self->{preamble}; } sub postamble { my ($self, $text) = @_; $self->{postamble} ||= $self->admin->postamble; $self->{postamble} .= $text if defined $text; $self->{postamble} } 1; __END__ #line 544 HTML-HTML5-Parser-0.301/inc/Module/Install/Can.pm0000644000175000017500000000615712166545201017134 0ustar taitai#line 1 package Module::Install::Can; use strict; use Config (); use ExtUtils::MakeMaker (); use Module::Install::Base (); use vars qw{$VERSION @ISA $ISCORE}; BEGIN { $VERSION = '1.06'; @ISA = 'Module::Install::Base'; $ISCORE = 1; } # check if we can load some module ### Upgrade this to not have to load the module if possible sub can_use { my ($self, $mod, $ver) = @_; $mod =~ s{::|\\}{/}g; $mod .= '.pm' unless $mod =~ /\.pm$/i; my $pkg = $mod; $pkg =~ s{/}{::}g; $pkg =~ s{\.pm$}{}i; local $@; eval { require $mod; $pkg->VERSION($ver || 0); 1 }; } # Check if we can run some command sub can_run { my ($self, $cmd) = @_; my $_cmd = $cmd; return $_cmd if (-x $_cmd or $_cmd = MM->maybe_command($_cmd)); for my $dir ((split /$Config::Config{path_sep}/, $ENV{PATH}), '.') { next if $dir eq ''; require File::Spec; my $abs = File::Spec->catfile($dir, $cmd); return $abs if (-x $abs or $abs = MM->maybe_command($abs)); } return; } # Can our C compiler environment build XS files sub can_xs { my $self = shift; # Ensure we have the CBuilder module $self->configure_requires( 'ExtUtils::CBuilder' => 0.27 ); # Do we have the configure_requires checker? local $@; eval "require ExtUtils::CBuilder;"; if ( $@ ) { # They don't obey configure_requires, so it is # someone old and delicate. Try to avoid hurting # them by falling back to an older simpler test. return $self->can_cc(); } # Do we have a working C compiler my $builder = ExtUtils::CBuilder->new( quiet => 1, ); unless ( $builder->have_compiler ) { # No working C compiler return 0; } # Write a C file representative of what XS becomes require File::Temp; my ( $FH, $tmpfile ) = File::Temp::tempfile( "compilexs-XXXXX", SUFFIX => '.c', ); binmode $FH; print $FH <<'END_C'; #include "EXTERN.h" #include "perl.h" #include "XSUB.h" int main(int argc, char **argv) { return 0; } int boot_sanexs() { return 1; } END_C close $FH; # Can the C compiler access the same headers XS does my @libs = (); my $object = undef; eval { local $^W = 0; $object = $builder->compile( source => $tmpfile, ); @libs = $builder->link( objects => $object, module_name => 'sanexs', ); }; my $result = $@ ? 0 : 1; # Clean up all the build files foreach ( $tmpfile, $object, @libs ) { next unless defined $_; 1 while unlink; } return $result; } # Can we locate a (the) C compiler sub can_cc { my $self = shift; my @chunks = split(/ /, $Config::Config{cc}) or return; # $Config{cc} may contain args; try to find out the program part while (@chunks) { return $self->can_run("@chunks") || (pop(@chunks), next); } return; } # Fix Cygwin bug on maybe_command(); if ( $^O eq 'cygwin' ) { require ExtUtils::MM_Cygwin; require ExtUtils::MM_Win32; if ( ! defined(&ExtUtils::MM_Cygwin::maybe_command) ) { *ExtUtils::MM_Cygwin::maybe_command = sub { my ($self, $file) = @_; if ($file =~ m{^/cygdrive/}i and ExtUtils::MM_Win32->can('maybe_command')) { ExtUtils::MM_Win32->maybe_command($file); } else { ExtUtils::MM_Unix->maybe_command($file); } } } } 1; __END__ #line 236 HTML-HTML5-Parser-0.301/inc/Module/Install/Base.pm0000644000175000017500000000214712166545146017310 0ustar taitai#line 1 package Module::Install::Base; use strict 'vars'; use vars qw{$VERSION}; BEGIN { $VERSION = '1.06'; } # Suspend handler for "redefined" warnings BEGIN { my $w = $SIG{__WARN__}; $SIG{__WARN__} = sub { $w }; } #line 42 sub new { my $class = shift; unless ( defined &{"${class}::call"} ) { *{"${class}::call"} = sub { shift->_top->call(@_) }; } unless ( defined &{"${class}::load"} ) { *{"${class}::load"} = sub { shift->_top->load(@_) }; } bless { @_ }, $class; } #line 61 sub AUTOLOAD { local $@; my $func = eval { shift->_top->autoload } or return; goto &$func; } #line 75 sub _top { $_[0]->{_top}; } #line 90 sub admin { $_[0]->_top->{admin} or Module::Install::Base::FakeAdmin->new; } #line 106 sub is_admin { ! $_[0]->admin->isa('Module::Install::Base::FakeAdmin'); } sub DESTROY {} package Module::Install::Base::FakeAdmin; use vars qw{$VERSION}; BEGIN { $VERSION = $Module::Install::Base::VERSION; } my $fake; sub new { $fake ||= bless(\@_, $_[0]); } sub AUTOLOAD {} sub DESTROY {} # Restore warning handler BEGIN { $SIG{__WARN__} = $SIG{__WARN__}->(); } 1; #line 159 HTML-HTML5-Parser-0.301/inc/Module/Install/WriteAll.pm0000644000175000017500000000237612166545201020155 0ustar taitai#line 1 package Module::Install::WriteAll; use strict; use Module::Install::Base (); use vars qw{$VERSION @ISA $ISCORE}; BEGIN { $VERSION = '1.06'; @ISA = qw{Module::Install::Base}; $ISCORE = 1; } sub WriteAll { my $self = shift; my %args = ( meta => 1, sign => 0, inline => 0, check_nmake => 1, @_, ); $self->sign(1) if $args{sign}; $self->admin->WriteAll(%args) if $self->is_admin; $self->check_nmake if $args{check_nmake}; unless ( $self->makemaker_args->{PL_FILES} ) { # XXX: This still may be a bit over-defensive... unless ($self->makemaker(6.25)) { $self->makemaker_args( PL_FILES => {} ) if -f 'Build.PL'; } } # Until ExtUtils::MakeMaker support MYMETA.yml, make sure # we clean it up properly ourself. $self->realclean_files('MYMETA.yml'); if ( $args{inline} ) { $self->Inline->write; } else { $self->Makefile->write; } # The Makefile write process adds a couple of dependencies, # so write the META.yml files after the Makefile. if ( $args{meta} ) { $self->Meta->write; } # Experimental support for MYMETA if ( $ENV{X_MYMETA} ) { if ( $ENV{X_MYMETA} eq 'JSON' ) { $self->Meta->write_mymeta_json; } else { $self->Meta->write_mymeta_yaml; } } return 1; } 1; HTML-HTML5-Parser-0.301/inc/Module/Install/Include.pm0000644000175000017500000000101512166545147020013 0ustar taitai#line 1 package Module::Install::Include; use strict; use Module::Install::Base (); use vars qw{$VERSION @ISA $ISCORE}; BEGIN { $VERSION = '1.06'; @ISA = 'Module::Install::Base'; $ISCORE = 1; } sub include { shift()->admin->include(@_); } sub include_deps { shift()->admin->include_deps(@_); } sub auto_include { shift()->admin->auto_include(@_); } sub auto_include_deps { shift()->admin->auto_include_deps(@_); } sub auto_include_dependent_dists { shift()->admin->auto_include_dependent_dists(@_); } 1; HTML-HTML5-Parser-0.301/inc/Module/Install/Metadata.pm0000644000175000017500000004327712166545146020167 0ustar taitai#line 1 package Module::Install::Metadata; use strict 'vars'; use Module::Install::Base (); use vars qw{$VERSION @ISA $ISCORE}; BEGIN { $VERSION = '1.06'; @ISA = 'Module::Install::Base'; $ISCORE = 1; } my @boolean_keys = qw{ sign }; my @scalar_keys = qw{ name module_name abstract version distribution_type tests installdirs }; my @tuple_keys = qw{ configure_requires build_requires requires recommends bundles resources }; my @resource_keys = qw{ homepage bugtracker repository }; my @array_keys = qw{ keywords author }; *authors = \&author; sub Meta { shift } sub Meta_BooleanKeys { @boolean_keys } sub Meta_ScalarKeys { @scalar_keys } sub Meta_TupleKeys { @tuple_keys } sub Meta_ResourceKeys { @resource_keys } sub Meta_ArrayKeys { @array_keys } foreach my $key ( @boolean_keys ) { *$key = sub { my $self = shift; if ( defined wantarray and not @_ ) { return $self->{values}->{$key}; } $self->{values}->{$key} = ( @_ ? $_[0] : 1 ); return $self; }; } foreach my $key ( @scalar_keys ) { *$key = sub { my $self = shift; return $self->{values}->{$key} if defined wantarray and !@_; $self->{values}->{$key} = shift; return $self; }; } foreach my $key ( @array_keys ) { *$key = sub { my $self = shift; return $self->{values}->{$key} if defined wantarray and !@_; $self->{values}->{$key} ||= []; push @{$self->{values}->{$key}}, @_; return $self; }; } foreach my $key ( @resource_keys ) { *$key = sub { my $self = shift; unless ( @_ ) { return () unless $self->{values}->{resources}; return map { $_->[1] } grep { $_->[0] eq $key } @{ $self->{values}->{resources} }; } return $self->{values}->{resources}->{$key} unless @_; my $uri = shift or die( "Did not provide a value to $key()" ); $self->resources( $key => $uri ); return 1; }; } foreach my $key ( grep { $_ ne "resources" } @tuple_keys) { *$key = sub { my $self = shift; return $self->{values}->{$key} unless @_; my @added; while ( @_ ) { my $module = shift or last; my $version = shift || 0; push @added, [ $module, $version ]; } push @{ $self->{values}->{$key} }, @added; return map {@$_} @added; }; } # Resource handling my %lc_resource = map { $_ => 1 } qw{ homepage license bugtracker repository }; sub resources { my $self = shift; while ( @_ ) { my $name = shift or last; my $value = shift or next; if ( $name eq lc $name and ! $lc_resource{$name} ) { die("Unsupported reserved lowercase resource '$name'"); } $self->{values}->{resources} ||= []; push @{ $self->{values}->{resources} }, [ $name, $value ]; } $self->{values}->{resources}; } # Aliases for build_requires that will have alternative # meanings in some future version of META.yml. sub test_requires { shift->build_requires(@_) } sub install_requires { shift->build_requires(@_) } # Aliases for installdirs options sub install_as_core { $_[0]->installdirs('perl') } sub install_as_cpan { $_[0]->installdirs('site') } sub install_as_site { $_[0]->installdirs('site') } sub install_as_vendor { $_[0]->installdirs('vendor') } sub dynamic_config { my $self = shift; my $value = @_ ? shift : 1; if ( $self->{values}->{dynamic_config} ) { # Once dynamic we never change to static, for safety return 0; } $self->{values}->{dynamic_config} = $value ? 1 : 0; return 1; } # Convenience command sub static_config { shift->dynamic_config(0); } sub perl_version { my $self = shift; return $self->{values}->{perl_version} unless @_; my $version = shift or die( "Did not provide a value to perl_version()" ); # Normalize the version $version = $self->_perl_version($version); # We don't support the really old versions unless ( $version >= 5.005 ) { die "Module::Install only supports 5.005 or newer (use ExtUtils::MakeMaker)\n"; } $self->{values}->{perl_version} = $version; } sub all_from { my ( $self, $file ) = @_; unless ( defined($file) ) { my $name = $self->name or die( "all_from called with no args without setting name() first" ); $file = join('/', 'lib', split(/-/, $name)) . '.pm'; $file =~ s{.*/}{} unless -e $file; unless ( -e $file ) { die("all_from cannot find $file from $name"); } } unless ( -f $file ) { die("The path '$file' does not exist, or is not a file"); } $self->{values}{all_from} = $file; # Some methods pull from POD instead of code. # If there is a matching .pod, use that instead my $pod = $file; $pod =~ s/\.pm$/.pod/i; $pod = $file unless -e $pod; # Pull the different values $self->name_from($file) unless $self->name; $self->version_from($file) unless $self->version; $self->perl_version_from($file) unless $self->perl_version; $self->author_from($pod) unless @{$self->author || []}; $self->license_from($pod) unless $self->license; $self->abstract_from($pod) unless $self->abstract; return 1; } sub provides { my $self = shift; my $provides = ( $self->{values}->{provides} ||= {} ); %$provides = (%$provides, @_) if @_; return $provides; } sub auto_provides { my $self = shift; return $self unless $self->is_admin; unless (-e 'MANIFEST') { warn "Cannot deduce auto_provides without a MANIFEST, skipping\n"; return $self; } # Avoid spurious warnings as we are not checking manifest here. local $SIG{__WARN__} = sub {1}; require ExtUtils::Manifest; local *ExtUtils::Manifest::manicheck = sub { return }; require Module::Build; my $build = Module::Build->new( dist_name => $self->name, dist_version => $self->version, license => $self->license, ); $self->provides( %{ $build->find_dist_packages || {} } ); } sub feature { my $self = shift; my $name = shift; my $features = ( $self->{values}->{features} ||= [] ); my $mods; if ( @_ == 1 and ref( $_[0] ) ) { # The user used ->feature like ->features by passing in the second # argument as a reference. Accomodate for that. $mods = $_[0]; } else { $mods = \@_; } my $count = 0; push @$features, ( $name => [ map { ref($_) ? ( ref($_) eq 'HASH' ) ? %$_ : @$_ : $_ } @$mods ] ); return @$features; } sub features { my $self = shift; while ( my ( $name, $mods ) = splice( @_, 0, 2 ) ) { $self->feature( $name, @$mods ); } return $self->{values}->{features} ? @{ $self->{values}->{features} } : (); } sub no_index { my $self = shift; my $type = shift; push @{ $self->{values}->{no_index}->{$type} }, @_ if $type; return $self->{values}->{no_index}; } sub read { my $self = shift; $self->include_deps( 'YAML::Tiny', 0 ); require YAML::Tiny; my $data = YAML::Tiny::LoadFile('META.yml'); # Call methods explicitly in case user has already set some values. while ( my ( $key, $value ) = each %$data ) { next unless $self->can($key); if ( ref $value eq 'HASH' ) { while ( my ( $module, $version ) = each %$value ) { $self->can($key)->($self, $module => $version ); } } else { $self->can($key)->($self, $value); } } return $self; } sub write { my $self = shift; return $self unless $self->is_admin; $self->admin->write_meta; return $self; } sub version_from { require ExtUtils::MM_Unix; my ( $self, $file ) = @_; $self->version( ExtUtils::MM_Unix->parse_version($file) ); # for version integrity check $self->makemaker_args( VERSION_FROM => $file ); } sub abstract_from { require ExtUtils::MM_Unix; my ( $self, $file ) = @_; $self->abstract( bless( { DISTNAME => $self->name }, 'ExtUtils::MM_Unix' )->parse_abstract($file) ); } # Add both distribution and module name sub name_from { my ($self, $file) = @_; if ( Module::Install::_read($file) =~ m/ ^ \s* package \s* ([\w:]+) \s* ; /ixms ) { my ($name, $module_name) = ($1, $1); $name =~ s{::}{-}g; $self->name($name); unless ( $self->module_name ) { $self->module_name($module_name); } } else { die("Cannot determine name from $file\n"); } } sub _extract_perl_version { if ( $_[0] =~ m/ ^\s* (?:use|require) \s* v? ([\d_\.]+) \s* ; /ixms ) { my $perl_version = $1; $perl_version =~ s{_}{}g; return $perl_version; } else { return; } } sub perl_version_from { my $self = shift; my $perl_version=_extract_perl_version(Module::Install::_read($_[0])); if ($perl_version) { $self->perl_version($perl_version); } else { warn "Cannot determine perl version info from $_[0]\n"; return; } } sub author_from { my $self = shift; my $content = Module::Install::_read($_[0]); if ($content =~ m/ =head \d \s+ (?:authors?)\b \s* ([^\n]*) | =head \d \s+ (?:licen[cs]e|licensing|copyright|legal)\b \s* .*? copyright .*? \d\d\d[\d.]+ \s* (?:\bby\b)? \s* ([^\n]*) /ixms) { my $author = $1 || $2; # XXX: ugly but should work anyway... if (eval "require Pod::Escapes; 1") { # Pod::Escapes has a mapping table. # It's in core of perl >= 5.9.3, and should be installed # as one of the Pod::Simple's prereqs, which is a prereq # of Pod::Text 3.x (see also below). $author =~ s{ E<( (\d+) | ([A-Za-z]+) )> } { defined $2 ? chr($2) : defined $Pod::Escapes::Name2character_number{$1} ? chr($Pod::Escapes::Name2character_number{$1}) : do { warn "Unknown escape: E<$1>"; "E<$1>"; }; }gex; } elsif (eval "require Pod::Text; 1" && $Pod::Text::VERSION < 3) { # Pod::Text < 3.0 has yet another mapping table, # though the table name of 2.x and 1.x are different. # (1.x is in core of Perl < 5.6, 2.x is in core of # Perl < 5.9.3) my $mapping = ($Pod::Text::VERSION < 2) ? \%Pod::Text::HTML_Escapes : \%Pod::Text::ESCAPES; $author =~ s{ E<( (\d+) | ([A-Za-z]+) )> } { defined $2 ? chr($2) : defined $mapping->{$1} ? $mapping->{$1} : do { warn "Unknown escape: E<$1>"; "E<$1>"; }; }gex; } else { $author =~ s{E}{<}g; $author =~ s{E}{>}g; } $self->author($author); } else { warn "Cannot determine author info from $_[0]\n"; } } #Stolen from M::B my %license_urls = ( perl => 'http://dev.perl.org/licenses/', apache => 'http://apache.org/licenses/LICENSE-2.0', apache_1_1 => 'http://apache.org/licenses/LICENSE-1.1', artistic => 'http://opensource.org/licenses/artistic-license.php', artistic_2 => 'http://opensource.org/licenses/artistic-license-2.0.php', lgpl => 'http://opensource.org/licenses/lgpl-license.php', lgpl2 => 'http://opensource.org/licenses/lgpl-2.1.php', lgpl3 => 'http://opensource.org/licenses/lgpl-3.0.html', bsd => 'http://opensource.org/licenses/bsd-license.php', gpl => 'http://opensource.org/licenses/gpl-license.php', gpl2 => 'http://opensource.org/licenses/gpl-2.0.php', gpl3 => 'http://opensource.org/licenses/gpl-3.0.html', mit => 'http://opensource.org/licenses/mit-license.php', mozilla => 'http://opensource.org/licenses/mozilla1.1.php', open_source => undef, unrestricted => undef, restrictive => undef, unknown => undef, ); sub license { my $self = shift; return $self->{values}->{license} unless @_; my $license = shift or die( 'Did not provide a value to license()' ); $license = __extract_license($license) || lc $license; $self->{values}->{license} = $license; # Automatically fill in license URLs if ( $license_urls{$license} ) { $self->resources( license => $license_urls{$license} ); } return 1; } sub _extract_license { my $pod = shift; my $matched; return __extract_license( ($matched) = $pod =~ m/ (=head \d \s+ L(?i:ICEN[CS]E|ICENSING)\b.*?) (=head \d.*|=cut.*|)\z /xms ) || __extract_license( ($matched) = $pod =~ m/ (=head \d \s+ (?:C(?i:OPYRIGHTS?)|L(?i:EGAL))\b.*?) (=head \d.*|=cut.*|)\z /xms ); } sub __extract_license { my $license_text = shift or return; my @phrases = ( '(?:under )?the same (?:terms|license) as (?:perl|the perl (?:\d )?programming language)' => 'perl', 1, '(?:under )?the terms of (?:perl|the perl programming language) itself' => 'perl', 1, 'Artistic and GPL' => 'perl', 1, 'GNU general public license' => 'gpl', 1, 'GNU public license' => 'gpl', 1, 'GNU lesser general public license' => 'lgpl', 1, 'GNU lesser public license' => 'lgpl', 1, 'GNU library general public license' => 'lgpl', 1, 'GNU library public license' => 'lgpl', 1, 'GNU Free Documentation license' => 'unrestricted', 1, 'GNU Affero General Public License' => 'open_source', 1, '(?:Free)?BSD license' => 'bsd', 1, 'Artistic license 2\.0' => 'artistic_2', 1, 'Artistic license' => 'artistic', 1, 'Apache (?:Software )?license' => 'apache', 1, 'GPL' => 'gpl', 1, 'LGPL' => 'lgpl', 1, 'BSD' => 'bsd', 1, 'Artistic' => 'artistic', 1, 'MIT' => 'mit', 1, 'Mozilla Public License' => 'mozilla', 1, 'Q Public License' => 'open_source', 1, 'OpenSSL License' => 'unrestricted', 1, 'SSLeay License' => 'unrestricted', 1, 'zlib License' => 'open_source', 1, 'proprietary' => 'proprietary', 0, ); while ( my ($pattern, $license, $osi) = splice(@phrases, 0, 3) ) { $pattern =~ s#\s+#\\s+#gs; if ( $license_text =~ /\b$pattern\b/i ) { return $license; } } return ''; } sub license_from { my $self = shift; if (my $license=_extract_license(Module::Install::_read($_[0]))) { $self->license($license); } else { warn "Cannot determine license info from $_[0]\n"; return 'unknown'; } } sub _extract_bugtracker { my @links = $_[0] =~ m#L<( https?\Q://rt.cpan.org/\E[^>]+| https?\Q://github.com/\E[\w_]+/[\w_]+/issues| https?\Q://code.google.com/p/\E[\w_\-]+/issues/list )>#gx; my %links; @links{@links}=(); @links=keys %links; return @links; } sub bugtracker_from { my $self = shift; my $content = Module::Install::_read($_[0]); my @links = _extract_bugtracker($content); unless ( @links ) { warn "Cannot determine bugtracker info from $_[0]\n"; return 0; } if ( @links > 1 ) { warn "Found more than one bugtracker link in $_[0]\n"; return 0; } # Set the bugtracker bugtracker( $links[0] ); return 1; } sub requires_from { my $self = shift; my $content = Module::Install::_readperl($_[0]); my @requires = $content =~ m/^use\s+([^\W\d]\w*(?:::\w+)*)\s+(v?[\d\.]+)/mg; while ( @requires ) { my $module = shift @requires; my $version = shift @requires; $self->requires( $module => $version ); } } sub test_requires_from { my $self = shift; my $content = Module::Install::_readperl($_[0]); my @requires = $content =~ m/^use\s+([^\W\d]\w*(?:::\w+)*)\s+([\d\.]+)/mg; while ( @requires ) { my $module = shift @requires; my $version = shift @requires; $self->test_requires( $module => $version ); } } # Convert triple-part versions (eg, 5.6.1 or 5.8.9) to # numbers (eg, 5.006001 or 5.008009). # Also, convert double-part versions (eg, 5.8) sub _perl_version { my $v = $_[-1]; $v =~ s/^([1-9])\.([1-9]\d?\d?)$/sprintf("%d.%03d",$1,$2)/e; $v =~ s/^([1-9])\.([1-9]\d?\d?)\.(0|[1-9]\d?\d?)$/sprintf("%d.%03d%03d",$1,$2,$3 || 0)/e; $v =~ s/(\.\d\d\d)000$/$1/; $v =~ s/_.+$//; if ( ref($v) ) { # Numify $v = $v + 0; } return $v; } sub add_metadata { my $self = shift; my %hash = @_; for my $key (keys %hash) { warn "add_metadata: $key is not prefixed with 'x_'.\n" . "Use appopriate function to add non-private metadata.\n" unless $key =~ /^x_/; $self->{values}->{$key} = $hash{$key}; } } ###################################################################### # MYMETA Support sub WriteMyMeta { die "WriteMyMeta has been deprecated"; } sub write_mymeta_yaml { my $self = shift; # We need YAML::Tiny to write the MYMETA.yml file unless ( eval { require YAML::Tiny; 1; } ) { return 1; } # Generate the data my $meta = $self->_write_mymeta_data or return 1; # Save as the MYMETA.yml file print "Writing MYMETA.yml\n"; YAML::Tiny::DumpFile('MYMETA.yml', $meta); } sub write_mymeta_json { my $self = shift; # We need JSON to write the MYMETA.json file unless ( eval { require JSON; 1; } ) { return 1; } # Generate the data my $meta = $self->_write_mymeta_data or return 1; # Save as the MYMETA.yml file print "Writing MYMETA.json\n"; Module::Install::_write( 'MYMETA.json', JSON->new->pretty(1)->canonical->encode($meta), ); } sub _write_mymeta_data { my $self = shift; # If there's no existing META.yml there is nothing we can do return undef unless -f 'META.yml'; # We need Parse::CPAN::Meta to load the file unless ( eval { require Parse::CPAN::Meta; 1; } ) { return undef; } # Merge the perl version into the dependencies my $val = $self->Meta->{values}; my $perl = delete $val->{perl_version}; if ( $perl ) { $val->{requires} ||= []; my $requires = $val->{requires}; # Canonize to three-dot version after Perl 5.6 if ( $perl >= 5.006 ) { $perl =~ s{^(\d+)\.(\d\d\d)(\d*)}{join('.', $1, int($2||0), int($3||0))}e } unshift @$requires, [ perl => $perl ]; } # Load the advisory META.yml file my @yaml = Parse::CPAN::Meta::LoadFile('META.yml'); my $meta = $yaml[0]; # Overwrite the non-configure dependency hashs delete $meta->{requires}; delete $meta->{build_requires}; delete $meta->{recommends}; if ( exists $val->{requires} ) { $meta->{requires} = { map { @$_ } @{ $val->{requires} } }; } if ( exists $val->{build_requires} ) { $meta->{build_requires} = { map { @$_ } @{ $val->{build_requires} } }; } return $meta; } 1; HTML-HTML5-Parser-0.301/inc/Module/Install/AutoManifest.pm0000644000175000017500000000125712166545200021025 0ustar taitai#line 1 use strict; use warnings; package Module::Install::AutoManifest; use Module::Install::Base; BEGIN { our $VERSION = '0.003'; our $ISCORE = 1; our @ISA = qw(Module::Install::Base); } sub auto_manifest { my ($self) = @_; return unless $Module::Install::AUTHOR; die "auto_manifest requested, but no MANIFEST.SKIP exists\n" unless -e "MANIFEST.SKIP"; if (-e "MANIFEST") { unlink('MANIFEST') or die "Can't remove MANIFEST: $!"; } $self->postamble(<<"END"); create_distdir: manifest_clean manifest distclean :: manifest_clean manifest_clean: \t\$(RM_F) MANIFEST END } 1; __END__ #line 48 #line 131 1; # End of Module::Install::AutoManifest HTML-HTML5-Parser-0.301/inc/Module/Install/TrustMetaYml.pm0000644000175000017500000000161512166545146021047 0ustar taitai#line 1 package Module::Install::TrustMetaYml; use 5.005; use strict; BEGIN { $Module::Install::TrustMetaYml::AUTHORITY = 'cpan:TOBYINK'; $Module::Install::TrustMetaYml::VERSION = '0.003'; } use base qw(Module::Install::Base); sub trust_meta_yml { my ($self, $where) = @_; $where ||= 'META.yml'; $self->perl_version('5.005') unless defined $self->perl_version; $self->include('YAML::Tiny', 0); return $self if $self->is_admin; require YAML::Tiny; my $data = YAML::Tiny::LoadFile($where); $self->perl_version($data->{requires}{perl} || '5.005'); KEY: foreach my $key (qw(requires recommends build_requires)) { next KEY unless ref $data->{$key} eq 'HASH'; my %deps = %{$data->{$key}}; DEP: while (my ($pkg, $ver) = each %deps) { next if $pkg eq 'perl'; $self->$key($pkg, $ver); } } return $self; } *trust_meta_yaml = \&trust_meta_yml; 1; __END__ =encoding utf8 HTML-HTML5-Parser-0.301/inc/Module/Install/Scripts.pm0000644000175000017500000000101112166545151020046 0ustar taitai#line 1 package Module::Install::Scripts; use strict 'vars'; use Module::Install::Base (); use vars qw{$VERSION @ISA $ISCORE}; BEGIN { $VERSION = '1.06'; @ISA = 'Module::Install::Base'; $ISCORE = 1; } sub install_script { my $self = shift; my $args = $self->makemaker_args; my $exe = $args->{EXE_FILES} ||= []; foreach ( @_ ) { if ( -f $_ ) { push @$exe, $_; } elsif ( -d 'script' and -f "script/$_" ) { push @$exe, "script/$_"; } else { die("Cannot find script '$_'"); } } } 1; HTML-HTML5-Parser-0.301/inc/Module/Install.pm0000644000175000017500000003013512166545141016427 0ustar taitai#line 1 package Module::Install; # For any maintainers: # The load order for Module::Install is a bit magic. # It goes something like this... # # IF ( host has Module::Install installed, creating author mode ) { # 1. Makefile.PL calls "use inc::Module::Install" # 2. $INC{inc/Module/Install.pm} set to installed version of inc::Module::Install # 3. The installed version of inc::Module::Install loads # 4. inc::Module::Install calls "require Module::Install" # 5. The ./inc/ version of Module::Install loads # } ELSE { # 1. Makefile.PL calls "use inc::Module::Install" # 2. $INC{inc/Module/Install.pm} set to ./inc/ version of Module::Install # 3. The ./inc/ version of Module::Install loads # } use 5.005; use strict 'vars'; use Cwd (); use File::Find (); use File::Path (); use vars qw{$VERSION $MAIN}; BEGIN { # All Module::Install core packages now require synchronised versions. # This will be used to ensure we don't accidentally load old or # different versions of modules. # This is not enforced yet, but will be some time in the next few # releases once we can make sure it won't clash with custom # Module::Install extensions. $VERSION = '1.06'; # Storage for the pseudo-singleton $MAIN = undef; *inc::Module::Install::VERSION = *VERSION; @inc::Module::Install::ISA = __PACKAGE__; } sub import { my $class = shift; my $self = $class->new(@_); my $who = $self->_caller; #------------------------------------------------------------- # all of the following checks should be included in import(), # to allow "eval 'require Module::Install; 1' to test # installation of Module::Install. (RT #51267) #------------------------------------------------------------- # Whether or not inc::Module::Install is actually loaded, the # $INC{inc/Module/Install.pm} is what will still get set as long as # the caller loaded module this in the documented manner. # If not set, the caller may NOT have loaded the bundled version, and thus # they may not have a MI version that works with the Makefile.PL. This would # result in false errors or unexpected behaviour. And we don't want that. my $file = join( '/', 'inc', split /::/, __PACKAGE__ ) . '.pm'; unless ( $INC{$file} ) { die <<"END_DIE" } Please invoke ${\__PACKAGE__} with: use inc::${\__PACKAGE__}; not: use ${\__PACKAGE__}; END_DIE # This reportedly fixes a rare Win32 UTC file time issue, but # as this is a non-cross-platform XS module not in the core, # we shouldn't really depend on it. See RT #24194 for detail. # (Also, this module only supports Perl 5.6 and above). eval "use Win32::UTCFileTime" if $^O eq 'MSWin32' && $] >= 5.006; # If the script that is loading Module::Install is from the future, # then make will detect this and cause it to re-run over and over # again. This is bad. Rather than taking action to touch it (which # is unreliable on some platforms and requires write permissions) # for now we should catch this and refuse to run. if ( -f $0 ) { my $s = (stat($0))[9]; # If the modification time is only slightly in the future, # sleep briefly to remove the problem. my $a = $s - time; if ( $a > 0 and $a < 5 ) { sleep 5 } # Too far in the future, throw an error. my $t = time; if ( $s > $t ) { die <<"END_DIE" } Your installer $0 has a modification time in the future ($s > $t). This is known to create infinite loops in make. Please correct this, then run $0 again. END_DIE } # Build.PL was formerly supported, but no longer is due to excessive # difficulty in implementing every single feature twice. if ( $0 =~ /Build.PL$/i ) { die <<"END_DIE" } Module::Install no longer supports Build.PL. It was impossible to maintain duel backends, and has been deprecated. Please remove all Build.PL files and only use the Makefile.PL installer. END_DIE #------------------------------------------------------------- # To save some more typing in Module::Install installers, every... # use inc::Module::Install # ...also acts as an implicit use strict. $^H |= strict::bits(qw(refs subs vars)); #------------------------------------------------------------- unless ( -f $self->{file} ) { foreach my $key (keys %INC) { delete $INC{$key} if $key =~ /Module\/Install/; } local $^W; require "$self->{path}/$self->{dispatch}.pm"; File::Path::mkpath("$self->{prefix}/$self->{author}"); $self->{admin} = "$self->{name}::$self->{dispatch}"->new( _top => $self ); $self->{admin}->init; @_ = ($class, _self => $self); goto &{"$self->{name}::import"}; } local $^W; *{"${who}::AUTOLOAD"} = $self->autoload; $self->preload; # Unregister loader and worker packages so subdirs can use them again delete $INC{'inc/Module/Install.pm'}; delete $INC{'Module/Install.pm'}; # Save to the singleton $MAIN = $self; return 1; } sub autoload { my $self = shift; my $who = $self->_caller; my $cwd = Cwd::cwd(); my $sym = "${who}::AUTOLOAD"; $sym->{$cwd} = sub { my $pwd = Cwd::cwd(); if ( my $code = $sym->{$pwd} ) { # Delegate back to parent dirs goto &$code unless $cwd eq $pwd; } unless ($$sym =~ s/([^:]+)$//) { # XXX: it looks like we can't retrieve the missing function # via $$sym (usually $main::AUTOLOAD) in this case. # I'm still wondering if we should slurp Makefile.PL to # get some context or not ... my ($package, $file, $line) = caller; die <<"EOT"; Unknown function is found at $file line $line. Execution of $file aborted due to runtime errors. If you're a contributor to a project, you may need to install some Module::Install extensions from CPAN (or other repository). If you're a user of a module, please contact the author. EOT } my $method = $1; if ( uc($method) eq $method ) { # Do nothing return; } elsif ( $method =~ /^_/ and $self->can($method) ) { # Dispatch to the root M:I class return $self->$method(@_); } # Dispatch to the appropriate plugin unshift @_, ( $self, $1 ); goto &{$self->can('call')}; }; } sub preload { my $self = shift; unless ( $self->{extensions} ) { $self->load_extensions( "$self->{prefix}/$self->{path}", $self ); } my @exts = @{$self->{extensions}}; unless ( @exts ) { @exts = $self->{admin}->load_all_extensions; } my %seen; foreach my $obj ( @exts ) { while (my ($method, $glob) = each %{ref($obj) . '::'}) { next unless $obj->can($method); next if $method =~ /^_/; next if $method eq uc($method); $seen{$method}++; } } my $who = $self->_caller; foreach my $name ( sort keys %seen ) { local $^W; *{"${who}::$name"} = sub { ${"${who}::AUTOLOAD"} = "${who}::$name"; goto &{"${who}::AUTOLOAD"}; }; } } sub new { my ($class, %args) = @_; delete $INC{'FindBin.pm'}; { # to suppress the redefine warning local $SIG{__WARN__} = sub {}; require FindBin; } # ignore the prefix on extension modules built from top level. my $base_path = Cwd::abs_path($FindBin::Bin); unless ( Cwd::abs_path(Cwd::cwd()) eq $base_path ) { delete $args{prefix}; } return $args{_self} if $args{_self}; $args{dispatch} ||= 'Admin'; $args{prefix} ||= 'inc'; $args{author} ||= ($^O eq 'VMS' ? '_author' : '.author'); $args{bundle} ||= 'inc/BUNDLES'; $args{base} ||= $base_path; $class =~ s/^\Q$args{prefix}\E:://; $args{name} ||= $class; $args{version} ||= $class->VERSION; unless ( $args{path} ) { $args{path} = $args{name}; $args{path} =~ s!::!/!g; } $args{file} ||= "$args{base}/$args{prefix}/$args{path}.pm"; $args{wrote} = 0; bless( \%args, $class ); } sub call { my ($self, $method) = @_; my $obj = $self->load($method) or return; splice(@_, 0, 2, $obj); goto &{$obj->can($method)}; } sub load { my ($self, $method) = @_; $self->load_extensions( "$self->{prefix}/$self->{path}", $self ) unless $self->{extensions}; foreach my $obj (@{$self->{extensions}}) { return $obj if $obj->can($method); } my $admin = $self->{admin} or die <<"END_DIE"; The '$method' method does not exist in the '$self->{prefix}' path! Please remove the '$self->{prefix}' directory and run $0 again to load it. END_DIE my $obj = $admin->load($method, 1); push @{$self->{extensions}}, $obj; $obj; } sub load_extensions { my ($self, $path, $top) = @_; my $should_reload = 0; unless ( grep { ! ref $_ and lc $_ eq lc $self->{prefix} } @INC ) { unshift @INC, $self->{prefix}; $should_reload = 1; } foreach my $rv ( $self->find_extensions($path) ) { my ($file, $pkg) = @{$rv}; next if $self->{pathnames}{$pkg}; local $@; my $new = eval { local $^W; require $file; $pkg->can('new') }; unless ( $new ) { warn $@ if $@; next; } $self->{pathnames}{$pkg} = $should_reload ? delete $INC{$file} : $INC{$file}; push @{$self->{extensions}}, &{$new}($pkg, _top => $top ); } $self->{extensions} ||= []; } sub find_extensions { my ($self, $path) = @_; my @found; File::Find::find( sub { my $file = $File::Find::name; return unless $file =~ m!^\Q$path\E/(.+)\.pm\Z!is; my $subpath = $1; return if lc($subpath) eq lc($self->{dispatch}); $file = "$self->{path}/$subpath.pm"; my $pkg = "$self->{name}::$subpath"; $pkg =~ s!/!::!g; # If we have a mixed-case package name, assume case has been preserved # correctly. Otherwise, root through the file to locate the case-preserved # version of the package name. if ( $subpath eq lc($subpath) || $subpath eq uc($subpath) ) { my $content = Module::Install::_read($subpath . '.pm'); my $in_pod = 0; foreach ( split //, $content ) { $in_pod = 1 if /^=\w/; $in_pod = 0 if /^=cut/; next if ($in_pod || /^=cut/); # skip pod text next if /^\s*#/; # and comments if ( m/^\s*package\s+($pkg)\s*;/i ) { $pkg = $1; last; } } } push @found, [ $file, $pkg ]; }, $path ) if -d $path; @found; } ##################################################################### # Common Utility Functions sub _caller { my $depth = 0; my $call = caller($depth); while ( $call eq __PACKAGE__ ) { $depth++; $call = caller($depth); } return $call; } # Done in evals to avoid confusing Perl::MinimumVersion eval( $] >= 5.006 ? <<'END_NEW' : <<'END_OLD' ); die $@ if $@; sub _read { local *FH; open( FH, '<', $_[0] ) or die "open($_[0]): $!"; my $string = do { local $/; }; close FH or die "close($_[0]): $!"; return $string; } END_NEW sub _read { local *FH; open( FH, "< $_[0]" ) or die "open($_[0]): $!"; my $string = do { local $/; }; close FH or die "close($_[0]): $!"; return $string; } END_OLD sub _readperl { my $string = Module::Install::_read($_[0]); $string =~ s/(?:\015{1,2}\012|\015|\012)/\n/sg; $string =~ s/(\n)\n*__(?:DATA|END)__\b.*\z/$1/s; $string =~ s/\n\n=\w+.+?\n\n=cut\b.+?\n+/\n\n/sg; return $string; } sub _readpod { my $string = Module::Install::_read($_[0]); $string =~ s/(?:\015{1,2}\012|\015|\012)/\n/sg; return $string if $_[0] =~ /\.pod\z/; $string =~ s/(^|\n=cut\b.+?\n+)[^=\s].+?\n(\n=\w+|\z)/$1$2/sg; $string =~ s/\n*=pod\b[^\n]*\n+/\n\n/sg; $string =~ s/\n*=cut\b[^\n]*\n+/\n\n/sg; $string =~ s/^\n+//s; return $string; } # Done in evals to avoid confusing Perl::MinimumVersion eval( $] >= 5.006 ? <<'END_NEW' : <<'END_OLD' ); die $@ if $@; sub _write { local *FH; open( FH, '>', $_[0] ) or die "open($_[0]): $!"; foreach ( 1 .. $#_ ) { print FH $_[$_] or die "print($_[0]): $!"; } close FH or die "close($_[0]): $!"; } END_NEW sub _write { local *FH; open( FH, "> $_[0]" ) or die "open($_[0]): $!"; foreach ( 1 .. $#_ ) { print FH $_[$_] or die "print($_[0]): $!"; } close FH or die "close($_[0]): $!"; } END_OLD # _version is for processing module versions (eg, 1.03_05) not # Perl versions (eg, 5.8.1). sub _version ($) { my $s = shift || 0; my $d =()= $s =~ /(\.)/g; if ( $d >= 2 ) { # Normalise multipart versions $s =~ s/(\.)(\d{1,3})/sprintf("$1%03d",$2)/eg; } $s =~ s/^(\d+)\.?//; my $l = $1 || 0; my @v = map { $_ . '0' x (3 - length $_) } $s =~ /(\d{1,3})\D?/g; $l = $l . '.' . join '', @v if @v; return $l + 0; } sub _cmp ($$) { _version($_[1]) <=> _version($_[2]); } # Cloned from Params::Util::_CLASS sub _CLASS ($) { ( defined $_[0] and ! ref $_[0] and $_[0] =~ m/^[^\W\d]\w*(?:::\w+)*\z/s ) ? $_[0] : undef; } 1; # Copyright 2008 - 2012 Adam Kennedy. HTML-HTML5-Parser-0.301/inc/Module/Package/0000755000175000017500000000000012166545247016023 5ustar taitaiHTML-HTML5-Parser-0.301/inc/Module/Package/Dist/0000755000175000017500000000000012166545247016726 5ustar taitaiHTML-HTML5-Parser-0.301/inc/Module/Package/Dist/RDF.pm0000644000175000017500000000204712166545152017675 0ustar taitai#line 1 package Module::Package::Dist::RDF; my $explanation = q< This is the component of Module::Package::RDF which gets bundled with the distribution. >; use 5.005; use strict; BEGIN { $Module::Package::Dist::RDF::AUTHORITY = 'cpan:TOBYINK'; $Module::Package::Dist::RDF::VERSION = '0.012'; @Module::Package::Dist::RDF::ISA = 'Module::Package::Dist'; } sub _main { my ($self) = @_; $self->mi->trust_meta_yml; $self->mi->auto_install; } { package Module::Package::Dist::RDF::standard; use 5.005; use strict; BEGIN { $Module::Package::Dist::RDF::standard::AUTHORITY = 'cpan:TOBYINK'; $Module::Package::Dist::RDF::standard::VERSION = '0.012'; @Module::Package::Dist::RDF::standard::ISA = 'Module::Package::Dist::RDF'; } } { package Module::Package::Dist::RDF::tobyink; use 5.005; use strict; BEGIN { $Module::Package::Dist::RDF::tobyink::AUTHORITY = 'cpan:TOBYINK'; $Module::Package::Dist::RDF::tobyink::VERSION = '0.012'; @Module::Package::Dist::RDF::tobyink::ISA = 'Module::Package::Dist::RDF'; } } 1; HTML-HTML5-Parser-0.301/bin/0000755000175000017500000000000012166545247013242 5ustar taitaiHTML-HTML5-Parser-0.301/bin/html2xhtml0000755000175000017500000000064112165627447015276 0ustar taitai#!/usr/bin/perl ## skip Test::Tabs use 5.010; use HTML::HTML5::Parser; my $input = shift // '-'; my $output = shift // '-'; my $parser = HTML::HTML5::Parser->new; my $dom = ($input eq '-') ? $parser->parse_string(do { local $/ = }) : $parser->parse_file($input); if ($output eq '-') { print $dom->toString; } else { open my($fh), '>:encoding(UTF-8)', $output; print $fh $dom->toString; close $fh; }HTML-HTML5-Parser-0.301/bin/html5debug0000755000175000017500000000307012165627436015230 0ustar taitai#!/usr/bin/perl ## skip Test::Tabs use Getopt::Long; use HTML::HTML5::Parser; my $output = $ENV{HTML_OUTPUT} || 'debug:json'; my $help; GetOptions( 'output|o=s' => \$output, 'help|usage|h' => \$help, ); if ($help) { my $name = $0; print <new; my $h = join '', <>; my $hash; if ($output =~ /debug/i) { load('XML::LibXML::Debugging'); $hash = $p->parse_string($h)->toDebuggingHash; } elsif ($output =~ /clark/i) { load('XML::LibXML::Debugging'); print $p->parse_string($h)->toClarkML; } elsif ($output =~ /html/i) { load('HTML::HTML5::Writer'); print HTML::HTML5::Writer->new->document($p->parse_string($h)); } elsif ($output =~ /parser/i) { $p->parse_string($h); $hash = $p; } elsif ($output =~ /err/i) { $p->parse_string($h); print "$_\n" foreach $p->errors; } else { print $p->parse_string($h)->toString; } if (defined $hash and $output =~ /json/) { load('JSON'); print to_json($hash, {pretty=>1,canonical=>1}); } elsif (defined $hash) { load('Data::Dumper'); print Dumper($hash); } HTML-HTML5-Parser-0.301/COPYRIGHT0000644000175000017500000001070712166545200013757 0ustar taitaiFormat: http://www.debian.org/doc/packaging-manuals/copyright-format/1.0/ Upstream-Name: HTML-HTML5-Parser Upstream-Contact: Toby Inkster Source: https://metacpan.org/release/HTML-HTML5-Parser Files: CREDITS Changes LICENSE MANIFEST.SKIP META.ttl Makefile.PL NEWS README TODO bin/html2xhtml bin/html5debug examples/charsets.pl examples/html/iso-8859-15.html examples/html/utf-16.html examples/html/utf-8.html lib/HTML/HTML5/Parser/Charset/DecodeHandle.pm lib/HTML/HTML5/Parser/Charset/Info.pm lib/HTML/HTML5/Parser/Charset/UnicodeChecker.pm lib/HTML/HTML5/Parser/Charset/UniversalCharDet.pm lib/HTML/HTML5/Parser/Charset/WebLatin1.pm lib/HTML/HTML5/Parser/Charset/WebThai.pm lib/HTML/HTML5/Parser/TagSoupParser.pm lib/HTML/HTML5/Parser/Tokenizer.pm meta/changes.ttl meta/doap.ttl meta/makefile.ttl meta/rt-bugs.ttl t/html5lib-fail/domjs-unsafe.dat t/html5lib-fail/entities02.dat t/html5lib-fail/plain-text-unsafe.dat t/html5lib-fail/tests1.dat t/html5lib-fail/tests11.dat t/html5lib-fail/tests14.dat t/html5lib-fail/tests16.dat t/html5lib-fail/tests21.dat t/html5lib-fail/tests22.dat t/html5lib-fail/tests23.dat t/html5lib-fail/tests5.dat t/html5lib-fail/tests6.dat t/html5lib-fail/tests9.dat t/html5lib-fail/tests_innerHTML_1.dat t/html5lib-pass/adoption01.dat t/html5lib-pass/adoption02.dat t/html5lib-pass/comments01.dat t/html5lib-pass/doctype01.dat t/html5lib-pass/entities01.dat t/html5lib-pass/html5test-com.dat t/html5lib-pass/inbody01.dat t/html5lib-pass/isindex.dat t/html5lib-pass/pending-spec-changes-plain-text-unsafe.dat t/html5lib-pass/pending-spec-changes.dat t/html5lib-pass/scriptdata01.dat t/html5lib-pass/scripted/adoption01.dat t/html5lib-pass/scripted/ark.dat t/html5lib-pass/scripted/webkit01.dat t/html5lib-pass/tables01.dat t/html5lib-pass/tests10.dat t/html5lib-pass/tests12.dat t/html5lib-pass/tests15.dat t/html5lib-pass/tests17.dat t/html5lib-pass/tests18.dat t/html5lib-pass/tests19.dat t/html5lib-pass/tests2.dat t/html5lib-pass/tests20.dat t/html5lib-pass/tests24.dat t/html5lib-pass/tests25.dat t/html5lib-pass/tests26.dat t/html5lib-pass/tests3.dat t/html5lib-pass/tests4.dat t/html5lib-pass/tests7.dat t/html5lib-pass/tests8.dat t/html5lib-pass/tricky01.dat t/html5lib-pass/webkit01.dat t/html5lib-pass/webkit02.dat t/lib/Test/HTTP/Server.pm Copyright: Unknown License: Unknown Files: lib/HTML/HTML5/Parser/UA.pm t/02html5.t t/03html4.t t/04fragments.t t/05origins.t t/06xlxdsln.t t/07ua.t t/08ua-lwp.t t/99html5lib.t t/rt-79019.t Copyright: Copyright (C) 2012 by Toby Inkster. License: GPL-1.0+ or Artistic-1.0 Files: inc/Module/Install.pm inc/Module/Install/Include.pm inc/Module/Install/Metadata.pm inc/Module/Install/Scripts.pm Copyright: Copyright 2002 - 2012 Brian Ingerson, Audrey Tang and Adam Kennedy. License: GPL-1.0+ or Artistic-1.0 Files: inc/Module/Install/TrustMetaYml.pm inc/Module/Package/Dist/RDF.pm Copyright: This software is copyright (c) 2011-2012 by Toby Inkster. License: GPL-1.0+ or Artistic-1.0 Files: inc/Module/Install/Package.pm Copyright: Copyright (c) 2011. Ingy doet Net. License: GPL-1.0+ or Artistic-1.0 Files: COPYRIGHT Copyright: None License: public-domain Comment: This file! Automatically generated. Files: inc/Module/Install/Contributors.pm Copyright: This software is copyright (c) 2013 by Toby Inkster. License: GPL-1.0+ or Artistic-1.0 Files: inc/Module/Install/Base.pm Copyright: Copyright 2003, 2004 by Audrey Tang . License: GPL-1.0+ or Artistic-1.0 Files: inc/Module/Install/Makefile.pm Copyright: Copyright 2002, 2003, 2004 Audrey Tang and Brian Ingerson. License: GPL-1.0+ or Artistic-1.0 Files: t/01basic.t Copyright: Copyright (C) 2010-2012 by Toby Inkster. License: GPL-1.0+ or Artistic-1.0 Files: lib/HTML/HTML5/Parser/Error.pm Copyright: Copyright (C) 2011-2012 by Toby Inkster. License: GPL-1.0+ or Artistic-1.0 Files: lib/HTML/HTML5/Parser.pm Copyright: Copyright (C) 2007-2011 by Wakaba. Copyright (C) 2009-2012 by Toby Inkster. License: GPL-1.0+ or Artistic-1.0 Files: inc/YAML/Tiny.pm Copyright: Copyright 2006 - 2012 Adam Kennedy. License: GPL-1.0+ or Artistic-1.0 License: Artistic-1.0 This software is Copyright (c) 2013 by the copyright holder(s). This is free software, licensed under: The Artistic License 1.0 License: GPL-1.0 This software is Copyright (c) 2013 by the copyright holder(s). This is free software, licensed under: The GNU General Public License, Version 1, February 1989 HTML-HTML5-Parser-0.301/NEWS0000644000175000017500000000051112165630111013146 0ustar taitai2013-07-05 HTML-HTML5-Parser-0.300 ================================== Fixed (DORIAN++) some major memory leaks at the expense of some minor API changes. 2012-03-15 HTML-HTML5-Parser-0.109 ================================== We now bundle the tree construction parts of the html5lib test suite, and pass more than half of it. HTML-HTML5-Parser-0.301/t/0000755000175000017500000000000012166545247012735 5ustar taitaiHTML-HTML5-Parser-0.301/t/05origins.t0000644000175000017500000000624012165411225014726 0ustar taitai## skip Test::Tabs use Test::More tests => 29; use HTML::HTML5::Parser; my $dom = HTML::HTML5::Parser->load_html(string => <<'HTML'); Test 5: Origins

This is a test! HTML can_ok 'HTML::HTML5::Parser' => 'source_line' or BAIL_OUT('No "source_line" method!!'); my @root = HTML::HTML5::Parser->source_line($dom->documentElement); is($root[0], 2, 'root element has correct line number'); is($root[1], 1, 'root element has correct col number'); ok(!$root[2], 'root element explicit'); my @head = HTML::HTML5::Parser->source_line($dom->getElementsByTagName('head')->get_node(1)); ok(defined $head[0], 'head element has a line number'); ok(defined $head[1], 'head element has a col number'); ok($head[2], 'head element implicit'); my @title_text = HTML::HTML5::Parser->source_line($dom->getElementsByTagName('title')->get_node(1)->childNodes->get_node(1)); is($title_text[0], 3, 'text node in title element has a line number'); is($title_text[1], 10, 'text node in title element has a col number'); ok(!$title_text[2], 'text node in title element explicit'); my @para = HTML::HTML5::Parser->source_line($dom->getElementsByTagName('p')->get_node(1)); is($para[0], 4, 'p element has correct line number'); is($para[1], 3, 'p element has correct col number'); ok(!$para[2], 'para element explicit'); my $para = HTML::HTML5::Parser->source_line($dom->getElementsByTagName('p')->get_node(1)); is($para, 4, 'p element has correct line number (scalar context)'); my @b = HTML::HTML5::Parser->source_line($dom->getElementsByTagName('b')->get_node(1)); is($b[0], 5, 'b element has correct line number'); is($b[1], 5, 'b element has correct col number'); ok(!$b[2], 'b element explicit'); my @i = HTML::HTML5::Parser->source_line($dom->getElementsByTagName('i')->get_node(1)); is($i[0], 5, 'i element has correct line number'); is($i[1], 17, 'i element has correct col number'); ok(!$i[2], 'i element explicit'); my @a = HTML::HTML5::Parser->source_line($dom->getElementsByTagName('a')->get_node(1)); is($a[0], 6, 'a element has correct line number'); is($a[1], 5, 'a element has correct col number'); ok(!$a[2], 'a element explicit'); my @href = HTML::HTML5::Parser->source_line($dom->getElementsByTagName('a')->get_node(1)->getAttributeNode('href')); is($href[0], 6, 'href attribute has correct line number'); is($href[1], 8, 'href attribute has correct col number'); ok(!$href[2], 'href attribute explicit'); # It's not easy to actually find comments in the DOM! my $comment = $dom->getElementsByTagName('p')->[0]->childNodes->[-2]; my @comment = HTML::HTML5::Parser->source_line($comment); is($comment[0], 8, 'comment has correct line number') or diag($comment->toString); is($comment[1], 5, 'comment has correct col number'); ok(!$comment[2], 'comment is explicit'); =head1 PURPOSE Check that line/column numbers are reported. =head1 AUTHOR Toby Inkster, Etobyink@cpan.orgE =head1 COPYRIGHT AND LICENCE Copyright (C) 2012 by Toby Inkster This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself. HTML-HTML5-Parser-0.301/t/01basic.t0000644000175000017500000000133712165411240014330 0ustar taitai## skip Test::Tabs use Test::More tests => 3; BEGIN { use_ok('HTML::HTML5::Parser') }; my $parser = new_ok 'HTML::HTML5::Parser'; can_ok $parser, qw/ parse_file parse_html_file parse_fh parse_html_fh parse_string parse_html_string parse_balanced_chunk load_xml load_html error_handler errors compat_mode dtd_public_id dtd_system_id dtd_element source_line /; =head1 PURPOSE Test that L can be loaded and instantiated, and that the object has the expected methods. =head1 AUTHOR Toby Inkster, Etobyink@cpan.orgE =head1 COPYRIGHT AND LICENCE Copyright (C) 2010-2012 by Toby Inkster This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself. HTML-HTML5-Parser-0.301/t/html5lib-pass/0000755000175000017500000000000012166545247015421 5ustar taitaiHTML-HTML5-Parser-0.301/t/html5lib-pass/scriptdata01.dat0000644000175000017500000001030111726072634020401 0ustar taitai#data FOOBAR #errors #document | | | | "FOO" | BAR #errors #document | | | | "FOO" | BAR #errors #document | | | | "FOO" | BAR #errors #document | | | | "FOO" | BAR #errors #document | | | | "FOO" | BAR #errors #document | | | | "FOO" | BAR #errors #document | | | | "FOO" | BAR #errors #document | | | | "FOO" | BAR #errors #document | | | | "FOO" | BAR #errors #document | | | | "FOO" | BAR #errors #document | | | | "FOO" | BAR #errors #document | | | | "FOO" | BAR #errors #document | | | | "FOO" | BAR #errors #document | | | | "FOO" | QUX #errors #document | | | | "FOO" | #errors Line: 1 Col: 6 Unexpected start tag (head). Expected DOCTYPE. Line: 1 Col: 21 Unexpected start tag (script) that can be in head. Moved. #document | | | #errors Line: 1 Col: 6 Unexpected start tag (head). Expected DOCTYPE. Line: 1 Col: 28 Unexpected start tag (style) that can be in head. Moved. #document | | | #errors Line: 1 Col: 6 Unexpected start tag (head). Expected DOCTYPE. #document | | | | | "x" | #errors #document | | | | | |
#errors #document | | | | |
| | #errors #document | | | | | | "X" | <meta> | name="z" | <link> | rel="foo" | <style> | " x { content:"</style" } " #data <!DOCTYPE html><select><optgroup></optgroup></select> #errors #document | <!DOCTYPE html> | <html> | <head> | <body> | <select> | <optgroup> #data #errors Line: 2 Col: 1 Unexpected End of file. Expected DOCTYPE. #document | <html> | <head> | <body> #data <!DOCTYPE html> <html> #errors #document | <!DOCTYPE html> | <html> | <head> | <body> #data <!DOCTYPE html><script> </script> <title>x #errors #document | | | | abc #errors #document | | | | | "abc" |
| | | abc #errors #document | | | | | "abc" |
| | | abc #errors #document | | | | |
|
| abc #errors #document | | | | | | | |
| abc #errors #document | | | | | | abc #errors #document | | | | | #errors #document | | | | |
| | #data #errors #document | | | | | | #data #errors #document | | | | | | #data #errors #document | | | | | | #data #errors #document | | | | | | #data #errors #document | | | | | | #data #errors #document | | | | | | #data #errors #document | | | | | | #data #errors #document | | | | | | #data #errors #document | | | | | | #data #errors #document | | | | | | #data #errors #document | | | | | | | #data
#errors #document | | | | | | | |
| | | | | #data #errors #document | | | | | | | | | | | | | | ���������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������HTML-HTML5-Parser-0.301/t/html5lib-pass/tables01.dat������������������������������������������������0000644�0001750�0001750�00000005322�11726072634�017524� 0����������������������������������������������������������������������������������������������������ustar �tai�����������������������������tai��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������#data
#errors #document | | | | | | |
#data
#errors #document | | | | | | |
#data #errors #document | | | |
| | | foo="bar" #data
foo #errors #document | | | | "foo" |
| #data

foo #errors #document | | | | |

| "foo" #data

#errors #document | | | | | | |
#data
#errors #document | | | | #data
#errors #document | | | | |
#data
#errors #document | | | | #data
B
#errors #document | | | | | | |
| "B" #data
foo #errors #document | | | | | | |
| "foo" #data
A
B #errors #document | | | | | | |
| "A" | "B" #data
#errors #document | | | | | | |
#data
foo #errors #document | | | | | | |
| "foo" #data #errors #document | | | |
| | | #data
|
#errors #document | | | | | | |
| #data
#errors #document | | | | | | |
| | | ��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������HTML-HTML5-Parser-0.301/t/html5lib-pass/tests17.dat�������������������������������������������������0000644�0001750�0001750�00000003721�11726072634�017424� 0����������������������������������������������������������������������������������������������������ustar �tai�����������������������������tai��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������#data #errors #document | | | | |
| | #data
#errors #document | | | | | | | |
#data
#errors #document | | | | | | | |
| #data
#errors #document | | | | | | | |
| #data #errors #document | | | | |
| | #data #errors #document | | | | | #errors #document | | | | | #errors #document | | | | | #errors #document | | | | |
|
#errors #document | | | | | #errors #document | | | | |
#errors #document | | | | |
a #errors #document | | | | | | | | "a" HTML-HTML5-Parser-0.301/t/html5lib-pass/tests26.dat0000644000175000017500000001231211726072634017420 0ustar taitai#data 1
2
3 #errors #document | | | | | | href="#1" | | "1" | | |
|
| href="#2" | | href="#2" | | "2" | | |
|
| href="#3" | | href="#3" | | "3" | #data 123 #errors #document | | | | | | | "1" | | | | | | "2" | | | "3" #data 1
23 #errors #document | | | | | | | "1" | | | | | "2" | | | "3" |
#data 1
23 #errors #document | | | | | | | "1" | | | |
| | | | | "2" | | | "3" #data 1
23 #errors #document | | | | | | | "1" |
| | | | | | | | "2" | | | "3" #data 1
23 #errors #document | | | | | | | "1" | |
| | | | | "2" | | | "3" #data 1 #errors #document | | | | | | | "1" | | | | | | #data 12 #errors #document | | | | | | | "1" | | | | | "2" #data 12 #errors #document | | | | | | "1" | | | | | | "2" #data

#errors #document | | | |

| | code="" | x<="" | | code="" | x<="" | " " #data

a #errors 45: End tag “p” seen, but there were open elements. 41: Unclosed element “i”. 46: End of file seen and there were open elements. 35: Unclosed element “foreignObject”. 20: Unclosed element “svg”. #document | | | | | | |

| | | "a" #data

a #errors 56: End tag “p” seen, but there were open elements. 52: Unclosed element “i”. 57: End of file seen and there were open elements. 46: Unclosed element “foreignObject”. 31: Unclosed element “svg”. 22: Unclosed element “table”. #document | | | | | | | |
| | |

| | | "a" #data

a #errors 38: End tag “p” seen, but there were open elements. 34: Unclosed element “i”. 39: End of file in a foreign namespace context. #document | | | | | | |

| | | "a" #data

a #errors 53: End tag “p” seen, but there were open elements. 49: Unclosed element “i”. 54: End of file in a foreign namespace context. #document | | | | | | | | | |
| | |

| | | "a" #data

a #errors 29: Bogus comment. 34: End of file seen and there were open elements. 26: Unclosed element “div”. #document | | | | |
| | "a" HTML-HTML5-Parser-0.301/t/html5lib-pass/isindex.dat0000644000175000017500000000107411726072634017554 0ustar taitai#data #errors #document | | | |
|
|
#data
A #errors #document | | | | | | |
| "A" #data
A #errors #document | | | | | | |
| "A" #data

#errors Line: 1 Col: 7 Unexpected start tag (table). Expected DOCTYPE. Line: 1 Col: 20 Unexpected end tag (strong) in table context caused voodoo mode. Line: 1 Col: 20 End tag (strong) violates step 1, paragraph 1 of the adoption agency algorithm. Line: 1 Col: 24 Unexpected end tag (b) in table context caused voodoo mode. Line: 1 Col: 24 End tag (b) violates step 1, paragraph 1 of the adoption agency algorithm. Line: 1 Col: 29 Unexpected end tag (em) in table context caused voodoo mode. Line: 1 Col: 29 End tag (em) violates step 1, paragraph 1 of the adoption agency algorithm. Line: 1 Col: 33 Unexpected end tag (i) in table context caused voodoo mode. Line: 1 Col: 33 End tag (i) violates step 1, paragraph 1 of the adoption agency algorithm. Line: 1 Col: 37 Unexpected end tag (u) in table context caused voodoo mode. Line: 1 Col: 37 End tag (u) violates step 1, paragraph 1 of the adoption agency algorithm. Line: 1 Col: 46 Unexpected end tag (strike) in table context caused voodoo mode. Line: 1 Col: 46 End tag (strike) violates step 1, paragraph 1 of the adoption agency algorithm. Line: 1 Col: 50 Unexpected end tag (s) in table context caused voodoo mode. Line: 1 Col: 50 End tag (s) violates step 1, paragraph 1 of the adoption agency algorithm. Line: 1 Col: 58 Unexpected end tag (blink) in table context caused voodoo mode. Line: 1 Col: 58 Unexpected end tag (blink). Ignored. Line: 1 Col: 63 Unexpected end tag (tt) in table context caused voodoo mode. Line: 1 Col: 63 End tag (tt) violates step 1, paragraph 1 of the adoption agency algorithm. Line: 1 Col: 69 Unexpected end tag (pre) in table context caused voodoo mode. Line: 1 Col: 69 End tag (pre) seen too early. Expected other end tag. Line: 1 Col: 75 Unexpected end tag (big) in table context caused voodoo mode. Line: 1 Col: 75 End tag (big) violates step 1, paragraph 1 of the adoption agency algorithm. Line: 1 Col: 83 Unexpected end tag (small) in table context caused voodoo mode. Line: 1 Col: 83 End tag (small) violates step 1, paragraph 1 of the adoption agency algorithm. Line: 1 Col: 90 Unexpected end tag (font) in table context caused voodoo mode. Line: 1 Col: 90 End tag (font) violates step 1, paragraph 1 of the adoption agency algorithm. Line: 1 Col: 99 Unexpected end tag (select) in table context caused voodoo mode. Line: 1 Col: 99 Unexpected end tag (select). Ignored. Line: 1 Col: 104 Unexpected end tag (h1) in table context caused voodoo mode. Line: 1 Col: 104 End tag (h1) seen too early. Expected other end tag. Line: 1 Col: 109 Unexpected end tag (h2) in table context caused voodoo mode. Line: 1 Col: 109 End tag (h2) seen too early. Expected other end tag. Line: 1 Col: 114 Unexpected end tag (h3) in table context caused voodoo mode. Line: 1 Col: 114 End tag (h3) seen too early. Expected other end tag. Line: 1 Col: 119 Unexpected end tag (h4) in table context caused voodoo mode. Line: 1 Col: 119 End tag (h4) seen too early. Expected other end tag. Line: 1 Col: 124 Unexpected end tag (h5) in table context caused voodoo mode. Line: 1 Col: 124 End tag (h5) seen too early. Expected other end tag. Line: 1 Col: 129 Unexpected end tag (h6) in table context caused voodoo mode. Line: 1 Col: 129 End tag (h6) seen too early. Expected other end tag. Line: 1 Col: 136 Unexpected end tag (body) in the table row phase. Ignored. Line: 1 Col: 141 Unexpected end tag (br) in table context caused voodoo mode. Line: 1 Col: 141 Unexpected end tag (br). Treated as br element. Line: 1 Col: 145 Unexpected end tag (a) in table context caused voodoo mode. Line: 1 Col: 145 End tag (a) violates step 1, paragraph 1 of the adoption agency algorithm. Line: 1 Col: 151 Unexpected end tag (img) in table context caused voodoo mode. Line: 1 Col: 151 This element (img) has no end tag. Line: 1 Col: 159 Unexpected end tag (title) in table context caused voodoo mode. Line: 1 Col: 159 Unexpected end tag (title). Ignored. Line: 1 Col: 166 Unexpected end tag (span) in table context caused voodoo mode. Line: 1 Col: 166 Unexpected end tag (span). Ignored. Line: 1 Col: 174 Unexpected end tag (style) in table context caused voodoo mode. Line: 1 Col: 174 Unexpected end tag (style). Ignored. Line: 1 Col: 183 Unexpected end tag (script) in table context caused voodoo mode. Line: 1 Col: 183 Unexpected end tag (script). Ignored. Line: 1 Col: 196 Unexpected end tag (th). Ignored. Line: 1 Col: 201 Unexpected end tag (td). Ignored. Line: 1 Col: 206 Unexpected end tag (tr). Ignored. Line: 1 Col: 214 This element (frame) has no end tag. Line: 1 Col: 221 This element (area) has no end tag. Line: 1 Col: 228 Unexpected end tag (link). Ignored. Line: 1 Col: 236 This element (param) has no end tag. Line: 1 Col: 241 This element (hr) has no end tag. Line: 1 Col: 249 This element (input) has no end tag. Line: 1 Col: 255 Unexpected end tag (col). Ignored. Line: 1 Col: 262 Unexpected end tag (base). Ignored. Line: 1 Col: 269 Unexpected end tag (meta). Ignored. Line: 1 Col: 280 This element (basefont) has no end tag. Line: 1 Col: 290 This element (bgsound) has no end tag. Line: 1 Col: 298 This element (embed) has no end tag. Line: 1 Col: 307 This element (spacer) has no end tag. Line: 1 Col: 311 Unexpected end tag (p). Ignored. Line: 1 Col: 316 End tag (dd) seen too early. Expected other end tag. Line: 1 Col: 321 End tag (dt) seen too early. Expected other end tag. Line: 1 Col: 331 Unexpected end tag (caption). Ignored. Line: 1 Col: 342 Unexpected end tag (colgroup). Ignored. Line: 1 Col: 350 Unexpected end tag (tbody). Ignored. Line: 1 Col: 358 Unexpected end tag (tfoot). Ignored. Line: 1 Col: 366 Unexpected end tag (thead). Ignored. Line: 1 Col: 376 End tag (address) seen too early. Expected other end tag. Line: 1 Col: 389 End tag (blockquote) seen too early. Expected other end tag. Line: 1 Col: 398 End tag (center) seen too early. Expected other end tag. Line: 1 Col: 404 Unexpected end tag (dir). Ignored. Line: 1 Col: 410 End tag (div) seen too early. Expected other end tag. Line: 1 Col: 415 End tag (dl) seen too early. Expected other end tag. Line: 1 Col: 426 End tag (fieldset) seen too early. Expected other end tag. Line: 1 Col: 436 End tag (listing) seen too early. Expected other end tag. Line: 1 Col: 443 End tag (menu) seen too early. Expected other end tag. Line: 1 Col: 448 End tag (ol) seen too early. Expected other end tag. Line: 1 Col: 453 End tag (ul) seen too early. Expected other end tag. Line: 1 Col: 458 End tag (li) seen too early. Expected other end tag. Line: 1 Col: 465 End tag (nobr) violates step 1, paragraph 1 of the adoption agency algorithm. Line: 1 Col: 471 This element (wbr) has no end tag. Line: 1 Col: 487 End tag (button) seen too early. Expected other end tag. Line: 1 Col: 497 End tag (marquee) seen too early. Expected other end tag. Line: 1 Col: 506 End tag (object) seen too early. Expected other end tag. Line: 1 Col: 524 Unexpected end tag (html). Ignored. Line: 1 Col: 524 Unexpected end tag (frameset). Ignored. Line: 1 Col: 531 Unexpected end tag (head). Ignored. Line: 1 Col: 540 Unexpected end tag (iframe). Ignored. Line: 1 Col: 548 This element (image) has no end tag. Line: 1 Col: 558 This element (isindex) has no end tag. Line: 1 Col: 568 Unexpected end tag (noembed). Ignored. Line: 1 Col: 579 Unexpected end tag (noframes). Ignored. Line: 1 Col: 590 Unexpected end tag (noscript). Ignored. Line: 1 Col: 601 Unexpected end tag (optgroup). Ignored. Line: 1 Col: 610 Unexpected end tag (option). Ignored. Line: 1 Col: 622 Unexpected end tag (plaintext). Ignored. Line: 1 Col: 633 Unexpected end tag (textarea). Ignored. #document | | | |
|
A #errors #document | | | | | | | World"; my $NS = 'xmlns="http://www.w3.org/1999/xhtml"'; can_ok $parser => 'parse_balanced_chunk'; is( $parser->parse_balanced_chunk($input, {within=>'div'})->toString, "HelloWorld", 'within div', ); is( $parser->parse_balanced_chunk($input, {within=>'td'})->toString, "WorldHello ", 'within td', ); is( $parser->parse_balanced_chunk($input, {force_within=>'td'})->toString, "Hello", 'force within td', ); my $list = $parser->parse_balanced_chunk($input, {mark_outliers=>1, within=>'td', as=>'list'}); ok( $list->get_node(1)->hasAttribute('data-perl-html-html5-parser-outlier'), 'mark outliers', ); =head1 PURPOSE Test C method. =head1 AUTHOR Toby Inkster, Etobyink@cpan.orgE =head1 COPYRIGHT AND LICENCE Copyright (C) 2012 by Toby Inkster This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself. HTML-HTML5-Parser-0.301/t/html5lib-fail/0000755000175000017500000000000012166545247015366 5ustar taitaiHTML-HTML5-Parser-0.301/t/html5lib-fail/tests22.dat0000644000175000017500000000634111726072634017366 0ustar taitai#data
X #errors #document | | | | | | | | | | | |
| | "X" #data
A #errors #document | | | | | | |
| id="1" | |
| id="2" | |
| id="3" | |
| id="4" | |
| id="5" | |
| id="6" | |
| id="7" | |
| id="8" | | "A" #data
A #errors #document | | | | | | |
| id="1" | |
| id="2" | |
| id="3" | |
| id="4" | |
| id="5" | |
| id="6" | |
| id="7" | |
| id="8" | |
| id="9" | "A" #data
A #errors #document | | | | | | |
| id="1" | |
| id="2" | |
| id="3" | |
| id="4" | |
| id="5" | |
| id="6" | |
| id="7" | |
| id="8" | |
| id="9" |
| id="10" | "A" #data
XTEST #errors Line: 1 Col: 6 Unexpected start tag (cite). Expected DOCTYPE. Line: 1 Col: 46 End tag (b) violates step 1, paragraph 3 of the adoption agency algorithm. Line: 1 Col: 50 Expected closing tag. Unexpected end of file. #document | | | | | | | | | | | | | |
| | "X" | "TEST" HTML-HTML5-Parser-0.301/t/html5lib-fail/tests11.dat0000644000175000017500000004137511726072634017372 0ustar taitai#data #errors #document | | | | | | attributeName="" | attributeType="" | baseFrequency="" | baseProfile="" | calcMode="" | clipPathUnits="" | contentScriptType="" | contentStyleType="" | diffuseConstant="" | edgeMode="" | externalResourcesRequired="" | filterRes="" | filterUnits="" | glyphRef="" | gradientTransform="" | gradientUnits="" | kernelMatrix="" | kernelUnitLength="" | keyPoints="" | keySplines="" | keyTimes="" | lengthAdjust="" | limitingConeAngle="" | markerHeight="" | markerUnits="" | markerWidth="" | maskContentUnits="" | maskUnits="" | numOctaves="" | pathLength="" | patternContentUnits="" | patternTransform="" | patternUnits="" | pointsAtX="" | pointsAtY="" | pointsAtZ="" | preserveAlpha="" | preserveAspectRatio="" | primitiveUnits="" | refX="" | refY="" | repeatCount="" | repeatDur="" | requiredExtensions="" | requiredFeatures="" | specularConstant="" | specularExponent="" | spreadMethod="" | startOffset="" | stdDeviation="" | stitchTiles="" | surfaceScale="" | systemLanguage="" | tableValues="" | targetX="" | targetY="" | textLength="" | viewBox="" | viewTarget="" | xChannelSelector="" | yChannelSelector="" | zoomAndPan="" #data #errors #document | | | | | | attributeName="" | attributeType="" | baseFrequency="" | baseProfile="" | calcMode="" | clipPathUnits="" | contentScriptType="" | contentStyleType="" | diffuseConstant="" | edgeMode="" | externalResourcesRequired="" | filterRes="" | filterUnits="" | glyphRef="" | gradientTransform="" | gradientUnits="" | kernelMatrix="" | kernelUnitLength="" | keyPoints="" | keySplines="" | keyTimes="" | lengthAdjust="" | limitingConeAngle="" | markerHeight="" | markerUnits="" | markerWidth="" | maskContentUnits="" | maskUnits="" | numOctaves="" | pathLength="" | patternContentUnits="" | patternTransform="" | patternUnits="" | pointsAtX="" | pointsAtY="" | pointsAtZ="" | preserveAlpha="" | preserveAspectRatio="" | primitiveUnits="" | refX="" | refY="" | repeatCount="" | repeatDur="" | requiredExtensions="" | requiredFeatures="" | specularConstant="" | specularExponent="" | spreadMethod="" | startOffset="" | stdDeviation="" | stitchTiles="" | surfaceScale="" | systemLanguage="" | tableValues="" | targetX="" | targetY="" | textLength="" | viewBox="" | viewTarget="" | xChannelSelector="" | yChannelSelector="" | zoomAndPan="" #data #errors #document | | | | | | attributeName="" | attributeType="" | baseFrequency="" | baseProfile="" | calcMode="" | clipPathUnits="" | contentScriptType="" | contentStyleType="" | diffuseConstant="" | edgeMode="" | externalResourcesRequired="" | filterRes="" | filterUnits="" | glyphRef="" | gradientTransform="" | gradientUnits="" | kernelMatrix="" | kernelUnitLength="" | keyPoints="" | keySplines="" | keyTimes="" | lengthAdjust="" | limitingConeAngle="" | markerHeight="" | markerUnits="" | markerWidth="" | maskContentUnits="" | maskUnits="" | numOctaves="" | pathLength="" | patternContentUnits="" | patternTransform="" | patternUnits="" | pointsAtX="" | pointsAtY="" | pointsAtZ="" | preserveAlpha="" | preserveAspectRatio="" | primitiveUnits="" | refX="" | refY="" | repeatCount="" | repeatDur="" | requiredExtensions="" | requiredFeatures="" | specularConstant="" | specularExponent="" | spreadMethod="" | startOffset="" | stdDeviation="" | stitchTiles="" | surfaceScale="" | systemLanguage="" | tableValues="" | targetX="" | targetY="" | textLength="" | viewBox="" | viewTarget="" | xChannelSelector="" | yChannelSelector="" | zoomAndPan="" #data #errors #document | | | | | | attributename="" | attributetype="" | basefrequency="" | baseprofile="" | calcmode="" | clippathunits="" | contentscripttype="" | contentstyletype="" | diffuseconstant="" | edgemode="" | externalresourcesrequired="" | filterres="" | filterunits="" | glyphref="" | gradienttransform="" | gradientunits="" | kernelmatrix="" | kernelunitlength="" | keypoints="" | keysplines="" | keytimes="" | lengthadjust="" | limitingconeangle="" | markerheight="" | markerunits="" | markerwidth="" | maskcontentunits="" | maskunits="" | numoctaves="" | pathlength="" | patterncontentunits="" | patterntransform="" | patternunits="" | pointsatx="" | pointsaty="" | pointsatz="" | preservealpha="" | preserveaspectratio="" | primitiveunits="" | refx="" | refy="" | repeatcount="" | repeatdur="" | requiredextensions="" | requiredfeatures="" | specularconstant="" | specularexponent="" | spreadmethod="" | startoffset="" | stddeviation="" | stitchtiles="" | surfacescale="" | systemlanguage="" | tablevalues="" | targetx="" | targety="" | textlength="" | viewbox="" | viewtarget="" | xchannelselector="" | ychannelselector="" | zoomandpan="" #data #errors #document | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | #data #errors #document | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | #data #errors #document | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | #data #errors #document | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | #data #errors #document | | | | | | �������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������HTML-HTML5-Parser-0.301/t/html5lib-fail/domjs-unsafe.dat��������������������������������������������0000644�0001750�0001750�00000014757�11726072634�020465� 0����������������������������������������������������������������������������������������������������ustar �tai�����������������������������tai��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������#data foo bar #errors #document | | | | | "foo bar" #data foo bar #errors #document | | | | | "foo bar" #data foo bar #errors #document | | | | | "foo bar" #data #errors #document | | | #errors #document | | | #errors #document | | | #errors #document | | | #errors #document | | | #errors #document | | | #errors #document | | | #errors #document | | | #errors #document | | | #errors #document | | | --> x #errors Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. #document | | | | | | | | | | | | | | | | | | | | | | | | #errors #document | | | | | | | | | | | | | #errors #document | | | | #errors #document | | | | #errors #document | | | | #errors #document | | | | #errors #document | | | | #errors #document | | | | #errors #document | | | | #errors #document | | | | #errors #document | | | | #errors Line: 1 Col: 59 Unexpected end tag (script). #document | | | | #errors #document | | | | | "<body><script><!--...</script></body>" | <body> #data <!doctype html><textarea><!--<textarea></textarea>--></textarea> #errors Line: 1 Col: 64 Unexpected end tag (textarea). #document | <!DOCTYPE html> | <html> | <head> | <body> | <textarea> | "<!--<textarea>" | "-->" #data <!doctype html><textarea>&lt;/textarea></textarea> #errors #document | <!DOCTYPE html> | <html> | <head> | <body> | <textarea> | "</textarea>" #data <!doctype html><textarea>&lt;</textarea> #errors #document | <!DOCTYPE html> | <html> | <head> | <body> | <textarea> | "<" #data <!doctype html><textarea>a&lt;b</textarea> #errors #document | <!DOCTYPE html> | <html> | <head> | <body> | <textarea> | "a<b" #data <!doctype html><iframe><!--<iframe></iframe>--></iframe> #errors Line: 1 Col: 56 Unexpected end tag (iframe). #document | <!DOCTYPE html> | <html> | <head> | <body> | <iframe> | "<!--<iframe>" | "-->" #data <!doctype html><iframe>...<!--X->...<!--/X->...</iframe> #errors #document | <!DOCTYPE html> | <html> | <head> | <body> | <iframe> | "...<!--X->...<!--/X->..." #data <!doctype html><xmp><!--<xmp></xmp>--></xmp> #errors Line: 1 Col: 44 Unexpected end tag (xmp). #document | <!DOCTYPE html> | <html> | <head> | <body> | <xmp> | "<!--<xmp>" | "-->" #data <!doctype html><noembed><!--<noembed></noembed>--></noembed> #errors Line: 1 Col: 60 Unexpected end tag (noembed). #document | <!DOCTYPE html> | <html> | <head> | <body> | <noembed> | "<!--<noembed>" | "-->" #data <script> #errors Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. Line: 1 Col: 8 Unexpected end of file. Expected end tag (script). #document | <html> | <head> | <script> | <body> #data <script>a #errors Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. Line: 1 Col: 9 Unexpected end of file. Expected end tag (script). #document | <html> | <head> | <script> | "a" | <body> #data <script>< #errors Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. Line: 1 Col: 9 Unexpected end of file. Expected end tag (script). #document | <html> | <head> | <script> | "<" | <body> #data <script></ #errors Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. Line: 1 Col: 10 Unexpected end of file. Expected end tag (script). #document | <html> | <head> | <script> | "</" | <body> #data <script></S #errors Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. Line: 1 Col: 11 Unexpected end of file. Expected end tag (script). #document | <html> | <head> | <script> | "</S" | <body> #data <script></SC #errors Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. Line: 1 Col: 12 Unexpected end of file. Expected end tag (script). #document | <html> | <head> | <script> | "</SC" | <body> #data <script></SCR #errors Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. Line: 1 Col: 13 Unexpected end of file. Expected end tag (script). #document | <html> | <head> | <script> | "</SCR" | <body> #data <script></SCRI #errors Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. Line: 1 Col: 14 Unexpected end of file. Expected end tag (script). #document | <html> | <head> | <script> | "</SCRI" | <body> #data <script></SCRIP #errors Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. Line: 1 Col: 15 Unexpected end of file. Expected end tag (script). #document | <html> | <head> | <script> | "</SCRIP" | <body> #data <script></SCRIPT #errors Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. Line: 1 Col: 16 Unexpected end of file. Expected end tag (script). #document | <html> | <head> | <script> | "</SCRIPT" | <body> #data <script></SCRIPT #errors Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. Line: 1 Col: 17 Unexpected end of file. Expected end tag (script). #document | <html> | <head> | <script> | <body> #data <script></s #errors Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. Line: 1 Col: 11 Unexpected end of file. Expected end tag (script). #document | <html> | <head> | <script> | "</s" | <body> #data <script></sc #errors Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. Line: 1 Col: 12 Unexpected end of file. Expected end tag (script). #document | <html> | <head> | <script> | "</sc" | <body> #data <script></scr #errors Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. Line: 1 Col: 13 Unexpected end of file. Expected end tag (script). #document | <html> | <head> | <script> | "</scr" | <body> #data <script></scri #errors Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. Line: 1 Col: 14 Unexpected end of file. Expected end tag (script). #document | <html> | <head> | <script> | "</scri" | <body> #data <script></scrip #errors Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. Line: 1 Col: 15 Unexpected end of file. Expected end tag (script). #document | <html> | <head> | <script> | "</scrip" | <body> #data <script></script #errors Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. Line: 1 Col: 16 Unexpected end of file. Expected end tag (script). #document | <html> | <head> | <script> | "</script" | <body> #data <script></script #errors Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. Line: 1 Col: 17 Unexpected end of file. Expected end tag (script). #document | <html> | <head> | <script> | <body> #data <script><! #errors Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. Line: 1 Col: 10 Unexpected end of file. Expected end tag (script). #document | <html> | <head> | <script> | "<!" | <body> #data <script><!a #errors Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. Line: 1 Col: 11 Unexpected end of file. Expected end tag (script). #document | <html> | <head> | <script> | "<!a" | <body> #data <script><!- #errors Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. Line: 1 Col: 11 Unexpected end of file. Expected end tag (script). #document | <html> | <head> | <script> | "<!-" | <body> #data <script><!-a #errors Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. Line: 1 Col: 12 Unexpected end of file. Expected end tag (script). #document | <html> | <head> | <script> | "<!-a" | <body> #data <script><!-- #errors Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. Line: 1 Col: 12 Unexpected end of file. Expected end tag (script). #document | <html> | <head> | <script> | "<!--" | <body> #data <script><!--a #errors Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. Line: 1 Col: 13 Unexpected end of file. Expected end tag (script). #document | <html> | <head> | <script> | "<!--a" | <body> #data <script><!--< #errors Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. Line: 1 Col: 13 Unexpected end of file. Expected end tag (script). #document | <html> | <head> | <script> | "<!--<" | <body> #data <script><!--<a #errors Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. Line: 1 Col: 14 Unexpected end of file. Expected end tag (script). #document | <html> | <head> | <script> | "<!--<a" | <body> #data <script><!--</ #errors Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. Line: 1 Col: 14 Unexpected end of file. Expected end tag (script). #document | <html> | <head> | <script> | "<!--</" | <body> #data <script><!--</script #errors Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. Line: 1 Col: 20 Unexpected end of file. Expected end tag (script). #document | <html> | <head> | <script> | "<!--</script" | <body> #data <script><!--</script #errors Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. Line: 1 Col: 21 Unexpected end of file. Expected end tag (script). #document | <html> | <head> | <script> | "<!--" | <body> #data <script><!--<s #errors Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. Line: 1 Col: 14 Unexpected end of file. Expected end tag (script). #document | <html> | <head> | <script> | "<!--<s" | <body> #data <script><!--<script #errors Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. Line: 1 Col: 19 Unexpected end of file. Expected end tag (script). #document | <html> | <head> | <script> | "<!--<script" | <body> #data <script><!--<script #errors Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. Line: 1 Col: 20 Unexpected end of file. Expected end tag (script). #document | <html> | <head> | <script> | "<!--<script " | <body> #data <script><!--<script < #errors Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. Line: 1 Col: 21 Unexpected end of file. Expected end tag (script). #document | <html> | <head> | <script> | "<!--<script <" | <body> #data <script><!--<script <a #errors Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. Line: 1 Col: 22 Unexpected end of file. Expected end tag (script). #document | <html> | <head> | <script> | "<!--<script <a" | <body> #data <script><!--<script </ #errors Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. Line: 1 Col: 22 Unexpected end of file. Expected end tag (script). #document | <html> | <head> | <script> | "<!--<script </" | <body> #data <script><!--<script </s #errors Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. Line: 1 Col: 23 Unexpected end of file. Expected end tag (script). #document | <html> | <head> | <script> | "<!--<script </s" | <body> #data <script><!--<script </script #errors Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. Line: 1 Col: 28 Unexpected end of file. Expected end tag (script). #document | <html> | <head> | <script> | "<!--<script </script" | <body> #data <script><!--<script </scripta #errors Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. Line: 1 Col: 29 Unexpected end of file. Expected end tag (script). #document | <html> | <head> | <script> | "<!--<script </scripta" | <body> #data <script><!--<script </script #errors Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. Line: 1 Col: 29 Unexpected end of file. Expected end tag (script). #document | <html> | <head> | <script> | "<!--<script </script " | <body> #data <script><!--<script </script> #errors Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. Line: 1 Col: 29 Unexpected end of file. Expected end tag (script). #document | <html> | <head> | <script> | "<!--<script </script>" | <body> #data <script><!--<script </script/ #errors Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. Line: 1 Col: 29 Unexpected end of file. Expected end tag (script). #document | <html> | <head> | <script> | "<!--<script </script/" | <body> #data <script><!--<script </script < #errors Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. Line: 1 Col: 30 Unexpected end of file. Expected end tag (script). #document | <html> | <head> | <script> | "<!--<script </script <" | <body> #data <script><!--<script </script <a #errors Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. Line: 1 Col: 31 Unexpected end of file. Expected end tag (script). #document | <html> | <head> | <script> | "<!--<script </script <a" | <body> #data <script><!--<script </script </ #errors Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. Line: 1 Col: 31 Unexpected end of file. Expected end tag (script). #document | <html> | <head> | <script> | "<!--<script </script </" | <body> #data <script><!--<script </script </script #errors Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. Line: 1 Col: 38 Unexpected end of file. Expected end tag (script). #document | <html> | <head> | <script> | "<!--<script </script </script" | <body> #data <script><!--<script </script </script #errors Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. Line: 1 Col: 38 Unexpected end of file. Expected end tag (script). #document | <html> | <head> | <script> | "<!--<script </script " | <body> #data <script><!--<script </script </script/ #errors Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. Line: 1 Col: 38 Unexpected end of file. Expected end tag (script). #document | <html> | <head> | <script> | "<!--<script </script " | <body> #data <script><!--<script </script </script> #errors Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. #document | <html> | <head> | <script> | "<!--<script </script " | <body> #data <script><!--<script - #errors Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. Line: 1 Col: 21 Unexpected end of file. Expected end tag (script). #document | <html> | <head> | <script> | "<!--<script -" | <body> #data <script><!--<script -a #errors Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. Line: 1 Col: 22 Unexpected end of file. Expected end tag (script). #document | <html> | <head> | <script> | "<!--<script -a" | <body> #data <script><!--<script -- #errors Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. Line: 1 Col: 22 Unexpected end of file. Expected end tag (script). #document | <html> | <head> | <script> | "<!--<script --" | <body> #data <script><!--<script --a #errors Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. Line: 1 Col: 23 Unexpected end of file. Expected end tag (script). #document | <html> | <head> | <script> | "<!--<script --a" | <body> #data <script><!--<script --> #errors Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. Line: 1 Col: 23 Unexpected end of file. Expected end tag (script). #document | <html> | <head> | <script> | "<!--<script -->" | <body> #data <script><!--<script -->< #errors Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. Line: 1 Col: 24 Unexpected end of file. Expected end tag (script). #document | <html> | <head> | <script> | "<!--<script --><" | <body> #data <script><!--<script --></ #errors Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. Line: 1 Col: 25 Unexpected end of file. Expected end tag (script). #document | <html> | <head> | <script> | "<!--<script --></" | <body> #data <script><!--<script --></script #errors Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. Line: 1 Col: 31 Unexpected end of file. Expected end tag (script). #document | <html> | <head> | <script> | "<!--<script --></script" | <body> #data <script><!--<script --></script #errors Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. Line: 1 Col: 32 Unexpected end of file. Expected end tag (script). #document | <html> | <head> | <script> | "<!--<script -->" | <body> #data <script><!--<script --></script/ #errors Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. Line: 1 Col: 32 Unexpected end of file. Expected end tag (script). #document | <html> | <head> | <script> | "<!--<script -->" | <body> #data <script><!--<script --></script> #errors Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. #document | <html> | <head> | <script> | "<!--<script -->" | <body> #data <script><!--<script><\/script>--></script> #errors Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. #document | <html> | <head> | <script> | "<!--<script><\/script>-->" | <body> #data <script><!--<script></scr'+'ipt>--></script> #errors Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. #document | <html> | <head> | <script> | "<!--<script></scr'+'ipt>-->" | <body> #data <script><!--<script></script><script></script></script> #errors Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. #document | <html> | <head> | <script> | "<!--<script></script><script></script>" | <body> #data <script><!--<script></script><script></script>--><!--</script> #errors Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. #document | <html> | <head> | <script> | "<!--<script></script><script></script>--><!--" | <body> #data <script><!--<script></script><script></script>-- ></script> #errors Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. #document | <html> | <head> | <script> | "<!--<script></script><script></script>-- >" | <body> #data <script><!--<script></script><script></script>- -></script> #errors Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. #document | <html> | <head> | <script> | "<!--<script></script><script></script>- ->" | <body> #data <script><!--<script></script><script></script>- - ></script> #errors Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. #document | <html> | <head> | <script> | "<!--<script></script><script></script>- - >" | <body> #data <script><!--<script></script><script></script>-></script> #errors Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. #document | <html> | <head> | <script> | "<!--<script></script><script></script>->" | <body> #data <script><!--<script>--!></script>X #errors Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. Line: 1 Col: 34 Unexpected end of file. Expected end tag (script). #document | <html> | <head> | <script> | "<!--<script>--!></script>X" | <body> #data <script><!--<scr'+'ipt></script>--></script> #errors Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. Line: 1 Col: 44 Unexpected end tag (script). #document | <html> | <head> | <script> | "<!--<scr'+'ipt>" | <body> | "-->" #data <script><!--<script></scr'+'ipt></script>X #errors Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. Line: 1 Col: 42 Unexpected end of file. Expected end tag (script). #document | <html> | <head> | <script> | "<!--<script></scr'+'ipt></script>X" | <body> #data <style><!--<style></style>--></style> #errors Line: 1 Col: 7 Unexpected start tag (style). Expected DOCTYPE. Line: 1 Col: 37 Unexpected end tag (style). #document | <html> | <head> | <style> | "<!--<style>" | <body> | "-->" #data <style><!--</style>X #errors Line: 1 Col: 7 Unexpected start tag (style). Expected DOCTYPE. #document | <html> | <head> | <style> | "<!--" | <body> | "X" #data <style><!--...</style>...--></style> #errors Line: 1 Col: 7 Unexpected start tag (style). Expected DOCTYPE. Line: 1 Col: 36 Unexpected end tag (style). #document | <html> | <head> | <style> | "<!--..." | <body> | "...-->" #data <style><!--<br><html xmlns:v="urn:schemas-microsoft-com:vml"><!--[if !mso]><style></style>X #errors Line: 1 Col: 7 Unexpected start tag (style). Expected DOCTYPE. #document | <html> | <head> | <style> | "<!--<br><html xmlns:v="urn:schemas-microsoft-com:vml"><!--[if !mso]><style>" | <body> | "X" #data <style><!--...<style><!--...--!></style>--></style> #errors Line: 1 Col: 7 Unexpected start tag (style). Expected DOCTYPE. Line: 1 Col: 51 Unexpected end tag (style). #document | <html> | <head> | <style> | "<!--...<style><!--...--!>" | <body> | "-->" #data <style><!--...</style><!-- --><style>@import ...</style> #errors Line: 1 Col: 7 Unexpected start tag (style). Expected DOCTYPE. #document | <html> | <head> | <style> | "<!--..." | <!-- --> | <style> | "@import ..." | <body> #data <style>...<style><!--...</style><!-- --></style> #errors Line: 1 Col: 7 Unexpected start tag (style). Expected DOCTYPE. Line: 1 Col: 48 Unexpected end tag (style). #document | <html> | <head> | <style> | "...<style><!--..." | <!-- --> | <body> #data <style>...<!--[if IE]><style>...</style>X #errors Line: 1 Col: 7 Unexpected start tag (style). Expected DOCTYPE. #document | <html> | <head> | <style> | "...<!--[if IE]><style>..." | <body> | "X" #data <title><!--<title></title>--></title> #errors Line: 1 Col: 7 Unexpected start tag (title). Expected DOCTYPE. Line: 1 Col: 37 Unexpected end tag (title). #document | <html> | <head> | <title> | "<!--<title>" | <body> | "-->" #data <title>&lt;/title></title> #errors Line: 1 Col: 7 Unexpected start tag (title). Expected DOCTYPE. #document | <html> | <head> | <title> | "</title>" | <body> #data <title>foo/title><link></head><body>X #errors Line: 1 Col: 7 Unexpected start tag (title). Expected DOCTYPE. Line: 1 Col: 37 Unexpected end of file. Expected end tag (title). #document | <html> | <head> | <title> | "foo/title><link></head><body>X" | <body> #data <noscript><!--<noscript></noscript>--></noscript> #errors Line: 1 Col: 10 Unexpected start tag (noscript). Expected DOCTYPE. Line: 1 Col: 49 Unexpected end tag (noscript). #document | <html> | <head> | <noscript> | "<!--<noscript>" | <body> | "-->" #data <noscript><!--</noscript>X<noscript>--></noscript> #errors Line: 1 Col: 10 Unexpected start tag (noscript). Expected DOCTYPE. #document | <html> | <head> | <noscript> | "<!--" | <body> | "X" | <noscript> | "-->" #data <noscript><iframe></noscript>X #errors Line: 1 Col: 10 Unexpected start tag (noscript). Expected DOCTYPE. #document | <html> | <head> | <noscript> | "<iframe>" | <body> | "X" #data <noframes><!--<noframes>--> #errors Line: 1 Col: 10 Unexpected start tag (noframes). Expected DOCTYPE. Line: 1 Col: 49 Unexpected end tag (noframes). #document | | | | "<!--<noframes>" | <body> | "-->" #data <noframes><body><script><!--...</script></body> #errors Line: 1 Col: 10 Unexpected start tag (noframes). Expected DOCTYPE. #document | | | | "<body><script><!--...</script></body>" | <body> #data <textarea><!--<textarea></textarea>--></textarea> #errors Line: 1 Col: 10 Unexpected start tag (textarea). Expected DOCTYPE. Line: 1 Col: 49 Unexpected end tag (textarea). #document | <html> | <head> | <body> | <textarea> | "<!--<textarea>" | "-->" #data <textarea>&lt;/textarea></textarea> #errors Line: 1 Col: 10 Unexpected start tag (textarea). Expected DOCTYPE. #document | <html> | <head> | <body> | <textarea> | "</textarea>" #data <iframe><!--<iframe></iframe>--></iframe> #errors Line: 1 Col: 8 Unexpected start tag (iframe). Expected DOCTYPE. Line: 1 Col: 41 Unexpected end tag (iframe). #document | <html> | <head> | <body> | <iframe> | "<!--<iframe>" | "-->" #data <iframe>...<!--X->...<!--/X->...</iframe> #errors Line: 1 Col: 8 Unexpected start tag (iframe). Expected DOCTYPE. #document | <html> | <head> | <body> | <iframe> | "...<!--X->...<!--/X->..." #data <xmp><!--<xmp></xmp>--></xmp> #errors Line: 1 Col: 5 Unexpected start tag (xmp). Expected DOCTYPE. Line: 1 Col: 29 Unexpected end tag (xmp). #document | <html> | <head> | <body> | <xmp> | "<!--<xmp>" | "-->" #data <noembed><!--<noembed></noembed>--></noembed> #errors Line: 1 Col: 9 Unexpected start tag (noembed). Expected DOCTYPE. Line: 1 Col: 45 Unexpected end tag (noembed). #document | <html> | <head> | <body> | <noembed> | "<!--<noembed>" | "-->" #data <!doctype html><table> #errors Line 2 Col 0 Unexpected end of file. Expected table content. #document | <!DOCTYPE html> | <html> | <head> | <body> | <table> | " " #data <!doctype html><table><td><span><font></span><span> #errors Line 1 Col 26 Unexpected table cell start tag (td) in the table body phase. Line 1 Col 45 Unexpected end tag (span). Line 1 Col 51 Expected closing tag. Unexpected end of file. #document | <!DOCTYPE html> | <html> | <head> | <body> | <table> | <tbody> | <tr> | <td> | <span> | <font> | <font> | <span> #data <!doctype html><form><table></form><form></table></form> #errors 35: Stray end tag “form”. 41: Start tag “form” seen in “table”. #document | <!DOCTYPE html> | <html> | <head> | <body> | <form> | <table> | <form> �������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������HTML-HTML5-Parser-0.301/t/html5lib-fail/tests1.dat��������������������������������������������������0000644�0001750�0001750�00000152434�11726072634�017310� 0����������������������������������������������������������������������������������������������������ustar �tai�����������������������������tai��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������#data Test #errors Line: 1 Col: 4 Unexpected non-space characters. Expected DOCTYPE. #document | <html> | <head> | <body> | "Test" #data <p>One<p>Two #errors Line: 1 Col: 3 Unexpected start tag (p). Expected DOCTYPE. #document | <html> | <head> | <body> | <p> | "One" | <p> | "Two" #data Line1<br>Line2<br>Line3<br>Line4 #errors Line: 1 Col: 5 Unexpected non-space characters. Expected DOCTYPE. #document | <html> | <head> | <body> | "Line1" | <br> | "Line2" | <br> | "Line3" | <br> | "Line4" #data <html> #errors Line: 1 Col: 6 Unexpected start tag (html). Expected DOCTYPE. #document | <html> | <head> | <body> #data <head> #errors Line: 1 Col: 6 Unexpected start tag (head). Expected DOCTYPE. #document | <html> | <head> | <body> #data <body> #errors Line: 1 Col: 6 Unexpected start tag (body). Expected DOCTYPE. #document | <html> | <head> | <body> #data <html><head> #errors Line: 1 Col: 6 Unexpected start tag (html). Expected DOCTYPE. #document | <html> | <head> | <body> #data <html><head></head> #errors Line: 1 Col: 6 Unexpected start tag (html). Expected DOCTYPE. #document | <html> | <head> | <body> #data <html><head></head><body> #errors Line: 1 Col: 6 Unexpected start tag (html). Expected DOCTYPE. #document | <html> | <head> | <body> #data <html><head></head><body></body> #errors Line: 1 Col: 6 Unexpected start tag (html). Expected DOCTYPE. #document | <html> | <head> | <body> #data <html><head><body></body></html> #errors Line: 1 Col: 6 Unexpected start tag (html). Expected DOCTYPE. #document | <html> | <head> | <body> #data <html><head></body></html> #errors Line: 1 Col: 6 Unexpected start tag (html). Expected DOCTYPE. Line: 1 Col: 19 Unexpected end tag (body). Line: 1 Col: 26 Unexpected end tag (html). #document | <html> | <head> | <body> #data <html><head><body></html> #errors Line: 1 Col: 6 Unexpected start tag (html). Expected DOCTYPE. #document | <html> | <head> | <body> #data <html><body></html> #errors Line: 1 Col: 6 Unexpected start tag (html). Expected DOCTYPE. #document | <html> | <head> | <body> #data <body></html> #errors Line: 1 Col: 6 Unexpected start tag (body). Expected DOCTYPE. #document | <html> | <head> | <body> #data <head></html> #errors Line: 1 Col: 6 Unexpected start tag (head). Expected DOCTYPE. Line: 1 Col: 13 Unexpected end tag (html). Ignored. #document | <html> | <head> | <body> #data </head> #errors Line: 1 Col: 7 Unexpected end tag (head). Expected DOCTYPE. #document | <html> | <head> | <body> #data </body> #errors Line: 1 Col: 7 Unexpected end tag (body). Expected DOCTYPE. Line: 1 Col: 7 Unexpected end tag (body) after the (implied) root element. #document | <html> | <head> | <body> #data </html> #errors Line: 1 Col: 7 Unexpected end tag (html). Expected DOCTYPE. Line: 1 Col: 7 Unexpected end tag (html) after the (implied) root element. #document | <html> | <head> | <body> #data <b><table><td><i></table> #errors Line: 1 Col: 3 Unexpected start tag (b). Expected DOCTYPE. Line: 1 Col: 14 Unexpected table cell start tag (td) in the table body phase. Line: 1 Col: 25 Got table cell end tag (td) while required end tags are missing. Line: 1 Col: 25 Expected closing tag. Unexpected end of file. #document | <html> | <head> | <body> | <b> | <table> | <tbody> | <tr> | <td> | <i> #data <b><table><td></b><i></table>X #errors Line: 1 Col: 3 Unexpected start tag (b). Expected DOCTYPE. Line: 1 Col: 14 Unexpected table cell start tag (td) in the table body phase. Line: 1 Col: 18 End tag (b) violates step 1, paragraph 1 of the adoption agency algorithm. Line: 1 Col: 29 Got table cell end tag (td) while required end tags are missing. Line: 1 Col: 30 Expected closing tag. Unexpected end of file. #document | <html> | <head> | <body> | <b> | <table> | <tbody> | <tr> | <td> | <i> | "X" #data <h1>Hello<h2>World #errors 4: Start tag seen without seeing a doctype first. Expected “<!DOCTYPE html>”. 13: Heading cannot be a child of another heading. 18: End of file seen and there were open elements. #document | <html> | <head> | <body> | <h1> | "Hello" | <h2> | "World" #data <a><p>X<a>Y</a>Z</p></a> #errors Line: 1 Col: 3 Unexpected start tag (a). Expected DOCTYPE. Line: 1 Col: 10 Unexpected start tag (a) implies end tag (a). Line: 1 Col: 10 End tag (a) violates step 1, paragraph 3 of the adoption agency algorithm. Line: 1 Col: 24 End tag (a) violates step 1, paragraph 1 of the adoption agency algorithm. #document | <html> | <head> | <body> | <a> | <p> | <a> | "X" | <a> | "Y" | "Z" #data <b><button>foo</b>bar #errors Line: 1 Col: 3 Unexpected start tag (b). Expected DOCTYPE. Line: 1 Col: 15 End tag (b) violates step 1, paragraph 1 of the adoption agency algorithm. #document | <html> | <head> | <body> | <b> | <button> | <b> | "foo" | "bar" #data <!DOCTYPE html><span><button>foo</span>bar #errors 39: End tag “span” seen but there were unclosed elements. #document | <!DOCTYPE html> | <html> | <head> | <body> | <span> | <button> | "foobar" #data <p><b><div><marquee></p></b></div>X #errors Line: 1 Col: 3 Unexpected start tag (p). Expected DOCTYPE. Line: 1 Col: 11 Unexpected end tag (p). Ignored. Line: 1 Col: 24 Unexpected end tag (p). Ignored. Line: 1 Col: 28 End tag (b) violates step 1, paragraph 1 of the adoption agency algorithm. Line: 1 Col: 34 End tag (div) seen too early. Expected other end tag. Line: 1 Col: 35 Expected closing tag. Unexpected end of file. #document | <html> | <head> | <body> | <p> | <b> | <div> | <b> | <marquee> | <p> | "X" #data <script><div></script></div><title><p></title><p><p> #errors Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. Line: 1 Col: 28 Unexpected end tag (div). Ignored. #document | <html> | <head> | <script> | "<div>" | <title> | "<p>" | <body> | <p> | <p> #data <!--><div>--<!--> #errors Line: 1 Col: 5 Incorrect comment. Line: 1 Col: 10 Unexpected start tag (div). Expected DOCTYPE. Line: 1 Col: 17 Incorrect comment. Line: 1 Col: 17 Expected closing tag. Unexpected end of file. #document | <!-- --> | <html> | <head> | <body> | <div> | "--" | <!-- --> #data <p><hr></p> #errors Line: 1 Col: 3 Unexpected start tag (p). Expected DOCTYPE. Line: 1 Col: 11 Unexpected end tag (p). Ignored. #document | <html> | <head> | <body> | <p> | <hr> | <p> #data <select><b><option><select><option></b></select>X #errors Line: 1 Col: 8 Unexpected start tag (select). Expected DOCTYPE. Line: 1 Col: 11 Unexpected start tag token (b) in the select phase. Ignored. Line: 1 Col: 27 Unexpected select start tag in the select phase treated as select end tag. Line: 1 Col: 39 End tag (b) violates step 1, paragraph 1 of the adoption agency algorithm. Line: 1 Col: 48 Unexpected end tag (select). Ignored. Line: 1 Col: 49 Expected closing tag. Unexpected end of file. #document | <html> | <head> | <body> | <select> | <option> | <option> | "X" #data <a><table><td><a><table></table><a></tr><a></table><b>X</b>C<a>Y #errors Line: 1 Col: 3 Unexpected start tag (a). Expected DOCTYPE. Line: 1 Col: 14 Unexpected table cell start tag (td) in the table body phase. Line: 1 Col: 35 Unexpected start tag (a) implies end tag (a). Line: 1 Col: 40 Got table cell end tag (td) while required end tags are missing. Line: 1 Col: 43 Unexpected start tag (a) in table context caused voodoo mode. Line: 1 Col: 43 Unexpected start tag (a) implies end tag (a). Line: 1 Col: 43 End tag (a) violates step 1, paragraph 1 of the adoption agency algorithm. Line: 1 Col: 51 Unexpected implied end tag (a) in the table phase. Line: 1 Col: 63 Unexpected start tag (a) implies end tag (a). Line: 1 Col: 64 Expected closing tag. Unexpected end of file. #document | <html> | <head> | <body> | <a> | <a> | <table> | <tbody> | <tr> | <td> | <a> | <table> | <a> | <a> | <b> | "X" | "C" | <a> | "Y" #data <a X>0<b>1<a Y>2 #errors Line: 1 Col: 5 Unexpected start tag (a). Expected DOCTYPE. Line: 1 Col: 15 Unexpected start tag (a) implies end tag (a). Line: 1 Col: 15 End tag (a) violates step 1, paragraph 3 of the adoption agency algorithm. Line: 1 Col: 16 Expected closing tag. Unexpected end of file. #document | <html> | <head> | <body> | <a> | x="" | "0" | <b> | "1" | <b> | <a> | y="" | "2" #data <!-----><font><div>hello<table>excite!<b>me!<th><i>please!</tr><!--X--> #errors Line: 1 Col: 7 Unexpected '-' after '--' found in comment. Line: 1 Col: 14 Unexpected start tag (font). Expected DOCTYPE. Line: 1 Col: 38 Unexpected non-space characters in table context caused voodoo mode. Line: 1 Col: 41 Unexpected start tag (b) in table context caused voodoo mode. Line: 1 Col: 48 Unexpected implied end tag (b) in the table phase. Line: 1 Col: 48 Unexpected table cell start tag (th) in the table body phase. Line: 1 Col: 63 Got table cell end tag (th) while required end tags are missing. Line: 1 Col: 71 Unexpected end of file. Expected table content. #document | <!-- - --> | <html> | <head> | <body> | <font> | <div> | "helloexcite!" | <b> | "me!" | <table> | <tbody> | <tr> | <th> | <i> | "please!" | <!-- X --> #data <!DOCTYPE html><li>hello<li>world<ul>how<li>do</ul>you</body><!--do--> #errors Line: 1 Col: 61 Unexpected end tag (li). Missing end tag (body). #document | <!DOCTYPE html> | <html> | <head> | <body> | <li> | "hello" | <li> | "world" | <ul> | "how" | <li> | "do" | "you" | <!-- do --> #data <!DOCTYPE html>A<option>B<optgroup>C<select>D</option>E #errors Line: 1 Col: 54 Unexpected end tag (option) in the select phase. Ignored. Line: 1 Col: 55 Expected closing tag. Unexpected end of file. #document | <!DOCTYPE html> | <html> | <head> | <body> | "A" | <option> | "B" | <optgroup> | "C" | <select> | "DE" #data < #errors Line: 1 Col: 1 Expected tag name. Got something else instead Line: 1 Col: 1 Unexpected non-space characters. Expected DOCTYPE. #document | <html> | <head> | <body> | "<" #data <# #errors Line: 1 Col: 1 Expected tag name. Got something else instead Line: 1 Col: 1 Unexpected non-space characters. Expected DOCTYPE. #document | <html> | <head> | <body> | "<#" #data </ #errors Line: 1 Col: 2 Expected closing tag. Unexpected end of file. Line: 1 Col: 2 Unexpected non-space characters. Expected DOCTYPE. #document | <html> | <head> | <body> | "</" #data </# #errors Line: 1 Col: 2 Expected closing tag. Unexpected character '#' found. Line: 1 Col: 3 Unexpected End of file. Expected DOCTYPE. #document | <!-- # --> | <html> | <head> | <body> #data <? #errors Line: 1 Col: 1 Expected tag name. Got '?' instead. (HTML doesn't support processing instructions.) Line: 1 Col: 2 Unexpected End of file. Expected DOCTYPE. #document | <!-- ? --> | <html> | <head> | <body> #data <?# #errors Line: 1 Col: 1 Expected tag name. Got '?' instead. (HTML doesn't support processing instructions.) Line: 1 Col: 3 Unexpected End of file. Expected DOCTYPE. #document | <!-- ?# --> | <html> | <head> | <body> #data <! #errors Line: 1 Col: 2 Expected '--' or 'DOCTYPE'. Not found. Line: 1 Col: 2 Unexpected End of file. Expected DOCTYPE. #document | <!-- --> | <html> | <head> | <body> #data <!# #errors Line: 1 Col: 3 Expected '--' or 'DOCTYPE'. Not found. Line: 1 Col: 3 Unexpected End of file. Expected DOCTYPE. #document | <!-- # --> | <html> | <head> | <body> #data <?COMMENT?> #errors Line: 1 Col: 1 Expected tag name. Got '?' instead. (HTML doesn't support processing instructions.) Line: 1 Col: 11 Unexpected End of file. Expected DOCTYPE. #document | <!-- ?COMMENT? --> | <html> | <head> | <body> #data <!COMMENT> #errors Line: 1 Col: 2 Expected '--' or 'DOCTYPE'. Not found. Line: 1 Col: 10 Unexpected End of file. Expected DOCTYPE. #document | <!-- COMMENT --> | <html> | <head> | <body> #data </ COMMENT > #errors Line: 1 Col: 2 Expected closing tag. Unexpected character ' ' found. Line: 1 Col: 12 Unexpected End of file. Expected DOCTYPE. #document | <!-- COMMENT --> | <html> | <head> | <body> #data <?COM--MENT?> #errors Line: 1 Col: 1 Expected tag name. Got '?' instead. (HTML doesn't support processing instructions.) Line: 1 Col: 13 Unexpected End of file. Expected DOCTYPE. #document | <!-- ?COM--MENT? --> | <html> | <head> | <body> #data <!COM--MENT> #errors Line: 1 Col: 2 Expected '--' or 'DOCTYPE'. Not found. Line: 1 Col: 12 Unexpected End of file. Expected DOCTYPE. #document | <!-- COM--MENT --> | <html> | <head> | <body> #data </ COM--MENT > #errors Line: 1 Col: 2 Expected closing tag. Unexpected character ' ' found. Line: 1 Col: 14 Unexpected End of file. Expected DOCTYPE. #document | <!-- COM--MENT --> | <html> | <head> | <body> #data <!DOCTYPE html><style> EOF #errors Line: 1 Col: 26 Unexpected end of file. Expected end tag (style). #document | <!DOCTYPE html> | <html> | <head> | <style> | " EOF" | <body> #data <!DOCTYPE html><script> <!-- </script> --> </script> EOF #errors #document | <!DOCTYPE html> | <html> | <head> | <script> | " <!-- " | " " | <body> | "--> EOF" #data <b><p></b>TEST #errors Line: 1 Col: 3 Unexpected start tag (b). Expected DOCTYPE. Line: 1 Col: 10 End tag (b) violates step 1, paragraph 3 of the adoption agency algorithm. #document | <html> | <head> | <body> | <b> | <p> | <b> | "TEST" #data <p id=a><b><p id=b></b>TEST #errors Line: 1 Col: 8 Unexpected start tag (p). Expected DOCTYPE. Line: 1 Col: 19 Unexpected end tag (p). Ignored. Line: 1 Col: 23 End tag (b) violates step 1, paragraph 2 of the adoption agency algorithm. #document | <html> | <head> | <body> | <p> | id="a" | <b> | <p> | id="b" | "TEST" #data <b id=a><p><b id=b></p></b>TEST #errors Line: 1 Col: 8 Unexpected start tag (b). Expected DOCTYPE. Line: 1 Col: 23 Unexpected end tag (p). Ignored. Line: 1 Col: 27 End tag (b) violates step 1, paragraph 2 of the adoption agency algorithm. Line: 1 Col: 31 Expected closing tag. Unexpected end of file. #document | <html> | <head> | <body> | <b> | id="a" | <p> | <b> | id="b" | "TEST" #data <!DOCTYPE html><title>U-test</title><body><div><p>Test<u></p></div></body> #errors Line: 1 Col: 61 Unexpected end tag (p). Ignored. #document | <!DOCTYPE html> | <html> | <head> | <title> | "U-test" | <body> | <div> | <p> | "Test" | <u> #data <!DOCTYPE html><font><table></font></table></font> #errors Line: 1 Col: 35 Unexpected end tag (font) in table context caused voodoo mode. Line: 1 Col: 35 End tag (font) violates step 1, paragraph 1 of the adoption agency algorithm. #document | <!DOCTYPE html> | <html> | <head> | <body> | <font> | <table> #data <font><p>hello<b>cruel</font>world #errors Line: 1 Col: 6 Unexpected start tag (font). Expected DOCTYPE. Line: 1 Col: 29 End tag (font) violates step 1, paragraph 3 of the adoption agency algorithm. Line: 1 Col: 29 End tag (font) violates step 1, paragraph 3 of the adoption agency algorithm. Line: 1 Col: 34 Expected closing tag. Unexpected end of file. #document | <html> | <head> | <body> | <font> | <p> | <font> | "hello" | <b> | "cruel" | <b> | "world" #data <b>Test</i>Test #errors Line: 1 Col: 3 Unexpected start tag (b). Expected DOCTYPE. Line: 1 Col: 11 End tag (i) violates step 1, paragraph 1 of the adoption agency algorithm. Line: 1 Col: 15 Expected closing tag. Unexpected end of file. #document | <html> | <head> | <body> | <b> | "TestTest" #data <b>A<cite>B<div>C #errors Line: 1 Col: 3 Unexpected start tag (b). Expected DOCTYPE. Line: 1 Col: 17 Expected closing tag. Unexpected end of file. #document | <html> | <head> | <body> | <b> | "A" | <cite> | "B" | <div> | "C" #data <b>A<cite>B<div>C</cite>D #errors Line: 1 Col: 3 Unexpected start tag (b). Expected DOCTYPE. Line: 1 Col: 24 Unexpected end tag (cite). Ignored. Line: 1 Col: 25 Expected closing tag. Unexpected end of file. #document | <html> | <head> | <body> | <b> | "A" | <cite> | "B" | <div> | "CD" #data <b>A<cite>B<div>C</b>D #errors Line: 1 Col: 3 Unexpected start tag (b). Expected DOCTYPE. Line: 1 Col: 21 End tag (b) violates step 1, paragraph 3 of the adoption agency algorithm. Line: 1 Col: 22 Expected closing tag. Unexpected end of file. #document | <html> | <head> | <body> | <b> | "A" | <cite> | "B" | <div> | <b> | "C" | "D" #data #errors Line: 1 Col: 0 Unexpected End of file. Expected DOCTYPE. #document | <html> | <head> | <body> #data <DIV> #errors Line: 1 Col: 5 Unexpected start tag (div). Expected DOCTYPE. Line: 1 Col: 5 Expected closing tag. Unexpected end of file. #document | <html> | <head> | <body> | <div> #data <DIV> abc #errors Line: 1 Col: 5 Unexpected start tag (div). Expected DOCTYPE. Line: 1 Col: 9 Expected closing tag. Unexpected end of file. #document | <html> | <head> | <body> | <div> | " abc" #data <DIV> abc <B> #errors Line: 1 Col: 5 Unexpected start tag (div). Expected DOCTYPE. Line: 1 Col: 13 Expected closing tag. Unexpected end of file. #document | <html> | <head> | <body> | <div> | " abc " | <b> #data <DIV> abc <B> def #errors Line: 1 Col: 5 Unexpected start tag (div). Expected DOCTYPE. Line: 1 Col: 17 Expected closing tag. Unexpected end of file. #document | <html> | <head> | <body> | <div> | " abc " | <b> | " def" #data <DIV> abc <B> def <I> #errors Line: 1 Col: 5 Unexpected start tag (div). Expected DOCTYPE. Line: 1 Col: 21 Expected closing tag. Unexpected end of file. #document | <html> | <head> | <body> | <div> | " abc " | <b> | " def " | <i> #data <DIV> abc <B> def <I> ghi #errors Line: 1 Col: 5 Unexpected start tag (div). Expected DOCTYPE. Line: 1 Col: 25 Expected closing tag. Unexpected end of file. #document | <html> | <head> | <body> | <div> | " abc " | <b> | " def " | <i> | " ghi" #data <DIV> abc <B> def <I> ghi <P> #errors Line: 1 Col: 5 Unexpected start tag (div). Expected DOCTYPE. Line: 1 Col: 29 Expected closing tag. Unexpected end of file. #document | <html> | <head> | <body> | <div> | " abc " | <b> | " def " | <i> | " ghi " | <p> #data <DIV> abc <B> def <I> ghi <P> jkl #errors Line: 1 Col: 5 Unexpected start tag (div). Expected DOCTYPE. Line: 1 Col: 33 Expected closing tag. Unexpected end of file. #document | <html> | <head> | <body> | <div> | " abc " | <b> | " def " | <i> | " ghi " | <p> | " jkl" #data <DIV> abc <B> def <I> ghi <P> jkl </B> #errors Line: 1 Col: 5 Unexpected start tag (div). Expected DOCTYPE. Line: 1 Col: 38 End tag (b) violates step 1, paragraph 3 of the adoption agency algorithm. Line: 1 Col: 38 Expected closing tag. Unexpected end of file. #document | <html> | <head> | <body> | <div> | " abc " | <b> | " def " | <i> | " ghi " | <i> | <p> | <b> | " jkl " #data <DIV> abc <B> def <I> ghi <P> jkl </B> mno #errors Line: 1 Col: 5 Unexpected start tag (div). Expected DOCTYPE. Line: 1 Col: 38 End tag (b) violates step 1, paragraph 3 of the adoption agency algorithm. Line: 1 Col: 42 Expected closing tag. Unexpected end of file. #document | <html> | <head> | <body> | <div> | " abc " | <b> | " def " | <i> | " ghi " | <i> | <p> | <b> | " jkl " | " mno" #data <DIV> abc <B> def <I> ghi <P> jkl </B> mno </I> #errors Line: 1 Col: 5 Unexpected start tag (div). Expected DOCTYPE. Line: 1 Col: 38 End tag (b) violates step 1, paragraph 3 of the adoption agency algorithm. Line: 1 Col: 47 End tag (i) violates step 1, paragraph 3 of the adoption agency algorithm. Line: 1 Col: 47 Expected closing tag. Unexpected end of file. #document | <html> | <head> | <body> | <div> | " abc " | <b> | " def " | <i> | " ghi " | <i> | <p> | <i> | <b> | " jkl " | " mno " #data <DIV> abc <B> def <I> ghi <P> jkl </B> mno </I> pqr #errors Line: 1 Col: 5 Unexpected start tag (div). Expected DOCTYPE. Line: 1 Col: 38 End tag (b) violates step 1, paragraph 3 of the adoption agency algorithm. Line: 1 Col: 47 End tag (i) violates step 1, paragraph 3 of the adoption agency algorithm. Line: 1 Col: 51 Expected closing tag. Unexpected end of file. #document | <html> | <head> | <body> | <div> | " abc " | <b> | " def " | <i> | " ghi " | <i> | <p> | <i> | <b> | " jkl " | " mno " | " pqr" #data <DIV> abc <B> def <I> ghi <P> jkl </B> mno </I> pqr </P> #errors Line: 1 Col: 5 Unexpected start tag (div). Expected DOCTYPE. Line: 1 Col: 38 End tag (b) violates step 1, paragraph 3 of the adoption agency algorithm. Line: 1 Col: 47 End tag (i) violates step 1, paragraph 3 of the adoption agency algorithm. Line: 1 Col: 56 Expected closing tag. Unexpected end of file. #document | <html> | <head> | <body> | <div> | " abc " | <b> | " def " | <i> | " ghi " | <i> | <p> | <i> | <b> | " jkl " | " mno " | " pqr " #data <DIV> abc <B> def <I> ghi <P> jkl </B> mno </I> pqr </P> stu #errors Line: 1 Col: 5 Unexpected start tag (div). Expected DOCTYPE. Line: 1 Col: 38 End tag (b) violates step 1, paragraph 3 of the adoption agency algorithm. Line: 1 Col: 47 End tag (i) violates step 1, paragraph 3 of the adoption agency algorithm. Line: 1 Col: 60 Expected closing tag. Unexpected end of file. #document | <html> | <head> | <body> | <div> | " abc " | <b> | " def " | <i> | " ghi " | <i> | <p> | <i> | <b> | " jkl " | " mno " | " pqr " | " stu" #data <test attribute----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------> #errors Line: 1 Col: 1040 Unexpected start tag (test). Expected DOCTYPE. Line: 1 Col: 1040 Expected closing tag. Unexpected end of file. #document | <html> | <head> | <body> | <test> | attribute----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------="" #data <a href="blah">aba<table><a href="foo">br<tr><td></td></tr>x</table>aoe #errors Line: 1 Col: 15 Unexpected start tag (a). Expected DOCTYPE. Line: 1 Col: 39 Unexpected start tag (a) in table context caused voodoo mode. Line: 1 Col: 39 Unexpected start tag (a) implies end tag (a). Line: 1 Col: 39 End tag (a) violates step 1, paragraph 1 of the adoption agency algorithm. Line: 1 Col: 45 Unexpected implied end tag (a) in the table phase. Line: 1 Col: 68 Unexpected implied end tag (a) in the table phase. Line: 1 Col: 71 Expected closing tag. Unexpected end of file. #document | <html> | <head> | <body> | <a> | href="blah" | "aba" | <a> | href="foo" | "br" | <a> | href="foo" | "x" | <table> | <tbody> | <tr> | <td> | <a> | href="foo" | "aoe" #data <a href="blah">aba<table><tr><td><a href="foo">br</td></tr>x</table>aoe #errors Line: 1 Col: 15 Unexpected start tag (a). Expected DOCTYPE. Line: 1 Col: 54 Got table cell end tag (td) while required end tags are missing. Line: 1 Col: 60 Unexpected non-space characters in table context caused voodoo mode. Line: 1 Col: 71 Expected closing tag. Unexpected end of file. #document | <html> | <head> | <body> | <a> | href="blah" | "abax" | <table> | <tbody> | <tr> | <td> | <a> | href="foo" | "br" | "aoe" #data <table><a href="blah">aba<tr><td><a href="foo">br</td></tr>x</table>aoe #errors Line: 1 Col: 7 Unexpected start tag (table). Expected DOCTYPE. Line: 1 Col: 22 Unexpected start tag (a) in table context caused voodoo mode. Line: 1 Col: 29 Unexpected implied end tag (a) in the table phase. Line: 1 Col: 54 Got table cell end tag (td) while required end tags are missing. Line: 1 Col: 68 Unexpected implied end tag (a) in the table phase. Line: 1 Col: 71 Expected closing tag. Unexpected end of file. #document | <html> | <head> | <body> | <a> | href="blah" | "aba" | <a> | href="blah" | "x" | <table> | <tbody> | <tr> | <td> | <a> | href="foo" | "br" | <a> | href="blah" | "aoe" #data <a href=a>aa<marquee>aa<a href=b>bb</marquee>aa #errors Line: 1 Col: 10 Unexpected start tag (a). Expected DOCTYPE. Line: 1 Col: 45 End tag (marquee) seen too early. Expected other end tag. Line: 1 Col: 47 Expected closing tag. Unexpected end of file. #document | <html> | <head> | <body> | <a> | href="a" | "aa" | <marquee> | "aa" | <a> | href="b" | "bb" | "aa" #data <wbr><strike><code></strike><code><strike></code> #errors Line: 1 Col: 5 Unexpected start tag (wbr). Expected DOCTYPE. Line: 1 Col: 28 End tag (strike) violates step 1, paragraph 3 of the adoption agency algorithm. Line: 1 Col: 49 Unexpected end tag (code). Ignored. #document | <html> | <head> | <body> | <wbr> | <strike> | <code> | <code> | <code> | <strike> #data <!DOCTYPE html><spacer>foo #errors 26: End of file seen and there were open elements. #document | <!DOCTYPE html> | <html> | <head> | <body> | <spacer> | "foo" #data <title><meta></title><link><title><meta></title> #errors Line: 1 Col: 7 Unexpected start tag (title). Expected DOCTYPE. #document | <html> | <head> | <title> | "<meta>" | <link> | <title> | "<meta>" | <body> #data <style><!--</style><meta><script>--><link></script> #errors Line: 1 Col: 7 Unexpected start tag (style). Expected DOCTYPE. Line: 1 Col: 51 Unexpected end of file. Expected end tag (style). #document | <html> | <head> | <style> | "<!--" | <meta> | <script> | "--><link>" | <body> #data <head><meta></head><link> #errors Line: 1 Col: 6 Unexpected start tag (head). Expected DOCTYPE. Line: 1 Col: 25 Unexpected start tag (link) that can be in head. Moved. #document | <html> | <head> | <meta> | <link> | <body> #data <table><tr><tr><td><td><span><th><span>X</table> #errors Line: 1 Col: 7 Unexpected start tag (table). Expected DOCTYPE. Line: 1 Col: 33 Got table cell end tag (td) while required end tags are missing. Line: 1 Col: 48 Got table cell end tag (th) while required end tags are missing. #document | <html> | <head> | <body> | <table> | <tbody> | <tr> | <tr> | <td> | <td> | <span> | <th> | <span> | "X" #data <body><body><base><link><meta><title><p></title><body><p></body> #errors Line: 1 Col: 6 Unexpected start tag (body). Expected DOCTYPE. Line: 1 Col: 12 Unexpected start tag (body). Line: 1 Col: 54 Unexpected start tag (body). Line: 1 Col: 64 Unexpected end tag (p). Missing end tag (body). #document | <html> | <head> | <body> | <base> | <link> | <meta> | <title> | "<p>" | <p> #data <textarea><p></textarea> #errors Line: 1 Col: 10 Unexpected start tag (textarea). Expected DOCTYPE. #document | <html> | <head> | <body> | <textarea> | "<p>" #data <p><image></p> #errors Line: 1 Col: 3 Unexpected start tag (p). Expected DOCTYPE. Line: 1 Col: 10 Unexpected start tag (image). Treated as img. #document | <html> | <head> | <body> | <p> | <img> #data <a><table><a></table><p><a><div><a> #errors Line: 1 Col: 3 Unexpected start tag (a). Expected DOCTYPE. Line: 1 Col: 13 Unexpected start tag (a) in table context caused voodoo mode. Line: 1 Col: 13 Unexpected start tag (a) implies end tag (a). Line: 1 Col: 13 End tag (a) violates step 1, paragraph 1 of the adoption agency algorithm. Line: 1 Col: 21 Unexpected end tag (table). Expected end tag (a). Line: 1 Col: 27 Unexpected start tag (a) implies end tag (a). Line: 1 Col: 27 End tag (a) violates step 1, paragraph 2 of the adoption agency algorithm. Line: 1 Col: 32 Unexpected end tag (p). Ignored. Line: 1 Col: 35 Unexpected start tag (a) implies end tag (a). Line: 1 Col: 35 End tag (a) violates step 1, paragraph 2 of the adoption agency algorithm. Line: 1 Col: 35 Expected closing tag. Unexpected end of file. #document | <html> | <head> | <body> | <a> | <a> | <table> | <p> | <a> | <div> | <a> #data <head></p><meta><p> #errors Line: 1 Col: 6 Unexpected start tag (head). Expected DOCTYPE. Line: 1 Col: 10 Unexpected end tag (p). Ignored. #document | <html> | <head> | <meta> | <body> | <p> #data <head></html><meta><p> #errors Line: 1 Col: 6 Unexpected start tag (head). Expected DOCTYPE. Line: 1 Col: 19 Unexpected start tag (meta). #document | <html> | <head> | <body> | <meta> | <p> #data <b><table><td><i></table> #errors Line: 1 Col: 3 Unexpected start tag (b). Expected DOCTYPE. Line: 1 Col: 14 Unexpected table cell start tag (td) in the table body phase. Line: 1 Col: 25 Got table cell end tag (td) while required end tags are missing. Line: 1 Col: 25 Expected closing tag. Unexpected end of file. #document | <html> | <head> | <body> | <b> | <table> | <tbody> | <tr> | <td> | <i> #data <b><table><td></b><i></table> #errors Line: 1 Col: 3 Unexpected start tag (b). Expected DOCTYPE. Line: 1 Col: 14 Unexpected table cell start tag (td) in the table body phase. Line: 1 Col: 18 End tag (b) violates step 1, paragraph 1 of the adoption agency algorithm. Line: 1 Col: 29 Got table cell end tag (td) while required end tags are missing. Line: 1 Col: 29 Expected closing tag. Unexpected end of file. #document | <html> | <head> | <body> | <b> | <table> | <tbody> | <tr> | <td> | <i> #data <h1><h2> #errors 4: Start tag seen without seeing a doctype first. Expected “<!DOCTYPE html>”. 8: Heading cannot be a child of another heading. 8: End of file seen and there were open elements. #document | <html> | <head> | <body> | <h1> | <h2> #data <a><p><a></a></p></a> #errors Line: 1 Col: 3 Unexpected start tag (a). Expected DOCTYPE. Line: 1 Col: 9 Unexpected start tag (a) implies end tag (a). Line: 1 Col: 9 End tag (a) violates step 1, paragraph 3 of the adoption agency algorithm. Line: 1 Col: 21 End tag (a) violates step 1, paragraph 1 of the adoption agency algorithm. #document | <html> | <head> | <body> | <a> | <p> | <a> | <a> #data <b><button></b></button></b> #errors Line: 1 Col: 3 Unexpected start tag (b). Expected DOCTYPE. Line: 1 Col: 15 End tag (b) violates step 1, paragraph 1 of the adoption agency algorithm. #document | <html> | <head> | <body> | <b> | <button> | <b> #data <p><b><div><marquee></p></b></div> #errors Line: 1 Col: 3 Unexpected start tag (p). Expected DOCTYPE. Line: 1 Col: 11 Unexpected end tag (p). Ignored. Line: 1 Col: 24 Unexpected end tag (p). Ignored. Line: 1 Col: 28 End tag (b) violates step 1, paragraph 1 of the adoption agency algorithm. Line: 1 Col: 34 End tag (div) seen too early. Expected other end tag. Line: 1 Col: 34 Expected closing tag. Unexpected end of file. #document | <html> | <head> | <body> | <p> | <b> | <div> | <b> | <marquee> | <p> #data <script></script></div><title></title><p><p> #errors Line: 1 Col: 8 Unexpected start tag (script). Expected DOCTYPE. Line: 1 Col: 23 Unexpected end tag (div). Ignored. #document | <html> | <head> | <script> | <title> | <body> | <p> | <p> #data <p><hr></p> #errors Line: 1 Col: 3 Unexpected start tag (p). Expected DOCTYPE. Line: 1 Col: 11 Unexpected end tag (p). Ignored. #document | <html> | <head> | <body> | <p> | <hr> | <p> #data <select><b><option><select><option></b></select> #errors Line: 1 Col: 8 Unexpected start tag (select). Expected DOCTYPE. Line: 1 Col: 11 Unexpected start tag token (b) in the select phase. Ignored. Line: 1 Col: 27 Unexpected select start tag in the select phase treated as select end tag. Line: 1 Col: 39 End tag (b) violates step 1, paragraph 1 of the adoption agency algorithm. Line: 1 Col: 48 Unexpected end tag (select). Ignored. Line: 1 Col: 48 Expected closing tag. Unexpected end of file. #document | <html> | <head> | <body> | <select> | <option> | <option> #data <html><head><title></title><body></body></html> #errors Line: 1 Col: 6 Unexpected start tag (html). Expected DOCTYPE. #document | <html> | <head> | <title> | <body> #data <a><table><td><a><table></table><a></tr><a></table><a> #errors Line: 1 Col: 3 Unexpected start tag (a). Expected DOCTYPE. Line: 1 Col: 14 Unexpected table cell start tag (td) in the table body phase. Line: 1 Col: 35 Unexpected start tag (a) implies end tag (a). Line: 1 Col: 40 Got table cell end tag (td) while required end tags are missing. Line: 1 Col: 43 Unexpected start tag (a) in table context caused voodoo mode. Line: 1 Col: 43 Unexpected start tag (a) implies end tag (a). Line: 1 Col: 43 End tag (a) violates step 1, paragraph 1 of the adoption agency algorithm. Line: 1 Col: 51 Unexpected implied end tag (a) in the table phase. Line: 1 Col: 54 Unexpected start tag (a) implies end tag (a). Line: 1 Col: 54 End tag (a) violates step 1, paragraph 2 of the adoption agency algorithm. Line: 1 Col: 54 Expected closing tag. Unexpected end of file. #document | <html> | <head> | <body> | <a> | <a> | <table> | <tbody> | <tr> | <td> | <a> | <table> | <a> | <a> #data <ul><li></li><div><li></div><li><li><div><li><address><li><b><em></b><li></ul> #errors Line: 1 Col: 4 Unexpected start tag (ul). Expected DOCTYPE. Line: 1 Col: 45 Missing end tag (div, li). Line: 1 Col: 58 Missing end tag (address, li). Line: 1 Col: 69 End tag (b) violates step 1, paragraph 3 of the adoption agency algorithm. #document | <html> | <head> | <body> | <ul> | <li> | <div> | <li> | <li> | <li> | <div> | <li> | <address> | <li> | <b> | <em> | <li> #data <ul><li><ul></li><li>a</li></ul></li></ul> #errors XXX: fix me #document | <html> | <head> | <body> | <ul> | <li> | <ul> | <li> | "a" #data <frameset><frame><frameset><frame></frameset><noframes> #errors Line: 1 Col: 10 Unexpected start tag (frameset). Expected DOCTYPE. #document | | | | | | | #data <h1><table><td><h3></table><h3></h1> #errors 4: Start tag seen without seeing a doctype first. Expected “<!DOCTYPE html>”. 15: “td” start tag in table body. 27: Unclosed elements. 31: Heading cannot be a child of another heading. 36: End tag “h1” seen but there were unclosed elements. #document | <html> | <head> | <body> | <h1> | <table> | <tbody> | <tr> | <td> | <h3> | <h3> #data <table><colgroup><col><colgroup><col><col><col><colgroup><col><col><thead><tr><td></table> #errors Line: 1 Col: 7 Unexpected start tag (table). Expected DOCTYPE. #document | <html> | <head> | <body> | <table> | <colgroup> | <col> | <colgroup> | <col> | <col> | <col> | <colgroup> | <col> | <col> | <thead> | <tr> | <td> #data <table><col><tbody><col><tr><col><td><col></table><col> #errors Line: 1 Col: 7 Unexpected start tag (table). Expected DOCTYPE. Line: 1 Col: 37 Unexpected table cell start tag (td) in the table body phase. Line: 1 Col: 55 Unexpected start tag col. Ignored. #document | <html> | <head> | <body> | <table> | <colgroup> | <col> | <tbody> | <colgroup> | <col> | <tbody> | <tr> | <colgroup> | <col> | <tbody> | <tr> | <td> | <colgroup> | <col> #data <table><colgroup><tbody><colgroup><tr><colgroup><td><colgroup></table><colgroup> #errors Line: 1 Col: 7 Unexpected start tag (table). Expected DOCTYPE. Line: 1 Col: 52 Unexpected table cell start tag (td) in the table body phase. Line: 1 Col: 80 Unexpected start tag colgroup. Ignored. #document | <html> | <head> | <body> | <table> | <colgroup> | <tbody> | <colgroup> | <tbody> | <tr> | <colgroup> | <tbody> | <tr> | <td> | <colgroup> #data </strong></b></em></i></u></strike></s></blink></tt></pre></big></small></font></select></h1></h2></h3></h4></h5></h6></body></br></a></img></title></span></style></script></table></th></td></tr></frame></area></link></param></hr></input></col></base></meta></basefont></bgsound></embed></spacer></p></dd></dt></caption></colgroup></tbody></tfoot></thead></address></blockquote></center></dir></div></dl></fieldset></listing></menu></ol></ul></li></nobr></wbr></form></button></marquee></object></html></frameset></head></iframe></image></isindex></noembed> #errors Line: 1 Col: 9 Unexpected end tag (strong). Expected DOCTYPE. Line: 1 Col: 9 Unexpected end tag (strong) after the (implied) root element. Line: 1 Col: 13 Unexpected end tag (b) after the (implied) root element. Line: 1 Col: 18 Unexpected end tag (em) after the (implied) root element. Line: 1 Col: 22 Unexpected end tag (i) after the (implied) root element. Line: 1 Col: 26 Unexpected end tag (u) after the (implied) root element. Line: 1 Col: 35 Unexpected end tag (strike) after the (implied) root element. Line: 1 Col: 39 Unexpected end tag (s) after the (implied) root element. Line: 1 Col: 47 Unexpected end tag (blink) after the (implied) root element. Line: 1 Col: 52 Unexpected end tag (tt) after the (implied) root element. Line: 1 Col: 58 Unexpected end tag (pre) after the (implied) root element. Line: 1 Col: 64 Unexpected end tag (big) after the (implied) root element. Line: 1 Col: 72 Unexpected end tag (small) after the (implied) root element. Line: 1 Col: 79 Unexpected end tag (font) after the (implied) root element. Line: 1 Col: 88 Unexpected end tag (select) after the (implied) root element. Line: 1 Col: 93 Unexpected end tag (h1) after the (implied) root element. Line: 1 Col: 98 Unexpected end tag (h2) after the (implied) root element. Line: 1 Col: 103 Unexpected end tag (h3) after the (implied) root element. Line: 1 Col: 108 Unexpected end tag (h4) after the (implied) root element. Line: 1 Col: 113 Unexpected end tag (h5) after the (implied) root element. Line: 1 Col: 118 Unexpected end tag (h6) after the (implied) root element. Line: 1 Col: 125 Unexpected end tag (body) after the (implied) root element. Line: 1 Col: 130 Unexpected end tag (br). Treated as br element. Line: 1 Col: 134 End tag (a) violates step 1, paragraph 1 of the adoption agency algorithm. Line: 1 Col: 140 This element (img) has no end tag. Line: 1 Col: 148 Unexpected end tag (title). Ignored. Line: 1 Col: 155 Unexpected end tag (span). Ignored. Line: 1 Col: 163 Unexpected end tag (style). Ignored. Line: 1 Col: 172 Unexpected end tag (script). Ignored. Line: 1 Col: 180 Unexpected end tag (table). Ignored. Line: 1 Col: 185 Unexpected end tag (th). Ignored. Line: 1 Col: 190 Unexpected end tag (td). Ignored. Line: 1 Col: 195 Unexpected end tag (tr). Ignored. Line: 1 Col: 203 This element (frame) has no end tag. Line: 1 Col: 210 This element (area) has no end tag. Line: 1 Col: 217 Unexpected end tag (link). Ignored. Line: 1 Col: 225 This element (param) has no end tag. Line: 1 Col: 230 This element (hr) has no end tag. Line: 1 Col: 238 This element (input) has no end tag. Line: 1 Col: 244 Unexpected end tag (col). Ignored. Line: 1 Col: 251 Unexpected end tag (base). Ignored. Line: 1 Col: 258 Unexpected end tag (meta). Ignored. Line: 1 Col: 269 This element (basefont) has no end tag. Line: 1 Col: 279 This element (bgsound) has no end tag. Line: 1 Col: 287 This element (embed) has no end tag. Line: 1 Col: 296 This element (spacer) has no end tag. Line: 1 Col: 300 Unexpected end tag (p). Ignored. Line: 1 Col: 305 End tag (dd) seen too early. Expected other end tag. Line: 1 Col: 310 End tag (dt) seen too early. Expected other end tag. Line: 1 Col: 320 Unexpected end tag (caption). Ignored. Line: 1 Col: 331 Unexpected end tag (colgroup). Ignored. Line: 1 Col: 339 Unexpected end tag (tbody). Ignored. Line: 1 Col: 347 Unexpected end tag (tfoot). Ignored. Line: 1 Col: 355 Unexpected end tag (thead). Ignored. Line: 1 Col: 365 End tag (address) seen too early. Expected other end tag. Line: 1 Col: 378 End tag (blockquote) seen too early. Expected other end tag. Line: 1 Col: 387 End tag (center) seen too early. Expected other end tag. Line: 1 Col: 393 Unexpected end tag (dir). Ignored. Line: 1 Col: 399 End tag (div) seen too early. Expected other end tag. Line: 1 Col: 404 End tag (dl) seen too early. Expected other end tag. Line: 1 Col: 415 End tag (fieldset) seen too early. Expected other end tag. Line: 1 Col: 425 End tag (listing) seen too early. Expected other end tag. Line: 1 Col: 432 End tag (menu) seen too early. Expected other end tag. Line: 1 Col: 437 End tag (ol) seen too early. Expected other end tag. Line: 1 Col: 442 End tag (ul) seen too early. Expected other end tag. Line: 1 Col: 447 End tag (li) seen too early. Expected other end tag. Line: 1 Col: 454 End tag (nobr) violates step 1, paragraph 1 of the adoption agency algorithm. Line: 1 Col: 460 This element (wbr) has no end tag. Line: 1 Col: 476 End tag (button) seen too early. Expected other end tag. Line: 1 Col: 486 End tag (marquee) seen too early. Expected other end tag. Line: 1 Col: 495 End tag (object) seen too early. Expected other end tag. Line: 1 Col: 513 Unexpected end tag (html). Ignored. Line: 1 Col: 513 Unexpected end tag (frameset). Ignored. Line: 1 Col: 520 Unexpected end tag (head). Ignored. Line: 1 Col: 529 Unexpected end tag (iframe). Ignored. Line: 1 Col: 537 This element (image) has no end tag. Line: 1 Col: 547 This element (isindex) has no end tag. Line: 1 Col: 557 Unexpected end tag (noembed). Ignored. Line: 1 Col: 568 Unexpected end tag (noframes). Ignored. Line: 1 Col: 579 Unexpected end tag (noscript). Ignored. Line: 1 Col: 590 Unexpected end tag (optgroup). Ignored. Line: 1 Col: 599 Unexpected end tag (option). Ignored. Line: 1 Col: 611 Unexpected end tag (plaintext). Ignored. Line: 1 Col: 622 Unexpected end tag (textarea). Ignored. #document | | | |
|

#data

| "A" HTML-HTML5-Parser-0.301/t/html5lib-pass/inbody01.dat0000644000175000017500000000066511726072634017543 0ustar taitai#data

| | |

#data #errors Line: 1 Col: 10 Unexpected start tag (frameset). Expected DOCTYPE. Line: 1 Col: 10 Expected closing tag. Unexpected end of file. #document | | | HTML-HTML5-Parser-0.301/t/html5lib-fail/tests21.dat0000644000175000017500000000473111726072634017366 0ustar taitai#data foo #errors #document | | | | | "foo" #data foo #errors #document | | | | | "foo" #data

#errors #document | | | |
| #data foo #errors #document | <html> | <head> | <body> | <svg svg> | "foo" #data <svg><![CDATA[foo #errors #document | <html> | <head> | <body> | <svg svg> | "foo" #data <svg><![CDATA[ #errors #document | <html> | <head> | <body> | <svg svg> #data <svg><![CDATA[ #errors #document | | | | #data ]] > #errors #document | | | | | "]] >" #data ]] > #errors #document | | | | | "]] >" #data ]] #errors #document | <html> | <head> | <body> | <svg svg> | "]]" #data <svg><![CDATA[] #errors #document | <html> | <head> | <body> | <svg svg> | "]" #data <svg><![CDATA[]>a #errors #document | <html> | <head> | <body> | <svg svg> | "]>a" #data <svg><foreignObject><div><![CDATA[foo #errors #document | | | | | |
| #data <svg> #errors #document | | | | | "" #data </svg>a #errors #document | | | | | "a" #data <svg>a #errors #document | <html> | <head> | <body> | <svg svg> | "<svg>a" #data <svg><![CDATA[</svg>a #errors #document | <html> | <head> | <body> | <svg svg> | "</svg>a" #data <svg><![CDATA[<svg> #errors #document | | | | | "" | #data <svg> #errors #document | | | | | "" #data <svg> #errors #document | | | | | "" | #data <svg>path #errors #document | | | | | "path" #data <!--svg--> #errors #document | | | | | "" ���������������������������������������HTML-HTML5-Parser-0.301/t/html5lib-fail/tests23.dat�������������������������������������������������0000644�0001750�0001750�00000005762�11726072634�017375� 0����������������������������������������������������������������������������������������������������ustar �tai�����������������������������tai��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������#data

X #errors 3: Start tag seen without seeing a doctype first. Expected “”. 116: Unclosed elements. 117: End of file seen and there were open elements. #document | | | |

| | size="4" | | color="red" | | size="4" | | size="4" | | size="4" | | size="4" | | size="4" | | color="red" |

| | color="red" | | size="4" | | size="4" | | size="4" | | color="red" | "X" #data

X #errors #document | | | |

| | size="4" | | size="4" | | size="4" | | size="4" |

| | size="4" | | size="4" | | size="4" | "X" #data

X #errors #document | | | |

| | size="4" | | size="4" | | size="4" | | size="5" | | size="4" |

| | size="4" | | size="4" | | size="5" | | size="4" | "X" #data

X #errors #document | | | |

| | id="a" | size="4" | | id="b" | size="4" | | size="4" | | size="4" |

| | id="a" | size="4" | | id="b" | size="4" | | size="4" | | size="4" | "X" #data

X

Y #errors #document | | | |

| | id="a" | | id="a" | | id="a" | | | | id="a" | | id="a" | "X" |

| | id="a" | | id="a" | | id="a" | | "Y" HTML-HTML5-Parser-0.301/t/html5lib-fail/tests9.dat0000644000175000017500000002701011726072634017307 0ustar taitai#data #errors #document | | | | | #data #errors #document | | | | | #data #errors 25: End of file in a foreign namespace context. #document | | | | | | #data #errors 45: HTML start tag “u” in a foreign namespace context. 45: End of file seen and there were open elements. #document | | | | | | | | #data #errors Line: 1 Col: 35 Unexpected start tag token (math) in the select phase. Ignored. Line: 1 Col: 42 Unexpected end tag (math) in the select phase. Ignored. #document | | | | | #errors Line: 1 Col: 43 Unexpected start tag token (math) in the select phase. Ignored. Line: 1 Col: 50 Unexpected end tag (math) in the select phase. Ignored. #document | | | | |

#errors Line: 1 Col: 34 Unexpected start tag (math) in table context caused voodoo mode. Line: 1 Col: 41 Unexpected end tag (math) in table context caused voodoo mode. #document | | | | | | #data
foo
#errors Line: 1 Col: 34 Unexpected start tag (math) in table context caused voodoo mode. Line: 1 Col: 46 Unexpected end tag (mi) in table context caused voodoo mode. Line: 1 Col: 53 Unexpected end tag (math) in table context caused voodoo mode. #document | | | | | | | "foo" | #data
foobar
#errors Line: 1 Col: 34 Unexpected start tag (math) in table context caused voodoo mode. Line: 1 Col: 46 Unexpected end tag (mi) in table context caused voodoo mode. Line: 1 Col: 58 Unexpected end tag (mi) in table context caused voodoo mode. Line: 1 Col: 65 Unexpected end tag (math) in table context caused voodoo mode. #document | | | | | | | "foo" | | "bar" | #data
foobar
#errors Line: 1 Col: 41 Unexpected start tag (math) in table context caused voodoo mode. Line: 1 Col: 53 Unexpected end tag (mi) in table context caused voodoo mode. Line: 1 Col: 65 Unexpected end tag (mi) in table context caused voodoo mode. Line: 1 Col: 72 Unexpected end tag (math) in table context caused voodoo mode. #document | | | | | | | "foo" | | "bar" | | #data
foobar
#errors Line: 1 Col: 45 Unexpected start tag (math) in table context caused voodoo mode. Line: 1 Col: 57 Unexpected end tag (mi) in table context caused voodoo mode. Line: 1 Col: 69 Unexpected end tag (mi) in table context caused voodoo mode. Line: 1 Col: 76 Unexpected end tag (math) in table context caused voodoo mode. #document | | | | | | | "foo" | | "bar" | | | #data
foobar
#errors #document | | | | | | | |
| | | "foo" | | "bar" #data
foobar

baz

#errors #document | | | | | | | |
| | | "foo" | | "bar" |

| "baz" #data
foobar

baz

#errors #document | | | | | |
| | | "foo" | | "bar" |

| "baz" #data
foobar

baz

quux #errors Line: 1 Col: 70 HTML start tag "p" in a foreign namespace context. Line: 1 Col: 81 Unexpected end table tag in caption. Generates implied end caption. #document | | | | | |
| | | "foo" | | "bar" |

| "baz" |

| "quux" #data
foobarbaz

quux #errors Line: 1 Col: 78 Unexpected end table tag in caption. Generates implied end caption. Line: 1 Col: 78 Unexpected end tag (caption). Missing end tag (math). #document | | | | | |
| | | "foo" | | "bar" | "baz" |

| "quux" #data foobar

baz

quux #errors Line: 1 Col: 44 Unexpected start tag (math) in table context caused voodoo mode. Line: 1 Col: 56 Unexpected end tag (mi) in table context caused voodoo mode. Line: 1 Col: 68 Unexpected end tag (mi) in table context caused voodoo mode. Line: 1 Col: 71 HTML start tag "p" in a foreign namespace context. Line: 1 Col: 71 Unexpected start tag (p) in table context caused voodoo mode. #document | | | | | | | "foo" | | "bar" |

| "baz" | | |

| "quux" #data

quux #errors Line: 1 Col: 50 Unexpected start tag token (math) in the select phase. Ignored. Line: 1 Col: 54 Unexpected start tag token (mi) in the select phase. Ignored. Line: 1 Col: 62 Unexpected end tag (mi) in the select phase. Ignored. Line: 1 Col: 66 Unexpected start tag token (mi) in the select phase. Ignored. Line: 1 Col: 74 Unexpected end tag (mi) in the select phase. Ignored. Line: 1 Col: 77 Unexpected start tag token (p) in the select phase. Ignored. Line: 1 Col: 88 Unexpected table element end tag (tables) in the select in table phase. #document | | | | | | | |
|

quux #errors Line: 1 Col: 36 Unexpected start tag (select) in table context caused voodoo mode. Line: 1 Col: 42 Unexpected start tag token (math) in the select phase. Ignored. Line: 1 Col: 46 Unexpected start tag token (mi) in the select phase. Ignored. Line: 1 Col: 54 Unexpected end tag (mi) in the select phase. Ignored. Line: 1 Col: 58 Unexpected start tag token (mi) in the select phase. Ignored. Line: 1 Col: 66 Unexpected end tag (mi) in the select phase. Ignored. Line: 1 Col: 69 Unexpected start tag token (p) in the select phase. Ignored. Line: 1 Col: 80 Unexpected table element end tag (tables) in the select in table phase. #document | | | | | |

| "quux" #data foobar

baz #errors Line: 1 Col: 41 Unexpected start tag (math). Line: 1 Col: 68 HTML start tag "p" in a foreign namespace context. #document | | | | | | | "foo" | | "bar" |

| "baz" #data foobar

baz #errors Line: 1 Col: 34 Unexpected start tag token (math) in the after body phase. Line: 1 Col: 61 HTML start tag "p" in a foreign namespace context. #document | | | | | | | "foo" | | "bar" |

| "baz" #data

#errors Line: 1 Col: 31 Unexpected start tag token (math) in the frameset phase. Ignored. Line: 1 Col: 35 Unexpected start tag token (mi) in the frameset phase. Ignored. Line: 1 Col: 40 Unexpected end tag token (mi) in the frameset phase. Ignored. Line: 1 Col: 44 Unexpected start tag token (mi) in the frameset phase. Ignored. Line: 1 Col: 49 Unexpected end tag token (mi) in the frameset phase. Ignored. Line: 1 Col: 52 Unexpected start tag token (p) in the frameset phase. Ignored. Line: 1 Col: 58 Unexpected start tag token (span) in the frameset phase. Ignored. Line: 1 Col: 58 Expected closing tag. Unexpected end of file. #document | | | | #data

#errors Line: 1 Col: 42 Unexpected start tag (math) in the after frameset phase. Ignored. Line: 1 Col: 46 Unexpected start tag (mi) in the after frameset phase. Ignored. Line: 1 Col: 51 Unexpected end tag (mi) in the after frameset phase. Ignored. Line: 1 Col: 55 Unexpected start tag (mi) in the after frameset phase. Ignored. Line: 1 Col: 60 Unexpected end tag (mi) in the after frameset phase. Ignored. Line: 1 Col: 63 Unexpected start tag (p) in the after frameset phase. Ignored. Line: 1 Col: 69 Unexpected start tag (span) in the after frameset phase. Ignored. #document | | | | #data #errors #document | | | | | xlink:href="foo" | | xlink href="foo" #data #errors #document | | | | | xlink:href="foo" | xml:lang="en" | | | xlink href="foo" | xml lang="en" #data #errors #document | | | | | xlink:href="foo" | xml:lang="en" | | | xlink href="foo" | xml lang="en" #data bar #errors #document | | | | | xlink:href="foo" | xml:lang="en" | | | xlink href="foo" | xml lang="en" | "bar" ������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������HTML-HTML5-Parser-0.301/t/07ua.t��������������������������������������������������������������������0000644�0001750�0001750�00000002764�12165411220�013665� 0����������������������������������������������������������������������������������������������������ustar �tai�����������������������������tai��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������## skip Test::Tabs use 5.008; use strict; use lib 'lib'; use lib 't/lib'; use Test::More; use HTML::HTML5::Parser; use HTML::HTML5::Parser::UA; use URI::file; { package Test::HTTP::Server::Request; sub doc1 { shift->{out_headers}{content_type} = 'text/html'; q{ Test!

Test!

}; } } eval { require Test::HTTP::Server; 1; } or plan skip_all => "Could not use Test::HTTP::Server: $@"; plan skip_all => "Test::HTTP::Server 0.03 fails on Win32" if $^O =~ /win/i && Test::HTTP::Server->VERSION lt '0.04'; plan tests => 3; my $server = Test::HTTP::Server->new(); my $baseuri = $server->uri; $HTML::HTML5::Parser::UA::NO_LWP = 1 if $HTML::HTML5::Parser::UA::NO_LWP eq '0'; my $file_response = HTML::HTML5::Parser::UA->get(URI::file->new_abs("t/01basic.t")); is( $file_response->{status}, 200, "simple file response - status 200", ); my $http_response = HTML::HTML5::Parser::UA->get($baseuri . 'doc1'); is( $file_response->{status}, 200, "simple HTTP response - status 200", ); my $dom = HTML::HTML5::Parser->load_html(location => $baseuri.'doc1'); is( $dom->getElementsByTagName('title')->shift->textContent, 'Test!', 'UA usage by parser', ); =head1 PURPOSE Check that L works with L. =head1 AUTHOR Toby Inkster, Etobyink@cpan.orgE =head1 COPYRIGHT AND LICENCE Copyright (C) 2012 by Toby Inkster This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself. HTML-HTML5-Parser-0.301/t/03html4.t0000644000175000017500000000163212165411232014300 0ustar taitai## skip Test::Tabs use Test::More tests => 2; use HTML::HTML5::Parser; my $parser = HTML::HTML5::Parser->new; my $html = <foo

foo

HTML my $dom_4 = $parser->parse_string(''.$html); my $dom_5 = $parser->parse_string(''.$html); my ($object_4) = $dom_4->getElementsByTagName('object'); my ($object_5) = $dom_5->getElementsByTagName('object'); is($object_4->parentNode->tagName, 'head', 'HTML 4 allows in .'); is($object_5->parentNode->tagName, 'body', 'HTML 5 disallows in .'); =head1 PURPOSE Test that HTML 4 allows C<< >> elements in C<< >>. =head1 AUTHOR Toby Inkster, Etobyink@cpan.orgE =head1 COPYRIGHT AND LICENCE Copyright (C) 2012 by Toby Inkster This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself. HTML-HTML5-Parser-0.301/README0000644000175000017500000003061612166545151013352 0ustar taitaiNAME HTML::HTML5::Parser - parse HTML reliably SYNOPSIS use HTML::HTML5::Parser; my $parser = HTML::HTML5::Parser->new; my $doc = $parser->parse_string(<<'EOT'); Foo

Foo bar.

Baz
Quux. EOT my $fdoc = $parser->parse_file( $html_file_name ); my $fhdoc = $parser->parse_fh( $html_file_handle ); DESCRIPTION This library is substantially the same as the non-CPAN module Whatpm::HTML. Changes include: * Provides an XML::LibXML-like DOM interface. If you usually use XML::LibXML's DOM parser, this should be a drop-in solution for tag soup HTML. * Constructs an XML::LibXML::Document as the result of parsing. * Via bundling and modifications, removed external dependencies on non-CPAN packages. Constructor "new" $parser = HTML::HTML5::Parser->new; # or $parser = HTML::HTML5::Parser->new(no_cache => 1); The constructor does nothing interesting besides take one flag argument, "no_cache => 1", to disable the global element metadata cache. Disabling the cache is handy for conserving memory if you parse a large number of documents, however, class methods such as "/source_line" will not work, and must be run from an instance of this parser. XML::LibXML-Compatible Methods "parse_file", "parse_html_file" $doc = $parser->parse_file( $html_file_name [,\%opts] ); This function parses an HTML document from a file or network; $html_file_name can be either a filename or an URL. Options include 'encoding' to indicate file encoding (e.g. 'utf-8') and 'user_agent' which should be a blessed "LWP::UserAgent" (or HTTP::Tiny) object to be used when retrieving URLs. If requesting a URL and the response Content-Type header indicates an XML-based media type (such as XHTML), XML::LibXML::Parser will be used automatically (instead of the tag soup parser). The XML parser can be told to use a DTD catalogue by setting the option 'xml_catalogue' to the filename of the catalogue. HTML (tag soup) parsing can be forced using the option 'force_html', even when an XML media type is returned. If an options hashref was passed, parse_file will set $options->{'parser_used'} to the name of the class used to parse the URL, to allow the calling code to double-check which parser was used afterwards. If an options hashref was passed, parse_file will set $options->{'response'} to the HTTP::Response object obtained by retrieving the URI. "parse_fh", "parse_html_fh" $doc = $parser->parse_fh( $io_fh [,\%opts] ); "parse_fh()" parses a IOREF or a subclass of "IO::Handle". Options include 'encoding' to indicate file encoding (e.g. 'utf-8'). "parse_string", "parse_html_string" $doc = $parser->parse_string( $html_string [,\%opts] ); This function is similar to "parse_fh()", but it parses an HTML document that is available as a single string in memory. Options include 'encoding' to indicate file encoding (e.g. 'utf-8'). "load_xml", "load_html" Wrappers for the parse_* functions. These should be roughly compatible with the equivalently named functions in XML::LibXML. Note that "load_xml" first attempts to parse as real XML, falling back to HTML5 parsing; "load_html" just goes straight for HTML5. "parse_balanced_chunk" $fragment = $parser->parse_balanced_chunk( $string [,\%opts] ); This method is roughly equivalent to XML::LibXML's method of the same name, but unlike XML::LibXML, and despite its name it does not require the chunk to be "balanced". This method is somewhat black magic, but should work, and do the proper thing in most cases. Of course, the proper thing might not be what you'd expect! I'll try to keep this explanation as brief as possible... Consider the following string: Hello World What is the proper way to parse that? If it were found in a document like this: X

Hello World
Then the document would end up equivalent to the following XHTML: X
Hello World
The superfluous "" is simply ignored. However, if it were found in a document like this: X
World
Hello
Then the result would be: X World
Hello
Yes, "World" gets hoisted up before the "". This is weird, I know, but it's how browsers do it in real life. So what should: $string = q{HelloWorld}; $fragment = $parser->parse_balanced_chunk($string); actually return? Well, you can choose... $string = q{HelloWorld}; $frag1 = $parser->parse_balanced_chunk($string, {within=>'div'}); say $frag1->toString; # HelloWorld $frag2 = $parser->parse_balanced_chunk($string, {within=>'td'}); say $frag2->toString; # WorldHello If you don't pass a "within" option, then the chunk is parsed as if it were within a "
" element. This is often the most sensible option. If you pass something like "{ within => "foobar" }" where "foobar" is not a real HTML element name (as found in the HTML5 spec), then this method will croak; if you pass the name of a void element (e.g. "br" or "meta") then this method will croak; there are a handful of other unsupported elements which will croak (namely: "noscript", "noembed", "noframes"). Note that the second time around, although we parsed the string "as if it were within a "
" element", the "Hello" bit did not strictly end up within the "" element (not even within the "" element!) yet it still gets returned. We'll call things such as this "outliers". There is a "force_within" option which tells parse_balanced_chunk to ignore outliers: $frag3 = $parser->parse_balanced_chunk($string, {force_within=>'td'}); say $frag3->toString; # Hello There is a boolean option "mark_outliers" which marks each outlier with an attribute ("data-perl-html-html5-parser-outlier") to indicate its outlier status. Clearly, this is ignored when you use "force_within" because no outliers are returned. Some outliers may be XML::LibXML::Text elements; text nodes don't have attributes, so these will not be marked with an attribute. A last note is to mention what gets returned by this method. Normally it's an XML::LibXML::DocumentFragment object, but if you call the method in list context, a list of the individual node elements is returned. Alternatively you can request the data to be returned as an XML::LibXML::NodeList object: # Get an XML::LibXML::NodeList my $list = $parser->parse_balanced_chunk($str, {as=>'list'}); The exact implementation of this method may change from version to version, but the long-term goal will be to approach how common desktop browsers parse HTML fragments when implementing the setter for DOM's "innerHTML" attribute. The push parser and SAX-based parser are not supported. Trying to change an option (such as recover_silently) will make HTML::HTML5::Parser carp a warning. (But you can inspect the options.) Error Handling Error handling is obviously different to XML::LibXML, as errors are (bugs notwithstanding) non-fatal. "error_handler" Get/set an error handling function. Must be set to a coderef or undef. The error handling function will be called with a single parameter, a HTML::HTML5::Parser::Error object. "errors" Returns a list of errors that occurred during the last parse. See HTML::HTML5::Parser::Error. Additional Methods The module provides a few methods to obtain additional, non-DOM data from DOM nodes. "dtd_public_id" $pubid = $parser->dtd_public_id( $doc ); For an XML::LibXML::Document which has been returned by HTML::HTML5::Parser, using this method will tell you the Public Identifier of the DTD used (if any). "dtd_system_id" $sysid = $parser->dtd_system_id( $doc ); For an XML::LibXML::Document which has been returned by HTML::HTML5::Parser, using this method will tell you the System Identifier of the DTD used (if any). "dtd_element" $element = $parser->dtd_element( $doc ); For an XML::LibXML::Document which has been returned by HTML::HTML5::Parser, using this method will tell you the root element declared in the DTD used (if any). That is, if the document has this doctype: ... it will return "html". This may return the empty string if a DTD was present but did not contain a root element; or undef if no DTD was present. "compat_mode" $mode = $parser->compat_mode( $doc ); Returns 'quirks', 'limited quirks' or undef (standards mode). "charset" $charset = $parser->charset( $doc ); The character set apparently used by the document. "source_line" ($line, $col) = $parser->source_line( $node ); $line = $parser->source_line( $node ); In scalar context, "source_line" returns the line number of the source code that started a particular node (element, attribute or comment). In list context, returns a tuple: $line, $column, $implicitness. Tab characters count as one column, not eight. $implicitness indicates that the node was not explicitly marked up in the source code, but its existence was inferred by the parser. For example, in the following markup, the HTML, TITLE and P elements are explicit, but the HEAD and BODY elements are implicit. I have an implicit head

And an implicit body too!

(Note that implicit elements do still have a line number and column number.) The implictness indicator is a new feature, and I'd appreciate any bug reports where it gets things wrong. XML::LibXML::Node has a "line_number" method. In general this will always return 0 and HTML::HTML5::Parser has no way of influencing it. However, if you install XML::LibXML::Devel::SetLineNumber on your system, the "line_number" method will start working (at least for elements). SEE ALSO . HTML::HTML5::Writer, HTML::HTML5::Builder, XML::LibXML, XML::LibXML::PrettyPrint, XML::LibXML::Devel::SetLineNumber. AUTHOR Toby Inkster, COPYRIGHT AND LICENCE Copyright (C) 2007-2011 by Wakaba Copyright (C) 2009-2012 by Toby Inkster This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself. DISCLAIMER OF WARRANTIES THIS PACKAGE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. HTML-HTML5-Parser-0.301/lib/0000755000175000017500000000000012166545247013240 5ustar taitaiHTML-HTML5-Parser-0.301/lib/HTML/0000755000175000017500000000000012166545247014004 5ustar taitaiHTML-HTML5-Parser-0.301/lib/HTML/HTML5/0000755000175000017500000000000012166545247014635 5ustar taitaiHTML-HTML5-Parser-0.301/lib/HTML/HTML5/Parser/0000755000175000017500000000000012166545247016071 5ustar taitaiHTML-HTML5-Parser-0.301/lib/HTML/HTML5/Parser/TagSoupParser.pm0000644000175000017500000100672712166544311021172 0ustar taitaipackage HTML::HTML5::Parser::TagSoupParser; ## skip Test::Tabs # This is a port of the Whatpm::HTML package away from dependencies # on manakai, and towards CPAN and XML::LibXML. # http://suika.fam.cx/gate/git/wi/manakai.git/history/HEAD:/lib/Whatpm/HTML.pm # CAUGHT UP TO d81fcb920a1a3c351149cd66a64bf1b8ae14a172 (2011-08-21) use 5.008001; use strict; no warnings; our $VERSION = '0.301'; use IO::Handle; use HTML::HTML5::Parser::Tokenizer; use Scalar::Util qw(blessed); use Try::Tiny; use XML::LibXML ':libxml'; use XML::LibXML::Devel; BEGIN { if (eval { require XML::LibXML::Devel::SetLineNumber; 1 }) { *HAS_XLXDSLN = sub () { 1 }; } else { *HAS_XLXDSLN = sub () { 0 }; } } *XML::LibXML::Element::appendTextFromUnicode = sub { my $element = shift; my $parser = shift if ref $_[0]; my $text = shift; utf8::encode($text); my $token = shift; # This prevents adjacent text nodes. if (defined $element->lastChild and $element->lastChild->nodeType == XML_TEXT_NODE) { $element->appendText($text); return; } my $textnode = XML::LibXML::Text->new($text); if ($token) { $parser->_data($textnode, manakai_source_line => $token->{line}) if $parser and defined $token->{line}; $parser->_data($textnode, manakai_source_column => $token->{column}) if $parser and defined $token->{column}; if (HAS_XLXDSLN and exists $token->{line} and int($token->{line}) and int($token->{line}) eq $token->{line}) { $textnode->XML::LibXML::Devel::SetLineNumber::set_line_number($token->{line}); } } return $element->appendChild($textnode); }; our $DATA = {}; sub DATA { _data(undef, @_); } sub _data { my $self = shift; my ($object, $k, $v) = @_; my $argc = @_; # This method doesn't work for non XLxN things. Fail silently. unless (blessed($object) and $object->isa('XML::LibXML::Node')) { return {} if $argc==1; return; } # This seems to work much better as a unique identifier for a # node than refaddr does. However, it's not a supported use # for XML::LibXML::Devel, so it might cause failures. We'll see. my $oaddr = XML::LibXML::Devel::node_from_perl($object); my $data; if (ref $self) { $data = $self->{_debug_cache}{$oaddr} ||= {}; } else { $data = $DATA->{$oaddr} ||= {}; } if (HAS_XLXDSLN and defined $k and $k eq 'manakai_source_line' and defined $v and int($v) and int($v) eq $v and $object->nodeType == XML_ELEMENT_NODE) # does not work well for attrs { $object->XML::LibXML::Devel::SetLineNumber::set_line_number($v); } $data->{$k} = $v if $argc==3; return $data->{$k} if $argc==2; return $data; } ## NOTE: This module don't check all HTML5 parse errors; character ## encoding related parse errors are expected to be handled by relevant ## modules. ## Parse errors for control characters that are not allowed in HTML5 ## documents, for surrogate code points, and for noncharacter code ## points, as well as U+FFFD substitions for characters whose code points ## is higher than U+10FFFF may be detected by combining the parser with ## the checker implemented by HTML::HTML5::Parser::Charset::UnicodeChecker (for its ## usage example, see |t/HTML-tree.t| in the Whatpm package or the ## WebHACC::Language::HTML module in the WebHACC package). ## ISSUE: ## var doc = implementation.createDocument (null, null, null); ## doc.write (''); ## alert (doc.compatMode); ## Namespace URLs sub HTML_NS () { q } sub MML_NS () { q } sub SVG_NS () { q } sub XLINK_NS () { q } sub XML_NS () { q } sub XMLNS_NS () { q } ## Element categories ## Bits 14-18 sub BUTTON_SCOPING_EL () { 0b1_000000000000000000 } ## Special sub SPECIAL_EL () { 0b1_00000000000000000 } ## Special sub SCOPING_EL () { 0b1_0000000000000000 } ## Special sub FORMATTING_EL () { 0b1_000000000000000 } ## Formatting sub PHRASING_EL () { 0b1_00000000000000 } ## Ordinary ## Bits 10-13 sub SVG_EL () { 0b1_0000000000000 } sub MML_EL () { 0b1_000000000000 } #sub FOREIGN_EL () { 0b1_00000000000 } # see HTML::HTML5::Parser::Tokenizer sub FOREIGN_FLOW_CONTENT_EL () { 0b1_0000000000 } ## Bits 6-9 sub TABLE_SCOPING_EL () { 0b1_000000000 } sub TABLE_ROWS_SCOPING_EL () { 0b1_00000000 } sub TABLE_ROW_SCOPING_EL () { 0b1_0000000 } sub TABLE_ROWS_EL () { 0b1_000000 } ## Bit 5 sub ADDRESS_DIV_P_EL () { 0b1_00000 } ## NOTE: Used in and EOF algorithms. ## Bit 4 sub ALL_END_TAG_OPTIONAL_EL () { 0b1_0000 } ## NOTE: Used in "generate implied end tags" algorithm. ## NOTE: There is a code where a modified version of ## END_TAG_OPTIONAL_EL is used in "generate implied end tags" ## implementation (search for the algorithm name). ## Bit 3 sub END_TAG_OPTIONAL_EL () { 0b1_000 } ## Bits 0-2 sub MISC_SPECIAL_EL () { SPECIAL_EL | 0b000 } sub FORM_EL () { SPECIAL_EL | 0b001 } sub FRAMESET_EL () { SPECIAL_EL | 0b010 } sub HEADING_EL () { SPECIAL_EL | 0b011 } sub SELECT_EL () { SPECIAL_EL | 0b100 } sub SCRIPT_EL () { SPECIAL_EL | 0b101 } sub BUTTON_EL () { SPECIAL_EL | BUTTON_SCOPING_EL | 0b110 } sub ADDRESS_DIV_EL () { SPECIAL_EL | ADDRESS_DIV_P_EL | 0b001 } sub BODY_EL () { SPECIAL_EL | ALL_END_TAG_OPTIONAL_EL | 0b001 } sub DTDD_EL () { SPECIAL_EL | END_TAG_OPTIONAL_EL | ALL_END_TAG_OPTIONAL_EL | 0b010 } sub LI_EL () { SPECIAL_EL | END_TAG_OPTIONAL_EL | ALL_END_TAG_OPTIONAL_EL | 0b100 } sub P_EL () { SPECIAL_EL | ADDRESS_DIV_P_EL | END_TAG_OPTIONAL_EL | ALL_END_TAG_OPTIONAL_EL | 0b001 } sub TABLE_ROW_EL () { SPECIAL_EL | TABLE_ROWS_EL | TABLE_ROW_SCOPING_EL | ALL_END_TAG_OPTIONAL_EL | 0b001 } sub TABLE_ROW_GROUP_EL () { SPECIAL_EL | TABLE_ROWS_EL | TABLE_ROWS_SCOPING_EL | ALL_END_TAG_OPTIONAL_EL | 0b001 } sub MISC_SCOPING_EL () { SCOPING_EL | BUTTON_SCOPING_EL | 0b000 } sub CAPTION_EL () { SCOPING_EL | BUTTON_SCOPING_EL | 0b010 } sub HTML_EL () { SCOPING_EL | BUTTON_SCOPING_EL | TABLE_SCOPING_EL | TABLE_ROWS_SCOPING_EL | TABLE_ROW_SCOPING_EL | ALL_END_TAG_OPTIONAL_EL | 0b001 } sub TABLE_EL () { SCOPING_EL | BUTTON_SCOPING_EL | TABLE_ROWS_EL | TABLE_SCOPING_EL | 0b001 } sub TABLE_CELL_EL () { SCOPING_EL | BUTTON_SCOPING_EL | ALL_END_TAG_OPTIONAL_EL | 0b001 } sub MISC_FORMATTING_EL () { FORMATTING_EL | 0b000 } sub A_EL () { FORMATTING_EL | 0b001 } sub NOBR_EL () { FORMATTING_EL | 0b010 } sub RUBY_EL () { PHRASING_EL | 0b001 } ## NOTE: These elements are not included in |ALL_END_TAG_OPTIONAL_EL|. sub OPTGROUP_EL () { PHRASING_EL | END_TAG_OPTIONAL_EL | 0b001 } sub OPTION_EL () { PHRASING_EL | END_TAG_OPTIONAL_EL | 0b010 } sub RUBY_COMPONENT_EL () { PHRASING_EL | END_TAG_OPTIONAL_EL | 0b100 } ## "MathML text integration point" elements. sub MML_TEXT_INTEGRATION_EL () { MML_EL | SCOPING_EL | BUTTON_SCOPING_EL | FOREIGN_EL | FOREIGN_FLOW_CONTENT_EL } # MML_TEXT_INTEGRATION_EL sub MML_AXML_EL () { MML_EL | SCOPING_EL | BUTTON_SCOPING_EL | FOREIGN_EL | 0b001 } # MML_AXML_EL ## "HTML integration point" elements in SVG namespace. sub SVG_INTEGRATION_EL () { SVG_EL | SCOPING_EL | BUTTON_SCOPING_EL | FOREIGN_EL | FOREIGN_FLOW_CONTENT_EL } # SVG_INTEGRATION_EL sub SVG_SCRIPT_EL () { SVG_EL | FOREIGN_EL | 0b101 } # SVG_SCRIPT_EL my $el_category = { a => A_EL, address => ADDRESS_DIV_EL, applet => MISC_SCOPING_EL, area => MISC_SPECIAL_EL, article => MISC_SPECIAL_EL, aside => MISC_SPECIAL_EL, b => FORMATTING_EL, base => MISC_SPECIAL_EL, basefont => MISC_SPECIAL_EL, bgsound => MISC_SPECIAL_EL, big => FORMATTING_EL, blockquote => MISC_SPECIAL_EL, body => BODY_EL, br => MISC_SPECIAL_EL, button => BUTTON_EL, caption => CAPTION_EL, center => MISC_SPECIAL_EL, code => FORMATTING_EL, col => MISC_SPECIAL_EL, colgroup => MISC_SPECIAL_EL, command => MISC_SPECIAL_EL, #datagrid => MISC_SPECIAL_EL, dd => DTDD_EL, details => MISC_SPECIAL_EL, dir => MISC_SPECIAL_EL, div => ADDRESS_DIV_EL, dl => MISC_SPECIAL_EL, dt => DTDD_EL, em => FORMATTING_EL, embed => MISC_SPECIAL_EL, fieldset => MISC_SPECIAL_EL, figure => MISC_SPECIAL_EL, figcaption => MISC_SPECIAL_EL, font => FORMATTING_EL, footer => MISC_SPECIAL_EL, form => FORM_EL, frame => MISC_SPECIAL_EL, frameset => FRAMESET_EL, h1 => HEADING_EL, h2 => HEADING_EL, h3 => HEADING_EL, h4 => HEADING_EL, h5 => HEADING_EL, h6 => HEADING_EL, head => MISC_SPECIAL_EL, header => MISC_SPECIAL_EL, hgroup => MISC_SPECIAL_EL, hr => MISC_SPECIAL_EL, html => HTML_EL, i => FORMATTING_EL, iframe => MISC_SPECIAL_EL, img => MISC_SPECIAL_EL, #image => MISC_SPECIAL_EL, ## NOTE: Commented out in the spec. input => MISC_SPECIAL_EL, isindex => MISC_SPECIAL_EL, ## XXX keygen? (Whether a void element is in Special or not does not ## affect to the processing, however.) li => LI_EL, link => MISC_SPECIAL_EL, listing => MISC_SPECIAL_EL, marquee => MISC_SCOPING_EL, menu => MISC_SPECIAL_EL, meta => MISC_SPECIAL_EL, nav => MISC_SPECIAL_EL, nobr => NOBR_EL, noembed => MISC_SPECIAL_EL, noframes => MISC_SPECIAL_EL, noscript => MISC_SPECIAL_EL, object => MISC_SCOPING_EL, ol => MISC_SPECIAL_EL, optgroup => OPTGROUP_EL, option => OPTION_EL, p => P_EL, param => MISC_SPECIAL_EL, plaintext => MISC_SPECIAL_EL, pre => MISC_SPECIAL_EL, rp => RUBY_COMPONENT_EL, rt => RUBY_COMPONENT_EL, ruby => RUBY_EL, s => FORMATTING_EL, script => MISC_SPECIAL_EL, select => SELECT_EL, section => MISC_SPECIAL_EL, small => FORMATTING_EL, strike => FORMATTING_EL, strong => FORMATTING_EL, style => MISC_SPECIAL_EL, summary => MISC_SPECIAL_EL, table => TABLE_EL, tbody => TABLE_ROW_GROUP_EL, td => TABLE_CELL_EL, textarea => MISC_SPECIAL_EL, tfoot => TABLE_ROW_GROUP_EL, th => TABLE_CELL_EL, thead => TABLE_ROW_GROUP_EL, title => MISC_SPECIAL_EL, tr => TABLE_ROW_EL, tt => FORMATTING_EL, u => FORMATTING_EL, ul => MISC_SPECIAL_EL, wbr => MISC_SPECIAL_EL, xmp => MISC_SPECIAL_EL, }; my $el_category_f = { (MML_NS) => { 'annotation-xml' => MML_AXML_EL, mi => MML_TEXT_INTEGRATION_EL, mo => MML_TEXT_INTEGRATION_EL, mn => MML_TEXT_INTEGRATION_EL, ms => MML_TEXT_INTEGRATION_EL, mtext => MML_TEXT_INTEGRATION_EL, }, (SVG_NS) => { foreignObject => SVG_INTEGRATION_EL, desc => SVG_INTEGRATION_EL, title => SVG_INTEGRATION_EL, script => SVG_SCRIPT_EL, }, ## NOTE: In addition, FOREIGN_EL is set to non-HTML elements, MML_EL ## is set to MathML elements, and SVG_EL is set to SVG elements. }; my $svg_attr_name = { attributename => 'attributeName', attributetype => 'attributeType', basefrequency => 'baseFrequency', baseprofile => 'baseProfile', calcmode => 'calcMode', clippathunits => 'clipPathUnits', contentscripttype => 'contentScriptType', contentstyletype => 'contentStyleType', diffuseconstant => 'diffuseConstant', edgemode => 'edgeMode', externalresourcesrequired => 'externalResourcesRequired', filterres => 'filterRes', filterunits => 'filterUnits', glyphref => 'glyphRef', gradienttransform => 'gradientTransform', gradientunits => 'gradientUnits', kernelmatrix => 'kernelMatrix', kernelunitlength => 'kernelUnitLength', keypoints => 'keyPoints', keysplines => 'keySplines', keytimes => 'keyTimes', lengthadjust => 'lengthAdjust', limitingconeangle => 'limitingConeAngle', markerheight => 'markerHeight', markerunits => 'markerUnits', markerwidth => 'markerWidth', maskcontentunits => 'maskContentUnits', maskunits => 'maskUnits', numoctaves => 'numOctaves', pathlength => 'pathLength', patterncontentunits => 'patternContentUnits', patterntransform => 'patternTransform', patternunits => 'patternUnits', pointsatx => 'pointsAtX', pointsaty => 'pointsAtY', pointsatz => 'pointsAtZ', preservealpha => 'preserveAlpha', preserveaspectratio => 'preserveAspectRatio', primitiveunits => 'primitiveUnits', refx => 'refX', refy => 'refY', repeatcount => 'repeatCount', repeatdur => 'repeatDur', requiredextensions => 'requiredExtensions', requiredfeatures => 'requiredFeatures', specularconstant => 'specularConstant', specularexponent => 'specularExponent', spreadmethod => 'spreadMethod', startoffset => 'startOffset', stddeviation => 'stdDeviation', stitchtiles => 'stitchTiles', surfacescale => 'surfaceScale', systemlanguage => 'systemLanguage', tablevalues => 'tableValues', targetx => 'targetX', targety => 'targetY', textlength => 'textLength', viewbox => 'viewBox', viewtarget => 'viewTarget', xchannelselector => 'xChannelSelector', ychannelselector => 'yChannelSelector', zoomandpan => 'zoomAndPan', }; my $foreign_attr_xname = { 'xlink:actuate' => [(XLINK_NS), ['xlink', 'actuate']], 'xlink:arcrole' => [(XLINK_NS), ['xlink', 'arcrole']], 'xlink:href' => [(XLINK_NS), ['xlink', 'href']], 'xlink:role' => [(XLINK_NS), ['xlink', 'role']], 'xlink:show' => [(XLINK_NS), ['xlink', 'show']], 'xlink:title' => [(XLINK_NS), ['xlink', 'title']], 'xlink:type' => [(XLINK_NS), ['xlink', 'type']], 'xml:base' => [(XML_NS), ['xml', 'base']], 'xml:lang' => [(XML_NS), ['xml', 'lang']], 'xml:space' => [(XML_NS), ['xml', 'space']], 'xmlns' => [(XMLNS_NS), [undef, 'xmlns']], 'xmlns:xlink' => [(XMLNS_NS), ['xmlns', 'xlink']], }; ## TODO: Invoke the reset algorithm when a resettable element is ## created (cf. HTML5 revision 2259). sub parse_byte_string ($$$$;$) { my $self = shift; my $charset_name = shift; open my $input, '<', ref $_[0] ? $_[0] : \($_[0]); return $self->parse_byte_stream ($charset_name, $input, @_[1..$#_]); } # parse_byte_string sub parse_byte_stream ($$$$;$$) { # my ($self, $charset_name, $byte_stream, $doc, $onerror, $get_wrapper) = @_; my $self = ref $_[0] ? shift : shift->new; my $charset_name = shift; my $byte_stream = $_[0]; my $onerror = $_[2] || sub { my (%opt) = @_; warn "Parse error ($opt{type})\n"; }; $self->{parse_error} = $onerror; # updated later by parse_char_string my $get_wrapper = $_[3] || sub ($) { return $_[0]; # $_[0] = byte stream handle, returned = arg to char handle }; ## HTML5 encoding sniffing algorithm require HTML::HTML5::Parser::Charset::Info; my $charset; my $buffer; my ($char_stream, $e_status); SNIFFING: { ## NOTE: By setting |allow_fallback| option true when the ## |get_decode_handle| method is invoked, we ignore what the HTML5 ## spec requires, i.e. unsupported encoding should be ignored. ## TODO: We should not do this unless the parser is invoked ## in the conformance checking mode, in which this behavior ## would be useful. ## Step 1 if (defined $charset_name) { $charset = HTML::HTML5::Parser::Charset::Info->get_by_html_name ($charset_name); ## TODO: Is this ok? Transfer protocol's parameter should be ## interpreted in its semantics? ($char_stream, $e_status) = $charset->get_decode_handle ($byte_stream, allow_error_reporting => 1, allow_fallback => 1); if ($char_stream) { $self->{confident} = 1; last SNIFFING; } else { $self->{parse_error}->(level => $self->{level}->{must}, type => 'charset:not supported', layer => 'encode', line => 1, column => 1, value => $charset_name, level => $self->{level}->{uncertain}); } } ## Step 2 my $byte_buffer = ''; for (1..1024) { my $char = $byte_stream->getc; last unless defined $char; $byte_buffer .= $char; } ## TODO: timeout ## Step 3 if ($byte_buffer =~ /^\xFE\xFF/) { $charset = HTML::HTML5::Parser::Charset::Info->get_by_html_name ('utf-16be'); ($char_stream, $e_status) = $charset->get_decode_handle ($byte_stream, allow_error_reporting => 1, allow_fallback => 1, byte_buffer => \$byte_buffer); $self->{confident} = 1; last SNIFFING; } elsif ($byte_buffer =~ /^\xFF\xFE/) { $charset = HTML::HTML5::Parser::Charset::Info->get_by_html_name ('utf-16le'); ($char_stream, $e_status) = $charset->get_decode_handle ($byte_stream, allow_error_reporting => 1, allow_fallback => 1, byte_buffer => \$byte_buffer); $self->{confident} = 1; last SNIFFING; } elsif ($byte_buffer =~ /^\xEF\xBB\xBF/) { $charset = HTML::HTML5::Parser::Charset::Info->get_by_html_name ('utf-8'); ($char_stream, $e_status) = $charset->get_decode_handle ($byte_stream, allow_error_reporting => 1, allow_fallback => 1, byte_buffer => \$byte_buffer); $self->{confident} = 1; last SNIFFING; } ## Step 4 ## TODO: ## Step 5 ## TODO: from history ## Step 6 require HTML::HTML5::Parser::Charset::UniversalCharDet; $charset_name = HTML::HTML5::Parser::Charset::UniversalCharDet->detect_byte_string($byte_buffer) if $byte_buffer; if (defined $charset_name) { $charset = HTML::HTML5::Parser::Charset::Info->get_by_html_name ($charset_name); require HTML::HTML5::Parser::Charset::DecodeHandle; $buffer = HTML::HTML5::Parser::Charset::DecodeHandle::ByteBuffer->new ($byte_stream); ($char_stream, $e_status) = $charset->get_decode_handle ($buffer, allow_error_reporting => 1, allow_fallback => 1, byte_buffer => \$byte_buffer); if ($char_stream) { $buffer->{buffer} = $byte_buffer; $self->{parse_error}->(level => $self->{level}->{must}, type => 'sniffing:chardet', text => $charset_name, level => $self->{level}->{info}, layer => 'encode', line => 1, column => 1); $self->{confident} = 0; last SNIFFING; } } ## Step 7: default ## TODO: Make this configurable. $charset = HTML::HTML5::Parser::Charset::Info->get_by_html_name ('windows-1252'); ## NOTE: We choose |windows-1252| here, since |utf-8| should be ## detectable in the step 6. require HTML::HTML5::Parser::Charset::DecodeHandle; $buffer = HTML::HTML5::Parser::Charset::DecodeHandle::ByteBuffer->new ($byte_stream); ($char_stream, $e_status) = $charset->get_decode_handle ($buffer, allow_error_reporting => 1, allow_fallback => 1, byte_buffer => \$byte_buffer); $buffer->{buffer} = $byte_buffer; $self->{parse_error}->(level => $self->{level}->{must}, type => 'sniffing:default', text => 'windows-1252', level => $self->{level}->{info}, line => 1, column => 1, layer => 'encode'); $self->{confident} = 0; } # SNIFFING if ($e_status & HTML::HTML5::Parser::Charset::Info::FALLBACK_ENCODING_IMPL ()) { $self->{input_encoding} = $charset->get_iana_name; ## TODO: Should we set actual charset decoder's encoding name? $self->{parse_error}->(level => $self->{level}->{must}, type => 'chardecode:fallback', #text => $self->{input_encoding}, level => $self->{level}->{uncertain}, line => 1, column => 1, layer => 'encode'); } elsif (not ($e_status & HTML::HTML5::Parser::Charset::Info::ERROR_REPORTING_ENCODING_IMPL ())) { $self->{input_encoding} = $charset->get_iana_name; $self->{parse_error}->(level => $self->{level}->{must}, type => 'chardecode:no error', text => $self->{input_encoding}, level => $self->{level}->{uncertain}, line => 1, column => 1, layer => 'encode'); } else { $self->{input_encoding} = $charset->get_iana_name; } $self->{change_encoding} = sub { my $self = shift; $charset_name = shift; my $token = shift; my $orig_char_stream = $char_stream; $charset = HTML::HTML5::Parser::Charset::Info->get_by_html_name ($charset_name); ($char_stream, $e_status) = $charset->get_decode_handle ($byte_stream, allow_error_reporting => 1, allow_fallback => 1, byte_buffer => \ $buffer->{buffer}); if ($char_stream) { # if supported if ($charset->{category} & HTML::HTML5::Parser::Charset::Info::CHARSET_CATEGORY_ASCII_COMPAT () or $charset->{category} & HTML::HTML5::Parser::Charset::Info::CHARSET_CATEGORY_UTF16 ()) { # } else { return; } ## "Change the encoding" algorithm: ## Step 1 if (defined $self->{input_encoding} and $self->{input_encoding} eq $charset_name) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'charset label:matching', text => $charset_name, level => $self->{level}->{info}); $self->{confident} = 1; return; } ## Step 2 (HTML5 revision 3205) if (defined $self->{input_encoding} and HTML::HTML5::Parser::Charset::Info->get_by_html_name ($self->{input_encoding}) ->{category} & HTML::HTML5::Parser::Charset::Info::CHARSET_CATEGORY_UTF16 ()) { $self->{confident} = 1; return; } ## Step 3 if ($charset->{category} & HTML::HTML5::Parser::Charset::Info::CHARSET_CATEGORY_UTF16 ()) { $charset = HTML::HTML5::Parser::Charset::Info->get_by_html_name ('utf-8'); ($char_stream, $e_status) = $charset->get_decode_handle ($byte_stream, allow_error_reporting => 1, byte_buffer => \ $buffer->{buffer}); } $charset_name = $charset->get_iana_name; $self->{parse_error}->(level => $self->{level}->{must}, type => 'charset label detected', text => $self->{input_encoding}, value => $charset_name, level => $self->{level}->{warn}, token => $token); ## Step 4 # if (can) { ## change the encoding on the fly. #$self->{confident} = 1; #return; # } ## Step 5 HTML::HTML5::Parser::TagSoupParser::RestartParser->throw; } else { $char_stream = $orig_char_stream; } }; # $self->{change_encoding} # XXX IF YOU PUT $SELF IN HERE YOU GET HUGE FAT MEMORY LEAKS my %x = ( level => $self->{level}{must}, layer => 'encode', line => $self->{line}, column => $self->{column} + 1, error => $self->{parse_error}, ); my $char_onerror = sub { my (undef, $type, %opt) = @_; $x{error}->( level => $x{level}, layer => $x{layer}, line => $x{line}, column => $x{column}, %opt, type => $type); if ($opt{octets}) { ${$opt{octets}} = "\x{FFFD}"; # relacement character } }; my $wrapped_char_stream = $get_wrapper->($char_stream); $wrapped_char_stream->onerror ($char_onerror); my @args = ($_[1], $_[2]); # $doc, $onerror - $get_wrapper = undef; my $return; try { $return = $self->parse_char_stream ($wrapped_char_stream, @args); } ## NOTE: Invoked after {change_encoding}. catch { unless (blessed($_) and $_->isa('HTML::HTML5::Parser::TagSoupParser::RestartParser')) { die $_; } if ($e_status & HTML::HTML5::Parser::Charset::Info::FALLBACK_ENCODING_IMPL ()) { $self->{input_encoding} = $charset->get_iana_name; ## TODO: Should we set actual charset decoder's encoding name? $self->{parse_error}->(level => $self->{level}->{must}, type => 'chardecode:fallback', level => $self->{level}->{uncertain}, #text => $self->{input_encoding}, line => 1, column => 1, layer => 'encode'); } elsif (not ($e_status & HTML::HTML5::Parser::Charset::Info::ERROR_REPORTING_ENCODING_IMPL ())) { $self->{input_encoding} = $charset->get_iana_name; $self->{parse_error}->(level => $self->{level}->{must}, type => 'chardecode:no error', text => $self->{input_encoding}, level => $self->{level}->{uncertain}, line => 1, column => 1, layer => 'encode'); } else { $self->{input_encoding} = $charset->get_iana_name; } $self->{confident} = 1; $wrapped_char_stream = $get_wrapper->($char_stream); $wrapped_char_stream->onerror ($char_onerror); $return = $self->parse_char_stream ($wrapped_char_stream, @args); }; $self->_data($return, charset => $charset_name); return $return; } # parse_byte_stream ## NOTE: HTML5 spec says that the encoding layer MUST NOT strip BOM ## and the HTML layer MUST ignore it. However, we does strip BOM in ## the encoding layer and the HTML layer does not ignore any U+FEFF, ## because the core part of our HTML parser expects a string of character, ## not a string of bytes or code units or anything which might contain a BOM. ## Therefore, any parser interface that accepts a string of bytes, ## such as |parse_byte_string| in this module, must ensure that it does ## strip the BOM and never strip any ZWNBSP. sub parse_char_string ($$$;$$) { #my ($self, $s, $doc, $onerror, $get_wrapper) = @_; my $self = shift; my $s = ref $_[0] ? $_[0] : \($_[0]); require HTML::HTML5::Parser::Charset::DecodeHandle; my $input = HTML::HTML5::Parser::Charset::DecodeHandle::CharString->new ($s); return $self->parse_char_stream ($input, @_[1..$#_]); } # parse_char_string *parse_string = \&parse_char_string; ## NOTE: Alias for backward compatibility. sub parse_char_stream ($$$;$$) { my $self = ref $_[0] ? shift : shift->new; my $input = $_[0]; my $doc = $self->{document} = $_[1]; $self->{document}->removeChildNodes; ## NOTE: |set_inner_html| copies most of this method's code ## Confidence: irrelevant. $self->{confident} = 1 unless exists $self->{confident}; $self->{document}->setEncoding($self->{input_encoding}) if defined $self->{input_encoding}; ## TODO: |{input_encoding}| is needless? $self->{line_prev} = $self->{line} = 1; $self->{column_prev} = -1; $self->{column} = 0; $self->{set_nc} = sub { my $self = shift; my $char = ''; if (defined $self->{next_nc}) { $char = $self->{next_nc}; delete $self->{next_nc}; $self->{nc} = ord $char; } else { $self->{char_buffer} = ''; $self->{char_buffer_pos} = 0; my $count = $input->manakai_read_until ($self->{char_buffer}, qr/[^\x0A\x0D]/, $self->{char_buffer_pos}); if ($count) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); return; } if ($input->read ($char, 1)) { $self->{nc} = ord $char; } else { $self->{nc} = -1; return; } } ($self->{line_prev}, $self->{column_prev}) = ($self->{line}, $self->{column}); $self->{column}++; if ($self->{nc} == 0x000A) { # LF $self->{line}++; $self->{column} = 0; } elsif ($self->{nc} == 0x000D) { # CR ## TODO: support for abort/streaming my $next = ''; if ($input->read ($next, 1) and $next ne "\x0A") { $self->{next_nc} = $next; } $self->{nc} = 0x000A; # LF # MUST $self->{line}++; $self->{column} = 0; } }; $self->{read_until} = sub { #my ($scalar, $specials_range, $offset) = @_; return 0 if defined $self->{next_nc}; my $pattern = qr/[^$_[1]\x0A\x0D]/; my $offset = $_[2] || 0; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { pos ($self->{char_buffer}) = $self->{char_buffer_pos}; if ($self->{char_buffer} =~ /\G(?>$pattern)+/) { substr ($_[0], $offset) = substr ($self->{char_buffer}, $-[0], $+[0] - $-[0]); my $count = $+[0] - $-[0]; if ($count) { $self->{column} += $count; $self->{char_buffer_pos} += $count; $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column} - 1; $self->{nc} = -1; } return $count; } else { return 0; } } else { my $count = $input->manakai_read_until ($_[0], $pattern, $_[2]); if ($count) { $self->{column} += $count; $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column} - 1; $self->{nc} = -1; } return $count; } }; # $self->{read_until} my $onerror = $_[2] || sub { my (%opt) = @_; my $line = $opt{token} ? $opt{token}->{line} : $opt{line}; my $column = $opt{token} ? $opt{token}->{column} : $opt{column}; warn "Parse error ($opt{type}) at line $line column $column\n"; }; $self->{parse_error} = sub { $onerror->(line => $self->{line}, column => $self->{column}, @_); }; my $char_onerror = sub { my (undef, $type, %opt) = @_; $self->{parse_error}->(level => $self->{level}->{must}, layer => 'encode', line => $self->{line}, column => $self->{column} + 1, %opt, type => $type); }; # $char_onerror if ($_[3]) { $input = $_[3]->($input); $input->onerror ($char_onerror); } else { $input->onerror ($char_onerror) unless defined $input->onerror; } $self->_initialize_tokenizer; $self->_initialize_tree_constructor; $self->_construct_tree; $self->_terminate_tree_constructor; ## Remove self-references delete $self->{set_nc}; delete $self->{read_until}; delete $self->{parse_error}; delete $self->{document}; return $doc; } # parse_char_stream sub new ($;@) { my $class = shift; my %p = @_; my $self = bless { level => { must => 'm', should => 's', obsconforming => 's', warn => 'w', info => 'i', uncertain => 'u', }, _debug_cache => $p{no_cache} ? {} : $DATA, }, $class; $self->{set_nc} = sub { $self->{nc} = -1; }; $self->{parse_error} = sub { # }; $self->{change_encoding} = sub { # if ($_[0] is a supported encoding) { # run "change the encoding" algorithm; # throw Whatpm::HTML::RestartParser (charset => $new_encoding); # } }; $self->{application_cache_selection} = sub { # }; return $self; } # new ## Insertion modes sub AFTER_HTML_IMS () { 0b100 } sub HEAD_IMS () { 0b1000 } sub BODY_IMS () { 0b10000 } sub BODY_TABLE_IMS () { 0b100000 } sub TABLE_IMS () { 0b1000000 } sub ROW_IMS () { 0b10000000 } sub BODY_AFTER_IMS () { 0b100000000 } sub FRAME_IMS () { 0b1000000000 } sub SELECT_IMS () { 0b10000000000 } sub IN_CDATA_RCDATA_IM () { 0b1000000000000 } ## NOTE: "in CDATA/RCDATA" insertion mode is also special; it is ## combined with the original insertion mode. In thie parser, ## they are stored together in the bit-or'ed form. sub IM_MASK () { 0b11111111111 } ## NOTE: "initial" and "before html" insertion modes have no constants. ## NOTE: "after after body" insertion mode. sub AFTER_HTML_BODY_IM () { AFTER_HTML_IMS | BODY_AFTER_IMS } ## NOTE: "after after frameset" insertion mode. sub AFTER_HTML_FRAMESET_IM () { AFTER_HTML_IMS | FRAME_IMS } sub IN_HEAD_IM () { HEAD_IMS | 0b00 } sub IN_HEAD_NOSCRIPT_IM () { HEAD_IMS | 0b01 } sub AFTER_HEAD_IM () { HEAD_IMS | 0b10 } sub BEFORE_HEAD_IM () { HEAD_IMS | 0b11 } sub IN_BODY_IM () { BODY_IMS } sub IN_CELL_IM () { BODY_IMS | BODY_TABLE_IMS | 0b01 } sub IN_CAPTION_IM () { BODY_IMS | BODY_TABLE_IMS | 0b10 } sub IN_ROW_IM () { TABLE_IMS | ROW_IMS | 0b01 } sub IN_TABLE_BODY_IM () { TABLE_IMS | ROW_IMS | 0b10 } sub IN_TABLE_IM () { TABLE_IMS } sub AFTER_BODY_IM () { BODY_AFTER_IMS } sub IN_FRAMESET_IM () { FRAME_IMS | 0b01 } sub AFTER_FRAMESET_IM () { FRAME_IMS | 0b10 } sub IN_SELECT_IM () { SELECT_IMS | 0b01 } sub IN_SELECT_IN_TABLE_IM () { SELECT_IMS | 0b10 } sub IN_COLUMN_GROUP_IM () { 0b10 } sub _initialize_tree_constructor ($) { my $self = shift; ## NOTE: $self->{document} MUST be specified before this method is called $self->_data($self->{document})->{strict_error_checking} = 0; ## TODO: Turn mutation events off # MUST ## TODO: Turn loose Document option (manakai extension) on $self->_data($self->{document})->{manakai_is_html} = 1; # MUST $self->_data($self->{document})->{manakai_source_line} = 1; $self->_data($self->{document})->{manakai_source_column} = 1; $self->{frameset_ok} = 1; } # _initialize_tree_constructor sub _terminate_tree_constructor ($) { my $self = shift; $self->_data($self->{document}, strict_error_checking => 1); ## TODO: Turn mutation events on } # _terminate_tree_constructor ## ISSUE: Should appendChild (for example) in script executed in tree construction stage fire mutation events? { # tree construction stage my $token; sub _construct_tree ($) { my ($self) = @_; ## When an interactive UA render the $self->{document} available ## to the user, or when it begin accepting user input, are ## not defined. $self->{insertion_mode} = 0; # dummy $token = $self->_get_next_token; undef $self->{form_element}; undef $self->{head_element}; $self->{open_elements} = []; undef $self->{inner_html_node}; undef $self->{ignore_newline}; ## NOTE: The "initial" insertion mode. $self->_tree_construction_initial; # MUST ## NOTE: The "before html" insertion mode. $self->_tree_construction_root_element; $self->{insertion_mode} = BEFORE_HEAD_IM; ## NOTE: The "before head" insertion mode and so on. $self->_tree_construction_main; } # _construct_tree sub _tree_construction_initial ($) { my $self = shift; ## NOTE: "initial" insertion mode INITIAL: { if ($token->{type} == DOCTYPE_TOKEN) { ## NOTE: Conformance checkers MAY, instead of reporting "not ## HTML5" error, switch to a conformance checking mode for ## another language. (We don't support such mode switchings; it ## is nonsense to do anything different from what browsers do.) my $doctype_name = $token->{name}; $doctype_name = '' unless defined $doctype_name; if ($doctype_name ne 'html') { $self->{parse_error}->(level => $self->{level}->{must}, type => 'not HTML5', token => $token); } elsif (defined $token->{pubid}) { ## Obsolete permitted DOCTYPEs (case-sensitive) my $xsysid = { '-//W3C//DTD HTML 4.0//EN' => 'http://www.w3.org/TR/REC-html40/strict.dtd', '-//W3C//DTD HTML 4.01//EN' => 'http://www.w3.org/TR/html4/strict.dtd', '-//W3C//DTD XHTML 1.0 Strict//EN' => 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd', '-//W3C//DTD XHTML 1.1//EN' => 'http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd', }->{$token->{pubid}}; if (defined $xsysid and (not defined $token->{sysid} or $token->{sysid} eq $xsysid)) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'obs DOCTYPE', token => $token, level => $self->{level}->{obsconforming}); } else { $self->{parse_error}->(level => $self->{level}->{must}, type => 'not HTML5', token => $token); } } elsif (defined $token->{sysid}) { if ($token->{sysid} eq 'about:legacy-compat') { ## $self->{parse_error}->(level => $self->{level}->{must}, type => 'XSLT-compat', token => $token, level => $self->{level}->{should}); } else { $self->{parse_error}->(level => $self->{level}->{must}, type => 'not HTML5', token => $token); } } else { ## # } $self->_data($self->{'document'}, 'DTD_PUBLIC_ID', $token->{pubid}); $self->_data($self->{'document'}, 'DTD_SYSTEM_ID', $token->{sysid}); $self->_data($self->{'document'}, 'DTD_ELEMENT', (defined $token->{name}?$token->{name}:'')); $self->_data($self->{'document'}, 'DTD_COLUMN', $token->{column}); $self->_data($self->{'document'}, 'DTD_LINE', $token->{line}); # TOBYINK $self->_data($self->{'document'}, isHTML4 => 1) if (($token->{pubid}||'') =~ /html 4/i or ($token->{sysid}||'') =~ /html4/i); if ($token->{quirks} or $doctype_name ne 'html') { $self->_data($self->{document})->{'manakai_compat_mode'} = 'quirks'; } elsif (defined $token->{pubid}) { my $pubid = $token->{pubid}; $pubid =~ tr/a-z/A-Z/; ## ASCII case-insensitive. my $prefix = [ "+//SILMARIL//DTD HTML PRO V0R11 19970101//", "-//ADVASOFT LTD//DTD HTML 3.0 ASWEDIT + EXTENSIONS//", "-//AS//DTD HTML 3.0 ASWEDIT + EXTENSIONS//", "-//IETF//DTD HTML 2.0 LEVEL 1//", "-//IETF//DTD HTML 2.0 LEVEL 2//", "-//IETF//DTD HTML 2.0 STRICT LEVEL 1//", "-//IETF//DTD HTML 2.0 STRICT LEVEL 2//", "-//IETF//DTD HTML 2.0 STRICT//", "-//IETF//DTD HTML 2.0//", "-//IETF//DTD HTML 2.1E//", "-//IETF//DTD HTML 3.0//", "-//IETF//DTD HTML 3.2 FINAL//", "-//IETF//DTD HTML 3.2//", "-//IETF//DTD HTML 3//", "-//IETF//DTD HTML LEVEL 0//", "-//IETF//DTD HTML LEVEL 1//", "-//IETF//DTD HTML LEVEL 2//", "-//IETF//DTD HTML LEVEL 3//", "-//IETF//DTD HTML STRICT LEVEL 0//", "-//IETF//DTD HTML STRICT LEVEL 1//", "-//IETF//DTD HTML STRICT LEVEL 2//", "-//IETF//DTD HTML STRICT LEVEL 3//", "-//IETF//DTD HTML STRICT//", "-//IETF//DTD HTML//", "-//METRIUS//DTD METRIUS PRESENTATIONAL//", "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 HTML STRICT//", "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 HTML//", "-//MICROSOFT//DTD INTERNET EXPLORER 2.0 TABLES//", "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 HTML STRICT//", "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 HTML//", "-//MICROSOFT//DTD INTERNET EXPLORER 3.0 TABLES//", "-//NETSCAPE COMM. CORP.//DTD HTML//", "-//NETSCAPE COMM. CORP.//DTD STRICT HTML//", "-//O'REILLY AND ASSOCIATES//DTD HTML 2.0//", "-//O'REILLY AND ASSOCIATES//DTD HTML EXTENDED 1.0//", "-//O'REILLY AND ASSOCIATES//DTD HTML EXTENDED RELAXED 1.0//", "-//SOFTQUAD SOFTWARE//DTD HOTMETAL PRO 6.0::19990601::EXTENSIONS TO HTML 4.0//", "-//SOFTQUAD//DTD HOTMETAL PRO 4.0::19971010::EXTENSIONS TO HTML 4.0//", "-//SPYGLASS//DTD HTML 2.0 EXTENDED//", "-//SQ//DTD HTML 2.0 HOTMETAL + EXTENSIONS//", "-//SUN MICROSYSTEMS CORP.//DTD HOTJAVA HTML//", "-//SUN MICROSYSTEMS CORP.//DTD HOTJAVA STRICT HTML//", "-//W3C//DTD HTML 3 1995-03-24//", "-//W3C//DTD HTML 3.2 DRAFT//", "-//W3C//DTD HTML 3.2 FINAL//", "-//W3C//DTD HTML 3.2//", "-//W3C//DTD HTML 3.2S DRAFT//", "-//W3C//DTD HTML 4.0 FRAMESET//", "-//W3C//DTD HTML 4.0 TRANSITIONAL//", "-//W3C//DTD HTML EXPERIMETNAL 19960712//", "-//W3C//DTD HTML EXPERIMENTAL 970421//", "-//W3C//DTD W3 HTML//", "-//W3O//DTD W3 HTML 3.0//", "-//WEBTECHS//DTD MOZILLA HTML 2.0//", "-//WEBTECHS//DTD MOZILLA HTML//", ]; # $prefix my $match; for (@$prefix) { if (substr ($prefix, 0, length $_) eq $_) { $match = 1; last; } } if ($match or $pubid eq "-//W3O//DTD W3 HTML STRICT 3.0//EN//" or $pubid eq "-/W3C/DTD HTML 4.0 TRANSITIONAL/EN" or $pubid eq "HTML") { $self->_data($self->{document})->{'manakai_compat_mode'} = 'quirks'; } elsif ($pubid =~ m[^-//W3C//DTD HTML 4.01 FRAMESET//] or $pubid =~ m[^-//W3C//DTD HTML 4.01 TRANSITIONAL//]) { if (defined $token->{sysid}) { $self->_data($self->{document})->{'manakai_compat_mode'} = 'quirks'; } else { $self->_data($self->{document})->{'manakai_compat_mode'} = 'limited quirks'; } } elsif ($pubid =~ m[^-//W3C//DTD XHTML 1.0 FRAMESET//] or $pubid =~ m[^-//W3C//DTD XHTML 1.0 TRANSITIONAL//]) { $self->_data($self->{document})->{'manakai_compat_mode'} ='limited quirks'; } else { } } else { } if (defined $token->{sysid}) { my $sysid = $token->{sysid}; $sysid =~ tr/A-Z/a-z/; ## ASCII case-insensitive. if ($sysid eq "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd") { ## NOTE: Ensure that |PUBLIC "(limited quirks)" "(quirks)"| ## is signaled as in quirks mode! $self->_data($self->{document})->{'manakai_compat_mode'} = 'quirks'; } else { } } else { } ## Go to the "before html" insertion mode. $token = $self->_get_next_token; return; } elsif ({ START_TAG_TOKEN, 1, END_TAG_TOKEN, 1, END_OF_FILE_TOKEN, 1, }->{$token->{type}}) { unless ($self->_data($self->{'document'}, 'manakai_is_srcdoc')) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE', token => $token); $self->_data($self->{document})->{'manakai_compat_mode'} = 'quirks'; } ## Go to the "before html" insertion mode. ## reprocess return; } elsif ($token->{type} == CHARACTER_TOKEN) { if ($token->{data} =~ s/^([\x09\x0A\x0C\x20]+)//) { ## Ignore the token unless (length $token->{data}) { ## Stay in the insertion mode. $token = $self->_get_next_token; redo INITIAL; } else { } } else { } $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE', token => $token); $self->_data($self->{document})->{'manakai_compat_mode'} = 'quirks'; ## Go to the "before html" insertion mode. ## reprocess return; } elsif ($token->{type} == COMMENT_TOKEN) { my $comment = $self->{document}->createComment($token->{data}); $self->_data($comment, manakai_source_line => $token->{line}) if defined $token->{line}; $self->_data($comment, manakai_source_column => $token->{column}) if defined $token->{column}; $self->{document}->appendChild($comment); ## Stay in the insertion mode. $token = $self->_get_next_token; redo INITIAL; } else { die "$0: $token->{type}: Unknown token type"; } } # INITIAL die "$0: _tree_construction_initial: This should be never reached"; } # _tree_construction_initial sub _tree_construction_root_element ($) { my $self = shift; ## NOTE: The "before html" insertion mode. B: { if ($token->{type} == DOCTYPE_TOKEN) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'in html:#DOCTYPE', token => $token); ## Ignore the token $token = $self->_get_next_token; redo B; } elsif ($token->{type} == COMMENT_TOKEN) { my $comment = $self->{document}->createComment($token->{data}); $self->_data($comment, manakai_source_line => $token->{line}) if defined $token->{line}; $self->_data($comment, manakai_source_column => $token->{column}) if defined $token->{column}; $self->{document}->appendChild($comment); ## Stay in the insertion mode. $token = $self->_get_next_token; redo B; } elsif ($token->{type} == CHARACTER_TOKEN) { if ($token->{data} =~ s/^([\x09\x0A\x0C\x20]+)//) { ## Ignore the token. unless (length $token->{data}) { ## Stay in the insertion mode. $token = $self->_get_next_token; redo B; } else { } } else { } $self->{application_cache_selection}->(undef); # } elsif ($token->{type} == START_TAG_TOKEN) { if ($token->{tag_name} eq 'html') { my $root_element; $root_element = $self->{document}->createElementNS((HTML_NS), $token->{tag_name}); for my $attr_name (keys %{ $token->{attributes}}) { my $attr_t = $token->{attributes}->{$attr_name}; my $attr = $self->{document}->createAttributeNS(undef, $attr_name); next unless $attr; $attr->setValue ($attr_t->{value}); $self->_data($attr, manakai_source_line => $attr_t->{line}); $self->_data($attr, manakai_source_column => $attr_t->{column}); $root_element->setAttributeNodeNS($attr); } $self->_data($root_element, manakai_source_line => $token->{line}) if defined $token->{line}; $self->_data($root_element, manakai_source_column => $token->{column}) if defined $token->{column}; $self->{document}->setDocumentElement($root_element); push @{$self->{open_elements}}, [$root_element, $el_category->{html}]; if ($token->{attributes}->{manifest}) { ## XXX resolve URL and drop fragment ## ## $self->{application_cache_selection} ->($token->{attributes}->{manifest}->{value}); } else { $self->{application_cache_selection}->(undef); } $token = $self->_get_next_token; return; ## Go to the "before head" insertion mode. } else { # } } elsif ($token->{type} == END_TAG_TOKEN) { if ({ head => 1, body => 1, html => 1, br => 1, }->{$token->{tag_name}}) { # } else { $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched end tag', text => $token->{tag_name}, token => $token); ## Ignore the token. $token = $self->_get_next_token; redo B; } } elsif ($token->{type} == END_OF_FILE_TOKEN) { # } else { die "$0: $token->{type}: Unknown token type"; } my $root_element; $root_element = $self->{document}->createElementNS((HTML_NS), 'html'); $self->_data($root_element, manakai_source_line => $token->{line}) if defined $token->{line}; $self->_data($root_element, manakai_source_column => $token->{column}) if defined $token->{column}; $self->_data($root_element, implied => __LINE__); $self->{document}->setDocumentElement($root_element); push @{$self->{open_elements}}, [$root_element, $el_category->{html}]; $self->{application_cache_selection}->(undef); ## NOTE: Reprocess the token. return; ## Go to the "before head" insertion mode. } # B die "$0: _tree_construction_root_element: This should never be reached"; } # _tree_construction_root_element sub _reset_insertion_mode ($) { my $self = shift; ## Step 1 my $last; ## Step 2 my $i = -1; my $node = $self->{open_elements}->[$i]; ## LOOP: Step 3 LOOP: { if ($self->{open_elements}->[0]->[0] eq $node->[0]) { $last = 1; if (defined $self->{inner_html_node}) { $node = $self->{inner_html_node}; } else { die "_reset_insertion_mode: t27"; } } ## Step 4..13 my $new_mode; if ($node->[1] == TABLE_CELL_EL) { if ($last) { # } else { $new_mode = IN_CELL_IM; } } elsif ($node->[1] & FOREIGN_EL) { # } else { $new_mode = { select => IN_SELECT_IM, ## NOTE: |option| and |optgroup| do not set ## insertion mode to "in select" by themselves. tr => IN_ROW_IM, tbody => IN_TABLE_BODY_IM, thead => IN_TABLE_BODY_IM, tfoot => IN_TABLE_BODY_IM, caption => IN_CAPTION_IM, colgroup => IN_COLUMN_GROUP_IM, table => IN_TABLE_IM, head => IN_BODY_IM, # not in head! body => IN_BODY_IM, frameset => IN_FRAMESET_IM, }->{$node->[0]->tagName}; } $self->{insertion_mode} = $new_mode and last LOOP if defined $new_mode; ## Step 14 if ($node->[1] == HTML_EL) { ## NOTE: Commented out in the spec (HTML5 revision 3894). #unless (defined $self->{head_element}) { $self->{insertion_mode} = BEFORE_HEAD_IM; #} else { ## ISSUE: Can this state be reached? # $self->{insertion_mode} = AFTER_HEAD_IM; #} last LOOP; } else { } ## Step 15 if ($last) { $self->{insertion_mode} = IN_BODY_IM; last LOOP; } ## Step 16 $i--; $node = $self->{open_elements}->[$i]; ## Step 17 redo LOOP; } # LOOP ## END } # _reset_insertion_mode my $parse_rcdata = sub ($$$$) { my ($self, $insert, $open_tables, $parse_refs) = @_; ## Step 1 my $start_tag_name = $token->{tag_name}; { my $el; $el = $self->{document}->createElementNS((HTML_NS), $token->{tag_name}); for my $attr_name (keys %{ $token->{attributes}}) { my $attr_t = $token->{attributes}->{$attr_name}; my $attr = $self->{document}->createAttributeNS(undef, $attr_name); $attr->setValue ($attr_t->{value}); $self->_data($attr, manakai_source_line => $attr_t->{line}); $self->_data($attr, manakai_source_column => $attr_t->{column}); $el->setAttributeNodeNS ($attr); } $self->_data($el, manakai_source_line => $token->{line}) if defined $token->{line}; $self->_data($el, manakai_source_column => $token->{column}) if defined $token->{column}; $insert->($self, $el, $open_tables); push @{$self->{open_elements}}, [$el, $el_category->{$token->{tag_name}} || 0]; } ## Step 2 if ($parse_refs) { $self->{state} = RCDATA_STATE; } else { $self->{state} = RAWTEXT_STATE; } delete $self->{escape}; # MUST ## Step 3, 4 $self->{insertion_mode} |= IN_CDATA_RCDATA_IM; $token = $self->_get_next_token; }; # $parse_rcdata my $script_start_tag = sub ($$$) { my ($self, $insert, $open_tables) = @_; ## Step 1 my $script_el; $script_el = $self->{document}->createElementNS((HTML_NS), 'script'); for my $attr_name (keys %{ $token->{attributes}}) { my $attr_t = $token->{attributes}->{$attr_name}; my $attr = $self->{document}->createAttributeNS(undef, $attr_name); $attr->setValue($attr_t->{value}); $self->_data($attr, manakai_source_line => $attr_t->{line}); $self->_data($attr, manakai_source_column => $attr_t->{column}); $script_el->setAttributeNodeNS($attr); } $self->_data($script_el, manakai_source_line => $token->{line}) if defined $token->{line}; $self->_data($script_el, manakai_source_column => $token->{column}) if defined $token->{column}; ## Step 2 ## TODO: mark as "parser-inserted" ## Step 3 ## TODO: Mark as "already executed", if ... ## Step 4 (HTML5 revision 2702) $insert->($self, $script_el, $open_tables); push @{$self->{open_elements}}, [$script_el, $el_category->{script}]; ## Step 5 $self->{state} = SCRIPT_DATA_STATE; delete $self->{escape}; # MUST ## Step 6-7 $self->{insertion_mode} |= IN_CDATA_RCDATA_IM; $token = $self->_get_next_token; }; # $script_start_tag sub push_afe ($$) { my ($item => $afes) = @_; my $item_token = $item->[2]; my $depth = 0; OUTER: for my $i (reverse 0..$#$afes) { my $afe = $afes->[$i]; if ($afe->[0] eq '#marker') { last OUTER; } else { my $token = $afe->[2]; ## Both |$token| and |$item_token| should be start tag tokens. if ($token->{tag_name} eq $item_token->{tag_name}) { if ((keys %{$token->{attributes}}) != (keys %{$item_token->{attributes}})) { next OUTER; } for my $attr_name (keys %{$item_token->{attributes}}) { next OUTER unless $token->{attributes}->{$attr_name}; next OUTER unless $token->{attributes}->{$attr_name}->{value} eq $item_token->{attributes}->{$attr_name}->{value}; } $depth++; if ($depth == 3) { splice @$afes, $i, 1 => (); last OUTER; } } ## We don't have to check namespaces of elements and attributes, ## nevertheless the spec requires it, because |$afes| could ## never contain a non-HTML element at the time of writing. In ## addition, scripted changes would never change the original ## start tag token. } } # OUTER push @$afes, $item; } # push_afe my $formatting_end_tag = sub { my ($self, $active_formatting_elements, $open_tables, $end_tag_token) = @_; my $tag_name = $end_tag_token->{tag_name}; ## NOTE: The adoption agency algorithm (AAA). ## Step 1 my $outer_loop_counter = 0; OUTER: { if ($outer_loop_counter >= 8) { $token = $self->_get_next_token; last OUTER; } ## Step 3 $outer_loop_counter++; ## Step 4 my $formatting_element; my $formatting_element_i_in_active; AFE: for (reverse 0..$#$active_formatting_elements) { if ($active_formatting_elements->[$_]->[0] eq '#marker') { last AFE; } elsif ($active_formatting_elements->[$_]->[0]->tagName eq $tag_name) { $formatting_element = $active_formatting_elements->[$_]; $formatting_element_i_in_active = $_; last AFE; } } # AFE unless (defined $formatting_element) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched end tag', text => $tag_name, token => $end_tag_token); ## Ignore the token $token = $self->_get_next_token; return; } ## has an element in scope my $in_scope = 1; my $formatting_element_i_in_open; INSCOPE: for (reverse 0..$#{$self->{open_elements}}) { my $node = $self->{open_elements}->[$_]; if ($node->[0] eq $formatting_element->[0]) { if ($in_scope) { $formatting_element_i_in_open = $_; last INSCOPE; } else { # in open elements but not in scope $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched end tag', text => $token->{tag_name}, token => $end_tag_token); ## Ignore the token $token = $self->_get_next_token; return; } } elsif ($node->[1] & SCOPING_EL) { $in_scope = 0; } } # INSCOPE unless (defined $formatting_element_i_in_open) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched end tag', text => $token->{tag_name}, token => $end_tag_token); pop @$active_formatting_elements; # $formatting_element $token = $self->_get_next_token; ## TODO: ok? return; } if (not $self->{open_elements}->[-1]->[0] eq $formatting_element->[0]) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'not closed', text => $self->{open_elements}->[-1]->[0] ->tagName, token => $end_tag_token); } ## Step 5 my $furthest_block; my $furthest_block_i_in_open; OE: for (reverse 0..$#{$self->{open_elements}}) { my $node = $self->{open_elements}->[$_]; if ($node->[1] & SPECIAL_EL or $node->[1] & SCOPING_EL) { ## "Special" $furthest_block = $node; $furthest_block_i_in_open = $_; ## NOTE: The topmost (eldest) node. } elsif ($node->[0] eq $formatting_element->[0]) { last OE; } } # OE ## Step 6 unless (defined $furthest_block) { # MUST splice @{$self->{open_elements}}, $formatting_element_i_in_open; splice @$active_formatting_elements, $formatting_element_i_in_active, 1; $token = $self->_get_next_token; return; } ## Step 7 my $common_ancestor_node = $self->{open_elements}->[$formatting_element_i_in_open - 1]; ## Step 8 my $bookmark_prev_el = $active_formatting_elements->[$formatting_element_i_in_active - 1] ->[0]; ## Step 9 my $node = $furthest_block; my $node_i_in_open = $furthest_block_i_in_open; my $last_node = $furthest_block; ## Step 9.1 my $inner_loop_counter = 0; INNER: { ## Step 9.2 if ($inner_loop_counter >= 3) { $token = $self->_get_next_token; last OUTER; } ## Step 9.3 $inner_loop_counter++; ## Step 9.4 $node_i_in_open--; $node = $self->{open_elements}->[$node_i_in_open]; ## Step 9.5 my $node_i_in_active; my $node_token; S7S2: { for (reverse 0..$#$active_formatting_elements) { if ($active_formatting_elements->[$_]->[0] eq $node->[0]) { $node_i_in_active = $_; $node_token = $active_formatting_elements->[$_]->[2]; last S7S2; } } splice @{$self->{open_elements}}, $node_i_in_open, 1; redo INNER; } # S7S2 ## Step 9.6 last INNER if $node->[0] eq $formatting_element->[0]; ## Step 9.7 if ($node->[0]->hasChildNodes ()) { my $new_element = []; $new_element->[0] = $self->{document}->createElementNS((HTML_NS), $node_token->{tag_name}); for my $attr_name (keys %{ $node_token->{attributes}}) { my $attr_t = $node_token->{attributes}->{$attr_name}; my $attr = $self->{document}->createAttributeNS(undef, $attr_name); $attr->setValue ($attr_t->{value}); $self->_data($attr, manakai_source_line => $attr_t->{line}); $self->_data($attr, manakai_source_column => $attr_t->{column}); $new_element->[0]->setAttributeNodeNS($attr); } $self->_data($new_element->[0], manakai_source_line => $node_token->{line}) if defined $node_token->{line}; $self->_data($new_element->[0], manakai_source_column => $node_token->{column}) if defined $node_token->{column}; $new_element->[1] = $node->[1]; $new_element->[2] = $node_token; $active_formatting_elements->[$node_i_in_active] = $new_element; $self->{open_elements}->[$node_i_in_open] = $new_element; $node = $new_element; } ## Step 9.8 if ($last_node->[0] eq $furthest_block->[0]) { $bookmark_prev_el = $node->[0]; } ## Step 9.9 $node->[0]->appendChild ($last_node->[0]); ## Step 9.10 $last_node = $node; ## Step 9.11 redo INNER; } # INNER ## Step 10 if ($common_ancestor_node->[1] & TABLE_ROWS_EL) { ## Foster parenting. my $foster_parent_element; my $next_sibling; OE: for (reverse 0..$#{$self->{open_elements}}) { if ($self->{open_elements}->[$_]->[1] == TABLE_EL) { $foster_parent_element = $self->{open_elements}->[$_ - 1]->[0]; $next_sibling = $self->{open_elements}->[$_]->[0]; undef $next_sibling unless $next_sibling->parentNode eq $foster_parent_element; last OE; } } # OE $foster_parent_element ||= $self->{open_elements}->[0]->[0]; $foster_parent_element->insertBefore ($last_node->[0], $next_sibling); $open_tables->[-1]->[1] = 1; # tainted } else { $common_ancestor_node->[0]->appendChild ($last_node->[0]); } ## Step 11 my $new_element = []; $new_element->[0] = $self->{document}->createElementNS((HTML_NS), $formatting_element->[2]->{tag_name}); for my $attr_name (keys %{ $formatting_element->[2]->{attributes}}) { my $attr_t = $formatting_element->[2]->{attributes}->{$attr_name}; my $attr = $self->{document}->createAttributeNS(undef, $attr_name); $attr->setValue ($attr_t->{value}); $self->_data($attr, manakai_source_line => $attr_t->{line}); $self->_data($attr, manakai_source_column => $attr_t->{column}); $new_element->[0]->setAttributeNodeNS($attr); } $self->_data($new_element->[0], manakai_source_line => $formatting_element->[2]->{line}) if defined $formatting_element->[2]->{line}; $self->_data($new_element->[0], manakai_source_column => $formatting_element->[2]->{column}) if defined $formatting_element->[2]->{column}; $new_element->[1] = $formatting_element->[1]; $new_element->[2] = $formatting_element->[2]; ## Step 12 my @cn = $furthest_block->[0]->childNodes; $new_element->[0]->appendChild($_) for @cn; ## Step 13 $furthest_block->[0]->appendChild ($new_element->[0]); ## Step 14 my $i; AFE: for (reverse 0..$#$active_formatting_elements) { if ($active_formatting_elements->[$_]->[0] eq $formatting_element->[0]) { splice @$active_formatting_elements, $_, 1; $i-- and last AFE if defined $i; } elsif ($active_formatting_elements->[$_]->[0] eq $bookmark_prev_el) { $i = $_; } } # AFE splice @$active_formatting_elements, (defined $i ? $i : 0) + 1, 0, $new_element; ## Step 15 undef $i; OE: for (reverse 0..$#{$self->{open_elements}}) { if ($self->{open_elements}->[$_]->[0] eq $formatting_element->[0]) { splice @{$self->{open_elements}}, $_, 1; $i-- and last OE if defined $i; } elsif ($self->{open_elements}->[$_]->[0] eq $furthest_block->[0]) { $i = $_; } } # OE splice @{$self->{open_elements}}, $i + 1, 0, $new_element; ## Step 16 redo OUTER; } # OUTER }; # $formatting_end_tag my $reconstruct_active_formatting_elements = sub ($$$$) { # MUST my ($self, $insert, $active_formatting_elements, $open_tables) = @_; ## Step 1 return unless @$active_formatting_elements; ## Step 3 my $i = -1; my $entry = $active_formatting_elements->[$i]; ## Step 2 return if $entry->[0] eq '#marker'; for (@{$self->{open_elements}}) { if ($entry->[0] eq $_->[0]) { return; } } S4: { ## Step 4 last S4 if $active_formatting_elements->[0]->[0] eq $entry->[0]; ## Step 5 $i--; $entry = $active_formatting_elements->[$i]; ## Step 6 if ($entry->[0] eq '#marker') { # } else { my $in_open_elements; OE: for (@{$self->{open_elements}}) { if ($entry->[0] eq $_->[0]) { $in_open_elements = 1; last OE; } } if ($in_open_elements) { # } else { ## NOTE:

X redo S4; } } ## Step 7 $i++; $entry = $active_formatting_elements->[$i]; } # S4 S7: { ## Step 8 my $clone = [$entry->[0]->cloneNode(0), $entry->[1], $entry->[2]]; ## Step 9 $insert->($self, $clone->[0], $open_tables); push @{$self->{open_elements}}, $clone; ## Step 10 $active_formatting_elements->[$i] = $self->{open_elements}->[-1]; ## Step 11 unless ($clone->[0] eq $active_formatting_elements->[-1]->[0]) { ## Step 7' $i++; $entry = $active_formatting_elements->[$i]; redo S7; } } # S7 }; # $reconstruct_active_formatting_elements my $clear_up_to_marker = sub ($) { my $active_formatting_elements = $_[0]; for (reverse 0..$#$active_formatting_elements) { if ($active_formatting_elements->[$_]->[0] eq '#marker') { splice @$active_formatting_elements, $_; return; } } }; # $clear_up_to_marker my $insert_to_current = sub { #my ($self, $child, $open_tables) = @_; $_[0]->{open_elements}->[-1]->[0]->appendChild ($_[1]); }; # insert_to_current ## Foster parenting. Note that there are three "foster parenting" ## code in the parser: for elements (this one), for texts, and for ## elements in the AAA code. my $insert_to_foster = sub { my ($self, $child, $open_tables) = @_; if ($self->{open_elements}->[-1]->[1] & TABLE_ROWS_EL) { # MUST my $foster_parent_element; my $next_sibling; OE: for (reverse 0..$#{$self->{open_elements}}) { if ($self->{open_elements}->[$_]->[1] == TABLE_EL) { $foster_parent_element = $self->{open_elements}->[$_ - 1]->[0]; $next_sibling = $self->{open_elements}->[$_]->[0]; undef $next_sibling unless $next_sibling->parentNode eq $foster_parent_element; last OE; } } # OE $foster_parent_element ||= $self->{open_elements}->[0]->[0]; # This conditional bit is by TOBY if ($next_sibling) { $foster_parent_element->insertBefore ($child, $next_sibling); } else { $foster_parent_element->appendChild($child); } $open_tables->[-1]->[1] = 1; # tainted } else { $self->{open_elements}->[-1]->[0]->appendChild ($child); } }; # $insert_to_foster sub _tree_construction_main ($) { my $self = shift; ## "List of active formatting elements". Each item in this array is ## an array reference, which contains: [0] - the element node; [1] - ## the local name of the element; [2] - the token that is used to ## create [0]. my $active_formatting_elements = []; my $insert; ## NOTE: $open_tables->[-1]->[0] is the "current table" element node. ## NOTE: $open_tables->[-1]->[1] is the "tainted" flag (OBSOLETE; unused). ## NOTE: $open_tables->[-1]->[2] is set false when non-Text node inserted. my $open_tables = [[$self->{open_elements}->[0]->[0]]]; $insert = $insert_to_current; ## NOTE: Insert a character (MUST): When a character is inserted, if ## the last node that was inserted by the parser is a Text node and ## the character has to be inserted after that node, then the ## character is appended to the Text node. However, if any other ## node is inserted by the parser, then a new Text node is created ## and the character is appended as that Text node. If I'm not ## wrong, for a parser with scripting disabled, there are only two ## cases where this occurs. It is the case where an element or ## comment is inserted into the |table| subtree while foster ## parenting happens. This is covered by using the [2] flag of the ## |$open_tables| structure. All other cases are handled simply by ## calling |manakai_append_text| method. B: while (1) { if ($token->{n}++ == 100) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'parser impl error', # XXXtest token => $token); require Data::Dumper; warn "====== HTML Parser Error ======\n"; warn join (' ', map { $_->[0]->tagName } @{$self->{open_elements}}) . ' #' . $self->{insertion_mode} . "\n"; warn Data::Dumper::Dumper ($token); $token = $self->_get_next_token; next B; } ## if ( (not @{$self->{open_elements}}) or (not $self->{open_elements}->[-1]->[1] & FOREIGN_EL) or ## HTML element ($self->{open_elements}->[-1]->[1] == MML_TEXT_INTEGRATION_EL and (($token->{type} == START_TAG_TOKEN and $token->{tag_name} ne 'mglyph' and $token->{tag_name} ne 'malignmark') or $token->{type} == CHARACTER_TOKEN)) or ($self->{open_elements}->[-1]->[1] & MML_AXML_EL and $token->{type} == START_TAG_TOKEN and $token->{tag_name} eq 'svg') or ( ## If the current node is an HTML integration point (other ## than |annotation-xml|). $self->{open_elements}->[-1]->[1] == SVG_INTEGRATION_EL and ($token->{type} == START_TAG_TOKEN or $token->{type} == CHARACTER_TOKEN)) or ( ## If the current node is an |annotation-xml| whose |encoding| ## is |text/html| or |application/xhtml+xml| (HTML integration ## point). $self->{open_elements}->[-1]->[1] == MML_AXML_EL and ($token->{type} == START_TAG_TOKEN or $token->{type} == CHARACTER_TOKEN) and do { my $encoding = $self->{open_elements}->[-1]->[0]->getAttributeNS(undef, 'encoding') || ''; $encoding =~ tr/A-Z/a-z/; ## ASCII case-insensitive. if ($encoding eq 'text/html' or $encoding eq 'application/xhtml+xml') { 1; } else { 0; } }) or ($token->{type} == END_OF_FILE_TOKEN)) { ## Use the rules for the current insertion mode in HTML content. # } else { ## Use the rules for the foreign content. if ($token->{type} == CHARACTER_TOKEN) { ## "In foreign content", character tokens. my $data = $token->{data}; while ($data =~ s/\x00/\x{FFFD}/) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL', token => $token); } $self->{open_elements}->[-1]->[0]->appendTextFromUnicode($self, $data, $token); if ($data =~ /[^\x09\x0A\x0C\x0D\x20]/) { delete $self->{frameset_ok}; } $token = $self->_get_next_token; next B; } elsif ($token->{type} == START_TAG_TOKEN) { ## "In foreign content", start tag token. if ( { b => 1, big => 1, blockquote => 1, body => 1, br => 1, center => 1, code => 1, dd => 1, div => 1, dl => 1, dt => 1, em => 1, embed => 1, h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1, head => 1, hr => 1, i => 1, img => 1, li => 1, listing => 1, menu => 1, meta => 1, nobr => 1, ol => 1, p => 1, pre => 1, ruby => 1, s => 1, small => 1, span => 1, strong => 1, strike => 1, sub => 1, sup => 1, table => 1, tt => 1, u => 1, ul => 1, var => 1, }->{$token->{tag_name}} or ($token->{tag_name} eq 'font' and ($token->{attributes}->{color} or $token->{attributes}->{face} or $token->{attributes}->{size})) ) { ## "In foreign content", HTML-only start tag. $self->{parse_error}->(level => $self->{level}->{must}, type => 'not closed', text => $self->{open_elements}->[-1]->[0] ->localname, token => $token); pop @{$self->{open_elements}}; V: { my $current_node = $self->{open_elements}->[-1]; if ( ## An HTML element. not $current_node->[1] & FOREIGN_EL or ## An MathML text integration point. $current_node->[1] == MML_TEXT_INTEGRATION_EL or ## An HTML integration point. $current_node->[1] == SVG_INTEGRATION_EL or ($current_node->[1] == MML_AXML_EL and do { my $encoding = $current_node->[0]->getAttributeNS(undef, 'encoding') || ''; $encoding =~ tr/A-Z/a-z/; ## ASCII case-insensitive. ($encoding eq 'text/html' or $encoding eq 'application/xhtml+xml'); }) ) { last V; } pop @{$self->{open_elements}}; redo V; } ## Reprocess the token. next B; } else { ## "In foreign content", foreign start tag. my $nsuri = $self->{open_elements}->[-1]->[0]->namespaceURI; my $tag_name = $token->{tag_name}; if ($nsuri eq (SVG_NS)) { $tag_name = { altglyph => 'altGlyph', altglyphdef => 'altGlyphDef', altglyphitem => 'altGlyphItem', animatecolor => 'animateColor', animatemotion => 'animateMotion', animatetransform => 'animateTransform', clippath => 'clipPath', feblend => 'feBlend', fecolormatrix => 'feColorMatrix', fecomponenttransfer => 'feComponentTransfer', fecomposite => 'feComposite', feconvolvematrix => 'feConvolveMatrix', fediffuselighting => 'feDiffuseLighting', fedisplacementmap => 'feDisplacementMap', fedistantlight => 'feDistantLight', feflood => 'feFlood', fefunca => 'feFuncA', fefuncb => 'feFuncB', fefuncg => 'feFuncG', fefuncr => 'feFuncR', fegaussianblur => 'feGaussianBlur', feimage => 'feImage', femerge => 'feMerge', femergenode => 'feMergeNode', femorphology => 'feMorphology', feoffset => 'feOffset', fepointlight => 'fePointLight', fespecularlighting => 'feSpecularLighting', fespotlight => 'feSpotLight', fetile => 'feTile', feturbulence => 'feTurbulence', foreignobject => 'foreignObject', glyphref => 'glyphRef', lineargradient => 'linearGradient', radialgradient => 'radialGradient', #solidcolor => 'solidColor', ## NOTE: Commented in spec (SVG1.2) textpath => 'textPath', }->{$tag_name} || $tag_name; } ## "adjust SVG attributes" (SVG only) - done in insert-element-f ## "adjust foreign attributes" - done in insert-element-f { my $el; $el = $self->{document}->createElementNS($nsuri, $tag_name); for my $attr_name (keys %{ $token->{attributes}}) { my $attr_t = $token->{attributes}->{$attr_name}; my $attr; if (defined $foreign_attr_xname->{ $attr_name }) { my $xmlnsuri = $foreign_attr_xname->{ $attr_name }->[0]; my $qname = join ':', @{$foreign_attr_xname->{ $attr_name }->[1]}; $qname =~ s/(^:)|(:$)//; $attr = $self->{document}->createAttributeNS($xmlnsuri, $qname); } elsif ($nsuri eq (MML_NS) && $attr_name eq 'definitionurl') { $attr = $self->{document}->createAttributeNS((MML_NS), 'math:definitionURL'); } elsif ($nsuri eq (MML_NS) ) { $attr = $self->{document}->createAttributeNS((MML_NS), "math:$attr_name"); } elsif ($nsuri eq (SVG_NS) ) { $attr = $self->{document}->createAttributeNS( (SVG_NS), "svg:".($svg_attr_name->{$attr_name} || $attr_name)); } unless (defined $attr) { $attr = $self->{document}->createAttributeNS($nsuri, $attr_name); } unless (defined $attr) { $attr = $self->{document}->createAttribute($attr_name); } if ($attr) { $attr->setValue($attr_t->{value}); $self->_data($attr, manakai_source_line => $attr_t->{line}); $self->_data($attr, manakai_source_column => $attr_t->{column}); $el->setAttributeNodeNS($attr); } } $self->_data($el, manakai_source_line => $token->{line}) if defined $token->{line}; $self->_data($el, manakai_source_column => $token->{column}) if defined $token->{column}; $insert->($self, $el, $open_tables); push @{$self->{open_elements}}, [$el, ($el_category_f->{$nsuri}->{ $tag_name} || 0) | FOREIGN_EL | (($nsuri) eq SVG_NS ? SVG_EL : ($nsuri) eq MML_NS ? MML_EL : 0)]; if ( $token->{attributes}->{xmlns} and $token->{attributes}->{xmlns}->{value} ne ($nsuri)) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad namespace', token => $token); ## TODO: Error type documentation } if ( $token->{attributes}->{'xmlns:xlink'} and $token->{attributes}->{'xmlns:xlink'}->{value} ne q) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad namespace', token => $token); } } if ($self->{self_closing}) { pop @{$self->{open_elements}}; delete $self->{self_closing}; } else { } $token = $self->_get_next_token; next B; } } elsif ($token->{type} == END_TAG_TOKEN) { ## "In foreign content", end tag. if ($token->{tag_name} eq 'script' and $self->{open_elements}->[-1]->[1] == SVG_SCRIPT_EL) { ## "In foreign content", "script" end tag, if the current ## node is an SVG |script| element. pop @{$self->{open_elements}}; ## XXXscript: Execute script here. $token = $self->_get_next_token; next B; } else { ## "In foreign content", end tag. ## 1. my $i = -1; my $node = $self->{open_elements}->[$i]; ## 2. my $tag_name = $node->[0]->localname; $tag_name =~ tr/A-Z/a-z/; ## ASCII case-insensitive. if ($tag_name ne $token->{tag_name}) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched end tag', text => $token->{tag_name}, level => $self->{level}->{must}); } ## 3. LOOP: { my $tag_name = $node->[0]->localname; $tag_name =~ tr/A-Z/a-z/; ## ASCII case-insensitive. if ($tag_name eq $token->{tag_name}) { splice @{$self->{open_elements}}, $i, -$i, (); $token = $self->_get_next_token; next B; } ## 4. $i--; $node = $self->{open_elements}->[$i]; ## 5. if ($node->[1] & FOREIGN_EL) { redo LOOP; } } # LOOP ## Step 6 (Use the current insertion mode in HTML content) # } } elsif ($token->{type} == COMMENT_TOKEN) { ## "In foreign content", comment token. my $comment = $self->{document}->createComment($token->{data}); $self->_data($comment, manakai_source_line => $token->{line}) if defined $token->{line}; $self->_data($comment, manakai_source_column => $token->{column}) if defined $token->{column}; $self->{open_elements}->[-1]->[0]->appendChild ($comment); $token = $self->_get_next_token; next B; } elsif ($token->{type} == DOCTYPE_TOKEN) { ## "In foreign content", DOCTYPE token. $self->{parse_error}->(level => $self->{level}->{must}, type => 'in html:#DOCTYPE', token => $token); ## Ignore the token. $token = $self->_get_next_token; next B; } else { die "$0: $token->{type}: Unknown token type"; } } # foreign ## The "in table text" insertion mode. if ($self->{insertion_mode} & TABLE_IMS and not $self->{insertion_mode} & IN_CDATA_RCDATA_IM) { C: { my $s; if ($token->{type} == CHARACTER_TOKEN) { $self->{pending_chars} ||= []; push @{$self->{pending_chars}}, $token; $token = $self->_get_next_token; next B; } else { ## There is an "insert pending chars" code clone. if ($self->{pending_chars}) { $s = join '', map { $_->{data} } @{$self->{pending_chars}}; delete $self->{pending_chars}; while ($s =~ s/\x00//) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL', token => $token); } if ($s eq '') { last C; } elsif ($s =~ /[^\x09\x0A\x0C\x0D\x20]/) { # } else { $self->{open_elements}->[-1]->[0]->appendTextFromUnicode($self, $s, $token); last C; } } else { last C; } } ## "in table" insertion mode, "Anything else". ## Foster parenting. $self->{parse_error}->(level => $self->{level}->{must}, type => 'in table:#text', token => $token); ## NOTE: As if in body, but insert into the foster parent element. $reconstruct_active_formatting_elements ->($self, $insert_to_foster, $active_formatting_elements, $open_tables); if ($self->{open_elements}->[-1]->[1] & TABLE_ROWS_EL) { # MUST my $foster_parent_element; my $next_sibling; OE: for (reverse 0..$#{$self->{open_elements}}) { if ($self->{open_elements}->[$_]->[1] == TABLE_EL) { $foster_parent_element = $self->{open_elements}->[$_ - 1]->[0]; $next_sibling = $self->{open_elements}->[$_]->[0]; undef $next_sibling unless $next_sibling->parentNode eq $foster_parent_element; last OE; } } # OE $foster_parent_element ||= $self->{open_elements}->[0]->[0]; $foster_parent_element->insertBefore ($self->{document}->createTextNode($s), $next_sibling); $open_tables->[-1]->[1] = 1; # tainted $open_tables->[-1]->[2] = 1; # ~node inserted } else { ## NOTE: Fragment case or in a foster parent'ed element ## (e.g. |

a|). In fragment case, whether the ## character is appended to existing node or a new node is ## created is irrelevant, since the foster parent'ed nodes ## are discarded and fragment parsing does not invoke any ## script. $self->{open_elements}->[-1]->[0]->appendTextFromUnicode($self, $s); } } # C } # TABLE_IMS if ($token->{type} == DOCTYPE_TOKEN) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'in html:#DOCTYPE', token => $token); ## Ignore the token ## Stay in the phase $token = $self->_get_next_token; next B; } elsif ($token->{type} == START_TAG_TOKEN and $token->{tag_name} eq 'html') { if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'after html', text => 'html', token => $token); $self->{insertion_mode} = AFTER_BODY_IM; } elsif ($self->{insertion_mode} == AFTER_HTML_FRAMESET_IM) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'after html', text => 'html', token => $token); $self->{insertion_mode} = AFTER_FRAMESET_IM; } else { } $self->{parse_error}->(level => $self->{level}->{must}, type => 'not first start tag', token => $token); my $top_el = $self->{open_elements}->[0]->[0]; for my $attr_name (keys %{$token->{attributes}}) { unless ($top_el->hasAttribute($attr_name)) { $top_el->setAttribute ($attr_name, $token->{attributes}->{$attr_name}->{value}); } } $token = $self->_get_next_token; next B; } elsif ($token->{type} == COMMENT_TOKEN) { my $comment = $self->{document}->createComment ($token->{data}); $self->_data($comment, manakai_source_line => $token->{line}) if defined $token->{line}; $self->_data($comment, manakai_source_column => $token->{column}) if defined $token->{column}; if ($self->{insertion_mode} & AFTER_HTML_IMS) { $self->{document}->appendChild ($comment); } elsif ($self->{insertion_mode} == AFTER_BODY_IM) { $self->{open_elements}->[0]->[0]->appendChild($comment); } else { $self->{open_elements}->[-1]->[0]->appendChild($comment); $open_tables->[-1]->[2] = 0 if @$open_tables; # ~node inserted } $token = $self->_get_next_token; next B; } elsif ($self->{insertion_mode} & IN_CDATA_RCDATA_IM) { if ($token->{type} == CHARACTER_TOKEN) { $token->{data} =~ s/^\x0A// if $self->{ignore_newline}; delete $self->{ignore_newline}; if (length $token->{data}) { ## NOTE: NULLs are replaced into U+FFFDs in tokenizer. $self->{open_elements}->[-1]->[0]->appendTextFromUnicode ($self, $token->{data}, $token); } else { } $token = $self->_get_next_token; next B; } elsif ($token->{type} == END_TAG_TOKEN) { delete $self->{ignore_newline}; if ($token->{tag_name} eq 'script') { ## Para 1-2 my $script = pop @{$self->{open_elements}}; ## Para 3 $self->{insertion_mode} &= ~ IN_CDATA_RCDATA_IM; ## Para 4 ## TODO: $old_insertion_point = $current_insertion_point; ## TODO: $current_insertion_point = just before $self->{nc}; ## Para 5 ## TODO: Run the $script->[0]. ## Para 6 ## TODO: $current_insertion_point = $old_insertion_point; ## Para 7 ## TODO: if ($pending_external_script) { ## TODO: ... ## TODO: } $token = $self->_get_next_token; next B; } else { pop @{$self->{open_elements}}; $self->{insertion_mode} &= ~ IN_CDATA_RCDATA_IM; $token = $self->_get_next_token; next B; } } elsif ($token->{type} == END_OF_FILE_TOKEN) { delete $self->{ignore_newline}; $self->{parse_error}->(level => $self->{level}->{must}, type => 'not closed', text => $self->{open_elements}->[-1]->[0] ->localname, token => $token); #if ($self->{open_elements}->[-1]->[1] == SCRIPT_EL) { # ## TODO: Mark as "already executed" #} pop @{$self->{open_elements}}; $self->{insertion_mode} &= ~ IN_CDATA_RCDATA_IM; ## Reprocess. next B; } else { die "$0: $token->{type}: In CDATA/RCDATA: Unknown token type"; } } # insertion_mode # BEGIN:TOBYINK if ($self->{insertion_mode} == IN_HEAD_IM and ($token->{tag_name}||'') eq 'object' and $token->{type} == END_TAG_TOKEN and $self->_data($self->{'document'}, 'isHTML4')) { pop @{$self->{open_elements}} if $self->{open_elements}->[-1]->[0]->localname eq 'object'; } # END:TOBYINK if ($self->{insertion_mode} & HEAD_IMS) { if ($token->{type} == CHARACTER_TOKEN) { if ($token->{data} =~ s/^([\x09\x0A\x0C\x20]+)//) { unless ($self->{insertion_mode} == BEFORE_HEAD_IM) { $self->{open_elements}->[-1]->[0]->appendTextFromUnicode($self, $1, $token); } else { ## Ignore the token. # } unless (length $token->{data}) { $token = $self->_get_next_token; next B; } ## TODO: set $token->{column} appropriately } if ($self->{insertion_mode} == BEFORE_HEAD_IM) { ## As if $self->{head_element} = $self->{document}->createElementNS((HTML_NS), 'head'); $self->_data($self->{head_element}, manakai_source_line => $token->{line}) if defined $token->{line}; $self->_data($self->{head_element}, manakai_source_column => $token->{column}) if defined $token->{column}; $self->_data($self->{head_element}, implied => __LINE__); $self->{open_elements}->[-1]->[0]->appendChild ($self->{head_element}); push @{$self->{open_elements}}, [$self->{head_element}, $el_category->{head}]; ## Reprocess in the "in head" insertion mode... pop @{$self->{open_elements}}; ## Reprocess in the "after head" insertion mode... } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) { ## As if pop @{$self->{open_elements}}; $self->{parse_error}->(level => $self->{level}->{must}, type => 'in noscript:#text', token => $token); ## Reprocess in the "in head" insertion mode... ## As if pop @{$self->{open_elements}}; ## Reprocess in the "after head" insertion mode... } elsif ($self->{insertion_mode} == IN_HEAD_IM) { pop @{$self->{open_elements}}; ## Reprocess in the "after head" insertion mode... } else { } ## "after head" insertion mode ## As if { my $el; $el = $self->{document}->createElementNS((HTML_NS), 'body'); $self->_data($el, manakai_source_line => $token->{line}) if defined $token->{line}; $self->_data($el, manakai_source_column => $token->{column}) if defined $token->{column}; $self->_data($el, implied => __LINE__); $self->{open_elements}->[-1]->[0]->appendChild ($el); push @{$self->{open_elements}}, [$el, $el_category->{'body'} || 0]; } $self->{insertion_mode} = IN_BODY_IM; ## The "frameset-ok" flag is left unchanged in this case. ## Reporcess the token. next B; } elsif ($token->{type} == START_TAG_TOKEN) { if ($token->{tag_name} eq 'head') { if ($self->{insertion_mode} == BEFORE_HEAD_IM) { $self->{head_element} = $self->{document}->createElementNS((HTML_NS), $token->{tag_name}); for my $attr_name (keys %{ $token->{attributes}}) { my $attr_t = $token->{attributes}->{$attr_name}; my $attr = $self->{document}->createAttributeNS (undef, $attr_name); $attr->setValue ($attr_t->{value}); $self->_data($attr,manakai_source_line => $attr_t->{line}); $self->_data($attr,manakai_source_column => $attr_t->{column}); $self->{head_element}->setAttributeNodeNS ($attr); } $self->_data($self->{head_element}, manakai_source_line => $token->{line}) if defined $token->{line}; $self->_data($self->{head_element}, manakai_source_column => $token->{column}) if defined $token->{column}; $self->{open_elements}->[-1]->[0]->appendChild($self->{head_element}); push @{$self->{open_elements}}, [$self->{head_element}, $el_category->{head}]; $self->{insertion_mode} = IN_HEAD_IM; $token = $self->_get_next_token; next B; } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'after head', text => 'head', token => $token); ## Ignore the token $token = $self->_get_next_token; next B; } else { $self->{parse_error}->(level => $self->{level}->{must}, type => 'in head:head', token => $token); # or in head noscript ## Ignore the token $token = $self->_get_next_token; next B; } } elsif ($self->{insertion_mode} == BEFORE_HEAD_IM) { ## As if $self->{head_element} = $self->{document}->createElementNS((HTML_NS), 'head'); $self->_data($self->{head_element}, manakai_source_line => $token->{line}) if defined $token->{line}; $self->_data($self->{head_element}, manakai_source_column => $token->{column}) if defined $token->{column}; $self->_data($self->{head_element}, implied => __LINE__); $self->{open_elements}->[-1]->[0]->appendChild ($self->{head_element}); push @{$self->{open_elements}}, [$self->{head_element}, $el_category->{head}]; $self->{insertion_mode} = IN_HEAD_IM; ## Reprocess in the "in head" insertion mode... } else { } if ($token->{tag_name} eq 'base') { if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) { ## As if pop @{$self->{open_elements}}; $self->{parse_error}->(level => $self->{level}->{must}, type => 'in noscript', text => 'base', token => $token); $self->{insertion_mode} = IN_HEAD_IM; ## Reprocess in the "in head" insertion mode... } else { } ## NOTE: There is a "as if in head" code clone. if ($self->{insertion_mode} == AFTER_HEAD_IM) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'after head', text => $token->{tag_name}, token => $token); push @{$self->{open_elements}}, [$self->{head_element}, $el_category->{head}]; } else { } { my $el; $el = $self->{document}->createElementNS((HTML_NS), $token->{tag_name}); for my $attr_name (keys %{ $token->{attributes}}) { my $attr_t = $token->{attributes}->{$attr_name}; my $attr = $self->{document}->createAttributeNS (undef, $attr_name); $attr->setValue ($attr_t->{value}); $self->_data($attr, manakai_source_line => $attr_t->{line}); $self->_data($attr, manakai_source_column => $attr_t->{column}); $el->setAttributeNodeNS($attr); } $self->_data($el,manakai_source_line => $token->{line}) if defined $token->{line}; $self->_data($el,manakai_source_column => $token->{column}) if defined $token->{column}; $self->{open_elements}->[-1]->[0]->appendChild ($el); push @{$self->{open_elements}}, [$el, $el_category->{$token->{tag_name}} || 0]; } pop @{$self->{open_elements}}; pop @{$self->{open_elements}} # if $self->{insertion_mode} == AFTER_HEAD_IM; $token = $self->_get_next_token; next B; } elsif ({ link => 1, basefont => 1, bgsound => 1, }->{$token->{tag_name}}) { ## NOTE: There is a "as if in head" code clone. if ($self->{insertion_mode} == AFTER_HEAD_IM) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'after head', text => $token->{tag_name}, token => $token); push @{$self->{open_elements}}, [$self->{head_element}, $el_category->{head}]; } else { } { my $el; $el = $self->{document}->createElementNS((HTML_NS), $token->{tag_name}); for my $attr_name (keys %{ $token->{attributes}}) { my $attr_t = $token->{attributes}->{$attr_name}; my $attr = $self->{document}->createAttributeNS (undef, $attr_name); $attr->setValue($attr_t->{value}); $self->_data($attr, manakai_source_line => $attr_t->{line}); $self->_data($attr, manakai_source_column => $attr_t->{column}); $el->setAttributeNodeNS($attr); } $self->_data($el, manakai_source_line => $token->{line}) if defined $token->{line}; $self->_data($el, manakai_source_column => $token->{column}) if defined $token->{column}; $self->{open_elements}->[-1]->[0]->appendChild ($el); push @{$self->{open_elements}}, [$el, $el_category->{$token->{tag_name}} || 0]; } pop @{$self->{open_elements}}; pop @{$self->{open_elements}} # if $self->{insertion_mode} == AFTER_HEAD_IM; delete $self->{self_closing}; $token = $self->_get_next_token; next B; } elsif ($token->{tag_name} eq 'command') { if ($self->{insertion_mode} == IN_HEAD_IM) { ## NOTE: If the insertion mode at the time of the emission ## of the token was "before head", $self->{insertion_mode} ## is already changed to |IN_HEAD_IM|. ## NOTE: There is a "as if in head" code clone. { my $el; $el = $self->{document}->createElementNS((HTML_NS), $token->{tag_name}); for my $attr_name (keys %{ $token->{attributes}}) { my $attr_t = $token->{attributes}->{$attr_name}; my $attr = $self->{document}->createAttributeNS (undef, $attr_name); $attr->setValue($attr_t->{value}); $self->_data($attr, manakai_source_line => $attr_t->{line}); $self->_data($attr, manakai_source_column => $attr_t->{column}); $el->setAttributeNodeNS($attr); } $self->_data($el, manakai_source_line => $token->{line}) if defined $token->{line}; $self->_data($el, manakai_source_column => $token->{column}) if defined $token->{column}; $self->{open_elements}->[-1]->[0]->appendChild ($el); push @{$self->{open_elements}}, [$el, $el_category->{$token->{tag_name}} || 0]; } pop @{$self->{open_elements}}; pop @{$self->{open_elements}} # if $self->{insertion_mode} == AFTER_HEAD_IM; delete $self->{self_closing}; $token = $self->_get_next_token; next B; } else { ## NOTE: "in head noscript" or "after head" insertion mode ## - in these cases, these tags are treated as same as ## normal in-body tags. # } } elsif ($token->{tag_name} eq 'meta') { ## NOTE: There is a "as if in head" code clone. if ($self->{insertion_mode} == AFTER_HEAD_IM) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'after head', text => $token->{tag_name}, token => $token); push @{$self->{open_elements}}, [$self->{head_element}, $el_category->{head}]; } else { } { my $el; $el = $self->{document}->createElementNS((HTML_NS), $token->{tag_name}); for my $attr_name (keys %{ $token->{attributes}}) { my $attr_t = $token->{attributes}->{$attr_name}; my $attr = $self->{document}->createAttributeNS(undef, $attr_name); $attr->setValue ($attr_t->{value}); $self->_data($attr, manakai_source_line => $attr_t->{line}); $self->_data($attr, manakai_source_column => $attr_t->{column}); $el->setAttributeNodeNS ($attr); } $self->_data($el, manakai_source_line => $token->{line}) if defined $token->{line}; $self->_data($el, manakai_source_column => $token->{column}) if defined $token->{column}; $self->{open_elements}->[-1]->[0]->appendChild ($el); push @{$self->{open_elements}}, [$el, $el_category->{$token->{tag_name}} || 0]; } my $meta_el = pop @{$self->{open_elements}}; unless ($self->{confident}) { if ($token->{attributes}->{charset}) { ## NOTE: Whether the encoding is supported or not, ## an ASCII-compatible charset is not, is handled in ## the {change_encoding} callback. $self->{change_encoding} ->($self, $token->{attributes}->{charset}->{value}, $token); $self->_data($meta_el->[0]->getAttributeNodeNS (undef, 'charset'), manakai_has_reference => $token->{attributes}->{charset}->{has_reference}); } elsif ($token->{attributes}->{content} and $token->{attributes}->{'http-equiv'}) { if ($token->{attributes}->{'http-equiv'}->{value} =~ /\A[Cc][Oo][Nn][Tt][Ee][Nn][Tt]-[Tt][Yy][Pp][Ee]\z/ and $token->{attributes}->{content}->{value} =~ /[Cc][Hh][Aa][Rr][Ss][Ee][Tt] [\x09\x0A\x0C\x0D\x20]*= [\x09\x0A\x0C\x0D\x20]*(?>"([^"]*)"|'([^']*)'| ([^"'\x09\x0A\x0C\x0D\x20] [^\x09\x0A\x0C\x0D\x20\x3B]*))/x) { ## NOTE: Whether the encoding is supported or not, ## an ASCII-compatible charset is not, is handled ## in the {change_encoding} callback. $self->{change_encoding} ->($self, defined $1 ? $1 : defined $2 ? $2 : $3, $token); $self->_data($meta_el->[0]->getAttributeNodeNS (undef, 'content'), manakai_has_reference => $token->{attributes}->{content}->{has_reference}); } else { } } } else { if ($token->{attributes}->{charset}) { $self->_data($meta_el->[0]->getAttributeNodeNS(undef, 'charset'), manakai_has_reference => $token->{attributes}->{charset}->{has_reference}); } if ($token->{attributes}->{content}) { $self->_data($meta_el->[0]->getAttributeNodeNS(undef, 'content'), manakai_has_reference => $token->{attributes}->{content}->{has_reference}); } } pop @{$self->{open_elements}} # if $self->{insertion_mode} == AFTER_HEAD_IM; delete $self->{self_closing}; $token = $self->_get_next_token; next B; } elsif ($token->{tag_name} eq 'title') { if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) { ## As if pop @{$self->{open_elements}}; $self->{parse_error}->(level => $self->{level}->{must}, type => 'in noscript', text => 'title', token => $token); $self->{insertion_mode} = IN_HEAD_IM; ## Reprocess in the "in head" insertion mode... } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'after head', text => $token->{tag_name}, token => $token); push @{$self->{open_elements}}, [$self->{head_element}, $el_category->{head}]; } else { } ## NOTE: There is a "as if in head" code clone. $parse_rcdata->($self, $insert, $open_tables, 1); # RCDATA ## NOTE: At this point the stack of open elements contain ## the |head| element (index == -2) and the |script| element ## (index == -1). In the "after head" insertion mode the ## |head| element is inserted only for the purpose of ## providing the context for the |script| element, and ## therefore we can now and have to remove the element from ## the stack. splice @{$self->{open_elements}}, -2, 1, () # if ($self->{insertion_mode} & IM_MASK) == AFTER_HEAD_IM; next B; } elsif ($token->{tag_name} eq 'style' or $token->{tag_name} eq 'noframes') { ## NOTE: Or (scripting is enabled and tag_name eq 'noscript' and ## insertion mode IN_HEAD_IM) ## NOTE: There is a "as if in head" code clone. if ($self->{insertion_mode} == AFTER_HEAD_IM) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'after head', text => $token->{tag_name}, token => $token); push @{$self->{open_elements}}, [$self->{head_element}, $el_category->{head}]; } else { } $parse_rcdata->($self, $insert, $open_tables, 0); # RAWTEXT splice @{$self->{open_elements}}, -2, 1, () # if ($self->{insertion_mode} & IM_MASK) == AFTER_HEAD_IM; next B; } elsif ($token->{tag_name} eq 'noscript') { if ($self->{insertion_mode} == IN_HEAD_IM) { ## NOTE: and scripting is disalbed { my $el; $el = $self->{document}->createElementNS((HTML_NS), $token->{tag_name}); for my $attr_name (keys %{ $token->{attributes}}) { my $attr_t = $token->{attributes}->{$attr_name}; my $attr = $self->{document}->createAttributeNS(undef, $attr_name); $attr->setValue ($attr_t->{value}); $self->_data($attr, manakai_source_line => $attr_t->{line}); $self->_data($attr, manakai_source_column => $attr_t->{column}); $el->setAttributeNodeNS ($attr); } $self->_data($el, manakai_source_line => $token->{line}) if defined $token->{line}; $self->_data($el, manakai_source_column => $token->{column}) if defined $token->{column}; $self->{open_elements}->[-1]->[0]->appendChild ($el); push @{$self->{open_elements}}, [$el, $el_category->{$token->{tag_name}} || 0]; } $self->{insertion_mode} = IN_HEAD_NOSCRIPT_IM; $token = $self->_get_next_token; next B; } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'in noscript', text => 'noscript', token => $token); ## Ignore the token $token = $self->_get_next_token; next B; } else { # } } elsif ($token->{tag_name} eq 'script') { if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) { ## As if pop @{$self->{open_elements}}; $self->{parse_error}->(level => $self->{level}->{must}, type => 'in noscript', text => 'script', token => $token); $self->{insertion_mode} = IN_HEAD_IM; ## Reprocess in the "in head" insertion mode... } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'after head', text => $token->{tag_name}, token => $token); push @{$self->{open_elements}}, [$self->{head_element}, $el_category->{head}]; } else { } ## NOTE: There is a "as if in head" code clone. $script_start_tag->($self, $insert, $open_tables); ## ISSUE: A spec bug [Bug 6038] splice @{$self->{open_elements}}, -2, 1 # if ($self->{insertion_mode} & IM_MASK) == AFTER_HEAD_IM; next B; } elsif ($token->{tag_name} eq 'body' or $token->{tag_name} eq 'frameset') { if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) { ## As if pop @{$self->{open_elements}}; $self->{parse_error}->(level => $self->{level}->{must}, type => 'in noscript', text => $token->{tag_name}, token => $token); ## Reprocess in the "in head" insertion mode... ## As if pop @{$self->{open_elements}}; ## Reprocess in the "after head" insertion mode... } elsif ($self->{insertion_mode} == IN_HEAD_IM) { pop @{$self->{open_elements}}; ## Reprocess in the "after head" insertion mode... } else { } ## "after head" insertion mode { my $el; $el = $self->{document}->createElementNS((HTML_NS), $token->{tag_name}); for my $attr_name (keys %{ $token->{attributes}}) { my $attr_t = $token->{attributes}->{$attr_name}; my $attr = $self->{document}->createAttributeNS (undef, $attr_name); $attr->setValue ($attr_t->{value}); $self->_data($attr, manakai_source_line => $attr_t->{line}); $self->_data($attr, manakai_source_column => $attr_t->{column}); $el->setAttributeNodeNS ($attr); } $self->_data($el, manakai_source_line => $token->{line}) if defined $token->{line}; $self->_data($el, manakai_source_column => $token->{column}) if defined $token->{column}; $self->{open_elements}->[-1]->[0]->appendChild ($el); push @{$self->{open_elements}}, [$el, $el_category->{$token->{tag_name}} || 0]; } if ($token->{tag_name} eq 'body') { delete $self->{frameset_ok}; $self->{insertion_mode} = IN_BODY_IM; } elsif ($token->{tag_name} eq 'frameset') { $self->{insertion_mode} = IN_FRAMESET_IM; } else { die "$0: tag name: $self->{tag_name}"; } $token = $self->_get_next_token; next B; # BEGIN:TOBYINK } elsif ($self->{insertion_mode} == IN_HEAD_IM and $token->{tag_name} =~ m'^(object|param)$' and $self->_data($self->{'document'}, 'isHTML4')) { { my $el = $self->{document}->createElementNS((HTML_NS), $token->{tag_name}); for my $attr_name (keys %{ $token->{attributes}}) { my $attr_t = $token->{attributes}->{$attr_name}; my $attr = $self->{document}->createAttributeNS (undef, $attr_name); $attr->setValue($attr_t->{value}); $self->_data($attr, manakai_source_line => $attr_t->{line}); $self->_data($attr, manakai_source_column => $attr_t->{column}); $el->setAttributeNodeNS($attr); } $self->_data($el, manakai_source_line => $token->{line}) if defined $token->{line}; $self->_data($el, manakai_source_column => $token->{column}) if defined $token->{column}; $self->{open_elements}->[-1]->[0]->appendChild ($el); push @{$self->{open_elements}}, [$el, $el_category->{$token->{tag_name}} || 0]; } if ($token->{tag_name} eq 'param') { pop @{$self->{open_elements}}; delete $self->{self_closing}; } $token = $self->_get_next_token; next B; # END:TOBYINK } else { # } if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) { ## As if pop @{$self->{open_elements}}; $self->{parse_error}->(level => $self->{level}->{must}, type => 'in noscript:/', text => $token->{tag_name}, token => $token); ## Reprocess in the "in head" insertion mode... ## As if pop @{$self->{open_elements}}; ## Reprocess in the "after head" insertion mode... } elsif ($self->{insertion_mode} == IN_HEAD_IM) { ## As if pop @{$self->{open_elements}}; ## Reprocess in the "after head" insertion mode... } else { } ## "after head" insertion mode ## As if { my $el; $el = $self->{document}->createElementNS((HTML_NS), 'body'); $self->_data($el, manakai_source_line => $token->{line}) if defined $token->{line}; $self->_data($el, manakai_source_column => $token->{column}) if defined $token->{column}; $self->_data($el, implied => __LINE__); $self->{open_elements}->[-1]->[0]->appendChild ($el); push @{$self->{open_elements}}, [$el, $el_category->{'body'} || 0]; } $self->{insertion_mode} = IN_BODY_IM; ## The "frameset-ok" flag is not changed in this case. ## Reprocess the token. next B; } elsif ($token->{type} == END_TAG_TOKEN) { ## "Before head", "in head", and "after head" insertion modes ## ignore most of end tags. Exceptions are "body", "html", ## and "br" end tags. "Before head" and "in head" insertion ## modes also recognize "head" end tag. "In head noscript" ## insertion modes ignore end tags except for "noscript" and ## "br". if ($token->{tag_name} eq 'head') { if ($self->{insertion_mode} == BEFORE_HEAD_IM) { ## As if $self->{head_element} = $self->{document}->createElementNS((HTML_NS), 'head'); $self->_data($self->{head_element}, manakai_source_line => $token->{line}) if defined $token->{line}; $self->_data($self->{head_element}, manakai_source_column => $token->{column}) if defined $token->{column}; $self->{open_elements}->[-1]->[0]->appendChild($self->{head_element}); push @{$self->{open_elements}}, [$self->{head_element}, $el_category->{head}]; ## Reprocess in the "in head" insertion mode... pop @{$self->{open_elements}}; $self->{insertion_mode} = AFTER_HEAD_IM; $token = $self->_get_next_token; next B; } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) { # } elsif ($self->{insertion_mode} == IN_HEAD_IM) { pop @{$self->{open_elements}}; $self->{insertion_mode} = AFTER_HEAD_IM; $token = $self->_get_next_token; next B; } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) { # } else { die "$0: $self->{insertion_mode}: Unknown insertion mode"; } } elsif ($token->{tag_name} eq 'noscript') { if ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) { pop @{$self->{open_elements}}; $self->{insertion_mode} = IN_HEAD_IM; $token = $self->_get_next_token; next B; } else { # } } elsif ({ body => ($self->{insertion_mode} != IN_HEAD_NOSCRIPT_IM), html => ($self->{insertion_mode} != IN_HEAD_NOSCRIPT_IM), br => 1, }->{$token->{tag_name}}) { if ($self->{insertion_mode} == BEFORE_HEAD_IM) { ## (before head) as if , (in head) as if $self->{head_element} = $self->{document}->createElementNS((HTML_NS), 'head'); $self->_data($self->{head_element}, manakai_source_line => $token->{line}) if defined $token->{line}; $self->_data($self->{head_element}, manakai_source_column => $token->{column}) if defined $token->{column}; $self->_data($self->{head_element}, implied => __LINE__); $self->{open_elements}->[-1]->[0]->appendChild ($self->{head_element}); $self->{insertion_mode} = AFTER_HEAD_IM; ## Reprocess in the "after head" insertion mode... } elsif ($self->{insertion_mode} == IN_HEAD_IM) { ## As if pop @{$self->{open_elements}}; $self->{insertion_mode} = AFTER_HEAD_IM; ## Reprocess in the "after head" insertion mode... } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) { ## NOTE: Two parse errors for pop @{$self->{open_elements}}; $self->{insertion_mode} = IN_HEAD_IM; ## Reprocess in the "in head" insertion mode... ## As if pop @{$self->{open_elements}}; $self->{insertion_mode} = AFTER_HEAD_IM; ## Reprocess in the "after head" insertion mode... } elsif ($self->{insertion_mode} == AFTER_HEAD_IM) { # } else { die "$0: $self->{insertion_mode}: Unknown insertion mode"; } ## "after head" insertion mode ## As if { my $el; $el = $self->{document}->createElementNS((HTML_NS), 'body'); $self->_data($el, manakai_source_line => $token->{line}) if defined $token->{line}; $self->_data($el, manakai_source_column => $token->{column}) if defined $token->{column}; $self->_data($el, implied => __LINE__); $self->{open_elements}->[-1]->[0]->appendChild ($el); push @{$self->{open_elements}}, [$el, $el_category->{'body'} || 0]; } $self->{insertion_mode} = IN_BODY_IM; ## The "frameset-ok" flag is left unchanged in this case. ## Reprocess the token. next B; } ## End tags are ignored by default. $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched end tag', text => $token->{tag_name}, token => $token); ## Ignore the token. $token = $self->_get_next_token; next B; } elsif ($token->{type} == END_OF_FILE_TOKEN) { if ($self->{insertion_mode} == BEFORE_HEAD_IM) { ## NOTE: As if $self->{head_element} = $self->{document}->createElementNS((HTML_NS), 'head'); $self->_data($self->{head_element}, manakai_source_line => $token->{line}) if defined $token->{line}; $self->_data($self->{head_element}, manakai_source_column => $token->{column}) if defined $token->{column}; $self->_data($self->{head_element}, implied => __LINE__); $self->{open_elements}->[-1]->[0]->appendChild($self->{head_element}); #push @{$self->{open_elements}}, # [$self->{head_element}, $el_category->{head}]; #$self->{insertion_mode} = IN_HEAD_IM; ## NOTE: Reprocess. ## NOTE: As if #pop @{$self->{open_elements}}; #$self->{insertion_mode} = IN_AFTER_HEAD_IM; ## NOTE: Reprocess. # } elsif ($self->{insertion_mode} == IN_HEAD_IM) { ## NOTE: As if pop @{$self->{open_elements}}; #$self->{insertion_mode} = IN_AFTER_HEAD_IM; ## NOTE: Reprocess. # } elsif ($self->{insertion_mode} == IN_HEAD_NOSCRIPT_IM) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'in noscript:#eof', token => $token); ## As if pop @{$self->{open_elements}}; #$self->{insertion_mode} = IN_HEAD_IM; ## NOTE: Reprocess. ## NOTE: As if pop @{$self->{open_elements}}; #$self->{insertion_mode} = IN_AFTER_HEAD_IM; ## NOTE: Reprocess. # } else { # } ## NOTE: As if { my $el; $el = $self->{document}->createElementNS((HTML_NS), 'body'); $self->_data($el, manakai_source_line => $token->{line}) if defined $token->{line}; $self->_data($el, manakai_source_column => $token->{column}) if defined $token->{column}; $self->_data($el, implied => __LINE__); $self->{open_elements}->[-1]->[0]->appendChild ($el); push @{$self->{open_elements}}, [$el, $el_category->{'body'} || 0]; } $self->{insertion_mode} = IN_BODY_IM; ## The "frameset-ok" flag is left unchanged in this case. ## Reprocess the token. next B; } else { die "$0: $token->{type}: Unknown token type"; } } elsif ($self->{insertion_mode} & BODY_IMS) { if ($token->{type} == CHARACTER_TOKEN) { ## "In body" insertion mode, character token. It is also used ## for character tokens "in foreign content" insertion ## mode, for certain cases. while ($token->{data} =~ s/\x00//g) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL', token => $token); } if ($token->{data} eq '') { $token = $self->_get_next_token; next B; } $reconstruct_active_formatting_elements ->($self, $insert_to_current, $active_formatting_elements, $open_tables); $self->{open_elements}->[-1]->[0]->appendTextFromUnicode($self, $token->{data}, $token); if ($self->{frameset_ok} and $token->{data} =~ /[^\x09\x0A\x0C\x0D\x20]/) { delete $self->{frameset_ok}; } $token = $self->_get_next_token; next B; } elsif ($token->{type} == START_TAG_TOKEN) { if ({ caption => 1, col => 1, colgroup => 1, tbody => 1, td => 1, tfoot => 1, th => 1, thead => 1, tr => 1, }->{$token->{tag_name}}) { if (($self->{insertion_mode} & IM_MASK) == IN_CELL_IM) { ## have an element in table scope for (reverse 0..$#{$self->{open_elements}}) { my $node = $self->{open_elements}->[$_]; if ($node->[1] == TABLE_CELL_EL) { ## Close the cell $token->{self_closing} = $self->{self_closing}; unshift @{$self->{token}}, $token; delete $self->{self_closing}; # $token = {type => END_TAG_TOKEN, tag_name => $node->[0]->tagName, line => $token->{line}, column => $token->{column}}; next B; } elsif ($node->[1] & TABLE_SCOPING_EL) { ## ISSUE: This case can never be reached, maybe. last; } } $self->{parse_error}->(level => $self->{level}->{must}, type => 'start tag not allowed', text => $token->{tag_name}, token => $token); ## Ignore the token $token = $self->_get_next_token; next B; } elsif (($self->{insertion_mode} & IM_MASK) == IN_CAPTION_IM) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'not closed', text => 'caption', token => $token); ## NOTE: As if . ## have a table element in table scope my $i; INSCOPE: { for (reverse 0..$#{$self->{open_elements}}) { my $node = $self->{open_elements}->[$_]; if ($node->[1] == CAPTION_EL) { $i = $_; last INSCOPE; } elsif ($node->[1] & TABLE_SCOPING_EL) { last; } } $self->{parse_error}->(level => $self->{level}->{must}, type => 'start tag not allowed', text => $token->{tag_name}, token => $token); ## Ignore the token $token = $self->_get_next_token; next B; } # INSCOPE ## generate implied end tags while ($self->{open_elements}->[-1]->[1] & END_TAG_OPTIONAL_EL) { pop @{$self->{open_elements}}; } unless ($self->{open_elements}->[-1]->[1] == CAPTION_EL) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'not closed', text => $self->{open_elements}->[-1]->[0]->tagName, token => $token); } else { } splice @{$self->{open_elements}}, $i; $clear_up_to_marker->($active_formatting_elements); $self->{insertion_mode} = IN_TABLE_IM; ## reprocess next B; } else { # } } else { # } } elsif ($token->{type} == END_TAG_TOKEN) { if ($token->{tag_name} eq 'td' or $token->{tag_name} eq 'th') { if (($self->{insertion_mode} & IM_MASK) == IN_CELL_IM) { ## have an element in table scope my $i; INSCOPE: for (reverse 0..$#{$self->{open_elements}}) { my $node = $self->{open_elements}->[$_]; if ($node->[0]->tagName eq $token->{tag_name}) { $i = $_; last INSCOPE; } elsif ($node->[1] & TABLE_SCOPING_EL) { last INSCOPE; } } # INSCOPE unless (defined $i) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched end tag', text => $token->{tag_name}, token => $token); ## Ignore the token $token = $self->_get_next_token; next B; } ## generate implied end tags while ($self->{open_elements}->[-1]->[1] & END_TAG_OPTIONAL_EL) { pop @{$self->{open_elements}}; } if ($self->{open_elements}->[-1]->[0]->tagName ne $token->{tag_name}) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'not closed', text => $self->{open_elements}->[-1]->[0]->tagName, token => $token); } else { } splice @{$self->{open_elements}}, $i; $clear_up_to_marker->($active_formatting_elements); $self->{insertion_mode} = IN_ROW_IM; $token = $self->_get_next_token; next B; } elsif (($self->{insertion_mode} & IM_MASK) == IN_CAPTION_IM) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched end tag', text => $token->{tag_name}, token => $token); ## Ignore the token $token = $self->_get_next_token; next B; } else { # } } elsif ($token->{tag_name} eq 'caption') { if (($self->{insertion_mode} & IM_MASK) == IN_CAPTION_IM) { ## have a table element in table scope my $i; INSCOPE: { for (reverse 0..$#{$self->{open_elements}}) { my $node = $self->{open_elements}->[$_]; if ($node->[1] == CAPTION_EL) { $i = $_; last INSCOPE; } elsif ($node->[1] & TABLE_SCOPING_EL) { last; } } $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched end tag', text => $token->{tag_name}, token => $token); ## Ignore the token $token = $self->_get_next_token; next B; } # INSCOPE ## generate implied end tags while ($self->{open_elements}->[-1]->[1] & END_TAG_OPTIONAL_EL) { pop @{$self->{open_elements}}; } unless ($self->{open_elements}->[-1]->[1] == CAPTION_EL) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'not closed', text => $self->{open_elements}->[-1]->[0]->tagName, token => $token); } else { } splice @{$self->{open_elements}}, $i; $clear_up_to_marker->($active_formatting_elements); $self->{insertion_mode} = IN_TABLE_IM; $token = $self->_get_next_token; next B; } elsif (($self->{insertion_mode} & IM_MASK) == IN_CELL_IM) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched end tag', text => $token->{tag_name}, token => $token); ## Ignore the token $token = $self->_get_next_token; next B; } else { # } } elsif ({ table => 1, tbody => 1, tfoot => 1, thead => 1, tr => 1, }->{$token->{tag_name}} and ($self->{insertion_mode} & IM_MASK) == IN_CELL_IM) { ## have an element in table scope my $i; my $tn; INSCOPE: { for (reverse 0..$#{$self->{open_elements}}) { my $node = $self->{open_elements}->[$_]; if ($node->[0]->localname eq $token->{tag_name}) { $i = $_; ## Close the cell $token->{self_closing} = $self->{self_closing}; unshift @{$self->{token}}, $token; delete $self->{self_closing}; # $token = {type => END_TAG_TOKEN, tag_name => $tn, line => $token->{line}, column => $token->{column}}; next B; } elsif ($node->[1] == TABLE_CELL_EL) { $tn = $node->[0]->tagName; ## NOTE: There is exactly one |td| or |th| element ## in scope in the stack of open elements by definition. } elsif ($node->[1] & TABLE_SCOPING_EL) { ## ISSUE: Can this be reached? last; } } $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched end tag', text => $token->{tag_name}, token => $token); ## Ignore the token $token = $self->_get_next_token; next B; } # INSCOPE } elsif ($token->{tag_name} eq 'table' and ($self->{insertion_mode} & IM_MASK) == IN_CAPTION_IM) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'not closed', text => 'caption', token => $token); ## As if ## have a table element in table scope my $i; INSCOPE: for (reverse 0..$#{$self->{open_elements}}) { my $node = $self->{open_elements}->[$_]; if ($node->[1] == CAPTION_EL) { $i = $_; last INSCOPE; } elsif ($node->[1] & TABLE_SCOPING_EL) { last INSCOPE; } } # INSCOPE unless (defined $i) { ## TODO: Wrong error type? $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched end tag', text => 'caption', token => $token); ## Ignore the token $token = $self->_get_next_token; next B; } ## generate implied end tags while ($self->{open_elements}->[-1]->[1] & END_TAG_OPTIONAL_EL) { pop @{$self->{open_elements}}; } unless ($self->{open_elements}->[-1]->[1] == CAPTION_EL) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'not closed', text => $self->{open_elements}->[-1]->[0] ->tagName, token => $token); } else { } splice @{$self->{open_elements}}, $i; $clear_up_to_marker->($active_formatting_elements); $self->{insertion_mode} = IN_TABLE_IM; ## reprocess next B; } elsif ({ body => 1, col => 1, colgroup => 1, html => 1, }->{$token->{tag_name}}) { if ($self->{insertion_mode} & BODY_TABLE_IMS) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched end tag', text => $token->{tag_name}, token => $token); ## Ignore the token $token = $self->_get_next_token; next B; } else { # } } elsif ({ tbody => 1, tfoot => 1, thead => 1, tr => 1, }->{$token->{tag_name}} and ($self->{insertion_mode} & IM_MASK) == IN_CAPTION_IM) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched end tag', text => $token->{tag_name}, token => $token); ## Ignore the token $token = $self->_get_next_token; next B; } else { # } } elsif ($token->{type} == END_OF_FILE_TOKEN) { for my $entry (@{$self->{open_elements}}) { unless ($entry->[1] & ALL_END_TAG_OPTIONAL_EL) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'in body:#eof', token => $token); last; } } ## Stop parsing. last B; } else { die "$0: $token->{type}: Unknown token type"; } $insert = $insert_to_current; # } elsif ($self->{insertion_mode} & TABLE_IMS) { if ($token->{type} == START_TAG_TOKEN) { if ({ tr => (($self->{insertion_mode} & IM_MASK) != IN_ROW_IM), th => 1, td => 1, }->{$token->{tag_name}}) { if (($self->{insertion_mode} & IM_MASK) == IN_TABLE_IM) { ## Clear back to table context while (not ($self->{open_elements}->[-1]->[1] & TABLE_SCOPING_EL)) { pop @{$self->{open_elements}}; } { my $el; $el = $self->{document}->createElementNS((HTML_NS), 'tbody'); $self->_data($el, manakai_source_line => $token->{line}) if defined $token->{line}; $self->_data($el, manakai_source_column => $token->{column}) if defined $token->{column}; $self->_data($el, implied => __LINE__); $self->{open_elements}->[-1]->[0]->appendChild ($el); push @{$self->{open_elements}}, [$el, $el_category->{'tbody'} || 0]; } $self->{insertion_mode} = IN_TABLE_BODY_IM; ## reprocess in the "in table body" insertion mode... } if (($self->{insertion_mode} & IM_MASK) == IN_TABLE_BODY_IM) { unless ($token->{tag_name} eq 'tr') { $self->{parse_error}->(level => $self->{level}->{must}, type => 'missing start tag:tr', token => $token); } ## Clear back to table body context while (not ($self->{open_elements}->[-1]->[1] & TABLE_ROWS_SCOPING_EL)) { ## ISSUE: Can this case be reached? pop @{$self->{open_elements}}; } $self->{insertion_mode} = IN_ROW_IM; if ($token->{tag_name} eq 'tr') { { my $el; $el = $self->{document}->createElementNS((HTML_NS), $token->{tag_name}); for my $attr_name (keys %{ $token->{attributes}}) { my $attr_t = $token->{attributes}->{$attr_name}; my $attr = $self->{document}->createAttributeNS(undef, $attr_name); $attr->setValue($attr_t->{value}); $self->_data($attr, manakai_source_line => $attr_t->{line}); $self->_data($attr, manakai_source_column => $attr_t->{column}); $el->setAttributeNodeNS($attr); } $self->_data($el, manakai_source_line => $token->{line}) if defined $token->{line}; $self->_data($el, manakai_source_column => $token->{column}) if defined $token->{column}; $self->{open_elements}->[-1]->[0]->appendChild ($el); push @{$self->{open_elements}}, [$el, $el_category->{$token->{tag_name}} || 0]; } $open_tables->[-1]->[2] = 0 if @$open_tables; # ~node inserted $token = $self->_get_next_token; next B; } else { { my $el; $el = $self->{document}->createElementNS ((HTML_NS), 'tr'); $self->_data($el, manakai_source_line => $token->{line}) if defined $token->{line}; $self->_data($el, manakai_source_column => $token->{column}) if defined $token->{column}; $self->_data($el, implied => __LINE__); $self->{open_elements}->[-1]->[0]->appendChild ($el); push @{$self->{open_elements}}, [$el, $el_category->{'tr'} || 0]; } ## reprocess in the "in row" insertion mode } } else { } ## Clear back to table row context while (not ($self->{open_elements}->[-1]->[1] & TABLE_ROW_SCOPING_EL)) { pop @{$self->{open_elements}}; } { my $el; $el = $self->{document}->createElementNS((HTML_NS), $token->{tag_name}); for my $attr_name (keys %{ $token->{attributes}}) { my $attr_t = $token->{attributes}->{$attr_name}; my $attr = $self->{document}->createAttributeNS(undef, $attr_name); $attr->setValue($attr_t->{value}); $self->_data($attr, manakai_source_line => $attr_t->{line}); $self->_data($attr, manakai_source_column => $attr_t->{column}); $el->setAttributeNodeNS ($attr); } $self->_data($el, manakai_source_line => $token->{line}) if defined $token->{line}; $self->_data($el, manakai_source_column => $token->{column}) if defined $token->{column}; $self->{open_elements}->[-1]->[0]->appendChild ($el); push @{$self->{open_elements}}, [$el, $el_category->{$token->{tag_name}} || 0]; } $open_tables->[-1]->[2] = 0 if @$open_tables; # ~node inserted $self->{insertion_mode} = IN_CELL_IM; push @$active_formatting_elements, ['#marker', '', undef]; $token = $self->_get_next_token; next B; } elsif ({ caption => 1, col => 1, colgroup => 1, tbody => 1, tfoot => 1, thead => 1, tr => 1, # $self->{insertion_mode} == IN_ROW_IM }->{$token->{tag_name}}) { if (($self->{insertion_mode} & IM_MASK) == IN_ROW_IM) { ## As if ## have an element in table scope my $i; INSCOPE: for (reverse 0..$#{$self->{open_elements}}) { my $node = $self->{open_elements}->[$_]; if ($node->[1] == TABLE_ROW_EL) { $i = $_; last INSCOPE; } elsif ($node->[1] & TABLE_SCOPING_EL) { last INSCOPE; } } # INSCOPE unless (defined $i) { ## TODO: This type is wrong. $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmacthed end tag', text => $token->{tag_name}, token => $token); ## Ignore the token $token = $self->_get_next_token; next B; } ## Clear back to table row context while (not ($self->{open_elements}->[-1]->[1] & TABLE_ROW_SCOPING_EL)) { ## ISSUE: Can this case be reached? pop @{$self->{open_elements}}; } pop @{$self->{open_elements}}; # tr $self->{insertion_mode} = IN_TABLE_BODY_IM; if ($token->{tag_name} eq 'tr') { ## reprocess next B; } else { ## reprocess in the "in table body" insertion mode... } } if (($self->{insertion_mode} & IM_MASK) == IN_TABLE_BODY_IM) { ## have an element in table scope my $i; INSCOPE: for (reverse 0..$#{$self->{open_elements}}) { my $node = $self->{open_elements}->[$_]; if ($node->[1] == TABLE_ROW_GROUP_EL) { $i = $_; last INSCOPE; } elsif ($node->[1] & TABLE_SCOPING_EL) { last INSCOPE; } } # INSCOPE unless (defined $i) { ## TODO: This erorr type is wrong. $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched end tag', text => $token->{tag_name}, token => $token); ## Ignore the token $token = $self->_get_next_token; next B; } ## Clear back to table body context while (not ($self->{open_elements}->[-1]->[1] & TABLE_ROWS_SCOPING_EL)) { ## ISSUE: Can this state be reached? pop @{$self->{open_elements}}; } ## As if <{current node}> ## have an element in table scope ## true by definition ## Clear back to table body context ## nop by definition pop @{$self->{open_elements}}; $self->{insertion_mode} = IN_TABLE_IM; ## reprocess in "in table" insertion mode... } else { } if ($token->{tag_name} eq 'col') { ## Clear back to table context while (not ($self->{open_elements}->[-1]->[1] & TABLE_SCOPING_EL)) { ## ISSUE: Can this state be reached? pop @{$self->{open_elements}}; } { my $el; $el = $self->{document}->createElementNS((HTML_NS), 'colgroup'); $self->_data($el, manakai_source_line => $token->{line}) if defined $token->{line}; $self->_data($el, manakai_source_column => $token->{column}) if defined $token->{column}; $self->_data($el, implied => __LINE__); $self->{open_elements}->[-1]->[0]->appendChild ($el); push @{$self->{open_elements}}, [$el, $el_category->{'colgroup'} || 0]; } $self->{insertion_mode} = IN_COLUMN_GROUP_IM; ## reprocess $open_tables->[-1]->[2] = 0 if @$open_tables; # ~node inserted next B; } elsif ({ caption => 1, colgroup => 1, tbody => 1, tfoot => 1, thead => 1, }->{$token->{tag_name}}) { ## Clear back to table context while (not ($self->{open_elements}->[-1]->[1] & TABLE_SCOPING_EL)) { ## ISSUE: Can this state be reached? pop @{$self->{open_elements}}; } push @$active_formatting_elements, ['#marker', '', undef] if $token->{tag_name} eq 'caption'; { my $el; $el = $self->{document}->createElementNS((HTML_NS), $token->{tag_name}); for my $attr_name (keys %{ $token->{attributes}}) { my $attr_t = $token->{attributes}->{$attr_name}; my $attr = $self->{document}->createAttributeNS(undef, $attr_name); $attr->setValue($attr_t->{value}); $self->_data($attr, manakai_source_line => $attr_t->{line}); $self->_data($attr, manakai_source_column => $attr_t->{column}); $el->setAttributeNodeNS ($attr); } $self->_data($el, manakai_source_line => $token->{line}) if defined $token->{line}; $self->_data($el, manakai_source_column => $token->{column}) if defined $token->{column}; $self->{open_elements}->[-1]->[0]->appendChild ($el); push @{$self->{open_elements}}, [$el, $el_category->{$token->{tag_name}} || 0]; } $open_tables->[-1]->[2] = 0 if @$open_tables; # ~node inserted $self->{insertion_mode} = { caption => IN_CAPTION_IM, colgroup => IN_COLUMN_GROUP_IM, tbody => IN_TABLE_BODY_IM, tfoot => IN_TABLE_BODY_IM, thead => IN_TABLE_BODY_IM, }->{$token->{tag_name}}; $token = $self->_get_next_token; next B; } else { die "$0: in table: <>: $token->{tag_name}"; } } elsif ($token->{tag_name} eq 'table') { $self->{parse_error}->(level => $self->{level}->{must}, type => 'not closed', text => $self->{open_elements}->[-1]->[0] ->tagName, token => $token); ## As if
## have a table element in table scope my $i; INSCOPE: for (reverse 0..$#{$self->{open_elements}}) { my $node = $self->{open_elements}->[$_]; if ($node->[1] == TABLE_EL) { $i = $_; last INSCOPE; } elsif ($node->[1] & TABLE_SCOPING_EL) { last INSCOPE; } } # INSCOPE unless (defined $i) { ## TODO: The following is wrong, maybe. $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched end tag', text => 'table', token => $token); ## Ignore tokens
$token = $self->_get_next_token; next B; } ## TODO: Followings are removed from the latest spec. ## generate implied end tags while ($self->{open_elements}->[-1]->[1] & END_TAG_OPTIONAL_EL) { pop @{$self->{open_elements}}; } unless ($self->{open_elements}->[-1]->[1] == TABLE_EL) { ## NOTE: |
| $self->{parse_error}->(level => $self->{level}->{must}, type => 'not closed', text => $self->{open_elements}->[-1]->[0] ->tagName, token => $token); } else { } splice @{$self->{open_elements}}, $i; pop @{$open_tables}; $self->_reset_insertion_mode; ## reprocess next B; } elsif ($token->{tag_name} eq 'style') { ## NOTE: This is a "as if in head" code clone. $parse_rcdata->($self, $insert, $open_tables, 0); # RAWTEXT $open_tables->[-1]->[2] = 0 if @$open_tables; # ~node inserted next B; } elsif ($token->{tag_name} eq 'script') { ## NOTE: This is a "as if in head" code clone. $script_start_tag->($self, $insert, $open_tables); $open_tables->[-1]->[2] = 0 if @$open_tables; # ~node inserted next B; } elsif ($token->{tag_name} eq 'input') { if ($token->{attributes}->{type}) { my $type = $token->{attributes}->{type}->{value}; $type =~ tr/A-Z/a-z/; ## ASCII case-insensitive. if ($type eq 'hidden') { $self->{parse_error}->(level => $self->{level}->{must}, type => 'in table', text => $token->{tag_name}, token => $token); { my $el; $el = $self->{document}->createElementNS((HTML_NS), $token->{tag_name}); for my $attr_name (keys %{ $token->{attributes}}) { my $attr_t = $token->{attributes}->{$attr_name}; my $attr = $self->{document}->createAttributeNS(undef, $attr_name); $attr->setValue($attr_t->{value}); $self->_data($attr, manakai_source_line => $attr_t->{line}); $self->_data($attr, manakai_source_column => $attr_t->{column}); $el->setAttributeNodeNS ($attr); } $self->_data($el, manakai_source_line => $token->{line}) if defined $token->{line}; $self->_data($el, manakai_source_column => $token->{column}) if defined $token->{column}; $self->{open_elements}->[-1]->[0]->appendChild ($el); push @{$self->{open_elements}}, [$el, $el_category->{$token->{tag_name}} || 0]; } $open_tables->[-1]->[2] = 0 if @$open_tables; # ~node inserted ## TODO: form element pointer pop @{$self->{open_elements}}; $token = $self->_get_next_token; delete $self->{self_closing}; next B; } else { # } } else { # } } elsif ($token->{tag_name} eq 'form') { $self->{parse_error}->(level => $self->{level}->{must}, type => 'form in table', token => $token); # XXX documentation if ($self->{form_element}) { ## Ignore the token. $token = $self->_get_next_token; next B; } else { { my $el; $el = $self->{document}->createElementNS((HTML_NS), $token->{tag_name}); for my $attr_name (keys %{ $token->{attributes}}) { my $attr_t = $token->{attributes}->{$attr_name}; my $attr = $self->{document}->createAttributeNS(undef, $attr_name); $attr->setValue($attr_t->{value}); $self->_data($attr, manakai_source_line => $attr_t->{line}); $self->_data($attr, manakai_source_column => $attr_t->{column}); $el->setAttributeNodeNS($attr); } $self->_data($el, manakai_source_line => $token->{line}) if defined $token->{line}; $self->_data($el, manakai_source_column => $token->{column}) if defined $token->{column}; $self->{open_elements}->[-1]->[0]->appendChild ($el); push @{$self->{open_elements}}, [$el, $el_category->{$token->{tag_name}} || 0]; } $self->{form_element} = $self->{open_elements}->[-1]->[0]; pop @{$self->{open_elements}}; $token = $self->_get_next_token; next B; } } else { # } $self->{parse_error}->(level => $self->{level}->{must}, type => 'in table', text => $token->{tag_name}, token => $token); $insert = $insert_to_foster; # } elsif ($token->{type} == END_TAG_TOKEN) { if ($token->{tag_name} eq 'tr' and ($self->{insertion_mode} & IM_MASK) == IN_ROW_IM) { ## have an element in table scope my $i; INSCOPE: for (reverse 0..$#{$self->{open_elements}}) { my $node = $self->{open_elements}->[$_]; if ($node->[1] == TABLE_ROW_EL) { $i = $_; last INSCOPE; } elsif ($node->[1] & TABLE_SCOPING_EL) { last INSCOPE; } } # INSCOPE unless (defined $i) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched end tag', text => $token->{tag_name}, token => $token); ## Ignore the token $token = $self->_get_next_token; next B; } else { } ## Clear back to table row context while (not ($self->{open_elements}->[-1]->[1] & TABLE_ROW_SCOPING_EL)) { ## ISSUE: Can this state be reached? pop @{$self->{open_elements}}; } pop @{$self->{open_elements}}; # tr $self->{insertion_mode} = IN_TABLE_BODY_IM; $token = $self->_get_next_token; next B; } elsif ($token->{tag_name} eq 'table') { if (($self->{insertion_mode} & IM_MASK) == IN_ROW_IM) { ## As if ## have an element in table scope my $i; INSCOPE: for (reverse 0..$#{$self->{open_elements}}) { my $node = $self->{open_elements}->[$_]; if ($node->[1] == TABLE_ROW_EL) { $i = $_; last INSCOPE; } elsif ($node->[1] & TABLE_SCOPING_EL) { last INSCOPE; } } # INSCOPE unless (defined $i) { ## TODO: The following is wrong. $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched end tag', text => $token->{type}, token => $token); ## Ignore the token $token = $self->_get_next_token; next B; } ## Clear back to table row context while (not ($self->{open_elements}->[-1]->[1] & TABLE_ROW_SCOPING_EL)) { ## ISSUE: Can this state be reached? pop @{$self->{open_elements}}; } pop @{$self->{open_elements}}; # tr $self->{insertion_mode} = IN_TABLE_BODY_IM; ## reprocess in the "in table body" insertion mode... } if (($self->{insertion_mode} & IM_MASK) == IN_TABLE_BODY_IM) { ## have an element in table scope my $i; INSCOPE: for (reverse 0..$#{$self->{open_elements}}) { my $node = $self->{open_elements}->[$_]; if ($node->[1] == TABLE_ROW_GROUP_EL) { $i = $_; last INSCOPE; } elsif ($node->[1] & TABLE_SCOPING_EL) { last INSCOPE; } } # INSCOPE unless (defined $i) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched end tag', text => $token->{tag_name}, token => $token); ## Ignore the token $token = $self->_get_next_token; next B; } ## Clear back to table body context while (not ($self->{open_elements}->[-1]->[1] & TABLE_ROWS_SCOPING_EL)) { pop @{$self->{open_elements}}; } ## As if <{current node}> ## have an element in table scope ## true by definition ## Clear back to table body context ## nop by definition pop @{$self->{open_elements}}; $self->{insertion_mode} = IN_TABLE_IM; ## reprocess in the "in table" insertion mode... } ## NOTE:
in the "in table" insertion mode. ## When you edit the code fragment below, please ensure that ## the code for in the "in table" insertion mode ## is synced with it. ## have a table element in table scope my $i; INSCOPE: for (reverse 0..$#{$self->{open_elements}}) { my $node = $self->{open_elements}->[$_]; if ($node->[1] == TABLE_EL) { $i = $_; last INSCOPE; } elsif ($node->[1] & TABLE_SCOPING_EL) { last INSCOPE; } } # INSCOPE unless (defined $i) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched end tag', text => $token->{tag_name}, token => $token); ## Ignore the token $token = $self->_get_next_token; next B; } splice @{$self->{open_elements}}, $i; pop @{$open_tables}; $self->_reset_insertion_mode; $token = $self->_get_next_token; next B; } elsif ({ tbody => 1, tfoot => 1, thead => 1, }->{$token->{tag_name}} and $self->{insertion_mode} & ROW_IMS) { if (($self->{insertion_mode} & IM_MASK) == IN_ROW_IM) { ## have an element in table scope my $i; INSCOPE: for (reverse 0..$#{$self->{open_elements}}) { my $node = $self->{open_elements}->[$_]; if ($node->[0]->tagName eq $token->{tag_name}) { $i = $_; last INSCOPE; } elsif ($node->[1] & TABLE_SCOPING_EL) { last INSCOPE; } } # INSCOPE unless (defined $i) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched end tag', text => $token->{tag_name}, token => $token); ## Ignore the token $token = $self->_get_next_token; next B; } ## As if ## have an element in table scope no warnings; my $i; use warnings; INSCOPE: for (reverse 0..$#{$self->{open_elements}}) { my $node = $self->{open_elements}->[$_]; if ($node->[1] == TABLE_ROW_EL) { $i = $_; last INSCOPE; } elsif ($node->[1] & TABLE_SCOPING_EL) { last INSCOPE; } } # INSCOPE unless (defined $i) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched end tag', text => 'tr', token => $token); ## Ignore the token $token = $self->_get_next_token; next B; } ## Clear back to table row context while (not ($self->{open_elements}->[-1]->[1] & TABLE_ROW_SCOPING_EL)) { ## ISSUE: Can this case be reached? pop @{$self->{open_elements}}; } pop @{$self->{open_elements}}; # tr $self->{insertion_mode} = IN_TABLE_BODY_IM; ## reprocess in the "in table body" insertion mode... } ## have an element in table scope my $i; INSCOPE: for (reverse 0..$#{$self->{open_elements}}) { my $node = $self->{open_elements}->[$_]; if ($node->[0]->tagName eq $token->{tag_name}) { $i = $_; last INSCOPE; } elsif ($node->[1] & TABLE_SCOPING_EL) { last INSCOPE; } } # INSCOPE unless (defined $i) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched end tag', text => $token->{tag_name}, token => $token); ## Ignore the token $token = $self->_get_next_token; next B; } ## Clear back to table body context while (not ($self->{open_elements}->[-1]->[1] & TABLE_ROWS_SCOPING_EL)) { ## ISSUE: Can this case be reached? pop @{$self->{open_elements}}; } pop @{$self->{open_elements}}; $self->{insertion_mode} = IN_TABLE_IM; $token = $self->_get_next_token; next B; } elsif ({ body => 1, caption => 1, col => 1, colgroup => 1, html => 1, td => 1, th => 1, tr => 1, # $self->{insertion_mode} == IN_ROW_IM tbody => 1, tfoot => 1, thead => 1, # $self->{insertion_mode} == IN_TABLE_IM }->{$token->{tag_name}}) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched end tag', text => $token->{tag_name}, token => $token); ## Ignore the token $token = $self->_get_next_token; next B; } else { $self->{parse_error}->(level => $self->{level}->{must}, type => 'in table:/', text => $token->{tag_name}, token => $token); $insert = $insert_to_foster; # } } elsif ($token->{type} == END_OF_FILE_TOKEN) { unless ($self->{open_elements}->[-1]->[1] == HTML_EL and @{$self->{open_elements}} == 1) { # redundant, maybe $self->{parse_error}->(level => $self->{level}->{must}, type => 'in body:#eof', token => $token); # } else { # } ## Stop parsing last B; } else { die "$0: $token->{type}: Unknown token type"; } } elsif (($self->{insertion_mode} & IM_MASK) == IN_COLUMN_GROUP_IM) { if ($token->{type} == CHARACTER_TOKEN) { if ($token->{data} =~ s/^([\x09\x0A\x0C\x20]+)//) { $self->{open_elements}->[-1]->[0]->appendTextFromUnicode($self, $1, $token); unless (length $token->{data}) { $token = $self->_get_next_token; next B; } } # } elsif ($token->{type} == START_TAG_TOKEN) { if ($token->{tag_name} eq 'col') { { my $el; $el = $self->{document}->createElementNS((HTML_NS), $token->{tag_name}); for my $attr_name (keys %{ $token->{attributes}}) { my $attr_t = $token->{attributes}->{$attr_name}; my $attr = $self->{document}->createAttributeNS(undef, $attr_name); $attr->setValue($attr_t->{value}); $self->_data($attr, manakai_source_line => $attr_t->{line}); $self->_data($attr, manakai_source_column => $attr_t->{column}); $el->setAttributeNodeNS ($attr); } $self->_data($el, manakai_source_line => $token->{line}) if defined $token->{line}; $self->_data($el, manakai_source_column => $token->{column}) if defined $token->{column}; $self->{open_elements}->[-1]->[0]->appendChild ($el); push @{$self->{open_elements}}, [$el, $el_category->{$token->{tag_name}} || 0]; } pop @{$self->{open_elements}}; delete $self->{self_closing}; $token = $self->_get_next_token; next B; } else { # } } elsif ($token->{type} == END_TAG_TOKEN) { if ($token->{tag_name} eq 'colgroup') { if ($self->{open_elements}->[-1]->[1] == HTML_EL) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched end tag', text => 'colgroup', token => $token); ## Ignore the token $token = $self->_get_next_token; next B; } else { pop @{$self->{open_elements}}; # colgroup $self->{insertion_mode} = IN_TABLE_IM; $token = $self->_get_next_token; next B; } } elsif ($token->{tag_name} eq 'col') { $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched end tag', text => 'col', token => $token); ## Ignore the token $token = $self->_get_next_token; next B; } else { # } } elsif ($token->{type} == END_OF_FILE_TOKEN) { if ($self->{open_elements}->[-1]->[1] == HTML_EL and @{$self->{open_elements}} == 1) { # redundant, maybe ## Stop parsing. last B; } else { ## NOTE: As if . pop @{$self->{open_elements}}; # colgroup $self->{insertion_mode} = IN_TABLE_IM; ## Reprocess. next B; } } else { die "$0: $token->{type}: Unknown token type"; } ## As if if ($self->{open_elements}->[-1]->[1] == HTML_EL) { ## TODO: Wrong error type? $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched end tag', text => 'colgroup', token => $token); ## Ignore the token $token = $self->_get_next_token; next B; } else { pop @{$self->{open_elements}}; # colgroup $self->{insertion_mode} = IN_TABLE_IM; ## reprocess next B; } } elsif ($self->{insertion_mode} & SELECT_IMS) { if ($token->{type} == CHARACTER_TOKEN) { my $data = $token->{data}; while ($data =~ s/\x00//) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL', token => $token); } $self->{open_elements}->[-1]->[0]->appendTextFromUnicode($self, $data, $token) if $data ne ''; $token = $self->_get_next_token; next B; } elsif ($token->{type} == START_TAG_TOKEN) { if ($token->{tag_name} eq 'option') { if ($self->{open_elements}->[-1]->[1] == OPTION_EL) { ## As if pop @{$self->{open_elements}}; } else { } { my $el; $el = $self->{document}->createElementNS((HTML_NS), $token->{tag_name}); for my $attr_name (keys %{ $token->{attributes}}) { my $attr_t = $token->{attributes}->{$attr_name}; my $attr = $self->{document}->createAttributeNS(undef, $attr_name); $attr->setValue($attr_t->{value}); $self->_data($attr, manakai_source_line => $attr_t->{line}); $self->_data($attr, manakai_source_column => $attr_t->{column}); $el->setAttributeNodeNS ($attr); } $self->_data($el, manakai_source_line => $token->{line}) if defined $token->{line}; $self->_data($el, manakai_source_column => $token->{column}) if defined $token->{column}; $self->{open_elements}->[-1]->[0]->appendChild ($el); push @{$self->{open_elements}}, [$el, $el_category->{$token->{tag_name}} || 0]; } $token = $self->_get_next_token; next B; } elsif ($token->{tag_name} eq 'optgroup') { if ($self->{open_elements}->[-1]->[1] == OPTION_EL) { ## As if pop @{$self->{open_elements}}; } else { } if ($self->{open_elements}->[-1]->[1] == OPTGROUP_EL) { ## As if pop @{$self->{open_elements}}; } else { } { my $el; $el = $self->{document}->createElementNS((HTML_NS), $token->{tag_name}); for my $attr_name (keys %{ $token->{attributes}}) { my $attr_t = $token->{attributes}->{$attr_name}; my $attr = $self->{document}->createAttributeNS(undef, $attr_name); $attr->setValue($attr_t->{value}); $self->_data($attr, manakai_source_line => $attr_t->{line}); $self->_data($attr, manakai_source_column => $attr_t->{column}); $el->setAttributeNodeNS ($attr); } $self->_data($el, manakai_source_line => $token->{line}) if defined $token->{line}; $self->_data($el, manakai_source_column => $token->{column}) if defined $token->{column}; $self->{open_elements}->[-1]->[0]->appendChild ($el); push @{$self->{open_elements}}, [$el, $el_category->{$token->{tag_name}} || 0]; } $token = $self->_get_next_token; next B; } elsif ($token->{tag_name} eq 'select') { ## "In select" / "in select in table" insertion mode, ## "select" start tag. $self->{parse_error}->(level => $self->{level}->{must}, type => 'select in select', ## XXX: documentation token => $token); ## Act as if the token were . $token = {type => END_TAG_TOKEN, tag_name => 'select', line => $token->{line}, column => $token->{column}}; next B; } elsif ({ input => 1, textarea => 1, keygen => 1, }->{$token->{tag_name}}) { ## "In select" / "in select in table" insertion mode, ## "input", "keygen", "textarea" start tag. ## Parse error. $self->{parse_error}->(level => $self->{level}->{must}, type => 'not closed', text => 'select', token => $token); ## If there "have an element in select scope" where element ## is a |select| element. my $i; INSCOPE: for (reverse 0..$#{$self->{open_elements}}) { my $node = $self->{open_elements}->[$_]; if ($node->[1] == SELECT_EL) { $i = $_; last INSCOPE; } elsif ($node->[1] == OPTGROUP_EL or $node->[1] == OPTION_EL) { # } else { last INSCOPE; } } # INSCOPE unless (defined $i) { ## Ignore the token. $token = $self->_get_next_token; next B; } ## Otherwise, act as if there were , then reprocess ## the token. $token->{self_closing} = $self->{self_closing}; unshift @{$self->{token}}, $token; delete $self->{self_closing}; $token = {type => END_TAG_TOKEN, tag_name => 'select', line => $token->{line}, column => $token->{column}}; next B; } elsif ( ($self->{insertion_mode} & IM_MASK) == IN_SELECT_IN_TABLE_IM and { caption => 1, table => 1, tbody => 1, tfoot => 1, thead => 1, tr => 1, td => 1, th => 1, }->{$token->{tag_name}} ) { ## "In select in table" insertion mode, table-related start ## tags. ## Parse error. $self->{parse_error}->(level => $self->{level}->{must}, type => 'not closed', text => 'select', token => $token); ## Act as if there were , then reprocess the token. $token->{self_closing} = $self->{self_closing}; unshift @{$self->{token}}, $token; delete $self->{self_closing}; $token = {type => END_TAG_TOKEN, tag_name => 'select', line => $token->{line}, column => $token->{column}}; next B; } elsif ($token->{tag_name} eq 'script') { ## NOTE: This is an "as if in head" code clone $script_start_tag->($self, $insert, $open_tables); next B; } else { $self->{parse_error}->(level => $self->{level}->{must}, type => 'in select', text => $token->{tag_name}, token => $token); ## Ignore the token $token = $self->_get_next_token; next B; } } elsif ($token->{type} == END_TAG_TOKEN) { if ($token->{tag_name} eq 'optgroup') { if ($self->{open_elements}->[-1]->[1] == OPTION_EL and $self->{open_elements}->[-2]->[1] == OPTGROUP_EL) { ## As if splice @{$self->{open_elements}}, -2; } elsif ($self->{open_elements}->[-1]->[1] == OPTGROUP_EL) { pop @{$self->{open_elements}}; } else { $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched end tag', text => $token->{tag_name}, token => $token); ## Ignore the token } $token = $self->_get_next_token; next B; } elsif ($token->{tag_name} eq 'option') { if ($self->{open_elements}->[-1]->[1] == OPTION_EL) { pop @{$self->{open_elements}}; } else { $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched end tag', text => $token->{tag_name}, token => $token); ## Ignore the token } $token = $self->_get_next_token; next B; } elsif ($token->{tag_name} eq 'select') { ## "In select" / "in select in table" insertion mode, ## "select" end tag. ## There "have an element in select scope" where the element ## is |select|. my $i; INSCOPE: for (reverse 0..$#{$self->{open_elements}}) { my $node = $self->{open_elements}->[$_]; if ($node->[1] == SELECT_EL) { $i = $_; last INSCOPE; } elsif ($node->[1] == OPTION_EL or $node->[1] == OPTGROUP_EL) { # } else { last INSCOPE; } } # INSCOPE unless (defined $i) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched end tag', text => $token->{tag_name}, token => $token); ## Ignore the token. $token = $self->_get_next_token; next B; } ## Otherwise, splice @{$self->{open_elements}}, $i; $self->_reset_insertion_mode; $token = $self->_get_next_token; next B; } elsif ( ($self->{insertion_mode} & IM_MASK) == IN_SELECT_IN_TABLE_IM and { caption => 1, table => 1, tbody => 1, tfoot => 1, thead => 1, tr => 1, td => 1, th => 1, }->{$token->{tag_name}} ) { ## "In select in table" insertion mode, table-related end ## tags. $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched end tag', text => $token->{tag_name}, token => $token); ## There "have an element in table scope" where the element ## is same tag name as |$token|. my $i; INSCOPE: for (reverse 0..$#{$self->{open_elements}}) { my $node = $self->{open_elements}->[$_]; if ($node->[0]->tagName eq $token->{tag_name}) { $i = $_; last INSCOPE; } elsif ($node->[1] & TABLE_SCOPING_EL) { last INSCOPE; } } # INSCOPE unless (defined $i) { ## Ignore the token $token = $self->_get_next_token; next B; } ## Act as if there were , then reprocess the token. $token->{self_closing} = $self->{self_closing}; unshift @{$self->{token}}, $token; delete $self->{self_closing}; $token = {type => END_TAG_TOKEN, tag_name => 'select', line => $token->{line}, column => $token->{column}}; next B; } else { $self->{parse_error}->(level => $self->{level}->{must}, type => 'in select:/', text => $token->{tag_name}, token => $token); ## Ignore the token $token = $self->_get_next_token; next B; } } elsif ($token->{type} == END_OF_FILE_TOKEN) { unless ($self->{open_elements}->[-1]->[1] == HTML_EL and @{$self->{open_elements}} == 1) { # redundant, maybe $self->{parse_error}->(level => $self->{level}->{must}, type => 'in body:#eof', token => $token); } else { } ## Stop parsing. last B; } else { die "$0: $token->{type}: Unknown token type"; } } elsif ($self->{insertion_mode} & BODY_AFTER_IMS) { if ($token->{type} == CHARACTER_TOKEN) { if ($token->{data} =~ s/^([\x09\x0A\x0C\x20]+)//) { my $data = $1; ## As if in body $reconstruct_active_formatting_elements ->($self, $insert_to_current, $active_formatting_elements, $open_tables); $self->{open_elements}->[-1]->[0]->appendTextFromUnicode($self, $1, $token); unless (length $token->{data}) { $token = $self->_get_next_token; next B; } } if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'after html:#text', token => $token); # } else { ## "after body" insertion mode $self->{parse_error}->(level => $self->{level}->{must}, type => 'after body:#text', token => $token); # } $self->{insertion_mode} = IN_BODY_IM; ## reprocess next B; } elsif ($token->{type} == START_TAG_TOKEN) { if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'after html', text => $token->{tag_name}, token => $token); # } else { ## "after body" insertion mode $self->{parse_error}->(level => $self->{level}->{must}, type => 'after body', text => $token->{tag_name}, token => $token); # } $self->{insertion_mode} = IN_BODY_IM; ## reprocess next B; } elsif ($token->{type} == END_TAG_TOKEN) { if ($self->{insertion_mode} == AFTER_HTML_BODY_IM) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'after html:/', text => $token->{tag_name}, token => $token); $self->{insertion_mode} = IN_BODY_IM; ## Reprocess. next B; } else { } ## "after body" insertion mode if ($token->{tag_name} eq 'html') { if (defined $self->{inner_html_node}) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched end tag', text => 'html', token => $token); ## Ignore the token $token = $self->_get_next_token; next B; } else { $self->{insertion_mode} = AFTER_HTML_BODY_IM; $token = $self->_get_next_token; next B; } } else { $self->{parse_error}->(level => $self->{level}->{must}, type => 'after body:/', text => $token->{tag_name}, token => $token); $self->{insertion_mode} = IN_BODY_IM; ## reprocess next B; } } elsif ($token->{type} == END_OF_FILE_TOKEN) { ## Stop parsing last B; } else { die "$0: $token->{type}: Unknown token type"; } } elsif ($self->{insertion_mode} & FRAME_IMS) { if ($token->{type} == CHARACTER_TOKEN) { if ($token->{data} =~ s/^([\x09\x0A\x0C\x20]+)//) { $self->{open_elements}->[-1]->[0]->appendTextFromUnicode($self, $1, $token); unless (length $token->{data}) { $token = $self->_get_next_token; next B; } } if ($token->{data} =~ s/^[^\x09\x0A\x0C\x20]+//) { if ($self->{insertion_mode} == IN_FRAMESET_IM) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'in frameset:#text', token => $token); } elsif ($self->{insertion_mode} == AFTER_FRAMESET_IM) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'after frameset:#text', token => $token); } else { # "after after frameset" $self->{parse_error}->(level => $self->{level}->{must}, type => 'after html:#text', token => $token); } ## Ignore the token. if (length $token->{data}) { ## reprocess the rest of characters } else { $token = $self->_get_next_token; } next B; } die qq[$0: Character "$token->{data}"]; } elsif ($token->{type} == START_TAG_TOKEN) { if ($token->{tag_name} eq 'frameset' and $self->{insertion_mode} == IN_FRAMESET_IM) { { my $el; $el = $self->{document}->createElementNS((HTML_NS), $token->{tag_name}); for my $attr_name (keys %{ $token->{attributes}}) { my $attr_t = $token->{attributes}->{$attr_name}; my $attr = $self->{document}->createAttributeNS(undef, $attr_name); $attr->setValue($attr_t->{value}); $self->_data($attr, manakai_source_line => $attr_t->{line}); $self->_data($attr, manakai_source_column => $attr_t->{column}); $el->setAttributeNodeNS ($attr); } $self->_data($el, manakai_source_line => $token->{line}) if defined $token->{line}; $self->_data($el, manakai_source_column => $token->{column}) if defined $token->{column}; $self->{open_elements}->[-1]->[0]->appendChild ($el); push @{$self->{open_elements}}, [$el, $el_category->{$token->{tag_name}} || 0]; } $token = $self->_get_next_token; next B; } elsif ($token->{tag_name} eq 'frame' and $self->{insertion_mode} == IN_FRAMESET_IM) { { my $el; $el = $self->{document}->createElementNS((HTML_NS), $token->{tag_name}); for my $attr_name (keys %{ $token->{attributes}}) { my $attr_t = $token->{attributes}->{$attr_name}; my $attr = $self->{document}->createAttributeNS(undef, $attr_name); $attr->setValue($attr_t->{value}); $self->_data($attr, manakai_source_line => $attr_t->{line}); $self->_data($attr, manakai_source_column => $attr_t->{column}); $el->setAttributeNodeNS ($attr); } $self->_data($el, manakai_source_line => $token->{line}) if defined $token->{line}; $self->_data($el, manakai_source_column => $token->{column}) if defined $token->{column}; $self->{open_elements}->[-1]->[0]->appendChild ($el); push @{$self->{open_elements}}, [$el, $el_category->{$token->{tag_name}} || 0]; } pop @{$self->{open_elements}}; delete $self->{self_closing}; $token = $self->_get_next_token; next B; } elsif ($token->{tag_name} eq 'noframes') { ## NOTE: As if in head. $parse_rcdata->($self, $insert, $open_tables, 0); # RAWTEXT next B; ## NOTE: || ## has no parse error. } else { if ($self->{insertion_mode} == IN_FRAMESET_IM) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'in frameset', text => $token->{tag_name}, token => $token); } elsif ($self->{insertion_mode} == AFTER_FRAMESET_IM) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'after frameset', text => $token->{tag_name}, token => $token); } else { # "after after frameset" $self->{parse_error}->(level => $self->{level}->{must}, type => 'after after frameset', text => $token->{tag_name}, token => $token); } ## Ignore the token $token = $self->_get_next_token; next B; } } elsif ($token->{type} == END_TAG_TOKEN) { if ($token->{tag_name} eq 'frameset' and $self->{insertion_mode} == IN_FRAMESET_IM) { if ($self->{open_elements}->[-1]->[1] == HTML_EL and @{$self->{open_elements}} == 1) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched end tag', text => $token->{tag_name}, token => $token); ## Ignore the token $token = $self->_get_next_token; } else { pop @{$self->{open_elements}}; $token = $self->_get_next_token; } if (not defined $self->{inner_html_node} and not ($self->{open_elements}->[-1]->[1] == FRAMESET_EL)) { $self->{insertion_mode} = AFTER_FRAMESET_IM; } else { } next B; } elsif ($token->{tag_name} eq 'html' and $self->{insertion_mode} == AFTER_FRAMESET_IM) { $self->{insertion_mode} = AFTER_HTML_FRAMESET_IM; $token = $self->_get_next_token; next B; } else { if ($self->{insertion_mode} == IN_FRAMESET_IM) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'in frameset:/', text => $token->{tag_name}, token => $token); } elsif ($self->{insertion_mode} == AFTER_FRAMESET_IM) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'after frameset:/', text => $token->{tag_name}, token => $token); } else { # "after after html" $self->{parse_error}->(level => $self->{level}->{must}, type => 'after after frameset:/', text => $token->{tag_name}, token => $token); } ## Ignore the token $token = $self->_get_next_token; next B; } } elsif ($token->{type} == END_OF_FILE_TOKEN) { unless ($self->{open_elements}->[-1]->[1] == HTML_EL and @{$self->{open_elements}} == 1) { # redundant, maybe $self->{parse_error}->(level => $self->{level}->{must}, type => 'in body:#eof', token => $token); } else { } ## Stop parsing last B; } else { die "$0: $token->{type}: Unknown token type"; } } else { die "$0: $self->{insertion_mode}: Unknown insertion mode"; } ## "in body" insertion mode if ($token->{type} == START_TAG_TOKEN) { if ($token->{tag_name} eq 'script') { ## NOTE: This is an "as if in head" code clone $script_start_tag->($self, $insert, $open_tables); next B; } elsif ($token->{tag_name} eq 'style') { ## NOTE: This is an "as if in head" code clone $parse_rcdata->($self, $insert, $open_tables, 0); # RAWTEXT next B; } elsif ({ base => 1, command => 1, link => 1, basefont => 1, bgsound => 1, }->{$token->{tag_name}}) { ## NOTE: This is an "as if in head" code clone, only "-t" differs { my $el; $el = $self->{document}->createElementNS((HTML_NS), $token->{tag_name}); for my $attr_name (keys %{ $token->{attributes}}) { my $attr_t = $token->{attributes}->{$attr_name}; my $attr = $self->{document}->createAttributeNS (undef, $attr_name); $attr->setValue($attr_t->{value}); $self->_data($attr, manakai_source_line => $attr_t->{line}); $self->_data($attr, manakai_source_column => $attr_t->{column}); $el->setAttributeNodeNS ($attr); } $self->_data($el, manakai_source_line => $token->{line}) if defined $token->{line}; $self->_data($el, manakai_source_column => $token->{column}) if defined $token->{column}; $insert->($self, $el, $open_tables); push @{$self->{open_elements}}, [$el, $el_category->{$token->{tag_name}} || 0]; } pop @{$self->{open_elements}}; delete $self->{self_closing}; $token = $self->_get_next_token; next B; } elsif ($token->{tag_name} eq 'meta') { ## NOTE: This is an "as if in head" code clone, only "-t" differs { my $el; $el = $self->{document}->createElementNS((HTML_NS), $token->{tag_name}); for my $attr_name (keys %{ $token->{attributes}}) { my $attr_t = $token->{attributes}->{$attr_name}; my $attr = $self->{document}->createAttributeNS (undef, $attr_name); $attr->setValue($attr_t->{value}); $self->_data($attr, manakai_source_line => $attr_t->{line}); $self->_data($attr, manakai_source_column => $attr_t->{column}); $el->setAttributeNodeNS ($attr); } $self->_data($el, manakai_source_line => $token->{line}) if defined $token->{line}; $self->_data($el, manakai_source_column => $token->{column}) if defined $token->{column}; $insert->($self, $el, $open_tables); push @{$self->{open_elements}}, [$el, $el_category->{$token->{tag_name}} || 0]; } my $meta_el = pop @{$self->{open_elements}}; unless ($self->{confident}) { if ($token->{attributes}->{charset}) { ## NOTE: Whether the encoding is supported or not is ## handled in the {change_encoding} callback. $self->{change_encoding} ->($self, $token->{attributes}->{charset}->{value}, $token); $self->_data($meta_el->[0]->getAttributeNodeNS(undef, 'charset'), manakai_has_reference => $token->{attributes}->{charset}->{has_reference}); } elsif ($token->{attributes}->{content} and $token->{attributes}->{'http-equiv'}) { if ($token->{attributes}->{'http-equiv'}->{value} =~ /\A[Cc][Oo][Nn][Tt][Ee][Nn][Tt]-[Tt][Yy][Pp][Ee]\z/ and $token->{attributes}->{content}->{value} =~ /[Cc][Hh][Aa][Rr][Ss][Ee][Tt] [\x09\x0A\x0C\x0D\x20]*= [\x09\x0A\x0C\x0D\x20]*(?>"([^"]*)"|'([^']*)'| ([^"'\x09\x0A\x0C\x0D\x20][^\x09\x0A\x0C\x0D\x20\x3B]*)) /x) { ## NOTE: Whether the encoding is supported or not is handled ## in the {change_encoding} callback. $self->{change_encoding} ->($self, defined $1 ? $1 : defined $2 ? $2 : $3, $token); $self->_data($meta_el->[0]->getAttributeNodeNS(undef, 'content'), manakai_has_reference => $token->{attributes}->{content}->{has_reference}); } } } else { if ($token->{attributes}->{charset}) { $self->_data($meta_el->[0]->getAttributeNodeNS(undef, 'charset'), manakai_has_reference => $token->{attributes}->{charset}->{has_reference}); } if ($token->{attributes}->{content}) { $self->_data($meta_el->[0]->getAttributeNodeNS (undef, 'content'), manakai_has_reference => $token->{attributes}->{content}->{has_reference}); } } delete $self->{self_closing}; $token = $self->_get_next_token; next B; } elsif ($token->{tag_name} eq 'title') { ## NOTE: This is an "as if in head" code clone $parse_rcdata->($self, $insert, $open_tables, 1); # RCDATA next B; } elsif ($token->{tag_name} eq 'body') { ## "In body" insertion mode, "body" start tag token. $self->{parse_error}->(level => $self->{level}->{must}, type => 'in body', text => 'body', token => $token); if (@{$self->{open_elements}} == 1 or not ($self->{open_elements}->[1]->[1] == BODY_EL)) { ## Ignore the token } else { delete $self->{frameset_ok}; my $body_el = $self->{open_elements}->[1]->[0]; for my $attr_name (keys %{$token->{attributes}}) { unless ($body_el->hasAttribute($attr_name)) { $body_el->setAttribute($attr_name, $token->{attributes}->{$attr_name}->{value}); } } } $token = $self->_get_next_token; next B; } elsif ($token->{tag_name} eq 'frameset') { $self->{parse_error}->(level => $self->{level}->{must}, type => 'in body', text => $token->{tag_name}, token => $token); if (@{$self->{open_elements}} == 1 or not ($self->{open_elements}->[1]->[1] == BODY_EL)) { ## Ignore the token. } elsif (not $self->{frameset_ok}) { ## Ignore the token. } else { ## 1. Remove the second element. my $body = $self->{open_elements}->[1]->[0]; my $body_parent = $body->parentNode; $body_parent->removeChild ($body) if $body_parent; ## 2. Pop nodes. splice @{$self->{open_elements}}, 1; ## 3. Insert. { my $el; $el = $self->{document}->createElementNS((HTML_NS), $token->{tag_name}); for my $attr_name (keys %{ $token->{attributes}}) { my $attr_t = $token->{attributes}->{$attr_name}; my $attr = $self->{document}->createAttributeNS(undef, $attr_name); $attr->setValue($attr_t->{value}); $self->_data($attr, manakai_source_line => $attr_t->{line}); $self->_data($attr, manakai_source_column => $attr_t->{column}); $el->setAttributeNodeNS ($attr); } $self->_data($el, manakai_source_line => $token->{line}) if defined $token->{line}; $self->_data($el, manakai_source_column => $token->{column}) if defined $token->{column}; $insert->($self, $el, $open_tables); push @{$self->{open_elements}}, [$el, $el_category->{$token->{tag_name}} || 0]; } ## 4. Switch. $self->{insertion_mode} = IN_FRAMESET_IM; } $token = $self->_get_next_token; next B; } elsif ({ ## "In body" insertion mode, non-phrasing flow-content ## elements start tags. address => 1, article => 1, aside => 1, blockquote => 1, center => 1, details => 1, dir => 1, div => 1, dl => 1, fieldset => 1, figcaption => 1, figure => 1, footer => 1, header => 1, hgroup => 1, menu => 1, nav => 1, ol => 1, p => 1, section => 1, ul => 1, summary => 1, # datagrid => 1, ## Closing any heading element h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1, ## Ignoring any leading newline in content pre => 1, listing => 1, ## Form element pointer form => 1, ## A quirk & switching of insertion mode table => 1, ## Void element hr => 1, }->{$token->{tag_name}}) { ## 1. When there is an opening |form| element: if ($token->{tag_name} eq 'form' and defined $self->{form_element}) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'in form:form', token => $token); ## Ignore the token $token = $self->_get_next_token; next B; } ## 2. Close the |p| element, if any. if ($token->{tag_name} ne 'table' or # The Hixie Quirk ($self->_data($self->{document})->{'manakai_compat_mode'}||'') ne 'quirks') { ## "have a |p| element in button scope" INSCOPE: for (reverse @{$self->{open_elements}}) { if ($_->[1] == P_EL) { $token->{self_closing} = $self->{self_closing}; unshift @{$self->{token}}, $token; delete $self->{self_closing}; # $token = {type => END_TAG_TOKEN, tag_name => 'p', line => $token->{line}, column => $token->{column}}; next B; } elsif ($_->[1] & BUTTON_SCOPING_EL) { last INSCOPE; } } # INSCOPE } ## 3. Close the opening element, if any. if ({h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1}->{$token->{tag_name}}) { if ($self->{open_elements}->[-1]->[1] == HEADING_EL) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'not closed', text => $self->{open_elements}->[-1]->[0]->tagName, token => $token); pop @{$self->{open_elements}}; } } ## 4. Insertion. { my $el; $el = $self->{document}->createElementNS((HTML_NS), $token->{tag_name}); for my $attr_name (keys %{ $token->{attributes}}) { $attr_name =~ s/[^A-Za-z0-9:_-]//g; my $attr_t = $token->{attributes}->{$attr_name}; my $attr = $self->{document}->createAttributeNS(undef, $attr_name); $attr->setValue($attr_t->{value}); $self->_data($attr, manakai_source_line => $attr_t->{line}); $self->_data($attr, manakai_source_column => $attr_t->{column}); $el->setAttributeNodeNS($attr); } $self->_data($el, manakai_source_line => $token->{line}) if defined $token->{line}; $self->_data($el, manakai_source_column => $token->{column}) if defined $token->{column}; $insert->($self, $el, $open_tables); push @{$self->{open_elements}}, [$el, $el_category->{$token->{tag_name}} || 0]; } if ($token->{tag_name} eq 'pre' or $token->{tag_name} eq 'listing') { $token = $self->_get_next_token; if ($token->{type} == CHARACTER_TOKEN) { $token->{data} =~ s/^\x0A//; unless (length $token->{data}) { $token = $self->_get_next_token; } else { } } else { } delete $self->{frameset_ok}; } elsif ($token->{tag_name} eq 'form') { $self->{form_element} = $self->{open_elements}->[-1]->[0]; $token = $self->_get_next_token; } elsif ($token->{tag_name} eq 'table') { push @{$open_tables}, [$self->{open_elements}->[-1]->[0]]; delete $self->{frameset_ok}; $self->{insertion_mode} = IN_TABLE_IM; $token = $self->_get_next_token; } elsif ($token->{tag_name} eq 'hr') { pop @{$self->{open_elements}}; delete $self->{self_closing}; delete $self->{frameset_ok}; $token = $self->_get_next_token; } else { $token = $self->_get_next_token; } next B; } elsif ($token->{tag_name} eq 'li') { ## "In body" insertion mode, "li" start tag. As normal, but ## imply when there's another
  • . ## NOTE: Special, Scope (
  • ==
  • ):: ## Interpreted as
  • (non-conforming): ## blockquote (O9.27), center (O), dd (Fx3, O, S3.1.2, IE7), ## dt (Fx, O, S, IE), dl (O), fieldset (O, S, IE), form (Fx, O, S), ## hn (O), pre (O), applet (O, S), button (O, S), marquee (Fx, O, S), ## object (Fx) ## Generate non-tree (non-conforming): ## basefont (IE7 (where basefont is non-void)), center (IE), ## form (IE), hn (IE) ## address, div, p (
  • ==
  • ):: ## Interpreted as
  • (non-conforming): ## div (Fx, S) ## 1. Frameset-ng delete $self->{frameset_ok}; my $non_optional; my $i = -1; ## 2. for my $node (reverse @{$self->{open_elements}}) { if ($node->[1] == LI_EL) { ## 3. (a) As if { ## If no - not applied # ## Otherwise ## 1. generate implied end tags, except for # ## 2. If current node != "li", parse error if ($non_optional) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'not closed', text => $non_optional->[0]->tagName, token => $token); } else { } ## 3. Pop splice @{$self->{open_elements}}, $i; } last; ## 3. (b) goto 5. } elsif ( ## NOTE: "special" category ($node->[1] & SPECIAL_EL or $node->[1] & SCOPING_EL) and ## NOTE: "li", "dt", and "dd" are in |SPECIAL_EL|. (not $node->[1] & ADDRESS_DIV_P_EL) ) { ## 4. last; ## goto 6. } elsif ($node->[1] & END_TAG_OPTIONAL_EL) { # } else { $non_optional ||= $node; # } ## 5. ## goto 3. $i--; } ## 6. (a) "have a |p| element in button scope". INSCOPE: for (reverse @{$self->{open_elements}}) { if ($_->[1] == P_EL) { ## NOTE: |

  • |, for example. $token->{self_closing} = $self->{self_closing}; unshift @{$self->{token}}, $token; delete $self->{self_closing}; # $token = {type => END_TAG_TOKEN, tag_name => 'p', line => $token->{line}, column => $token->{column}}; next B; } elsif ($_->[1] & BUTTON_SCOPING_EL) { last INSCOPE; } } # INSCOPE ## 6. (b) insert { my $el; $el = $self->{document}->createElementNS((HTML_NS), $token->{tag_name}); for my $attr_name (keys %{ $token->{attributes}}) { my $attr_t = $token->{attributes}->{$attr_name}; my $attr = $self->{document}->createAttributeNS(undef, $attr_name); $attr->setValue($attr_t->{value}); $self->_data($attr, manakai_source_line => $attr_t->{line}); $self->_data($attr, manakai_source_column => $attr_t->{column}); $el->setAttributeNodeNS ($attr); } $self->_data($el, manakai_source_line => $token->{line}) if defined $token->{line}; $self->_data($el, manakai_source_column => $token->{column}) if defined $token->{column}; $insert->($self, $el, $open_tables); push @{$self->{open_elements}}, [$el, $el_category->{$token->{tag_name}} || 0]; } $token = $self->_get_next_token; next B; } elsif ($token->{tag_name} eq 'dt' or $token->{tag_name} eq 'dd') { ## "In body" insertion mode, "dt" or "dd" start tag. As ## normal, but imply or when there's antoher
    ## or
    . ## 1. Frameset-ng delete $self->{frameset_ok}; my $non_optional; my $i = -1; ## 2. for my $node (reverse @{$self->{open_elements}}) { if ($node->[1] == DTDD_EL) { ## 3. (a) As if
  • { ## If no - not applied # ## Otherwise ## 1. generate implied end tags, except for or # ## 2. If current node != "dt"|"dd", parse error if ($non_optional) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'not closed', text => $non_optional->[0]->tagName, token => $token); } else { } ## 3. Pop splice @{$self->{open_elements}}, $i; } last; ## 3. (b) goto 5. } elsif ( ## NOTE: "special" category ($node->[1] & SPECIAL_EL or $node->[1] & SCOPING_EL) and ## NOTE: "li", "dt", and "dd" are in |SPECIAL_EL|. (not $node->[1] & ADDRESS_DIV_P_EL) ) { ## 4. last; ## goto 5. } elsif ($node->[1] & END_TAG_OPTIONAL_EL) { # } else { $non_optional ||= $node; # } ## 5. ## goto 3. $i--; } ## 6. (a) "have a |p| element in button scope". INSCOPE: for (reverse @{$self->{open_elements}}) { if ($_->[1] == P_EL) { $token->{self_closing} = $self->{self_closing}; unshift @{$self->{token}}, $token; delete $self->{self_closing}; # $token = {type => END_TAG_TOKEN, tag_name => 'p', line => $token->{line}, column => $token->{column}}; next B; } elsif ($_->[1] & BUTTON_SCOPING_EL) { last INSCOPE; } } # INSCOPE ## 6. (b) insert { my $el; $el = $self->{document}->createElementNS((HTML_NS), $token->{tag_name}); for my $attr_name (keys %{ $token->{attributes}}) { my $attr_t = $token->{attributes}->{$attr_name}; my $attr = $self->{document}->createAttributeNS(undef, $attr_name); $attr->setValue($attr_t->{value}); $self->_data($attr, manakai_source_line => $attr_t->{line}); $self->_data($attr, manakai_source_column => $attr_t->{column}); $el->setAttributeNodeNS ($attr); } $self->_data($el, manakai_source_line => $token->{line}) if defined $token->{line}; $self->_data($el, manakai_source_column => $token->{column}) if defined $token->{column}; $insert->($self, $el, $open_tables); push @{$self->{open_elements}}, [$el, $el_category->{$token->{tag_name}} || 0]; } $token = $self->_get_next_token; next B; } elsif ($token->{tag_name} eq 'plaintext') { ## "In body" insertion mode, "plaintext" start tag. As ## normal, but effectively ends parsing. ## has a p element in scope INSCOPE: for (reverse @{$self->{open_elements}}) { if ($_->[1] == P_EL) { $token->{self_closing} = $self->{self_closing}; unshift @{$self->{token}}, $token; delete $self->{self_closing}; # $token = {type => END_TAG_TOKEN, tag_name => 'p', line => $token->{line}, column => $token->{column}}; next B; } elsif ($_->[1] & BUTTON_SCOPING_EL) { last INSCOPE; } } # INSCOPE { my $el; $el = $self->{document}->createElementNS((HTML_NS), $token->{tag_name}); for my $attr_name (keys %{ $token->{attributes}}) { my $attr_t = $token->{attributes}->{$attr_name}; my $attr = $self->{document}->createAttributeNS(undef, $attr_name); $attr->setValue($attr_t->{value}); $self->_data($attr, manakai_source_line => $attr_t->{line}); $self->_data($attr, manakai_source_column => $attr_t->{column}); $el->setAttributeNodeNS ($attr); } $self->_data($el, manakai_source_line => $token->{line}) if defined $token->{line}; $self->_data($el, manakai_source_column => $token->{column}) if defined $token->{column}; $insert->($self, $el, $open_tables); push @{$self->{open_elements}}, [$el, $el_category->{$token->{tag_name}} || 0]; } $self->{state} = PLAINTEXT_STATE; $token = $self->_get_next_token; next B; } elsif ($token->{tag_name} eq 'a') { AFE: for my $i (reverse 0..$#$active_formatting_elements) { my $node = $active_formatting_elements->[$i]; no warnings; if ($node->[1] == A_EL) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'in a:a', token => $token); $token->{self_closing} = $self->{self_closing}; unshift @{$self->{token}}, $token; delete $self->{self_closing}; # <a> $token = {type => END_TAG_TOKEN, tag_name => 'a', line => $token->{line}, column => $token->{column}}; $formatting_end_tag->($self, $active_formatting_elements, $open_tables, $token); AFE2: for (reverse 0..$#$active_formatting_elements) { if ($active_formatting_elements->[$_]->[0] eq $node->[0]) { splice @$active_formatting_elements, $_, 1; last AFE2; } } # AFE2 OE: for (reverse 0..$#{$self->{open_elements}}) { if ($self->{open_elements}->[$_]->[0] eq $node->[0]) { splice @{$self->{open_elements}}, $_, 1; last OE; } } # OE last AFE; } elsif ($node->[0] eq '#marker') { last AFE; } } # AFE my $insert = $self->{insertion_mode} & TABLE_IMS ? $insert_to_foster : $insert_to_current; $reconstruct_active_formatting_elements ->($self, $insert, $active_formatting_elements, $open_tables); { my $el; $el = $self->{document}->createElementNS((HTML_NS), $token->{tag_name}); for my $attr_name (keys %{ $token->{attributes}}) { my $attr_t = $token->{attributes}->{$attr_name}; my $attr = $self->{document}->createAttributeNS (undef, $attr_name); if ($attr) { $attr->setValue($attr_t->{value}); $self->_data($attr, manakai_source_line => $attr_t->{line}); $self->_data($attr, manakai_source_column => $attr_t->{column}); $el->setAttributeNodeNS($attr); } } $self->_data($el, manakai_source_line => $token->{line}) if defined $token->{line}; $self->_data($el, manakai_source_column => $token->{column}) if defined $token->{column}; $insert->($self, $el, $open_tables); push @{$self->{open_elements}}, [$el, $el_category->{$token->{tag_name}} || 0]; } push @$active_formatting_elements, [$self->{open_elements}->[-1]->[0], $self->{open_elements}->[-1]->[1], $token]; $token = $self->_get_next_token; next B; } elsif ($token->{tag_name} eq 'nobr') { my $insert = $self->{insertion_mode} & TABLE_IMS ? $insert_to_foster : $insert_to_current; $reconstruct_active_formatting_elements ->($self, $insert, $active_formatting_elements, $open_tables); ## has a |nobr| element in scope INSCOPE: for (reverse 0..$#{$self->{open_elements}}) { my $node = $self->{open_elements}->[$_]; if ($node->[1] == NOBR_EL) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'in nobr:nobr', token => $token); $token->{self_closing} = $self->{self_closing}; unshift @{$self->{token}}, $token; delete $self->{self_closing}; # <nobr> $token = {type => END_TAG_TOKEN, tag_name => 'nobr', line => $token->{line}, column => $token->{column}}; next B; } elsif ($node->[1] & SCOPING_EL) { last INSCOPE; } } # INSCOPE { my $el; $el = $self->{document}->createElementNS((HTML_NS), $token->{tag_name}); for my $attr_name (keys %{ $token->{attributes}}) { my $attr_t = $token->{attributes}->{$attr_name}; my $attr = $self->{document}->createAttributeNS(undef, $attr_name); $attr->setValue($attr_t->{value}); $self->_data($attr, manakai_source_line => $attr_t->{line}); $self->_data($attr, manakai_source_column => $attr_t->{column}); $el->setAttributeNodeNS ($attr); } $self->_data($el, manakai_source_line => $token->{line}) if defined $token->{line}; $self->_data($el, manakai_source_column => $token->{column}) if defined $token->{column}; $insert->($self, $el, $open_tables); push @{$self->{open_elements}}, [$el, $el_category->{$token->{tag_name}} || 0]; } push @$active_formatting_elements, [$self->{open_elements}->[-1]->[0], $self->{open_elements}->[-1]->[1], $token]; $token = $self->_get_next_token; next B; } elsif ($token->{tag_name} eq 'button') { ## has a button element in scope INSCOPE: for (reverse 0..$#{$self->{open_elements}}) { my $node = $self->{open_elements}->[$_]; if ($node->[1] == BUTTON_EL) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'in button:button', token => $token); $token->{self_closing} = $self->{self_closing}; unshift @{$self->{token}}, $token; delete $self->{self_closing}; # <button> $token = {type => END_TAG_TOKEN, tag_name => 'button', line => $token->{line}, column => $token->{column}}; next B; } elsif ($node->[1] & SCOPING_EL) { last INSCOPE; } } # INSCOPE my $insert = $self->{insertion_mode} & TABLE_IMS ? $insert_to_foster : $insert_to_current; $reconstruct_active_formatting_elements ->($self, $insert, $active_formatting_elements, $open_tables); { my $el; $el = $self->{document}->createElementNS((HTML_NS), $token->{tag_name}); for my $attr_name (keys %{ $token->{attributes}}) { my $attr_t = $token->{attributes}->{$attr_name}; my $attr = $self->{document}->createAttributeNS (undef, $attr_name); $attr->setValue($attr_t->{value}); $self->_data($attr, manakai_source_line => $attr_t->{line}); $self->_data($attr, manakai_source_column => $attr_t->{column}); $el->setAttributeNodeNS ($attr); } $self->_data($el, manakai_source_line => $token->{line}) if defined $token->{line}; $self->_data($el, manakai_source_column => $token->{column}) if defined $token->{column}; $insert->($self, $el, $open_tables); push @{$self->{open_elements}}, [$el, $el_category->{$token->{tag_name}} || 0]; } ## TODO: associate with $self->{form_element} if defined delete $self->{frameset_ok}; $token = $self->_get_next_token; next B; } elsif ({ xmp => 1, iframe => 1, noembed => 1, noframes => 1, ## NOTE: This is an "as if in head" code clone. noscript => 0, ## TODO: 1 if scripting is enabled }->{$token->{tag_name}}) { if ($token->{tag_name} eq 'xmp') { ## "In body" insertion mode, "xmp" start tag. As normal ## flow-content element start tag, but CDATA parsing. ## "have a |p| element in button scope". INSCOPE: for (reverse @{$self->{open_elements}}) { if ($_->[1] == P_EL) { $token->{self_closing} = $self->{self_closing}; unshift @{$self->{token}}, $token; delete $self->{self_closing}; # <xmp> $token = {type => END_TAG_TOKEN, tag_name => 'p', line => $token->{line}, column => $token->{column}}; next B; } elsif ($_->[1] & BUTTON_SCOPING_EL) { last INSCOPE; } } # INSCOPE my $insert = $self->{insertion_mode} & TABLE_IMS ? $insert_to_foster : $insert_to_current; $reconstruct_active_formatting_elements ->($self, $insert, $active_formatting_elements, $open_tables); delete $self->{frameset_ok}; } elsif ($token->{tag_name} eq 'iframe') { delete $self->{frameset_ok}; } else { } ## NOTE: There is an "as if in body" code clone. $parse_rcdata->($self, $insert, $open_tables, 0); # RAWTEXT next B; } elsif ($token->{tag_name} eq 'isindex') { $self->{parse_error}->(level => $self->{level}->{must}, type => 'isindex', token => $token); if (defined $self->{form_element}) { ## Ignore the token ## NOTE: Not acknowledged. $token = $self->_get_next_token; next B; } else { delete $self->{self_closing}; my $at = $token->{attributes}; my $form_attrs; $form_attrs->{action} = $at->{action} if $at->{action}; my $prompt_attr = $at->{prompt}; $at->{name} = {name => 'name', value => 'isindex'}; delete $at->{action}; delete $at->{prompt}; my @tokens = ( {type => START_TAG_TOKEN, tag_name => 'form', attributes => $form_attrs, line => $token->{line}, column => $token->{column}}, {type => START_TAG_TOKEN, tag_name => 'hr', line => $token->{line}, column => $token->{column}}, {type => START_TAG_TOKEN, tag_name => 'label', line => $token->{line}, column => $token->{column}}, ); if ($prompt_attr) { push @tokens, {type => CHARACTER_TOKEN, data => $prompt_attr->{value}, #line => $token->{line}, column => $token->{column}, }; } else { push @tokens, {type => CHARACTER_TOKEN, data => 'This is a searchable index. Enter search keywords: ', #line => $token->{line}, column => $token->{column}, }; # SHOULD ## TODO: make this configurable } push @tokens, {type => START_TAG_TOKEN, tag_name => 'input', attributes => $at, line => $token->{line}, column => $token->{column}}, #{type => CHARACTER_TOKEN, data => ''}, # SHOULD {type => END_TAG_TOKEN, tag_name => 'label', line => $token->{line}, column => $token->{column}}, {type => START_TAG_TOKEN, tag_name => 'hr', line => $token->{line}, column => $token->{column}}, {type => END_TAG_TOKEN, tag_name => 'form', line => $token->{line}, column => $token->{column}}; unshift @{$self->{token}}, (@tokens); $token = $self->_get_next_token; next B; } } elsif ($token->{tag_name} eq 'textarea') { ## 1. Insert { my $el; $el = $self->{document}->createElementNS((HTML_NS), $token->{tag_name}); for my $attr_name (keys %{ $token->{attributes}}) { my $attr_t = $token->{attributes}->{$attr_name}; my $attr = $self->{document}->createAttributeNS (undef, $attr_name); $attr->setValue($attr_t->{value}); $self->_data($attr, manakai_source_line => $attr_t->{line}); $self->_data($attr, manakai_source_column => $attr_t->{column}); $el->setAttributeNodeNS ($attr); } $self->_data($el, manakai_source_line => $token->{line}) if defined $token->{line}; $self->_data($el, manakai_source_column => $token->{column}) if defined $token->{column}; $insert->($self, $el, $open_tables); push @{$self->{open_elements}}, [$el, $el_category->{$token->{tag_name}} || 0]; } ## Step 2 # XXX ## TODO: $self->{form_element} if defined ## 2. Drop U+000A LINE FEED $self->{ignore_newline} = 1; ## 3. RCDATA $self->{state} = RCDATA_STATE; delete $self->{escape}; # MUST ## 4., 6. Insertion mode $self->{insertion_mode} |= IN_CDATA_RCDATA_IM; ## 5. Frameset-ng. delete $self->{frameset_ok}; $token = $self->_get_next_token; next B; } elsif ($token->{tag_name} eq 'optgroup' or $token->{tag_name} eq 'option') { if ($self->{open_elements}->[-1]->[1] == OPTION_EL) { ## NOTE: As if </option> $token->{self_closing} = $self->{self_closing}; unshift @{$self->{token}}, $token; delete $self->{self_closing}; # <option> or <optgroup> $token = {type => END_TAG_TOKEN, tag_name => 'option', line => $token->{line}, column => $token->{column}}; next B; } my $insert = $self->{insertion_mode} & TABLE_IMS ? $insert_to_foster : $insert_to_current; $reconstruct_active_formatting_elements ->($self, $insert, $active_formatting_elements, $open_tables); { my $el; $el = $self->{document}->createElementNS((HTML_NS), $token->{tag_name}); for my $attr_name (keys %{ $token->{attributes}}) { my $attr_t = $token->{attributes}->{$attr_name}; my $attr = $self->{document}->createAttributeNS (undef, $attr_name); $attr->setValue($attr_t->{value}); $self->_data($attr, manakai_source_line => $attr_t->{line}); $self->_data($attr, manakai_source_column => $attr_t->{column}); $el->setAttributeNodeNS ($attr); } $self->_data($el, manakai_source_line => $token->{line}) if defined $token->{line}; $self->_data($el, manakai_source_column => $token->{column}) if defined $token->{column}; $insert->($self, $el, $open_tables); push @{$self->{open_elements}}, [$el, $el_category->{$token->{tag_name}} || 0]; } $token = $self->_get_next_token; redo B; } elsif ($token->{tag_name} eq 'rt' or $token->{tag_name} eq 'rp') { ## has a |ruby| element in scope INSCOPE: for (reverse 0..$#{$self->{open_elements}}) { my $node = $self->{open_elements}->[$_]; if ($node->[1] == RUBY_EL) { ## generate implied end tags while ($self->{open_elements}->[-1]->[1] & END_TAG_OPTIONAL_EL) { pop @{$self->{open_elements}}; } unless ($self->{open_elements}->[-1]->[1] == RUBY_EL) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'not closed', text => $self->{open_elements}->[-1]->[0] ->tagName, token => $token); } last INSCOPE; } elsif ($node->[1] & SCOPING_EL) { last INSCOPE; } } # INSCOPE { my $el; $el = $self->{document}->createElementNS((HTML_NS), $token->{tag_name}); for my $attr_name (keys %{ $token->{attributes}}) { my $attr_t = $token->{attributes}->{$attr_name}; my $attr = $self->{document}->createAttributeNS(undef, $attr_name); $attr->setValue($attr_t->{value}); $self->_data($attr, manakai_source_line => $attr_t->{line}); $self->_data($attr, manakai_source_column => $attr_t->{column}); $el->setAttributeNodeNS ($attr); } $self->_data($el, manakai_source_line => $token->{line}) if defined $token->{line}; $self->_data($el, manakai_source_column => $token->{column}) if defined $token->{column}; $insert->($self, $el, $open_tables); push @{$self->{open_elements}}, [$el, $el_category->{$token->{tag_name}} || 0]; } $token = $self->_get_next_token; redo B; } elsif ($token->{tag_name} eq 'math' or $token->{tag_name} eq 'svg') { my $insert = $self->{insertion_mode} & TABLE_IMS ? $insert_to_foster : $insert_to_current; $reconstruct_active_formatting_elements ->($self, $insert, $active_formatting_elements, $open_tables); ## "Adjust MathML attributes" ('math' only) - done in insert-element-f ## "adjust SVG attributes" ('svg' only) - done in insert-element-f ## "adjust foreign attributes" - done in insert-element-f { my $el; $el = $self->{document}->createElementNS ($token->{tag_name} eq 'math' ? (MML_NS) : (SVG_NS), $token->{tag_name}); for my $attr_name (keys %{ $token->{attributes}}) { my $attr_t = $token->{attributes}->{$attr_name}; my $attr; if (defined $foreign_attr_xname->{ $attr_name }) { my $xmlnsuri = $foreign_attr_xname->{ $attr_name }->[0]; my $qname = join ':', @{$foreign_attr_xname->{ $attr_name }->[1]}; $qname =~ s/(^:)|(:$)//; $attr = $self->{document}->createAttributeNS($xmlnsuri, $qname); } elsif ($token->{tag_name} eq 'math' && $attr_name eq 'definitionurl') { $attr = $self->{document}->createAttributeNS((MML_NS), 'definitionURL'); } elsif ($token->{tag_name} eq 'math') { $attr = $self->{document}->createAttributeNS((MML_NS), $attr_name); } elsif ($token->{tag_name} eq 'svg') { $attr = $self->{document}->createAttributeNS( (SVG_NS), ($svg_attr_name->{$attr_name} || $attr_name)); } unless ($attr) { $attr = $self->{document}->createAttribute($attr_name); } if ($attr) { $attr->setValue($attr_t->{value}); $self->_data($attr, manakai_source_line => $attr_t->{line}); $self->_data($attr, manakai_source_column => $attr_t->{column}); $el->setAttributeNodeNS ($attr); } } $self->_data($el, manakai_source_line => $token->{line}) if defined $token->{line}; $self->_data($el, manakai_source_column => $token->{column}) if defined $token->{column}; $insert->($self, $el, $open_tables); push @{$self->{open_elements}}, [$el, ($el_category_f->{$token->{tag_name} eq 'math' ? MML_NS : SVG_NS}->{ $token->{tag_name}} || 0) | FOREIGN_EL | (($token->{tag_name} eq 'math' ? MML_NS : SVG_NS) eq SVG_NS ? SVG_EL : ($token->{tag_name} eq 'math' ? MML_NS : SVG_NS) eq MML_NS ? MML_EL : 0)]; if ( $token->{attributes}->{xmlns} and $token->{attributes}->{xmlns}->{value} ne ($token->{tag_name} eq 'math' ? (MML_NS) : (SVG_NS))) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad namespace', token => $token); ## TODO: Error type documentation } if ( $token->{attributes}->{'xmlns:xlink'} and $token->{attributes}->{'xmlns:xlink'}->{value} ne q<http://www.w3.org/1999/xlink>) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad namespace', token => $token); } } if ($self->{self_closing}) { pop @{$self->{open_elements}}; delete $self->{self_closing}; } else { } $token = $self->_get_next_token; next B; } elsif ({ caption => 1, col => 1, colgroup => 1, frame => 1, head => 1, tbody => 1, td => 1, tfoot => 1, th => 1, thead => 1, tr => 1, }->{$token->{tag_name}}) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'in body', text => $token->{tag_name}, token => $token); ## Ignore the token ## NOTE: |<col/>| or |<frame/>| here is an error. $token = $self->_get_next_token; next B; } elsif ($token->{tag_name} eq 'param' or $token->{tag_name} eq 'source' or $token->{tag_name} eq 'track') { { my $el; $el = $self->{document}->createElementNS((HTML_NS), $token->{tag_name}); for my $attr_name (keys %{ $token->{attributes}}) { my $attr_t = $token->{attributes}->{$attr_name}; my $attr = $self->{document}->createAttributeNS(undef, $attr_name); $attr->setValue($attr_t->{value}); $self->_data($attr, manakai_source_line => $attr_t->{line}); $self->_data($attr, manakai_source_column => $attr_t->{column}); $el->setAttributeNodeNS ($attr); } $self->_data($el, manakai_source_line => $token->{line}) if defined $token->{line}; $self->_data($el, manakai_source_column => $token->{column}) if defined $token->{column}; $insert->($self, $el, $open_tables); push @{$self->{open_elements}}, [$el, $el_category->{$token->{tag_name}} || 0]; } pop @{$self->{open_elements}}; delete $self->{self_closing}; $token = $self->_get_next_token; redo B; } else { if ($token->{tag_name} eq 'image') { $self->{parse_error}->(level => $self->{level}->{must}, type => 'image', token => $token); $token->{tag_name} = 'img'; } else { } ## NOTE: There is an "as if <br>" code clone. my $insert = $self->{insertion_mode} & TABLE_IMS ? $insert_to_foster : $insert_to_current; $reconstruct_active_formatting_elements ->($self, $insert, $active_formatting_elements, $open_tables); { my $el; $token->{tag_name} =~ s/[^A-Za-z0-9:_-]//g; $el = $self->{document}->createElementNS((HTML_NS), $token->{tag_name}); ATR: for my $attr_name (keys %{ $token->{attributes}}) { my $attr_t = $token->{attributes}->{$attr_name}; my $attr = $self->{document}->createAttributeNS (undef, $attr_name); next ATR unless ref($attr); $attr->setValue ($attr_t->{value}); $self->_data($attr, manakai_source_line => $attr_t->{line}); $self->_data($attr, manakai_source_column => $attr_t->{column}); $el->setAttributeNodeNS ($attr); } $self->_data($el, manakai_source_line => $token->{line}) if defined $token->{line}; $self->_data($el, manakai_source_column => $token->{column}) if defined $token->{column}; $insert->($self, $el, $open_tables); push @{$self->{open_elements}}, [$el, $el_category->{$token->{tag_name}} || 0]; } if ({ applet => 1, marquee => 1, object => 1, }->{$token->{tag_name}}) { push @$active_formatting_elements, ['#marker', '', undef]; delete $self->{frameset_ok}; } elsif ({ b => 1, big => 1, code=>1, em => 1, font => 1, i => 1, s => 1, small => 1, strike => 1, strong => 1, tt => 1, u => 1, }->{$token->{tag_name}}) { push @$active_formatting_elements, [$self->{open_elements}->[-1]->[0], $self->{open_elements}->[-1]->[1], $token]; } elsif ($token->{tag_name} eq 'input') { ## TODO: associate with $self->{form_element} if defined pop @{$self->{open_elements}}; if ($token->{attributes}->{type}) { my $type = $token->{attributes}->{type}->{value}; $type =~ tr/A-Z/a-z/; ## ASCII case-insensitive. if ($type eq 'hidden') { # } else { delete $self->{frameset_ok}; } } else { delete $self->{frameset_ok}; } delete $self->{self_closing}; } elsif ({ area => 1, br => 1, embed => 1, img => 1, wbr => 1, keygen => 1, }->{$token->{tag_name}}) { pop @{$self->{open_elements}}; delete $self->{frameset_ok}; delete $self->{self_closing}; } elsif ($token->{tag_name} eq 'select') { ## TODO: associate with $self->{form_element} if defined delete $self->{frameset_ok}; if ($self->{insertion_mode} & TABLE_IMS or $self->{insertion_mode} & BODY_TABLE_IMS) { $self->{insertion_mode} = IN_SELECT_IN_TABLE_IM; } else { $self->{insertion_mode} = IN_SELECT_IM; } } else { } $token = $self->_get_next_token; next B; } } elsif ($token->{type} == END_TAG_TOKEN) { if ($token->{tag_name} eq 'body' or $token->{tag_name} eq 'html') { ## 1. If not "have an element in scope": ## "has a |body| element in scope" my $i; INSCOPE: { for (reverse @{$self->{open_elements}}) { if ($_->[1] == BODY_EL) { $i = $_; last INSCOPE; } elsif ($_->[1] & SCOPING_EL) { last; } } ## NOTE: |<marquee></body>|, |<svg><foreignobject></body>|, ## and fragment cases. $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched end tag', text => $token->{tag_name}, token => $token); ## Ignore the token. (</body> or </html>) $token = $self->_get_next_token; next B; } # INSCOPE ## 2. If unclosed elements: for (@{$self->{open_elements}}) { unless ($_->[1] & ALL_END_TAG_OPTIONAL_EL || $_->[1] == OPTGROUP_EL || $_->[1] == OPTION_EL || $_->[1] == RUBY_COMPONENT_EL) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'not closed', text => $_->[0]->tagName, token => $token); last; } else { } } ## 3. Switch the insertion mode. $self->{insertion_mode} = AFTER_BODY_IM; if ($token->{tag_name} eq 'body') { $token = $self->_get_next_token; } else { # html ## Reprocess. } next B; } elsif ({ ## "In body" insertion mode, end tags for non-phrasing flow ## content elements. address => 1, article => 1, aside => 1, blockquote => 1, center => 1, #datagrid => 1, details => 1, dir => 1, div => 1, dl => 1, fieldset => 1, figure => 1, footer => 1, header => 1, hgroup => 1, listing => 1, menu => 1, nav => 1, ol => 1, pre => 1, section => 1, ul => 1, figcaption => 1, summary => 1, ## NOTE: As normal, but ... optional tags dd => 1, dt => 1, li => 1, applet => 1, button => 1, marquee => 1, object => 1, }->{$token->{tag_name}}) { ## NOTE: Code for <li> start tags includes "as if </li>" code. ## Code for <dt> or <dd> start tags includes "as if </dt> or ## </dd>" code. ## has an element in scope my $i; INSCOPE: for (reverse 0..$#{$self->{open_elements}}) { my $node = $self->{open_elements}->[$_]; if ($node->[0]->tagName eq $token->{tag_name}) { $i = $_; last INSCOPE; } elsif ($node->[1] & SCOPING_EL) { last INSCOPE; } elsif ($token->{tag_name} eq 'li' and {ul => 1, ol => 1}->{$node->[0]->localname}) { ## Has an element in list item scope last INSCOPE; } } # INSCOPE unless (defined $i) { # has an element in scope $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched end tag', text => $token->{tag_name}, token => $token); ## NOTE: Ignore the token. } else { ## Step 1. generate implied end tags while ({ ## END_TAG_OPTIONAL_EL dd => ($token->{tag_name} ne 'dd'), dt => ($token->{tag_name} ne 'dt'), li => ($token->{tag_name} ne 'li'), option => 1, optgroup => 1, p => 1, rt => 1, rp => 1, }->{$self->{open_elements}->[-1]->[0]->tagName}) { pop @{$self->{open_elements}}; } ## Step 2. if ($self->{open_elements}->[-1]->[0]->tagName ne $token->{tag_name}) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'not closed', text => $self->{open_elements}->[-1]->[0] ->tagName, token => $token); } else { } ## Step 3. splice @{$self->{open_elements}}, $i; ## Step 4. $clear_up_to_marker->($active_formatting_elements) if { applet => 1, marquee => 1, object => 1, }->{$token->{tag_name}}; } $token = $self->_get_next_token; next B; } elsif ($token->{tag_name} eq 'form') { ## NOTE: As normal, but interacts with the form element pointer undef $self->{form_element}; ## has an element in scope my $i; INSCOPE: for (reverse 0..$#{$self->{open_elements}}) { my $node = $self->{open_elements}->[$_]; if ($node->[1] == FORM_EL) { $i = $_; last INSCOPE; } elsif ($node->[1] & SCOPING_EL) { last INSCOPE; } } # INSCOPE unless (defined $i) { # has an element in scope $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched end tag', text => $token->{tag_name}, token => $token); ## NOTE: Ignore the token. } else { ## Step 1. generate implied end tags while ($self->{open_elements}->[-1]->[1] & END_TAG_OPTIONAL_EL) { pop @{$self->{open_elements}}; } ## Step 2. if ($self->{open_elements}->[-1]->[0]->tagName ne $token->{tag_name}) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'not closed', text => $self->{open_elements}->[-1]->[0] ->tagName, token => $token); } else { } ## Step 3. splice @{$self->{open_elements}}, $i; } $token = $self->_get_next_token; next B; } elsif ({ ## NOTE: As normal, except acts as a closer for any ... h1 => 1, h2 => 1, h3 => 1, h4 => 1, h5 => 1, h6 => 1, }->{$token->{tag_name}}) { ## has an element in scope my $i; INSCOPE: for (reverse 0..$#{$self->{open_elements}}) { my $node = $self->{open_elements}->[$_]; if ($node->[1] == HEADING_EL) { $i = $_; last INSCOPE; } elsif ($node->[1] & SCOPING_EL) { last INSCOPE; } } # INSCOPE unless (defined $i) { # has an element in scope $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched end tag', text => $token->{tag_name}, token => $token); ## NOTE: Ignore the token. } else { ## Step 1. generate implied end tags while ($self->{open_elements}->[-1]->[1] & END_TAG_OPTIONAL_EL) { pop @{$self->{open_elements}}; } ## Step 2. if ($self->{open_elements}->[-1]->[0]->tagName ne $token->{tag_name}) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched end tag', text => $token->{tag_name}, token => $token); } else { } ## Step 3. splice @{$self->{open_elements}}, $i; } $token = $self->_get_next_token; next B; } elsif ($token->{tag_name} eq 'p') { ## "In body" insertion mode, "p" start tag. As normal, except ## </p> implies <p> and ... ## "have an element in button scope". my $non_optional; my $i; INSCOPE: for (reverse 0..$#{$self->{open_elements}}) { my $node = $self->{open_elements}->[$_]; if ($node->[1] == P_EL) { $i = $_; last INSCOPE; } elsif ($node->[1] & BUTTON_SCOPING_EL) { last INSCOPE; } elsif ($node->[1] & END_TAG_OPTIONAL_EL) { ## NOTE: |END_TAG_OPTIONAL_EL| includes "p" # } else { $non_optional ||= $node; # } } # INSCOPE if (defined $i) { ## 1. Generate implied end tags # ## 2. If current node != "p", parse error if ($non_optional) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'not closed', text => $non_optional->[0]->tagName, token => $token); } else { } ## 3. Pop splice @{$self->{open_elements}}, $i; } else { $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched end tag', text => $token->{tag_name}, token => $token); ## As if <p>, then reprocess the current token my $el; $el = $self->{document}->createElementNS((HTML_NS), 'p'); $self->_data($el, manakai_source_line => $token->{line}) if defined $token->{line}; $self->_data($el, manakai_source_column => $token->{column}) if defined $token->{column}; $self->_data($el, implied => __LINE__); $insert->($self, $el, $open_tables); ## NOTE: Not inserted into |$self->{open_elements}|. } $token = $self->_get_next_token; next B; } elsif ({ a => 1, b => 1, big => 1, code=>1, em => 1, font => 1, i => 1, nobr => 1, s => 1, small => 1, strike => 1, strong => 1, tt => 1, u => 1, }->{$token->{tag_name}}) { $formatting_end_tag->($self, $active_formatting_elements, $open_tables, $token); next B; } elsif ($token->{tag_name} eq 'br') { $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched end tag', text => 'br', token => $token); ## As if <br> my $insert = $self->{insertion_mode} & TABLE_IMS ? $insert_to_foster : $insert_to_current; $reconstruct_active_formatting_elements ->($self, $insert, $active_formatting_elements, $open_tables); my $el; $el = $self->{document}->createElementNS((HTML_NS), 'br'); $self->_data($el, manakai_source_line => $token->{line}) if defined $token->{line}; $self->_data($el, manakai_source_column => $token->{column}) if defined $token->{column}; $insert->($self, $el, $open_tables); ## Ignore the token. $token = $self->_get_next_token; next B; } else { if ($token->{tag_name} eq 'sarcasm') { sleep 0.001; # take a deep breath } ## Step 1 my $node_i = -1; my $node = $self->{open_elements}->[$node_i]; ## Step 2 LOOP: { my $node_tag_name = $node->[0]->tagName; $node_tag_name =~ tr/A-Z/a-z/; # for SVG camelCase tag names if ($node_tag_name eq $token->{tag_name}) { ## Step 1 ## generate implied end tags while ($self->{open_elements}->[-1]->[1] & END_TAG_OPTIONAL_EL and $self->{open_elements}->[-1]->[0]->localname ne $token->{tag_name}) { ## NOTE: |<ruby><rt></ruby>|. pop @{$self->{open_elements}}; $node_i++; } ## Step 2 my $current_tag_name = $self->{open_elements}->[-1]->[0]->tagName; $current_tag_name =~ tr/A-Z/a-z/; if ($current_tag_name ne $token->{tag_name}) { ## NOTE: <x><y></x> $self->{parse_error}->(level => $self->{level}->{must}, type => 'not closed', text => $self->{open_elements}->[-1]->[0] ->tagName, token => $token); } else { } ## Step 3 splice @{$self->{open_elements}}, $node_i if $node_i < 0; $token = $self->_get_next_token; last LOOP; } else { ## Step 3 if ($node->[1] & SPECIAL_EL or $node->[1] & SCOPING_EL) { ## "Special" $self->{parse_error}->(level => $self->{level}->{must}, type => 'unmatched end tag', text => $token->{tag_name}, token => $token); ## Ignore the token $token = $self->_get_next_token; last LOOP; ## NOTE: |<span><dd></span>a|: In Safari 3.1.2 and Opera ## 9.27, "a" is a child of <dd> (conforming). In ## Firefox 3.0.2, "a" is a child of <body>. In WinIE 7, ## "a" is a child of both <body> and <dd>. } } ## Step 4 $node_i--; $node = $self->{open_elements}->[$node_i]; ## Step 5; redo LOOP; } # LOOP next B; } } next B; } # B ## Stop parsing # MUST ## TODO: script stuffs } # _tree_construct_main ## XXX: How this method is organized is somewhat out of date, although ## it still does what the current spec documents. sub set_inner_html ($$$$;$) { my ($class, $self); if (ref $_[0]) { $self = shift; $class = ref $self; } else { $class = shift; $self = $class->new; } my $node = shift; # /context/ #my $s = \$_[0]; my $onerror = $_[1]; my $get_wrapper = $_[2] || sub ($) { return $_[0] }; my $nt = $node->node_type; #TOBY-TODO if ($nt == 9) { # Document (invoke the algorithm with no /context/ element) # MUST ## Step 1 # MUST ## TODO: If the document has an active parser, ... ## ISSUE: There is an issue in the spec. ## Step 2 # MUST my @cn = $node->childNodes; for (@cn) { $node->removeChild ($_); } ## Step 3, 4, 5 # MUST $self->parse_char_string ($_[0] => $node, $onerror, $get_wrapper); } elsif ($nt == 1) { # Element (invoke the algorithm with /context/ element) ## TODO: If non-html element ## NOTE: Most of this code is copied from |parse_string| ## TODO: Support for $get_wrapper #TOBY-TODO ## F1. Create an HTML document. my $this_doc = $node->ownerDocument; my $implementation = ref($this_doc); my $doc = $implementation->createDocument; $self->_data($doc, manakai_is_html => 1); ## F2. Propagate quirkness flag my $node_doc = $node->ownerDocument; $self->_data($doc)->{'manakai_compat_mode'} = $self->_data($node_doc, 'manakai_compat_mode'); ## F3. Create an HTML parser my $p = $self; $p->{document} = $doc; ## Step 8 # MUST my $i = 0; $p->{line_prev} = $p->{line} = 1; $p->{column_prev} = $p->{column} = 0; require HTML::HTML5::Parser::Charset::DecodeHandle; my $input = HTML::HTML5::Parser::Charset::DecodeHandle::CharString->new (\($_[0])); $input = $get_wrapper->($input); $p->{set_nc} = sub { my $self = shift; my $char = ''; if (defined $self->{next_nc}) { $char = $self->{next_nc}; delete $self->{next_nc}; $self->{nc} = ord $char; } else { $self->{char_buffer} = ''; $self->{char_buffer_pos} = 0; my $count = $input->manakai_read_until ($self->{char_buffer}, qr/[^\x00\x0A\x0D\x{D800}-\x{DFFF}]/, $self->{char_buffer_pos}); if ($count) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); return; } if ($input->read ($char, 1)) { $self->{nc} = ord $char; } else { $self->{nc} = -1; return; } } ($p->{line_prev}, $p->{column_prev}) = ($p->{line}, $p->{column}); $p->{column}++; if ($self->{nc} == 0x000A) { # LF $p->{line}++; $p->{column} = 0; } elsif ($self->{nc} == 0x000D) { # CR ## TODO: support for abort/streaming my $next = ''; if ($input->read ($next, 1) and $next ne "\x0A") { $self->{next_nc} = $next; } $self->{nc} = 0x000A; # LF # MUST $p->{line}++; $p->{column} = 0; } elsif (0xD800 <= $self->{nc} and $self->{nc} <= 0xDFFF) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'surrogate'); ## XXX documentation $self->{nc} = 0xFFFD; # REPLACEMENT CHARACTER # MUST } }; $p->{read_until} = sub { #my ($scalar, $specials_range, $offset) = @_; return 0 if defined $p->{next_nc}; my $pattern = qr/[^$_[1]\x00\x0A\x0D\x{D800}-\x{DFFF}]/; my $offset = $_[2] || 0; if ($p->{char_buffer_pos} < length $p->{char_buffer}) { pos ($p->{char_buffer}) = $p->{char_buffer_pos}; if ($p->{char_buffer} =~ /\G(?>$pattern)+/) { substr ($_[0], $offset) = substr ($p->{char_buffer}, $-[0], $+[0] - $-[0]); my $count = $+[0] - $-[0]; if ($count) { $p->{column} += $count; $p->{char_buffer_pos} += $count; $p->{line_prev} = $p->{line}; $p->{column_prev} = $p->{column} - 1; $p->{nc} = -1; } return $count; } else { return 0; } } else { my $count = $input->manakai_read_until ($_[0], $pattern, $_[2]); if ($count) { $p->{column} += $count; $p->{column_prev} += $count; $p->{nc} = -1; } return $count; } }; # $p->{read_until} my $ponerror = $onerror || sub { my (%opt) = @_; my $line = $opt{line}; my $column = $opt{column}; if (defined $opt{token} and defined $opt{token}->{line}) { $line = $opt{token}->{line}; $column = $opt{token}->{column}; } warn "Parse error ($opt{type}) at line $line column $column\n"; }; $p->{parse_error} = sub { $ponerror->(line => $p->{line}, column => $p->{column}, @_); }; my $char_onerror = sub { my (undef, $type, %opt) = @_; $ponerror->(layer => 'encode', line => $p->{line}, column => $p->{column} + 1, %opt, type => $type); }; # $char_onerror $input->onerror ($char_onerror); $p->_initialize_tokenizer; $p->_initialize_tree_constructor; ## F4. If /context/ is not undef... ## F4.1. content model flag my $node_ns = $node->namespaceURI || ''; my $node_ln = $node->localname; if ($node_ns eq HTML_NS) { if ($node_ln eq 'title' or $node_ln eq 'textarea') { $p->{state} = RCDATA_STATE; } elsif ($node_ln eq 'script') { $p->{state} = SCRIPT_DATA_STATE; } elsif ({ style => 1, script => 1, xmp => 1, iframe => 1, noembed => 1, noframes => 1, noscript => 1, }->{$node_ln}) { $p->{state} = RAWTEXT_STATE; } elsif ($node_ln eq 'plaintext') { $p->{state} = PLAINTEXT_STATE; } $p->{inner_html_node} = [$node, $el_category->{$node_ln}]; } elsif ($node_ns eq SVG_NS) { $p->{inner_html_node} = [$node, $el_category_f->{$node_ns}->{$node_ln} || FOREIGN_EL | SVG_EL]; } elsif ($node_ns eq MML_NS) { $p->{inner_html_node} = [$node, $el_category_f->{$node_ns}->{$node_ln} || FOREIGN_EL | MML_EL]; } else { $p->{inner_html_node} = [$node, FOREIGN_EL]; } ## F4.2. Root |html| element my $root = $doc->createElementNS('http://www.w3.org/1999/xhtml', 'html'); ## F4.3. $doc->appendChild ($root); ## F4.4. push @{$p->{open_elements}}, [$root, $el_category->{html}]; undef $p->{head_element}; ## F4.5. $p->_reset_insertion_mode; ## F4.6. my $anode = $node; AN: while (defined $anode) { if ($anode->node_type == 1) { my $nsuri = $anode->namespaceURI; if (defined $nsuri and $nsuri eq 'http://www.w3.org/1999/xhtml') { if ($anode->tagName eq 'form') { $p->{form_element} = $anode; last AN; } } } $anode = $anode->parentNode; } # AN ## F.5. Set the input stream. $p->{confident} = 1; ## Confident: irrelevant. ## F.6. Start the parser. { my $self = $p; $token = $self->_get_next_token; } $p->_tree_construction_main; ## F.7. my @cn = $node->childNodes; for (@cn) { $node->removeChild ($_); } ## ISSUE: mutation events? read-only? ## Step 11 # MUST @cn = $root->childNodes; for (@cn) { $this_doc->adoptNode ($_); $node->appendChild ($_); } ## ISSUE: mutation events? $p->_terminate_tree_constructor; ## Remove self references. delete $p->{set_nc}; delete $p->{read_until}; delete $p->{parse_error}; } else { die "$0: |set_inner_html| is not defined for node of type $nt"; } } # set_inner_html } # tree construction stage package HTML::HTML5::Parser::TagSoupParser::RestartParser; sub new { my ($class, %opts) = @_; bless \%opts => $class; } sub throw { my ($class, %opts) = @_; die $class->new(%opts); } 1; # $Date: 2009/09/06 23:32:06 $ �����������������������������������������HTML-HTML5-Parser-0.301/lib/HTML/HTML5/Parser/UA.pm�������������������������������������������������0000644�0001750�0001750�00000012665�12166544311�016735� 0����������������������������������������������������������������������������������������������������ustar �tai�����������������������������tai��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������package HTML::HTML5::Parser::UA; use 5.008001; use strict; BEGIN { $HTML::HTML5::Parser::UA::AUTHORITY = 'cpan:TOBYINK'; $HTML::HTML5::Parser::UA::VERSION = '0.301'; } use Encode qw(decode); use HTTP::Tiny; use URI::file; our $NO_LWP = '0'; sub get { my ($class, $uri, $ua) = @_; if (ref $ua and $ua->isa('HTTP::Tiny') and $uri =~ /^https?:/i) { goto \&_get_tiny } if (ref $ua and $ua->isa('LWP::UserAgent')) { goto \&_get_lwp } if (UNIVERSAL::can('LWP::UserAgent', 'can') and not $NO_LWP) { goto \&_get_lwp } if ($uri =~ /^file:/i) { goto \&_get_fs } goto \&_get_tiny; } sub _get_lwp { eval "require LWP::UserAgent; 1" or do { require Carp; Carp::croak("could not load LWP::UserAgent"); }; my ($class, $uri, $ua) = @_; $ua ||= LWP::UserAgent->new( agent => sprintf( "%s/%s ", 'HTML::HTML5::Parser', HTML::HTML5::Parser->VERSION, ), default_headers => HTTP::Headers->new( 'Accept' => join q(, ) => qw( text/html application/xhtml+xml;q=0.9 application/xml;q=0.1 text/xml;q=0.1 ) ), parse_head => 0, ); my $response = $ua->get($uri); my $h = $response->headers; my %header_hash = map { lc($_) => $h->header($_); } $h->header_field_names; return +{ success => $response->is_success, status => $response->code, reason => $response->message, headers => \%header_hash, content => $response->content, decoded_content => $response->decoded_content, }; } sub _get_tiny { my ($class, $uri, $ua) = @_; $ua ||= HTTP::Tiny->new( agent => sprintf("%s/%s", 'HTML::HTML5::Parser', HTML::HTML5::Parser->VERSION), default_headers => +{ 'Accept' => join(q(, ) => qw( text/html application/xhtml+xml;q=0.9 application/xml;q=0.1 text/xml;q=0.1 )), }, ); my $response = $ua->get($uri); if ($response->{headers}{'content-type'} =~ /charset=(\S+)/) { (my $encoding = $1) =~ s/["']//g; $response->{decoded_content} = eval { decode($encoding, $response->{content}) }; } $response->{decoded_content} = $response->{content} unless defined $response->{decoded_content}; return $response; } sub _get_fs { my $class = shift; my ($uri) = map { ref() ? $_ : URI->new($_) } @_; my $file = $uri->file; my ($status, $reason, $content, $content_type) = do { if (not -e $file) { (404 => 'Not Found', 'File not found.', 'text/plain') } elsif (not -r $file) { (403 => 'Forbidden', 'File not readable by effective guid.', 'text/plain') } else { (200 => 'OK') } }; $content ||= do { if (open my $fh, '<', $file) { local $/ = <$fh> } else { $status = 418; $reason = "I'm a teapot"; $content_type = 'text/plain'; $! } }; $content_type ||= 'text/xml' if $file =~ /\.xml$/i; $content_type ||= 'application/xhtml+xml' if $file =~ /\.xht(ml)?$/i; $content_type ||= 'text/html' if $file =~ /\.html?$/i; $content_type ||= 'application/octet-stream'; return +{ success => ($status == 200), status => $status, reason => $reason, headers => +{ 'content-type' => $content_type, 'content-length' => length($content), }, content => $content, decoded_content => $content, }; } 1; =head1 NAME HTML::HTML5::Parser::UA - simple web user agent class =head1 SYNOPSIS use aliased 'HTML::HTML5::Parser::UA'; my $response = UA->get($url); die unless $response->{success}; print $response->{decoded_content}; =head1 DESCRIPTION This is a simple wrapper around HTTP::Tiny and LWP::UserAgent to smooth out the API differences between them. It only supports bog standard C<< get($url) >> requests. If LWP::UserAgent is already in memory, this module will use that. If LWP::UserAgent is not in memory, then this module will use HTTP::Tiny (or direct filesystem access for "file://" URLs). If LWP::UserAgent is not in memory, and you attempt to request a URL that HTTP::Tiny cannot handle (e.g. an "ftp://" URL), then this module will load LWP::UserAgent and die if it cannot be loaded (e.g. is not installed). HTML::HTML5::Parser::UA is used by the C<parse_file> method of HTML::HTML5::Parser. =head2 Class Method =over =item C<< get($url, $ua) >> Gets the URL and returns a hashref similar to HTTP::Tiny's hashrefs, but with an additional C<decoded_content> key, which contains the response body, decoded into a Perl character string (not a byte string). If $ua is given (it's optional), then this user agent will be used to perform the actual request. Must be undef or an LWP::UserAgent object (or a subclass) or an HTTP::Tiny object (or a subclass). =back =head2 Package Variable =over =item C<< $HTML::HTML5::Parser::NO_LWP >> If true, avoids using LWP::UserAgent. =back =head1 MOTIVATION L<LWP::UserAgent> is a good piece of software but it has a dependency on L<HTML::Parser>. L<HTML::Parser> is only used to provide one fairly esoteric feature, which this package doesn't make use of. (It's the C<parse_head> option.) Because of that, I don't especially want HTML::HTML5::Parser to have a dependency on LWP::UserAgent. Hence this module. =head1 SEE ALSO L<HTML::HTML5::Parser>. =head1 AUTHOR Toby Inkster, E<lt>tobyink@cpan.orgE<gt> =head1 COPYRIGHT AND LICENSE Copyright (C) 2012 by Toby Inkster This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =head1 DISCLAIMER OF WARRANTIES THIS PACKAGE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. ���������������������������������������������������������������������������HTML-HTML5-Parser-0.301/lib/HTML/HTML5/Parser/Error.pm����������������������������������������������0000644�0001750�0001750�00000006047�12166544311�017516� 0����������������������������������������������������������������������������������������������������ustar �tai�����������������������������tai��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������package HTML::HTML5::Parser::Error; use 5.008001; use strict; use warnings; our $VERSION = '0.301'; use overload '""' => \&to_string; sub new { my ($class, %args) = @_; bless \%args, $class; } sub level { my $self = shift; return { m => 'MUST', s => 'SHOULD', w => 'WARN', i => 'INFO', u => undef, }->{$self->{level}} || undef; } sub layer { my $self = shift; return $self->{layer} || undef; } sub type { my $self = shift; return $self->{type}||undef; } sub tag_name { my $self = shift; return undef unless $self->{token} && exists $self->{token}{tag_name}; return $self->{token}{tag_name}; } sub source_line { my $self = shift; if (wantarray) { return ($self->{line}, $self->{column}); } else { return $self->{line}; } } sub to_string { my $self = shift; my $msg = $self->type; my $level = $self->level; my $tag = $self->tag_name; my ($l, $c) = $self->source_line; my @details; push @details, sprintf('complicance: %s', $level) if defined $level; push @details, sprintf('line: %d', $l) if defined $l; push @details, sprintf('column: %d', $c) if defined $c; push @details, sprintf('tag: %s', $tag) if defined $tag; if (@details) { $msg .= " ["; $msg .= join '; ', @details; $msg .= "]"; } return $msg; } 1; =head1 NAME HTML::HTML5::Parser::Error - an error that occurred during parsing =head1 DESCRIPTION The C<error_handler> and C<errors> methods of C<HTML::HTML5::Parser> generate C<HTML::HTML5::Parser::Error> objects. C<HTML::HTML5::Parser::Error> overloads stringification, so can be printed, matched against regular expressions, etc. Note that L<HTML::HTML5::Parser> is not a validation tool, and there are many classes of error that it does not care about, so will not raise. =head2 Constructor =over =item C<< new(level=>$level, type=>$type, token=>$token, ...) >> Constructs a new C<HTML::HTML5::Parser::Error> object. =back =head2 Methods =over =item C<level> Returns the level of error. ('MUST', 'SHOULD', 'WARN', 'INFO' or undef.) =item C<layer> Returns the parsing layer involved, often undef. e.g. 'encode'. =item C<type> Returns the type of error as a string. =item C<tag_name> Returns the tag name (if any). =item C<source_line> ($line, $col) = $error->source_line(); $line = $error->source_line; In scalar context, C<source_line> returns the line number of the source code that triggered the error. In list context, returns a line/column pair. (Tab characters count as one column, not eight.) =item C<to_string> Returns a friendly error string. =back =head1 SEE ALSO L<HTML::HTML5::Parser>. =head1 AUTHOR Toby Inkster, E<lt>tobyink@cpan.orgE<gt> =head1 COPYRIGHT AND LICENSE Copyright (C) 2011-2012 by Toby Inkster This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =head1 DISCLAIMER OF WARRANTIES THIS PACKAGE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. �����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������HTML-HTML5-Parser-0.301/lib/HTML/HTML5/Parser/Tokenizer.pm������������������������������������������0000644�0001750�0001750�00001101767�12166544311�020405� 0����������������������������������������������������������������������������������������������������ustar �tai�����������������������������tai��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������package HTML::HTML5::Parser::Tokenizer; # -*- Perl -*- ## skip Test::Tabs use strict; our $VERSION='0.301'; ## This module implements the tokenization phase of both HTML5 and ## XML5. Notes like this are usually based on the latest HTML ## specification. Since XML is different from HTML, and since XML5 ## specification has not been maintained any more, there is a few ## differences from HTML's tokenization. Such differences are marked ## by prefix "XML5:". ## Warnings that depend on the HTML/XML input stream, such as ones ## related to surrogate code positions, are not useful. no warnings 'utf8'; ## ------ Token types ------ BEGIN { require Exporter; push our @ISA, 'Exporter'; our @EXPORT_OK = qw( DOCTYPE_TOKEN COMMENT_TOKEN START_TAG_TOKEN END_TAG_TOKEN END_OF_FILE_TOKEN CHARACTER_TOKEN PI_TOKEN ABORT_TOKEN END_OF_DOCTYPE_TOKEN ATTLIST_TOKEN ELEMENT_TOKEN GENERAL_ENTITY_TOKEN PARAMETER_ENTITY_TOKEN NOTATION_TOKEN ); our %EXPORT_TAGS = ( token => [qw( DOCTYPE_TOKEN COMMENT_TOKEN START_TAG_TOKEN END_TAG_TOKEN END_OF_FILE_TOKEN CHARACTER_TOKEN PI_TOKEN ABORT_TOKEN END_OF_DOCTYPE_TOKEN ATTLIST_TOKEN ELEMENT_TOKEN GENERAL_ENTITY_TOKEN PARAMETER_ENTITY_TOKEN NOTATION_TOKEN )], ); } sub DOCTYPE_TOKEN () { 1 } ## XML5: No DOCTYPE token. sub COMMENT_TOKEN () { 2 } sub START_TAG_TOKEN () { 3 } sub END_TAG_TOKEN () { 4 } sub END_OF_FILE_TOKEN () { 5 } sub CHARACTER_TOKEN () { 6 } sub PI_TOKEN () { 7 } ## NOTE: XML only. sub ABORT_TOKEN () { 8 } ## NOTE: For internal processing. sub END_OF_DOCTYPE_TOKEN () { 9 } ## NOTE: XML only. sub ATTLIST_TOKEN () { 10 } ## NOTE: XML only. sub ELEMENT_TOKEN () { 11 } ## NOTE: XML only. sub GENERAL_ENTITY_TOKEN () { 12 } ## NOTE: XML only. sub PARAMETER_ENTITY_TOKEN () { 13 } ## NOTE: XML only. sub NOTATION_TOKEN () { 14 } ## NOTE: XML only. ## XML5: XML5 has "empty tag token". In this implementation, it is ## represented as a start tag token with $self->{self_closing} flag ## set to true. ## XML5: XML5 has "short end tag token". In this implementation, it ## is represented as an end tag token with $token->{tag_name} flag set ## to an empty string. package HTML::HTML5::Parser::TagSoupParser; BEGIN { HTML::HTML5::Parser::Tokenizer->import (':token') } use HTML::HTML5::Entities qw[%entity2char]; ## ------ Tokenizer states ------ sub DATA_STATE () { 0 } sub RCDATA_STATE () { 107 } sub RAWTEXT_STATE () { 108 } sub SCRIPT_DATA_STATE () { 109 } sub PLAINTEXT_STATE () { 110 } sub TAG_OPEN_STATE () { 2 } sub RCDATA_LT_STATE () { 111 } sub RAWTEXT_LT_STATE () { 112 } sub SCRIPT_DATA_LT_STATE () { 113 } sub CLOSE_TAG_OPEN_STATE () { 3 } sub RCDATA_END_TAG_OPEN_STATE () { 114 } sub RAWTEXT_END_TAG_OPEN_STATE () { 115 } sub SCRIPT_DATA_END_TAG_OPEN_STATE () { 116 } sub SCRIPT_DATA_ESCAPE_START_STATE () { 1 } sub SCRIPT_DATA_ESCAPE_START_DASH_STATE () { 12 } sub SCRIPT_DATA_ESCAPED_STATE () { 117 } sub SCRIPT_DATA_ESCAPED_DASH_STATE () { 118 } sub SCRIPT_DATA_ESCAPED_DASH_DASH_STATE () { 119 } sub SCRIPT_DATA_ESCAPED_LT_STATE () { 120 } sub SCRIPT_DATA_ESCAPED_END_TAG_OPEN_STATE () { 121 } sub SCRIPT_DATA_DOUBLE_ESCAPE_START_STATE () { 122 } sub SCRIPT_DATA_DOUBLE_ESCAPED_STATE () { 123 } sub SCRIPT_DATA_DOUBLE_ESCAPED_DASH_STATE () { 124 } sub SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH_STATE () { 125 } sub SCRIPT_DATA_DOUBLE_ESCAPED_LT_STATE () { 126 } sub SCRIPT_DATA_DOUBLE_ESCAPE_END_STATE () { 127 } sub TAG_NAME_STATE () { 4 } sub BEFORE_ATTRIBUTE_NAME_STATE () { 5 } sub ATTRIBUTE_NAME_STATE () { 6 } sub AFTER_ATTRIBUTE_NAME_STATE () { 7 } sub BEFORE_ATTRIBUTE_VALUE_STATE () { 8 } sub ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE () { 9 } sub ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE () { 10 } sub ATTRIBUTE_VALUE_UNQUOTED_STATE () { 11 } sub MARKUP_DECLARATION_OPEN_STATE () { 13 } sub COMMENT_START_STATE () { 14 } sub COMMENT_START_DASH_STATE () { 15 } sub COMMENT_STATE () { 16 } sub COMMENT_END_STATE () { 17 } sub COMMENT_END_BANG_STATE () { 102 } #sub COMMENT_END_SPACE_STATE () { 103 } ## REMOVED sub COMMENT_END_DASH_STATE () { 18 } sub BOGUS_COMMENT_STATE () { 19 } sub DOCTYPE_STATE () { 20 } sub BEFORE_DOCTYPE_NAME_STATE () { 21 } sub DOCTYPE_NAME_STATE () { 22 } sub AFTER_DOCTYPE_NAME_STATE () { 23 } sub AFTER_DOCTYPE_PUBLIC_KEYWORD_STATE () { 104 } sub BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 24 } sub DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE () { 25 } sub DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE () { 26 } sub AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE () { 27 } sub BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 28 } sub DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE () { 29 } sub DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE () { 30 } sub BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDS_STATE () { 105 } sub AFTER_DOCTYPE_SYSTEM_KEYWORD_STATE () { 106 } sub AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE () { 31 } sub BOGUS_DOCTYPE_STATE () { 32 } sub AFTER_ATTRIBUTE_VALUE_QUOTED_STATE () { 33 } sub SELF_CLOSING_START_TAG_STATE () { 34 } sub CDATA_SECTION_STATE () { 35 } sub MD_HYPHEN_STATE () { 36 } # "markup declaration open state" in the spec sub MD_DOCTYPE_STATE () { 37 } # "markup declaration open state" in the spec sub MD_CDATA_STATE () { 38 } # "markup declaration open state" in the spec #sub CDATA_RCDATA_CLOSE_TAG_STATE () { 39 } # "close tag open state" in the spec sub CDATA_SECTION_MSE1_STATE () { 40 } # "CDATA section state" in the spec sub CDATA_SECTION_MSE2_STATE () { 41 } # "CDATA section state" in the spec sub PUBLIC_STATE () { 42 } # "after DOCTYPE name state" in the spec sub SYSTEM_STATE () { 43 } # "after DOCTYPE name state" in the spec ## ## NOTE: "Entity data state", "entity in attribute value state", and ## the "consume a character reference" algorithm, are jointly ## implemented as the following six states: sub ENTITY_STATE () { 44 } sub ENTITY_HASH_STATE () { 45 } sub NCR_NUM_STATE () { 46 } sub HEXREF_X_STATE () { 47 } sub HEXREF_HEX_STATE () { 48 } sub ENTITY_NAME_STATE () { 49 } ## ## XML-only states sub DATA_MSE1_STATE () { 50 } sub DATA_MSE2_STATE () { 128 } # last sub PI_STATE () { 51 } sub PI_TARGET_STATE () { 52 } sub PI_TARGET_AFTER_STATE () { 53 } sub PI_DATA_STATE () { 54 } sub PI_AFTER_STATE () { 55 } sub PI_DATA_AFTER_STATE () { 56 } sub DOCTYPE_INTERNAL_SUBSET_STATE () { 57 } sub DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 58 } sub BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE () { 59 } sub DOCTYPE_TAG_STATE () { 60 } sub DOCTYPE_MARKUP_DECLARATION_OPEN_STATE () { 61 } sub MD_ATTLIST_STATE () { 62 } sub MD_E_STATE () { 63 } sub MD_ELEMENT_STATE () { 64 } sub MD_ENTITY_STATE () { 65 } sub MD_NOTATION_STATE () { 66 } sub DOCTYPE_MD_STATE () { 67 } sub BEFORE_MD_NAME_STATE () { 68 } sub MD_NAME_STATE () { 69 } sub DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE () { 70 } sub DOCTYPE_ATTLIST_NAME_AFTER_STATE () { 71 } sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE () { 72 } sub DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE () { 73 } sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE () { 74 } sub DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE () { 75 } sub BEFORE_ALLOWED_TOKEN_STATE () { 76 } sub ALLOWED_TOKEN_STATE () { 77 } sub AFTER_ALLOWED_TOKEN_STATE () { 78 } sub AFTER_ALLOWED_TOKENS_STATE () { 79 } sub BEFORE_ATTR_DEFAULT_STATE () { 80 } sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE () { 81 } sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE () { 82 } sub DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE () { 83 } sub AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE () { 84 } sub BEFORE_NDATA_STATE () { 85 } sub NDATA_STATE () { 86 } sub AFTER_NDATA_STATE () { 87 } sub BEFORE_NOTATION_NAME_STATE () { 88 } sub NOTATION_NAME_STATE () { 89 } sub DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE () { 90 } sub DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE () { 91 } sub ENTITY_VALUE_ENTITY_STATE () { 92 } sub AFTER_ELEMENT_NAME_STATE () { 93 } sub BEFORE_ELEMENT_CONTENT_STATE () { 94 } sub CONTENT_KEYWORD_STATE () { 95 } sub AFTER_CM_GROUP_OPEN_STATE () { 96 } sub CM_ELEMENT_NAME_STATE () { 97 } sub AFTER_CM_ELEMENT_NAME_STATE () { 98 } sub AFTER_CM_GROUP_CLOSE_STATE () { 99 } sub AFTER_MD_DEF_STATE () { 100 } sub BOGUS_MD_STATE () { 101 } ## ------ Tree constructor state constants ------ ## Whether the parsed string is in the foreign island or not affect ## how tokenization is done, unfortunately. These are a copy of some ## of tokenization state constants. See Whatpm::HTML for the full ## list and the descriptions for constants. sub FOREIGN_EL () { 0b1_00000000000 } ## ------ Character reference mappings ------ my $charref_map = { 0x00 => 0xFFFD, # REPLACEMENT CHARACTER 0x0D => 0x000D, # CARRIAGE RETURN 0x80 => 0x20AC, 0x81 => 0x0081, 0x82 => 0x201A, 0x83 => 0x0192, 0x84 => 0x201E, 0x85 => 0x2026, 0x86 => 0x2020, 0x87 => 0x2021, 0x88 => 0x02C6, 0x89 => 0x2030, 0x8A => 0x0160, 0x8B => 0x2039, 0x8C => 0x0152, 0x8D => 0x008D, 0x8E => 0x017D, 0x8F => 0x008F, 0x90 => 0x0090, 0x91 => 0x2018, 0x92 => 0x2019, 0x93 => 0x201C, 0x94 => 0x201D, 0x95 => 0x2022, 0x96 => 0x2013, 0x97 => 0x2014, 0x98 => 0x02DC, 0x99 => 0x2122, 0x9A => 0x0161, 0x9B => 0x203A, 0x9C => 0x0153, 0x9D => 0x009D, 0x9E => 0x017E, 0x9F => 0x0178, }; # $charref_map $charref_map->{$_} = 0xFFFD # REPLACEMENT CHARACTER for 0xD800..0xDFFF; $charref_map->{$_} = $_ for 0x0001..0x0008, 0x000B, 0x000E..0x001F, 0x007F, 0xFDD0..0xFDEF, 0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE, 0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF, 0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE, 0x10FFFF; ## ------ Special character-like constants ------ ## The "EOF" pseudo-character in the HTML parsing algorithm. sub EOF_CHAR () { -1 } ## A pseudo-character code that can never appear in the input stream. sub NEVER_CHAR () { -2 } ## ------ The tokenizer ------ ## Implementations MUST act as if state machine in the spec sub _initialize_tokenizer ($) { my $self = shift; ## NOTE: Fields set by |new| constructor: #$self->{level} #$self->{set_nc} #$self->{parse_error} #$self->{is_xml} (if XML) $self->{state} = DATA_STATE; # MUST #$self->{kwd} = ''; # State-dependent keyword; initialized when used #$self->{entity__value}; # initialized when used #$self->{entity__match}; # initialized when used undef $self->{ct}; # current token undef $self->{ca}; # current attribute undef $self->{last_stag_name}; # last emitted start tag name #$self->{prev_state}; # initialized when used delete $self->{self_closing}; $self->{char_buffer} = ''; $self->{char_buffer_pos} = 0; $self->{nc} = -1; # next input character #$self->{next_nc} if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } $self->{token} = []; # $self->{escape} } # _initialize_tokenizer ## A token has: ## ->{type} == DOCTYPE_TOKEN, START_TAG_TOKEN, END_TAG_TOKEN, COMMENT_TOKEN, ## CHARACTER_TOKEN, END_OF_FILE_TOKEN, PI_TOKEN, or ABORT_TOKEN ## ->{name} (DOCTYPE_TOKEN) ## ->{tag_name} (START_TAG_TOKEN, END_TAG_TOKEN) ## ->{target} (PI_TOKEN) ## ->{pubid} (DOCTYPE_TOKEN) ## ->{sysid} (DOCTYPE_TOKEN) ## ->{quirks} == 1 or 0 (DOCTYPE_TOKEN): "force-quirks" flag ## ->{attributes} isa HASH (START_TAG_TOKEN, END_TAG_TOKEN) ## ->{name} ## ->{value} ## ->{has_reference} == 1 or 0 ## ->{index}: Index of the attribute in a tag. ## ->{data} (COMMENT_TOKEN, CHARACTER_TOKEN, PI_TOKEN) ## ->{has_reference} == 1 or 0 (CHARACTER_TOKEN) ## ->{last_index} (ELEMENT_TOKEN): Next attribute's index - 1. ## ->{has_internal_subset} = 1 or 0 (DOCTYPE_TOKEN) ## NOTE: The "self-closing flag" is hold as |$self->{self_closing}|. ## |->{self_closing}| is used to save the value of |$self->{self_closing}| ## while the token is pushed back to the stack. ## Emitted token MUST immediately be handled by the tree construction state. ## Before each step, UA MAY check to see if either one of the scripts in ## "list of scripts that will execute as soon as possible" or the first ## script in the "list of scripts that will execute asynchronously", ## has completed loading. If one has, then it MUST be executed ## and removed from the list. ## TODO: Polytheistic slash SHOULD NOT be used. (Applied only to atheists.) ## (This requirement was dropped from HTML5 spec, unfortunately.) my $is_space = { 0x0009 => 1, # CHARACTER TABULATION (HT) 0x000A => 1, # LINE FEED (LF) #0x000B => 0, # LINE TABULATION (VT) 0x000C => 1, # FORM FEED (FF) ## XML5: Not a space character. 0x000D => 1, # CARRIAGE RETURN (CR) 0x0020 => 1, # SPACE (SP) }; sub KEY_ELSE_CHAR () { 255 } sub KEY_ULATIN_CHAR () { 254 } sub KEY_LLATIN_CHAR () { 253 } sub KEY_EOF_CHAR () { 252 } sub KEY_SPACE_CHAR () { 251 } my $Action; my $XMLAction; $Action->[DATA_STATE]->[0x0026] = { name => 'data &', state => ENTITY_STATE, # "entity data state" + "consume a character reference" state_set => {entity_add => -1, prev_state => DATA_STATE}, }; $Action->[DATA_STATE]->[0x003C] = { name => 'data <', state => TAG_OPEN_STATE, }; $Action->[DATA_STATE]->[KEY_EOF_CHAR] = { name => 'data eof', emit => END_OF_FILE_TOKEN, reconsume => 1, }; $Action->[DATA_STATE]->[0x0000] = { name => 'data null', emit => CHARACTER_TOKEN, error => 'NULL', }; $Action->[DATA_STATE]->[KEY_ELSE_CHAR] = { name => 'data else', emit => CHARACTER_TOKEN, emit_data_read_until => qq{\x00<&}, }; $XMLAction->[DATA_STATE]->[0x005D] = { # ] name => 'data ]', state => DATA_MSE1_STATE, emit => CHARACTER_TOKEN, }; $XMLAction->[DATA_STATE]->[KEY_ELSE_CHAR] = { name => 'data else xml', emit => CHARACTER_TOKEN, emit_data_read_until => qq{\x00<&\]}, }; $Action->[RCDATA_STATE]->[0x0026] = { name => 'rcdata &', state => ENTITY_STATE, # "entity data state" + "consume a character reference" state_set => {entity_add => -1, prev_state => RCDATA_STATE}, }; $Action->[RCDATA_STATE]->[0x003C] = { name => 'rcdata <', state => RCDATA_LT_STATE, }; $Action->[RCDATA_STATE]->[KEY_EOF_CHAR] = $Action->[DATA_STATE]->[KEY_EOF_CHAR]; $Action->[RCDATA_STATE]->[0x0000] = { name => 'rcdata null', emit => CHARACTER_TOKEN, emit_data => "\x{FFFD}", error => 'NULL', }; $Action->[RCDATA_STATE]->[KEY_ELSE_CHAR] = { name => 'rcdata else', emit => CHARACTER_TOKEN, emit_data_read_until => qq{\x00<&}, }; $Action->[RAWTEXT_STATE]->[0x003C] = { name => 'rawtext <', state => RAWTEXT_LT_STATE, }; $Action->[RAWTEXT_STATE]->[KEY_EOF_CHAR] = $Action->[DATA_STATE]->[KEY_EOF_CHAR]; $Action->[RAWTEXT_STATE]->[0x0000] = $Action->[RCDATA_STATE]->[0x0000]; $Action->[RAWTEXT_STATE]->[KEY_ELSE_CHAR] = { name => 'rawtext else', emit => CHARACTER_TOKEN, emit_data_read_until => qq{\x00<}, }; $Action->[SCRIPT_DATA_STATE]->[0x003C] = { name => 'script data <', state => SCRIPT_DATA_LT_STATE, }; $Action->[SCRIPT_DATA_STATE]->[KEY_EOF_CHAR] = $Action->[DATA_STATE]->[KEY_EOF_CHAR]; $Action->[SCRIPT_DATA_STATE]->[0x0000] = $Action->[RAWTEXT_STATE]->[0x0000]; $Action->[SCRIPT_DATA_STATE]->[KEY_ELSE_CHAR] = $Action->[RAWTEXT_STATE]->[KEY_ELSE_CHAR]; $Action->[PLAINTEXT_STATE]->[KEY_EOF_CHAR] = $Action->[DATA_STATE]->[KEY_EOF_CHAR]; $Action->[PLAINTEXT_STATE]->[0x0000] = $Action->[RAWTEXT_STATE]->[0x0000]; $Action->[PLAINTEXT_STATE]->[KEY_ELSE_CHAR] = { name => 'plaintext else', emit => CHARACTER_TOKEN, emit_data_read_until => qq{\x00}, }; # "Tag open state" is known as "tag state" in XML5. $Action->[TAG_OPEN_STATE]->[0x0021] = { name => 'tag open !', state => MARKUP_DECLARATION_OPEN_STATE, }; $Action->[TAG_OPEN_STATE]->[0x002F] = { name => 'tag open /', state => CLOSE_TAG_OPEN_STATE, }; $Action->[TAG_OPEN_STATE]->[KEY_ULATIN_CHAR] = { name => 'tag open uc', ct => { type => START_TAG_TOKEN, delta => 1, append_tag_name => 0x0020, # UC -> lc }, state => TAG_NAME_STATE, }; $XMLAction->[TAG_OPEN_STATE]->[KEY_ULATIN_CHAR] = { name => 'tag open uc xml', ct => { type => START_TAG_TOKEN, delta => 1, append_tag_name => 0x0000, }, state => TAG_NAME_STATE, }; $Action->[TAG_OPEN_STATE]->[KEY_LLATIN_CHAR] = { name => 'tag open lc', ct => { type => START_TAG_TOKEN, delta => 1, append_tag_name => 0x0000, }, state => TAG_NAME_STATE, }; $Action->[TAG_OPEN_STATE]->[0x003F] = { name => 'tag open ?', state => BOGUS_COMMENT_STATE, error => 'pio', error_delta => 1, ct => { type => COMMENT_TOKEN, }, reconsume => 1, ## $self->{nc} is intentionally left as is }; $XMLAction->[TAG_OPEN_STATE]->[0x003F] = { # ? name => 'tag open ? xml', state => PI_STATE, }; $Action->[TAG_OPEN_STATE]->[KEY_SPACE_CHAR] = $Action->[TAG_OPEN_STATE]->[0x003E] = { # > name => 'tag open else', error => 'bare stago', error_delta => 1, state => DATA_STATE, reconsume => 1, emit => CHARACTER_TOKEN, emit_data => '<', emit_delta => 1, }; $Action->[TAG_OPEN_STATE]->[KEY_ELSE_CHAR] = $Action->[TAG_OPEN_STATE]->[0x003E]; $XMLAction->[TAG_OPEN_STATE]->[0x0000] = { name => 'tag open null xml', ct => { type => START_TAG_TOKEN, delta => 1, append_tag_name => 0xFFFD, }, error => 'NULL', state => TAG_NAME_STATE, }; ## XML5: "<:" has a parse error. $XMLAction->[TAG_OPEN_STATE]->[KEY_ELSE_CHAR] = { name => 'tag open else xml', ct => { type => START_TAG_TOKEN, delta => 1, append_tag_name => 0x0000, }, state => TAG_NAME_STATE, }; $Action->[RCDATA_LT_STATE]->[0x002F] = { name => 'rcdata lt /', state => RCDATA_END_TAG_OPEN_STATE, buffer => {clear => 1}, }; $Action->[RAWTEXT_LT_STATE]->[0x002F] = { name => 'rawtext lt /', state => RAWTEXT_END_TAG_OPEN_STATE, buffer => {clear => 1}, }; $Action->[SCRIPT_DATA_LT_STATE]->[0x002F] = { name => 'script data lt /', state => SCRIPT_DATA_END_TAG_OPEN_STATE, buffer => {clear => 1}, }; $Action->[SCRIPT_DATA_ESCAPED_LT_STATE]->[0x002F] = { name => 'script data escaped lt /', state => SCRIPT_DATA_ESCAPED_END_TAG_OPEN_STATE, buffer => {clear => 1}, }; $Action->[SCRIPT_DATA_LT_STATE]->[0x0021] = { name => 'script data lt !', state => SCRIPT_DATA_ESCAPE_START_STATE, emit => CHARACTER_TOKEN, emit_data => '<!', }; $Action->[SCRIPT_DATA_ESCAPED_LT_STATE]->[KEY_ULATIN_CHAR] = { name => 'script data escaped lt uc', emit => CHARACTER_TOKEN, emit_data => '<', emit_data_append => 1, buffer => {clear => 1, append => 0x0020}, # UC -> lc state => SCRIPT_DATA_DOUBLE_ESCAPE_START_STATE, }; $Action->[SCRIPT_DATA_ESCAPED_LT_STATE]->[KEY_LLATIN_CHAR] = { name => 'script data escaped lt lc', emit => CHARACTER_TOKEN, emit_data => '<', emit_data_append => 1, buffer => {clear => 1, append => 0x0000}, state => SCRIPT_DATA_DOUBLE_ESCAPE_START_STATE, }; $Action->[RCDATA_LT_STATE]->[KEY_ELSE_CHAR] = { name => 'rcdata lt else', state => RCDATA_STATE, reconsume => 1, emit => CHARACTER_TOKEN, emit_data => '<', }; $Action->[RAWTEXT_LT_STATE]->[KEY_ELSE_CHAR] = { name => 'rawtext lt else', state => RAWTEXT_STATE, reconsume => 1, emit => CHARACTER_TOKEN, emit_data => '<', }; $Action->[SCRIPT_DATA_LT_STATE]->[KEY_ELSE_CHAR] = { name => 'script data lt else', state => SCRIPT_DATA_STATE, reconsume => 1, emit => CHARACTER_TOKEN, emit_data => '<', }; $Action->[SCRIPT_DATA_ESCAPED_LT_STATE]->[KEY_ELSE_CHAR] = { name => 'script data escaped lt else', state => SCRIPT_DATA_ESCAPED_STATE, reconsume => 1, emit => CHARACTER_TOKEN, emit_data => '<', }; ## XXX "End tag token" in latest HTML5 and in XML5. $Action->[CLOSE_TAG_OPEN_STATE]->[KEY_ULATIN_CHAR] = { name => 'end tag open uc', ct => { type => END_TAG_TOKEN, delta => 2, append_tag_name => 0x0020, # UC -> lc }, state => TAG_NAME_STATE, }; $XMLAction->[CLOSE_TAG_OPEN_STATE]->[KEY_ULATIN_CHAR] = { name => 'end tag open uc xml', ct => { type => END_TAG_TOKEN, delta => 2, append_tag_name => 0x0000, }, state => TAG_NAME_STATE, }; $Action->[CLOSE_TAG_OPEN_STATE]->[KEY_LLATIN_CHAR] = { name => 'end tag open lc', ct => { type => END_TAG_TOKEN, delta => 2, append_tag_name => 0x0000, }, state => TAG_NAME_STATE, }; $Action->[CLOSE_TAG_OPEN_STATE]->[0x003E] = { name => 'end tag open >', error => 'empty end tag', error_delta => 2, # "<" in "</>" state => DATA_STATE, }; ## XML5: No parse error. ## NOTE: This parser raises a parse error, since it supports XML1, ## not XML5. ## NOTE: A short end tag token. $XMLAction->[CLOSE_TAG_OPEN_STATE]->[0x003E] = { name => 'end tag open > xml', error => 'empty end tag', error_delta => 2, # "<" in "</>" state => DATA_STATE, ct => { type => END_TAG_TOKEN, delta => 2, }, emit => '', }; $Action->[CLOSE_TAG_OPEN_STATE]->[KEY_EOF_CHAR] = { name => 'end tag open eof', error => 'bare etago', state => DATA_STATE, reconsume => 1, emit => CHARACTER_TOKEN, emit_data => '</', emit_delta => 2, }; $Action->[CLOSE_TAG_OPEN_STATE]->[KEY_SPACE_CHAR] = $Action->[CLOSE_TAG_OPEN_STATE]->[KEY_ELSE_CHAR] = { name => 'end tag open else', error => 'bogus end tag', error_delta => 2, # "<" of "</" state => BOGUS_COMMENT_STATE, ct => { type => COMMENT_TOKEN, delta => 2, # "<" of "</" }, reconsume => 1, ## NOTE: $self->{nc} is intentionally left as is. Although the ## "anything else" case of the spec not explicitly states that the ## next input character is to be reconsumed, it will be included to ## the |data| of the comment token generated from the bogus end tag, ## as defined in the "bogus comment state" entry. }; $XMLAction->[CLOSE_TAG_OPEN_STATE]->[0x0000] = { name => 'end tag open null xml', ct => { type => END_TAG_TOKEN, delta => 2, append_tag_name => 0xFFFD, }, error => 'NULL', state => TAG_NAME_STATE, ## XML5: "end tag name state". }; ## XML5: "</:" is a parse error. $XMLAction->[CLOSE_TAG_OPEN_STATE]->[KEY_ELSE_CHAR] = { name => 'end tag open else xml', ct => { type => END_TAG_TOKEN, delta => 2, append_tag_name => 0x0000, }, state => TAG_NAME_STATE, ## XML5: "end tag name state". }; ## This switch-case implements "tag name state", "RCDATA end tag ## name state", "RAWTEXT end tag name state", and "script data ## end tag name state" jointly with the implementation of ## "RCDATA end tag open state" and so on. $Action->[TAG_NAME_STATE]->[KEY_SPACE_CHAR] = { name => 'tag name sp', state => BEFORE_ATTRIBUTE_NAME_STATE, }; $Action->[TAG_NAME_STATE]->[0x003E] = { name => 'tag name >', state => DATA_STATE, emit => '', }; $Action->[TAG_NAME_STATE]->[KEY_ULATIN_CHAR] = { name => 'tag name uc', ct => { append_tag_name => 0x0020, # UC -> lc }, }; $XMLAction->[TAG_NAME_STATE]->[KEY_ULATIN_CHAR] = { name => 'tag name uc xml', ct => { append_tag_name => 0x0000, }, }; $Action->[TAG_NAME_STATE]->[KEY_EOF_CHAR] = { name => 'tag name eof', error => 'unclosed tag', state => DATA_STATE, reconsume => 1, }; $Action->[TAG_NAME_STATE]->[0x002F] = { name => 'tag name /', state => SELF_CLOSING_START_TAG_STATE, }; $Action->[TAG_NAME_STATE]->[0x0000] = { name => 'tag name null', ct => { append_tag_name => 0xFFFD, }, error => 'NULL', }; $Action->[TAG_NAME_STATE]->[KEY_ELSE_CHAR] = { name => 'tag name else', ct => { append_tag_name => 0x0000, }, }; $Action->[SCRIPT_DATA_ESCAPE_START_STATE]->[0x002D] = { name => 'script data escape start -', state => SCRIPT_DATA_ESCAPE_START_DASH_STATE, emit => CHARACTER_TOKEN, emit_data => '-', }; $Action->[SCRIPT_DATA_ESCAPE_START_DASH_STATE]->[0x002D] = { name => 'script data escape start dash -', state => SCRIPT_DATA_ESCAPED_STATE, emit => CHARACTER_TOKEN, emit_data => '-', }; $Action->[SCRIPT_DATA_ESCAPE_START_STATE]->[KEY_ELSE_CHAR] = { name => 'script data escape start else', state => SCRIPT_DATA_STATE, reconsume => 1, }; $Action->[SCRIPT_DATA_ESCAPE_START_DASH_STATE]->[KEY_ELSE_CHAR] = $Action->[SCRIPT_DATA_ESCAPE_START_STATE]->[KEY_ELSE_CHAR]; $Action->[SCRIPT_DATA_ESCAPED_STATE]->[0x002D] = { name => 'script data escaped -', state => SCRIPT_DATA_ESCAPED_DASH_STATE, emit => CHARACTER_TOKEN, emit_data => '-', }; $Action->[SCRIPT_DATA_ESCAPED_DASH_STATE]->[0x002D] = { name => 'script data escaped dash -', state => SCRIPT_DATA_ESCAPED_DASH_DASH_STATE, emit => CHARACTER_TOKEN, emit_data => '-', }; $Action->[SCRIPT_DATA_ESCAPED_DASH_DASH_STATE]->[0x002D] = { name => 'script data escaped dash dash -', emit => CHARACTER_TOKEN, emit_data => '-', }; $Action->[SCRIPT_DATA_DOUBLE_ESCAPED_STATE]->[0x002D] = { name => 'script data double escaped -', state => SCRIPT_DATA_DOUBLE_ESCAPED_DASH_STATE, emit => CHARACTER_TOKEN, emit_data => '-', }; $Action->[SCRIPT_DATA_DOUBLE_ESCAPED_DASH_STATE]->[0x002D] = { name => 'script data double escaped -', state => SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH_STATE, emit => CHARACTER_TOKEN, emit_data => '-', }; $Action->[SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH_STATE]->[0x002D] = { name => 'script data double escaped dash dash -', emit => CHARACTER_TOKEN, emit_data => '-', }; $Action->[SCRIPT_DATA_ESCAPED_STATE]->[0x003C] = { name => 'script data escaped <', state => SCRIPT_DATA_ESCAPED_LT_STATE, }; $Action->[SCRIPT_DATA_ESCAPED_DASH_STATE]->[0x003C] = { name => 'script data escaped dash <', state => SCRIPT_DATA_ESCAPED_LT_STATE, }; $Action->[SCRIPT_DATA_ESCAPED_DASH_DASH_STATE]->[0x003C] = { name => 'script data escaped dash dash <', state => SCRIPT_DATA_ESCAPED_LT_STATE, }; $Action->[SCRIPT_DATA_DOUBLE_ESCAPED_STATE]->[0x003C] = { name => 'script data double escaped <', state => SCRIPT_DATA_DOUBLE_ESCAPED_LT_STATE, emit => CHARACTER_TOKEN, emit_data => '<', }; $Action->[SCRIPT_DATA_DOUBLE_ESCAPED_DASH_STATE]->[0x003C] = { name => 'script data double escaped dash <', state => SCRIPT_DATA_DOUBLE_ESCAPED_LT_STATE, emit => CHARACTER_TOKEN, emit_data => '<', }; $Action->[SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH_STATE]->[0x003C] = { name => 'script data double escaped dash dash <', state => SCRIPT_DATA_DOUBLE_ESCAPED_LT_STATE, emit => CHARACTER_TOKEN, emit_data => '<', }; $Action->[SCRIPT_DATA_ESCAPED_DASH_DASH_STATE]->[0x003E] = { name => 'script data escaped dash dash >', state => SCRIPT_DATA_STATE, emit => CHARACTER_TOKEN, emit_data => '>', }; $Action->[SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH_STATE]->[0x003E] = $Action->[SCRIPT_DATA_ESCAPED_DASH_DASH_STATE]->[0x003E]; $Action->[SCRIPT_DATA_ESCAPED_STATE]->[KEY_EOF_CHAR] = $Action->[SCRIPT_DATA_ESCAPED_DASH_STATE]->[KEY_EOF_CHAR] = $Action->[SCRIPT_DATA_ESCAPED_DASH_DASH_STATE]->[KEY_EOF_CHAR] = $Action->[SCRIPT_DATA_DOUBLE_ESCAPED_STATE]->[KEY_EOF_CHAR] = $Action->[SCRIPT_DATA_DOUBLE_ESCAPED_DASH_STATE]->[KEY_EOF_CHAR] = $Action->[SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH_STATE]->[KEY_EOF_CHAR] = { name => 'script data escaped eof', error => 'eof in escaped script data', # XXXdocumentation state => DATA_STATE, reconsume => 1, }; $Action->[SCRIPT_DATA_ESCAPED_STATE]->[0x0000] = $Action->[SCRIPT_DATA_ESCAPED_DASH_STATE]->[0x0000] = $Action->[SCRIPT_DATA_ESCAPED_DASH_DASH_STATE]->[0x0000] = $Action->[SCRIPT_DATA_DOUBLE_ESCAPED_STATE]->[0x0000] = $Action->[SCRIPT_DATA_DOUBLE_ESCAPED_DASH_STATE]->[0x0000] = $Action->[SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH_STATE]->[0x0000] = { name => 'script data escaped null', emit => CHARACTER_TOKEN, emit_data => "\x{FFFD}", error => 'NULL', state => SCRIPT_DATA_ESCAPED_STATE, }; $Action->[SCRIPT_DATA_ESCAPED_STATE]->[KEY_ELSE_CHAR] = { name => 'script data escaped else', emit => CHARACTER_TOKEN, state => SCRIPT_DATA_ESCAPED_STATE, }; $Action->[SCRIPT_DATA_ESCAPED_DASH_STATE]->[KEY_ELSE_CHAR] = { name => 'script data escaped dash else', emit => CHARACTER_TOKEN, state => SCRIPT_DATA_ESCAPED_STATE, }; $Action->[SCRIPT_DATA_ESCAPED_DASH_DASH_STATE]->[KEY_ELSE_CHAR] = { name => 'script data escaped dash dash else', emit => CHARACTER_TOKEN, state => SCRIPT_DATA_ESCAPED_STATE, }; $Action->[SCRIPT_DATA_DOUBLE_ESCAPED_STATE]->[KEY_ELSE_CHAR] = { name => 'script data double escaped else', emit => CHARACTER_TOKEN, state => SCRIPT_DATA_DOUBLE_ESCAPED_STATE, }; $Action->[SCRIPT_DATA_DOUBLE_ESCAPED_DASH_STATE]->[KEY_ELSE_CHAR] = { name => 'script data double escaped dash else', emit => CHARACTER_TOKEN, state => SCRIPT_DATA_DOUBLE_ESCAPED_STATE, }; $Action->[SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH_STATE]->[KEY_ELSE_CHAR] = { name => 'script data double escaped dash dash else', emit => CHARACTER_TOKEN, state => SCRIPT_DATA_DOUBLE_ESCAPED_STATE, }; $Action->[SCRIPT_DATA_DOUBLE_ESCAPE_START_STATE]->[KEY_SPACE_CHAR] = $Action->[SCRIPT_DATA_DOUBLE_ESCAPE_END_STATE]->[KEY_SPACE_CHAR] = $Action->[SCRIPT_DATA_DOUBLE_ESCAPE_START_STATE]->[0x003E] = $Action->[SCRIPT_DATA_DOUBLE_ESCAPE_END_STATE]->[0x003E] = $Action->[SCRIPT_DATA_DOUBLE_ESCAPE_START_STATE]->[0x002F] = $Action->[SCRIPT_DATA_DOUBLE_ESCAPE_END_STATE]->[0x002F] = { name => 'script data double escape start sp>/', skip => 1, }; $Action->[SCRIPT_DATA_DOUBLE_ESCAPE_START_STATE]->[KEY_ULATIN_CHAR] = $Action->[SCRIPT_DATA_DOUBLE_ESCAPE_END_STATE]->[KEY_ULATIN_CHAR] = { name => 'script data double escape start uc', emit => CHARACTER_TOKEN, buffer => {append => 0x0020}, # UC -> lc }; $Action->[SCRIPT_DATA_DOUBLE_ESCAPE_START_STATE]->[KEY_LLATIN_CHAR] = $Action->[SCRIPT_DATA_DOUBLE_ESCAPE_END_STATE]->[KEY_LLATIN_CHAR] = { name => 'script data double escape start lc', emit => CHARACTER_TOKEN, buffer => {append => 0x0000}, }; $Action->[SCRIPT_DATA_DOUBLE_ESCAPE_START_STATE]->[KEY_ELSE_CHAR] = { name => 'script data double escape start else', state => SCRIPT_DATA_ESCAPED_STATE, reconsume => 1, }; $Action->[SCRIPT_DATA_DOUBLE_ESCAPE_END_STATE]->[KEY_ELSE_CHAR] = { name => 'script data double escape end else', state => SCRIPT_DATA_DOUBLE_ESCAPED_STATE, reconsume => 1, }; $Action->[SCRIPT_DATA_DOUBLE_ESCAPED_LT_STATE]->[0x002F] = { name => 'script data double escaped lt /', buffer => {clear => 1}, state => SCRIPT_DATA_DOUBLE_ESCAPE_END_STATE, emit => CHARACTER_TOKEN, emit_data => '/', }; $Action->[SCRIPT_DATA_DOUBLE_ESCAPED_LT_STATE]->[KEY_ELSE_CHAR] = { name => 'script data double escaped lt else', state => SCRIPT_DATA_DOUBLE_ESCAPED_STATE, reconsume => 1, }; ## XML5: Part of the "data state". $Action->[DATA_MSE1_STATE]->[0x005D] = { name => 'data mse1 ]', state => DATA_MSE2_STATE, emit => CHARACTER_TOKEN, emit_data => ']', }; $Action->[DATA_MSE1_STATE]->[KEY_ELSE_CHAR] = { name => 'data mse1 else', state => DATA_STATE, reconsume => 1, }; $Action->[DATA_MSE2_STATE]->[0x003E] = { name => 'data mse2 >', error => 'unmatched mse', # XML5: Not a parse error. # XXXdocumentation error_delta => 2, state => DATA_STATE, emit => CHARACTER_TOKEN, emit_data => '>', }; $Action->[DATA_MSE2_STATE]->[0x005D] = { name => 'data mse2 ]', emit => CHARACTER_TOKEN, emit_data => ']', }; $Action->[DATA_MSE2_STATE]->[KEY_ELSE_CHAR] = { name => 'data mse2 else', state => DATA_STATE, reconsume => 1, }; ## XML5: "Tag attribute name before state". $Action->[BEFORE_ATTRIBUTE_NAME_STATE]->[KEY_SPACE_CHAR] = { name => 'before attr name sp', }; $Action->[BEFORE_ATTRIBUTE_NAME_STATE]->[0x003E] = { name => 'before attr name >', emit => '', state => DATA_STATE, }; $Action->[BEFORE_ATTRIBUTE_NAME_STATE]->[KEY_ULATIN_CHAR] = { name => 'before attr name uc', ca => { set_name => 0x0020, # UC -> lc }, state => ATTRIBUTE_NAME_STATE, }; $XMLAction->[BEFORE_ATTRIBUTE_NAME_STATE]->[KEY_ULATIN_CHAR] = { name => 'before attr name uc xml', ca => { set_name => 0x0000, }, state => ATTRIBUTE_NAME_STATE, }; $Action->[BEFORE_ATTRIBUTE_NAME_STATE]->[0x002F] = { name => 'before attr name /', state => SELF_CLOSING_START_TAG_STATE, }; $Action->[BEFORE_ATTRIBUTE_NAME_STATE]->[KEY_EOF_CHAR] = { name => 'before attr name eof', error => 'unclosed tag', state => DATA_STATE, }; $Action->[BEFORE_ATTRIBUTE_NAME_STATE]->[0x0022] = $Action->[BEFORE_ATTRIBUTE_NAME_STATE]->[0x0027] = $Action->[BEFORE_ATTRIBUTE_NAME_STATE]->[0x003C] = $Action->[BEFORE_ATTRIBUTE_NAME_STATE]->[0x003D] = { name => q[before attr name "'<=], error => 'bad attribute name', ## XML5: Not a parse error. ca => {set_name => 0x0000}, state => ATTRIBUTE_NAME_STATE, }; $Action->[BEFORE_ATTRIBUTE_NAME_STATE]->[0x0000] = { name => 'before attr name null', ca => {set_name => 0xFFFD}, error => 'NULL', state => ATTRIBUTE_NAME_STATE, }; ## XML5: ":" raises a parse error and is ignored. $Action->[BEFORE_ATTRIBUTE_NAME_STATE]->[KEY_ELSE_CHAR] = { name => 'before attr name else', ca => {set_name => 0x0000}, state => ATTRIBUTE_NAME_STATE, }; ## XML5: "Tag attribute name state". $Action->[ATTRIBUTE_NAME_STATE]->[KEY_SPACE_CHAR] = { name => 'attr name sp', ca => {leave => 1}, state => AFTER_ATTRIBUTE_NAME_STATE, }; $Action->[ATTRIBUTE_NAME_STATE]->[0x003D] = { name => 'attr name =', ca => {leave => 1}, state => BEFORE_ATTRIBUTE_VALUE_STATE, }; $Action->[ATTRIBUTE_NAME_STATE]->[0x003E] = { name => 'attr name >', ca => {leave => 1}, emit => '', state => DATA_STATE, }; $XMLAction->[ATTRIBUTE_NAME_STATE]->[0x003E] = { name => 'attr name > xml', error => 'no attr value', ## XML5: Not a parse error. # XXXdocumentation ca => {leave => 1}, emit => '', state => DATA_STATE, }; $Action->[ATTRIBUTE_NAME_STATE]->[KEY_ULATIN_CHAR] = { name => 'attr name uc', ca => {name => 0x0020}, # UC -> lc }; $XMLAction->[ATTRIBUTE_NAME_STATE]->[KEY_ULATIN_CHAR] = { name => 'attr name uc', ca => {name => 0x0000}, }; $Action->[ATTRIBUTE_NAME_STATE]->[0x002F] = { name => 'attr name /', ca => {leave => 1}, state => SELF_CLOSING_START_TAG_STATE, }; $XMLAction->[ATTRIBUTE_NAME_STATE]->[0x002F] = { name => 'attr name / xml', error => 'no attr value', ## XML5: Not a parse error. # XXXdocumentation ca => {leave => 1}, state => SELF_CLOSING_START_TAG_STATE, }; $Action->[ATTRIBUTE_NAME_STATE]->[KEY_EOF_CHAR] = { name => 'attr name eof', error => 'unclosed tag', ca => {leave => 1}, state => DATA_STATE, reconsume => 1, }; $Action->[ATTRIBUTE_NAME_STATE]->[0x0022] = $Action->[ATTRIBUTE_NAME_STATE]->[0x0027] = $Action->[ATTRIBUTE_NAME_STATE]->[0x003C] = { name => q[attr name "'<], error => 'bad attribute name', ## XML5: Not a parse error. ca => {name => 0x0000}, }; $Action->[ATTRIBUTE_NAME_STATE]->[0x0000] = { name => 'attr name null', ca => {name => 0xFFFD}, error => 'NULL', }; $Action->[ATTRIBUTE_NAME_STATE]->[KEY_ELSE_CHAR] = { name => 'attr name else', ca => {name => 0x0000}, }; ## XML5: "Tag attribute name after state". $Action->[AFTER_ATTRIBUTE_NAME_STATE]->[KEY_SPACE_CHAR] = { name => 'after attr name sp', }; $Action->[AFTER_ATTRIBUTE_NAME_STATE]->[0x003D] = { name => 'after attr name =', state => BEFORE_ATTRIBUTE_VALUE_STATE, }; $Action->[AFTER_ATTRIBUTE_NAME_STATE]->[0x003E] = { name => 'after attr name >', emit => '', state => DATA_STATE, }; $XMLAction->[AFTER_ATTRIBUTE_NAME_STATE]->[0x003E] = { name => 'after attr name > xml', error => 'no attr value', ## XML5: Not a parse error. # XXXdocumentation emit => '', state => DATA_STATE, }; $Action->[AFTER_ATTRIBUTE_NAME_STATE]->[KEY_ULATIN_CHAR] = { name => 'after attr name uc', ca => {set_name => 0x0020}, # UC -> lc state => ATTRIBUTE_NAME_STATE, }; $XMLAction->[AFTER_ATTRIBUTE_NAME_STATE]->[KEY_ULATIN_CHAR] = { name => 'after attr name uc xml', ca => {set_name => 0x0000}, state => ATTRIBUTE_NAME_STATE, }; $Action->[AFTER_ATTRIBUTE_NAME_STATE]->[0x002F] = { name => 'after attr name /', state => SELF_CLOSING_START_TAG_STATE, }; $XMLAction->[AFTER_ATTRIBUTE_NAME_STATE]->[0x002F] = { name => 'after attr name / xml', error => 'no attr value', ## XML5: Not a parse error. # XXXdocumentation state => SELF_CLOSING_START_TAG_STATE, }; $Action->[AFTER_ATTRIBUTE_NAME_STATE]->[KEY_EOF_CHAR] = { name => 'after attr name eof', error => 'unclosed tag', state => DATA_STATE, reconsume => 1, }; $Action->[AFTER_ATTRIBUTE_NAME_STATE]->[0x0022] = $Action->[AFTER_ATTRIBUTE_NAME_STATE]->[0x0027] = $Action->[AFTER_ATTRIBUTE_NAME_STATE]->[0x003C] = { name => q[after attr name "'<], error => 'bad attribute name', ## XML5: Not a parse error. #error2(xml) => 'no attr value', ## XML5: Not a parse error. ca => {set_name => 0x0000}, state => ATTRIBUTE_NAME_STATE, }; $Action->[AFTER_ATTRIBUTE_NAME_STATE]->[0x0000] = { name => q[after attr name else], ca => {set_name => 0xFFFD}, error => 'NULL', #error2(xml) => 'no attr value', ## XML5: Not a parse error. state => ATTRIBUTE_NAME_STATE, }; $Action->[AFTER_ATTRIBUTE_NAME_STATE]->[KEY_ELSE_CHAR] = { name => q[after attr name else], ca => {set_name => 0x0000}, state => ATTRIBUTE_NAME_STATE, }; $XMLAction->[AFTER_ATTRIBUTE_NAME_STATE]->[KEY_ELSE_CHAR] = { name => q[after attr name else], error => 'no attr value', ## XML5: Not a parse error. ca => {set_name => 0x0000}, state => ATTRIBUTE_NAME_STATE, }; ## XML5: "Tag attribute value before state". $Action->[BEFORE_ATTRIBUTE_VALUE_STATE]->[KEY_SPACE_CHAR] = { name => 'before attr value sp', }; $Action->[BEFORE_ATTRIBUTE_VALUE_STATE]->[0x0022] = { name => 'before attr value "', state => ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE, }; $XMLAction->[BEFORE_ATTRIBUTE_VALUE_STATE]->[0x0026] = { name => 'before attr value &', error => 'unquoted attr value', ## XML5: Not a parse error. state => ATTRIBUTE_VALUE_UNQUOTED_STATE, reconsume => 1, }; $Action->[BEFORE_ATTRIBUTE_VALUE_STATE]->[0x0026] = { name => 'before attr value &', state => ATTRIBUTE_VALUE_UNQUOTED_STATE, reconsume => 1, }; $Action->[BEFORE_ATTRIBUTE_VALUE_STATE]->[0x0027] = { name => "before attr value '", state => ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE, }; $Action->[BEFORE_ATTRIBUTE_VALUE_STATE]->[0x003E] = { name => 'before attr value >', error => 'empty unquoted attribute value', emit => '', state => DATA_STATE, }; $Action->[BEFORE_ATTRIBUTE_VALUE_STATE]->[KEY_EOF_CHAR] = { name => 'before attr value eof', error => 'unclosed tag', state => DATA_STATE, }; $Action->[BEFORE_ATTRIBUTE_VALUE_STATE]->[0x003C] = $Action->[BEFORE_ATTRIBUTE_VALUE_STATE]->[0x003D] = $Action->[BEFORE_ATTRIBUTE_VALUE_STATE]->[0x0060] = { name => 'before attr value <=`', error => 'bad attribute value', ## XML5: Not a parse error. #error2(xml) => 'unquoted attr value', ## XML5: Not a parse error. ca => {value => 1}, state => ATTRIBUTE_VALUE_UNQUOTED_STATE, }; $Action->[BEFORE_ATTRIBUTE_VALUE_STATE]->[0x0000] = { name => 'before attr value null', ca => {value => "\x{FFFD}"}, error => 'NULL', #error2(xml) => 'unquoted attr value', ## XML5: Not a parse error. state => ATTRIBUTE_VALUE_UNQUOTED_STATE, }; $XMLAction->[BEFORE_ATTRIBUTE_VALUE_STATE]->[KEY_ELSE_CHAR] = { name => 'before attr value else xml', error => 'unquoted attr value', ## XML5: Not a parse error. # XXXdocumentation ca => {value => 1}, state => ATTRIBUTE_VALUE_UNQUOTED_STATE, }; $Action->[BEFORE_ATTRIBUTE_VALUE_STATE]->[KEY_ELSE_CHAR] = { name => 'before attr value else', ca => {value => 1}, state => ATTRIBUTE_VALUE_UNQUOTED_STATE, }; $Action->[AFTER_ATTRIBUTE_VALUE_QUOTED_STATE]->[KEY_SPACE_CHAR] = { name => 'after attr value quoted sp', state => BEFORE_ATTRIBUTE_NAME_STATE, }; $Action->[AFTER_ATTRIBUTE_VALUE_QUOTED_STATE]->[0x003E] = { name => 'after attr value quoted >', emit => '', state => DATA_STATE, }; $Action->[AFTER_ATTRIBUTE_VALUE_QUOTED_STATE]->[0x002F] = { name => 'after attr value quoted /', state => SELF_CLOSING_START_TAG_STATE, }; $Action->[AFTER_ATTRIBUTE_VALUE_QUOTED_STATE]->[KEY_EOF_CHAR] = { name => 'after attr value quoted eof', error => 'unclosed tag', state => DATA_STATE, reconsume => 1, }; $Action->[AFTER_ATTRIBUTE_VALUE_QUOTED_STATE]->[KEY_ELSE_CHAR] = { name => 'after attr value quoted else', error => 'no space between attributes', state => BEFORE_ATTRIBUTE_NAME_STATE, reconsume => 1, }; $Action->[SELF_CLOSING_START_TAG_STATE]->[0x003E] = { name => 'self closing start tag >', skip => 1, }; $Action->[SELF_CLOSING_START_TAG_STATE]->[KEY_EOF_CHAR] = { name => 'self closing start tag eof', error => 'unclosed tag', state => DATA_STATE, ## XML5: "Tag attribute name before state". reconsume => 1, }; $Action->[SELF_CLOSING_START_TAG_STATE]->[KEY_ELSE_CHAR] = { name => 'self closing start tag else', error => 'nestc', # XXX This error type is wrong. state => BEFORE_ATTRIBUTE_NAME_STATE, reconsume => 1, }; $Action->[MD_HYPHEN_STATE]->[0x002D] = { name => 'md hyphen -', ct => {type => COMMENT_TOKEN, data => '', delta => 3}, state => COMMENT_START_STATE, ## XML5: "comment state". }; $Action->[MD_HYPHEN_STATE]->[KEY_ELSE_CHAR] = { name => 'md hyphen else', error => 'bogus comment', error_delta => 3, state => BOGUS_COMMENT_STATE, reconsume => 1, ct => {type => COMMENT_TOKEN, data => '-', delta => 3}, }; my $c_to_key = []; $c_to_key->[255] = KEY_EOF_CHAR; # EOF_CHAR $c_to_key->[$_] = $_ for 0x0000..0x007F; $c_to_key->[$_] = KEY_SPACE_CHAR for keys %$is_space; $c_to_key->[$_] = KEY_ULATIN_CHAR for 0x0041..0x005A; $c_to_key->[$_] = KEY_LLATIN_CHAR for 0x0061..0x007A; sub _get_next_token ($) { my $self = shift; if ($self->{self_closing}) { ## NOTE: The |$self->{self_closing}| flag can never be set to ## tokens except for start tag tokens. A start tag token is ## always set to |$self->{ct}| before it is emitted. $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc', token => $self->{ct}); delete $self->{self_closing}; } if (@{$self->{token}}) { $self->{self_closing} = $self->{token}->[0]->{self_closing}; return shift @{$self->{token}}; } A: { my $nc = $self->{nc}; my $state = $self->{state}; my $c = $nc > 0x007F ? KEY_ELSE_CHAR : $c_to_key->[$nc]; my $action = $Action->[$state]->[$c] || $Action->[$state]->[KEY_ELSE_CHAR]; if ($self->{is_xml}) { $action = $XMLAction->[$state]->[$c] || $Action->[$state]->[$c] || $XMLAction->[$state]->[KEY_ELSE_CHAR] || $Action->[$state]->[KEY_ELSE_CHAR]; } if ($action and not $action->{skip}) { if (defined $action->{error}) { if ($action->{error_delta}) { $self->{parse_error}->(level => $self->{level}->{must}, type => $action->{error}, line => $self->{line_prev}, column => $self->{column_prev} - $action->{error_delta} + 1); } else { $self->{parse_error}->(level => $self->{level}->{must}, type => $action->{error}); } } if (defined $action->{state}) { $self->{state} = $action->{state}; if ($action->{state_set}) { for (keys %{$action->{state_set}}) { $self->{$_} = $action->{state_set}->{$_}; } } } if (my $act = $action->{ct}) { if (defined $act->{type}) { $self->{ct} = {type => $act->{type}, tag_name => '', data => $act->{data}}; if ($act->{delta}) { $self->{ct}->{line} = $self->{line_prev}; $self->{ct}->{column} = $self->{column_prev} - $act->{delta} + 1; } else { $self->{ct}->{line} = $self->{line}; $self->{ct}->{column} = $self->{column}; } } if (defined $act->{append_tag_name}) { $self->{ct}->{tag_name} .= chr ($nc + $act->{append_tag_name}); } } if (my $aca = $action->{ca}) { if ($aca->{value}) { $self->{ca}->{value} .= $aca->{value} ne '1' ? $aca->{value} : chr $nc; } elsif (defined $aca->{name}) { $self->{ca}->{name} .= chr ($nc + $aca->{name}); } elsif (defined $aca->{set_name}) { $self->{ca} = { name => chr ($nc + $aca->{set_name}), value => '', line => $self->{line}, column => $self->{column}, }; } elsif ($aca->{leave}) { if (exists $self->{ct}->{attributes}->{$self->{ca}->{name}}) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'duplicate attribute', text => $self->{ca}->{name}, line => $self->{ca}->{line}, column => $self->{ca}->{column}); ## Discard $self->{ca}. } else { $self->{ct}->{attributes}->{$self->{ca}->{name}} = $self->{ca}; $self->{ca}->{index} = ++$self->{ct}->{last_index}; } } } if (defined $action->{buffer}) { $self->{kwd} = '' if $action->{buffer}->{clear}; $self->{kwd} .= chr ($nc + $action->{buffer}->{append}) if defined $action->{buffer}->{append}; } if (defined $action->{emit}) { if ($action->{emit} eq '') { if ($self->{ct}->{type} == START_TAG_TOKEN) { $self->{last_stag_name} = $self->{ct}->{tag_name}; } elsif ($self->{ct}->{type} == END_TAG_TOKEN) { if ($self->{ct}->{attributes}) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute'); } else { } } else { die "$0: $self->{ct}->{type}: Unknown token type"; } if ($action->{reconsume}) { # } else { if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } } return ($self->{ct}); } else { my $token = {type => $action->{emit}}; if (defined $action->{emit_data}) { $token->{data} = $action->{emit_data}; if ($action->{emit_data_append}) { $token->{data} .= chr $nc; } } elsif ($action->{emit} == CHARACTER_TOKEN) { $token->{data} .= chr $nc; } if ($action->{emit_delta}) { $token->{line} = $self->{line_prev}; $token->{column} = $self->{column_prev} - $action->{emit_delta} + 1; } else { $token->{line} = $self->{line}; $token->{column} = $self->{column}; } if (defined $action->{emit_data_read_until}) { $self->{read_until}->($token->{data}, $action->{emit_data_read_until}, length $token->{data}); } if ($action->{reconsume}) { # } else { if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } } return ($token); } } else { if ($action->{reconsume}) { # } else { if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } } } redo A; } if ({ (RCDATA_END_TAG_OPEN_STATE) => 1, (RAWTEXT_END_TAG_OPEN_STATE) => 1, (SCRIPT_DATA_END_TAG_OPEN_STATE) => 1, (SCRIPT_DATA_ESCAPED_END_TAG_OPEN_STATE) => 1, }->{$state}) { ## This switch-case implements "RCDATA end tag open state", ## "RAWTEXT end tag open state", "script data end tag open ## state", "RCDATA end tag name state", "RAWTEXT end tag name ## state", and "script end tag name state" jointly with the ## implementation of the "tag name" state. my ($l, $c) = ($self->{line_prev}, $self->{column_prev} - 1); # "<"of"</" if (defined $self->{last_stag_name}) { # } else { ## No start tag token has ever been emitted ## NOTE: See <http://krijnhoetmer.nl/irc-logs/whatwg/20070626#l-564>. $self->{state} = { (RCDATA_END_TAG_OPEN_STATE) => RCDATA_STATE, (RAWTEXT_END_TAG_OPEN_STATE) => RAWTEXT_STATE, (SCRIPT_DATA_END_TAG_OPEN_STATE) => SCRIPT_DATA_STATE, (SCRIPT_DATA_ESCAPED_END_TAG_OPEN_STATE) => SCRIPT_DATA_ESCAPED_STATE, }->{$state} or die "${state}'s next state not found"; ## Reconsume. return ({type => CHARACTER_TOKEN, data => '</', line => $l, column => $c}); redo A; } my $ch = substr $self->{last_stag_name}, length $self->{kwd}, 1; if (length $ch) { my $CH = $ch; $ch =~ tr/a-z/A-Z/; my $nch = chr $nc; if ($nch eq $ch or $nch eq $CH) { ## Stay in the state. $self->{kwd} .= $nch; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } else { $self->{state} = { (RCDATA_END_TAG_OPEN_STATE) => RCDATA_STATE, (RAWTEXT_END_TAG_OPEN_STATE) => RAWTEXT_STATE, (SCRIPT_DATA_END_TAG_OPEN_STATE) => SCRIPT_DATA_STATE, (SCRIPT_DATA_ESCAPED_END_TAG_OPEN_STATE) => SCRIPT_DATA_ESCAPED_STATE, }->{$state} or die "${state}'s next state not found"; ## Reconsume. return ({type => CHARACTER_TOKEN, data => '</' . $self->{kwd}, line => $self->{line_prev}, column => $self->{column_prev} - 1 - length $self->{kwd}, }); redo A; } } else { # after "</{tag-name}" unless ($is_space->{$nc} or { 0x003E => 1, # > 0x002F => 1, # / }->{$nc}) { ## Reconsume. $self->{state} = { (RCDATA_END_TAG_OPEN_STATE) => RCDATA_STATE, (RAWTEXT_END_TAG_OPEN_STATE) => RAWTEXT_STATE, (SCRIPT_DATA_END_TAG_OPEN_STATE) => SCRIPT_DATA_STATE, (SCRIPT_DATA_ESCAPED_END_TAG_OPEN_STATE) => SCRIPT_DATA_ESCAPED_STATE, }->{$self->{state}} or die "${state}'s next state not found"; return ({type => CHARACTER_TOKEN, data => '</' . $self->{kwd}, line => $self->{line_prev}, column => $self->{column_prev} - 1 - length $self->{kwd}, }); redo A; } else { $self->{ct} = {type => END_TAG_TOKEN, tag_name => $self->{last_stag_name}, line => $self->{line_prev}, column => $self->{column_prev} - 1 - length $self->{kwd}}; $self->{state} = TAG_NAME_STATE; ## Reconsume. redo A; } } } elsif ($state == SCRIPT_DATA_DOUBLE_ESCAPE_START_STATE or $state == SCRIPT_DATA_DOUBLE_ESCAPE_END_STATE) { if ($is_space->{$nc} or $nc == 0x002F or # / $nc == 0x003E) { # > my $token = {type => CHARACTER_TOKEN, data => chr $nc, line => $self->{line}, column => $self->{column}}; if ($state == SCRIPT_DATA_DOUBLE_ESCAPE_START_STATE) { $self->{state} = $self->{kwd} eq 'script' # "temporary buffer" ? SCRIPT_DATA_DOUBLE_ESCAPED_STATE : SCRIPT_DATA_ESCAPED_STATE; } else { $self->{state} = $self->{kwd} eq 'script' # "temporary buffer" ? SCRIPT_DATA_ESCAPED_STATE : SCRIPT_DATA_DOUBLE_ESCAPED_STATE; } if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } return ($token); redo A; } else { die "$state/$nc is implemented"; } } elsif ($state == ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE) { ## XML5: "Tag attribute value double quoted state" and "DOCTYPE ## ATTLIST attribute value double quoted state". if ($nc == 0x0022) { # " if ($self->{ct}->{type} == ATTLIST_TOKEN) { ## XML5: "DOCTYPE ATTLIST name after state". push @{$self->{ct}->{attrdefs}}, $self->{ca}; $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE; } else { ## XML5: "Tag attribute name before state". $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE; } if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($nc == 0x0026) { # & ## XML5: Not defined yet. ## NOTE: In the spec, the tokenizer is switched to the ## "entity in attribute value state". In this implementation, the ## tokenizer is switched to the |ENTITY_STATE|, which is an ## implementation of the "consume a character reference" algorithm. $self->{prev_state} = $state; $self->{entity_add} = 0x0022; # " $self->{state} = ENTITY_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($self->{is_xml} and $is_space->{$nc}) { $self->{ca}->{value} .= ' '; ## Stay in the state. if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($nc == -1) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value'); if ($self->{ct}->{type} == START_TAG_TOKEN) { $self->{last_stag_name} = $self->{ct}->{tag_name}; $self->{state} = DATA_STATE; ## reconsume return ($self->{ct}); # start tag redo A; } elsif ($self->{ct}->{type} == END_TAG_TOKEN) { if ($self->{ct}->{attributes}) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute'); } else { ## NOTE: This state should never be reached. } $self->{state} = DATA_STATE; ## reconsume ## Discard the token. #return ($self->{ct}); # end tag redo A; } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) { ## XML5: No parse error above; not defined yet. push @{$self->{ct}->{attrdefs}}, $self->{ca}; $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## Reconsume. ## Discard the token. #return ($self->{ct}); # ATTLIST redo A; } else { die "$0: $self->{ct}->{type}: Unknown token type"; } } elsif ($nc == 0x0000) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL'); $self->{ca}->{value} .= "\x{FFFD}"; ## Stay in the state if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } else { ## XML5 [ATTLIST]: Not defined yet. if ($self->{is_xml} and $nc == 0x003C) { # < ## XML5: Not a parse error. $self->{parse_error}->(level => $self->{level}->{must}, type => 'lt in attr value'); ## TODO: type } else { } $self->{ca}->{value} .= chr ($nc); $self->{read_until}->($self->{ca}->{value}, qq[\x00"&<\x09\x0C\x20], length $self->{ca}->{value}); ## Stay in the state if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } } elsif ($state == ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE) { ## XML5: "Tag attribute value single quoted state" and "DOCTYPE ## ATTLIST attribute value single quoted state". if ($nc == 0x0027) { # ' if ($self->{ct}->{type} == ATTLIST_TOKEN) { ## XML5: "DOCTYPE ATTLIST name after state". push @{$self->{ct}->{attrdefs}}, $self->{ca}; $self->{state} = AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE; } else { ## XML5: "Before attribute name state" (sic). $self->{state} = AFTER_ATTRIBUTE_VALUE_QUOTED_STATE; } if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($nc == 0x0026) { # & ## XML5: Not defined yet. ## NOTE: In the spec, the tokenizer is switched to the ## "entity in attribute value state". In this implementation, the ## tokenizer is switched to the |ENTITY_STATE|, which is an ## implementation of the "consume a character reference" algorithm. $self->{entity_add} = 0x0027; # ' $self->{prev_state} = $state; $self->{state} = ENTITY_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($self->{is_xml} and $is_space->{$nc}) { $self->{ca}->{value} .= ' '; ## Stay in the state. if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($nc == -1) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed attribute value'); if ($self->{ct}->{type} == START_TAG_TOKEN) { $self->{last_stag_name} = $self->{ct}->{tag_name}; $self->{state} = DATA_STATE; ## reconsume ## Discard the token. #return ($self->{ct}); # start tag redo A; } elsif ($self->{ct}->{type} == END_TAG_TOKEN) { if ($self->{ct}->{attributes}) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute'); } else { ## NOTE: This state should never be reached. } $self->{state} = DATA_STATE; ## reconsume ## Discard the token. #return ($self->{ct}); # end tag redo A; } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) { ## XML5: No parse error above; not defined yet. push @{$self->{ct}->{attrdefs}}, $self->{ca}; $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## Reconsume. ## Discard the token. #return ($self->{ct}); # ATTLIST redo A; } else { die "$0: $self->{ct}->{type}: Unknown token type"; } } elsif ($nc == 0x0000) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL'); $self->{ca}->{value} .= "\x{FFFD}"; ## Stay in the state if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } else { ## XML5 [ATTLIST]: Not defined yet. if ($self->{is_xml} and $nc == 0x003C) { # < ## XML5: Not a parse error. $self->{parse_error}->(level => $self->{level}->{must}, type => 'lt in attr value'); ## TODO: type } else { } $self->{ca}->{value} .= chr ($nc); $self->{read_until}->($self->{ca}->{value}, qq[\x00'&<\x09\x0C\x20], length $self->{ca}->{value}); ## Stay in the state if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } } elsif ($state == ATTRIBUTE_VALUE_UNQUOTED_STATE) { ## XML5: "Tag attribute value unquoted state". if ($is_space->{$nc}) { if ($self->{ct}->{type} == ATTLIST_TOKEN) { push @{$self->{ct}->{attrdefs}}, $self->{ca}; $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE; } else { ## XML5: "Tag attribute name before state". $self->{state} = BEFORE_ATTRIBUTE_NAME_STATE; } if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($nc == 0x0026) { # & ## XML5: Not defined yet. ## NOTE: In the spec, the tokenizer is switched to the ## "character reference in attribute value state". In this ## implementation, the tokenizer is switched to the ## |ENTITY_STATE|, which is an implementation of the "consume ## a character reference" algorithm. $self->{entity_add} = 0x003E; # > $self->{prev_state} = $state; $self->{state} = ENTITY_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($nc == 0x003E) { # > if ($self->{ct}->{type} == START_TAG_TOKEN) { $self->{last_stag_name} = $self->{ct}->{tag_name}; $self->{state} = DATA_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } return ($self->{ct}); # start tag redo A; } elsif ($self->{ct}->{type} == END_TAG_TOKEN) { if ($self->{ct}->{attributes}) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute'); } else { ## NOTE: This state should never be reached. } $self->{state} = DATA_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } return ($self->{ct}); # end tag redo A; } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) { push @{$self->{ct}->{attrdefs}}, $self->{ca}; $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } return ($self->{ct}); # ATTLIST redo A; } else { die "$0: $self->{ct}->{type}: Unknown token type"; } } elsif ($nc == -1) { if ($self->{ct}->{type} == START_TAG_TOKEN) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag'); $self->{last_stag_name} = $self->{ct}->{tag_name}; $self->{state} = DATA_STATE; ## reconsume ## Discard the token. #return ($self->{ct}); # start tag redo A; } elsif ($self->{ct}->{type} == END_TAG_TOKEN) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed tag'); if ($self->{ct}->{attributes}) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute'); } else { ## NOTE: This state should never be reached. } $self->{state} = DATA_STATE; ## reconsume ## Discard the token. #return ($self->{ct}); # end tag redo A; } elsif ($self->{ct}->{type} == ATTLIST_TOKEN) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type push @{$self->{ct}->{attrdefs}}, $self->{ca}; $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## Reconsume. ## Discard the token. #return ($self->{ct}); # ATTLIST redo A; } else { die "$0: $self->{ct}->{type}: Unknown token type"; } } elsif ($nc == 0x0000) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL'); $self->{ca}->{value} .= "\x{FFFD}"; ## Stay in the state if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } else { if ({ 0x0022 => 1, # " 0x0027 => 1, # ' 0x003D => 1, # = 0x003C => 1, # < 0x0060 => 1, # ` }->{$nc}) { ## XML5: Not a parse error. $self->{parse_error}->(level => $self->{level}->{must}, type => 'bad attribute value'); } else { } $self->{ca}->{value} .= chr ($nc); $self->{read_until}->($self->{ca}->{value}, qq[\x00"'=&` \x09\x0C<>], length $self->{ca}->{value}); ## Stay in the state if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } } elsif ($state == SELF_CLOSING_START_TAG_STATE) { ## XML5: "Empty tag state". if ($nc == 0x003E) { # > if ($self->{ct}->{type} == END_TAG_TOKEN) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'nestc', token => $self->{ct}); ## XXX: Different type than slash in start tag if ($self->{ct}->{attributes}) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'end tag attribute'); } else { } ## XXX: Test |<title></title/>| } else { $self->{self_closing} = 1; } $self->{state} = DATA_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } return ($self->{ct}); # start tag or end tag redo A; } else { die "$state/$nc is implemented"; } } elsif ($state == BOGUS_COMMENT_STATE) { ## XML5: "Bogus comment state" and "DOCTYPE bogus comment state". ## NOTE: Unlike spec's "bogus comment state", this implementation ## consumes characters one-by-one basis. if ($nc == 0x003E) { # > if ($self->{in_subset}) { $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; } else { $self->{state} = DATA_STATE; } if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } return ($self->{ct}); # comment redo A; } elsif ($nc == -1) { if ($self->{in_subset}) { $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; } else { $self->{state} = DATA_STATE; } ## reconsume return ($self->{ct}); # comment redo A; } elsif ($nc == 0x0000) { $self->{ct}->{data} .= "\x{FFFD}"; # comment ## Stay in the state. if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } else { $self->{ct}->{data} .= chr ($nc); # comment $self->{read_until}->($self->{ct}->{data}, qq[\x00>], length $self->{ct}->{data}); ## Stay in the state. if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } } elsif ($state == MARKUP_DECLARATION_OPEN_STATE) { ## XML5: "Markup declaration state". if ($nc == 0x002D) { # - $self->{state} = MD_HYPHEN_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($nc == 0x0044 or # D $nc == 0x0064) { # d ## ASCII case-insensitive. $self->{state} = MD_DOCTYPE_STATE; $self->{kwd} = chr $nc; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; # $nc == 0x005B) { # [ $self->{state} = MD_CDATA_STATE; $self->{kwd} = '['; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } else { } $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment', line => $self->{line_prev}, column => $self->{column_prev} - 1); ## Reconsume. $self->{state} = BOGUS_COMMENT_STATE; $self->{ct} = {type => COMMENT_TOKEN, data => '', line => $self->{line_prev}, column => $self->{column_prev} - 1, }; redo A; } elsif ($state == MD_DOCTYPE_STATE) { ## ASCII case-insensitive. if ($nc == [ undef, 0x004F, # O 0x0043, # C 0x0054, # T 0x0059, # Y 0x0050, # P NEVER_CHAR, # (E) ]->[length $self->{kwd}] or $nc == [ undef, 0x006F, # o 0x0063, # c 0x0074, # t 0x0079, # y 0x0070, # p NEVER_CHAR, # (e) ]->[length $self->{kwd}]) { ## Stay in the state. $self->{kwd} .= chr $nc; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ((length $self->{kwd}) == 6 and ($nc == 0x0045 or # E $nc == 0x0065)) { # e if ($self->{is_xml} and ($self->{kwd} ne 'DOCTYP' or $nc == 0x0065)) { ## XML5: case-sensitive. $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO text => 'DOCTYPE', line => $self->{line_prev}, column => $self->{column_prev} - 5); } else { } $self->{state} = DOCTYPE_STATE; $self->{ct} = {type => DOCTYPE_TOKEN, quirks => 1, line => $self->{line_prev}, column => $self->{column_prev} - 7, }; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } else { $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment', line => $self->{line_prev}, column => $self->{column_prev} - 1 - length $self->{kwd}); $self->{state} = BOGUS_COMMENT_STATE; ## Reconsume. $self->{ct} = {type => COMMENT_TOKEN, data => $self->{kwd}, line => $self->{line_prev}, column => $self->{column_prev} - 1 - length $self->{kwd}, }; redo A; } } elsif ($state == MD_CDATA_STATE) { if ($nc == { '[' => 0x0043, # C '[C' => 0x0044, # D '[CD' => 0x0041, # A '[CDA' => 0x0054, # T '[CDAT' => 0x0041, # A '[CDATA' => NEVER_CHAR, # ([) }->{$self->{kwd}}) { ## Stay in the state. $self->{kwd} .= chr $nc; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($self->{kwd} eq '[CDATA' and $nc == 0x005B) { # [ if ($self->{is_xml} and not $self->{tainted} and @{$self->{open_elements} or []} == 0) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'cdata outside of root element', line => $self->{line_prev}, column => $self->{column_prev} - 7); $self->{tainted} = 1; } else { } $self->{ct} = {type => CHARACTER_TOKEN, data => '', line => $self->{line_prev}, column => $self->{column_prev} - 7}; $self->{state} = CDATA_SECTION_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } else { $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment', line => $self->{line_prev}, column => $self->{column_prev} - 1 - length $self->{kwd}); $self->{state} = BOGUS_COMMENT_STATE; ## Reconsume. $self->{ct} = {type => COMMENT_TOKEN, data => $self->{kwd}, line => $self->{line_prev}, column => $self->{column_prev} - 1 - length $self->{kwd}, }; redo A; } } elsif ($state == COMMENT_START_STATE) { if ($nc == 0x002D) { # - $self->{state} = COMMENT_START_DASH_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($nc == 0x003E) { # > $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment'); if ($self->{in_subset}) { $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; } else { $self->{state} = DATA_STATE; } if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } return ($self->{ct}); # comment redo A; } elsif ($nc == -1) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment'); if ($self->{in_subset}) { $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; } else { $self->{state} = DATA_STATE; } ## reconsume return ($self->{ct}); # comment redo A; } elsif ($nc == 0x0000) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL'); $self->{ct}->{data} .= "\x{FFFD}"; # comment $self->{state} = COMMENT_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } else { $self->{ct}->{data} # comment .= chr ($nc); $self->{state} = COMMENT_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } } elsif ($state == COMMENT_START_DASH_STATE) { if ($nc == 0x002D) { # - $self->{state} = COMMENT_END_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($nc == 0x003E) { # > $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment'); if ($self->{in_subset}) { $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; } else { $self->{state} = DATA_STATE; } if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } return ($self->{ct}); # comment redo A; } elsif ($nc == -1) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment'); if ($self->{in_subset}) { $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; } else { $self->{state} = DATA_STATE; } ## reconsume return ($self->{ct}); # comment redo A; } elsif ($nc == 0x0000) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL'); $self->{ct}->{data} .= "-\x{FFFD}"; # comment $self->{state} = COMMENT_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } else { $self->{ct}->{data} # comment .= '-' . chr ($nc); $self->{state} = COMMENT_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } } elsif ($state == COMMENT_STATE) { ## XML5: "Comment state" and "DOCTYPE comment state". if ($nc == 0x002D) { # - $self->{state} = COMMENT_END_DASH_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($nc == -1) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment'); if ($self->{in_subset}) { $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; } else { $self->{state} = DATA_STATE; } ## reconsume return ($self->{ct}); # comment redo A; } elsif ($nc == 0x0000) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL'); $self->{ct}->{data} .= "\x{FFFD}"; # comment if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } else { $self->{ct}->{data} .= chr ($nc); # comment $self->{read_until}->($self->{ct}->{data}, qq[-\x00], length $self->{ct}->{data}); ## Stay in the state if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } } elsif ($state == COMMENT_END_DASH_STATE) { ## XML5: "Comment dash state" and "DOCTYPE comment dash state". if ($nc == 0x002D) { # - $self->{state} = COMMENT_END_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($nc == -1) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment'); if ($self->{in_subset}) { $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; } else { $self->{state} = DATA_STATE; } ## reconsume return ($self->{ct}); # comment redo A; } elsif ($nc == 0x0000) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL'); $self->{ct}->{data} .= "-\x{FFFD}"; # comment $self->{state} = COMMENT_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } else { $self->{ct}->{data} .= '-' . chr ($nc); # comment $self->{state} = COMMENT_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } } elsif ($state == COMMENT_END_STATE or $state == COMMENT_END_BANG_STATE) { ## XML5: "Comment end state" and "DOCTYPE comment end state". ## (No comment end bang state.) if ($nc == 0x003E) { # > if ($self->{in_subset}) { $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; } else { $self->{state} = DATA_STATE; } if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } return ($self->{ct}); # comment redo A; } elsif ($nc == 0x002D) { # - if ($state == COMMENT_END_BANG_STATE) { $self->{ct}->{data} .= '--!'; # comment $self->{state} = COMMENT_END_DASH_STATE; } else { ## XML5: Not a parse error. $self->{parse_error}->(level => $self->{level}->{must}, type => 'dash in comment', line => $self->{line_prev}, column => $self->{column_prev}); $self->{ct}->{data} .= '-'; # comment ## Stay in the state } if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($state != COMMENT_END_BANG_STATE and $nc == 0x0021) { # ! $self->{parse_error}->(level => $self->{level}->{must}, type => 'comment end bang'); # XXX error type $self->{state} = COMMENT_END_BANG_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($nc == -1) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed comment'); if ($self->{in_subset}) { $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; } else { $self->{state} = DATA_STATE; } ## Reconsume. return ($self->{ct}); # comment redo A; } elsif ($nc == 0x0000) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL'); if ($state == COMMENT_END_BANG_STATE) { $self->{ct}->{data} .= "--!\x{FFFD}"; # comment } else { $self->{ct}->{data} .= "--\x{FFFD}"; # comment } $self->{state} = COMMENT_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } else { if ($state == COMMENT_END_BANG_STATE) { $self->{ct}->{data} .= '--!' . chr ($nc); # comment } else { $self->{ct}->{data} .= '--' . chr ($nc); # comment } $self->{state} = COMMENT_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } } elsif ($state == DOCTYPE_STATE) { if ($is_space->{$nc}) { $self->{state} = BEFORE_DOCTYPE_NAME_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($nc == -1) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE'); $self->{ct}->{quirks} = 1; $self->{state} = DATA_STATE; ## Reconsume. return ($self->{ct}); # DOCTYPE (quirks) redo A; } else { ## XML5: Swith to the bogus comment state. $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before DOCTYPE name'); $self->{state} = BEFORE_DOCTYPE_NAME_STATE; ## reconsume redo A; } } elsif ($state == BEFORE_DOCTYPE_NAME_STATE) { ## XML5: "DOCTYPE root name before state". if ($is_space->{$nc}) { ## Stay in the state if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($nc == 0x003E) { # > ## XML5: No parse error. $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name'); $self->{state} = DATA_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } return ($self->{ct}); # DOCTYPE (quirks) redo A; } elsif (0x0041 <= $nc and $nc <= 0x005A) { # A..Z $self->{ct}->{name} # DOCTYPE = chr ($nc + ($self->{is_xml} ? 0 : 0x0020)); delete $self->{ct}->{quirks}; $self->{state} = DOCTYPE_NAME_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($nc == -1) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name'); $self->{state} = DATA_STATE; ## reconsume return ($self->{ct}); # DOCTYPE (quirks) redo A; } elsif ($self->{is_xml} and $nc == 0x005B) { # [ $self->{parse_error}->(level => $self->{level}->{must}, type => 'no DOCTYPE name'); $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; $self->{ct}->{has_internal_subset} = 1; # DOCTYPE $self->{in_subset} = 1; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } return ($self->{ct}); # DOCTYPE redo A; } elsif ($nc == 0x0000) { $self->{ct}->{name} = "\x{FFFD}"; delete $self->{ct}->{quirks}; $self->{state} = DOCTYPE_NAME_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } else { $self->{ct}->{name} = chr $nc; delete $self->{ct}->{quirks}; $self->{state} = DOCTYPE_NAME_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } } elsif ($state == DOCTYPE_NAME_STATE) { ## XML5: "DOCTYPE root name state". if ($is_space->{$nc}) { $self->{state} = AFTER_DOCTYPE_NAME_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($nc == 0x003E) { # > $self->{state} = DATA_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } return ($self->{ct}); # DOCTYPE redo A; } elsif (0x0041 <= $nc and $nc <= 0x005A) { # A..Z $self->{ct}->{name} # DOCTYPE .= chr ($nc + ($self->{is_xml} ? 0 : 0x0020)); delete $self->{ct}->{quirks}; ## Stay in the state. if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($nc == -1) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE'); $self->{state} = DATA_STATE; ## reconsume $self->{ct}->{quirks} = 1; return ($self->{ct}); # DOCTYPE redo A; } elsif ($self->{is_xml} and $nc == 0x005B) { # [ $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; $self->{ct}->{has_internal_subset} = 1; # DOCTYPE $self->{in_subset} = 1; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } return ($self->{ct}); # DOCTYPE redo A; } elsif ($nc == 0x0000) { $self->{ct}->{name} .= "\x{FFFD}"; # DOCTYPE ## Stay in the state. if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } else { $self->{ct}->{name} .= chr ($nc); # DOCTYPE ## Stay in the state. if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } } elsif ($state == AFTER_DOCTYPE_NAME_STATE) { ## XML5: Corresponding to XML5's "DOCTYPE root name after ## state", but implemented differently. if ($is_space->{$nc}) { ## Stay in the state if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($nc == 0x003E) { # > if ($self->{ct}->{type} == DOCTYPE_TOKEN) { $self->{state} = DATA_STATE; } else { $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md def'); ## TODO: type $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; } if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION redo A; } elsif ($nc == -1) { if ($self->{ct}->{type} == DOCTYPE_TOKEN) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE'); $self->{state} = DATA_STATE; $self->{ct}->{quirks} = 1; } else { $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; } ## Reconsume. return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION redo A; } elsif ($nc == 0x0050 or # P $nc == 0x0070) { # p $self->{state} = PUBLIC_STATE; $self->{kwd} = chr $nc; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($nc == 0x0053 or # S $nc == 0x0073) { # s $self->{state} = SYSTEM_STATE; $self->{kwd} = chr $nc; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($nc == 0x0022 and # " ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) { $self->{state} = DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE; $self->{ct}->{value} = ''; # ENTITY if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($nc == 0x0027 and # ' ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN or $self->{ct}->{type} == PARAMETER_ENTITY_TOKEN)) { $self->{state} = DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE; $self->{ct}->{value} = ''; # ENTITY if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($self->{is_xml} and $self->{ct}->{type} == DOCTYPE_TOKEN and $nc == 0x005B) { # [ $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; $self->{ct}->{has_internal_subset} = 1; # DOCTYPE $self->{in_subset} = 1; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } return ($self->{ct}); # DOCTYPE redo A; } else { $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name'); ## TODO: type if ($self->{ct}->{type} == DOCTYPE_TOKEN) { $self->{ct}->{quirks} = 1; $self->{state} = BOGUS_DOCTYPE_STATE; } else { $self->{state} = BOGUS_MD_STATE; } if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } } elsif ($state == PUBLIC_STATE) { ## ASCII case-insensitive if ($nc == [ undef, 0x0055, # U 0x0042, # B 0x004C, # L 0x0049, # I NEVER_CHAR, # (C) ]->[length $self->{kwd}] or $nc == [ undef, 0x0075, # u 0x0062, # b 0x006C, # l 0x0069, # i NEVER_CHAR, # (c) ]->[length $self->{kwd}]) { ## Stay in the state. $self->{kwd} .= chr $nc; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ((length $self->{kwd}) == 5 and ($nc == 0x0043 or # C $nc == 0x0063)) { # c if ($self->{is_xml} and ($self->{kwd} ne 'PUBLI' or $nc == 0x0063)) { # c $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type text => 'PUBLIC', line => $self->{line_prev}, column => $self->{column_prev} - 4); } else { } $self->{state} = AFTER_DOCTYPE_PUBLIC_KEYWORD_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } else { $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name', ## TODO: type line => $self->{line_prev}, column => $self->{column_prev} + 1 - length $self->{kwd}); if ($self->{ct}->{type} == DOCTYPE_TOKEN) { $self->{ct}->{quirks} = 1; $self->{state} = BOGUS_DOCTYPE_STATE; } else { $self->{state} = BOGUS_MD_STATE; } ## Reconsume. redo A; } } elsif ($state == SYSTEM_STATE) { ## ASCII case-insensitive if ($nc == [ undef, 0x0059, # Y 0x0053, # S 0x0054, # T 0x0045, # E NEVER_CHAR, # (M) ]->[length $self->{kwd}] or $nc == [ undef, 0x0079, # y 0x0073, # s 0x0074, # t 0x0065, # e NEVER_CHAR, # (m) ]->[length $self->{kwd}]) { ## Stay in the state. $self->{kwd} .= chr $nc; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ((length $self->{kwd}) == 5 and ($nc == 0x004D or # M $nc == 0x006D)) { # m if ($self->{is_xml} and ($self->{kwd} ne 'SYSTE' or $nc == 0x006D)) { # m $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type text => 'SYSTEM', line => $self->{line_prev}, column => $self->{column_prev} - 4); } else { } $self->{state} = AFTER_DOCTYPE_SYSTEM_KEYWORD_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } else { $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after DOCTYPE name', ## TODO: type line => $self->{line_prev}, column => $self->{column_prev} + 1 - length $self->{kwd}); if ($self->{ct}->{type} == DOCTYPE_TOKEN) { $self->{ct}->{quirks} = 1; $self->{state} = BOGUS_DOCTYPE_STATE; } else { $self->{state} = BOGUS_MD_STATE; } ## Reconsume. redo A; } } elsif ($state == AFTER_DOCTYPE_PUBLIC_KEYWORD_STATE or $state == BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE) { if ($is_space->{$nc}) { ## Stay in or switch to the state. $self->{state} = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($nc == 0x0022) { # " if ($state == AFTER_DOCTYPE_PUBLIC_KEYWORD_STATE) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before pubid literal'); # XXX documentation } else { } $self->{ct}->{pubid} = ''; # DOCTYPE $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($nc == 0x0027) { # ' if ($state == AFTER_DOCTYPE_PUBLIC_KEYWORD_STATE) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before pubid literal'); # XXX documentation } else { } $self->{ct}->{pubid} = ''; # DOCTYPE $self->{state} = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($nc == 0x003E) { # > $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal'); if ($self->{ct}->{type} == DOCTYPE_TOKEN) { $self->{state} = DATA_STATE; $self->{ct}->{quirks} = 1; } else { $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; } if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION redo A; } elsif ($nc == EOF_CHAR) { if ($self->{ct}->{type} == DOCTYPE_TOKEN) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE'); $self->{state} = DATA_STATE; $self->{ct}->{quirks} = 1; } else { $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; } ## Reconsume. return ($self->{ct}); # DOCTYPE redo A; } elsif ($self->{is_xml} and $self->{ct}->{type} == DOCTYPE_TOKEN and $nc == 0x005B) { # [ $self->{parse_error}->(level => $self->{level}->{must}, type => 'no PUBLIC literal'); $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; $self->{ct}->{has_internal_subset} = 1; # DOCTYPE $self->{in_subset} = 1; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } return ($self->{ct}); # DOCTYPE redo A; } else { $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC'); if ($self->{ct}->{type} == DOCTYPE_TOKEN) { $self->{ct}->{quirks} = 1; $self->{state} = BOGUS_DOCTYPE_STATE; } else { $self->{state} = BOGUS_MD_STATE; } if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } } elsif ($state == DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED_STATE) { if ($nc == 0x0022) { # " $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($nc == 0x003E) { # > $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal'); if ($self->{ct}->{type} == DOCTYPE_TOKEN) { $self->{state} = DATA_STATE; $self->{ct}->{quirks} = 1; } else { $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; } if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION redo A; } elsif ($nc == -1) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal'); if ($self->{ct}->{type} == DOCTYPE_TOKEN) { $self->{state} = DATA_STATE; $self->{ct}->{quirks} = 1; } else { $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; } ## Reconsume. return ($self->{ct}); # DOCTYPE redo A; } elsif ($nc == 0x0000) { $self->{ct}->{pubid} .= "\x{FFFD}"; # DOCTYPE/ENTITY/NOTATION ## Stay in the state. if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } else { $self->{ct}->{pubid} .= chr $nc; # DOCTYPE/ENTITY/NOTATION $self->{read_until}->($self->{ct}->{pubid}, qq[\x00">], length $self->{ct}->{pubid}); ## Stay in the state. if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } } elsif ($state == DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED_STATE) { if ($nc == 0x0027) { # ' $self->{state} = AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($nc == 0x003E) { # > $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal'); if ($self->{ct}->{type} == DOCTYPE_TOKEN) { $self->{state} = DATA_STATE; $self->{ct}->{quirks} = 1; } else { $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; } if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION redo A; } elsif ($nc == -1) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed PUBLIC literal'); if ($self->{ct}->{type} == DOCTYPE_TOKEN) { $self->{state} = DATA_STATE; $self->{ct}->{quirks} = 1; } else { $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; } ## reconsume return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION redo A; } elsif ($nc == 0x0000) { $self->{ct}->{pubid} .= "\x{FFFD}"; # DOCTYPE/ENTITY/NOTATION ## Stay in the state. if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } else { $self->{ct}->{pubid} .= chr $nc; # DOCTYPE/ENTITY/NOTATION $self->{read_until}->($self->{ct}->{pubid}, qq[\x00'>], length $self->{ct}->{pubid}); ## Stay in the state if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } } elsif ($state == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE or $state == BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDS_STATE) { if ($is_space->{$nc}) { ## Stay in or switch to the state. $self->{state} = BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDS_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($nc == 0x0022) { # " if ($state == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before system literal'); # XXX documentation } else { } $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($nc == 0x0027) { # ' if ($state == AFTER_DOCTYPE_PUBLIC_IDENTIFIER_STATE) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before system literal'); # XXX documentation } else { } $self->{ct}->{sysid} = ''; # DOCTYPE/ENTITY/NOTATION $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($nc == 0x003E) { # > if ($self->{ct}->{type} == DOCTYPE_TOKEN) { if ($self->{is_xml}) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal'); } else { } $self->{state} = DATA_STATE; } else { if ($self->{ct}->{type} == NOTATION_TOKEN) { } else { $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal'); } $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; } if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION redo A; } elsif ($nc == EOF_CHAR) { if ($self->{ct}->{type} == DOCTYPE_TOKEN) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE'); $self->{state} = DATA_STATE; $self->{ct}->{quirks} = 1; } else { $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; } ## Reconsume. return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION redo A; } elsif ($self->{is_xml} and $self->{ct}->{type} == DOCTYPE_TOKEN and $nc == 0x005B) { # [ $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal'); $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; $self->{ct}->{has_internal_subset} = 1; # DOCTYPE $self->{in_subset} = 1; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } return ($self->{ct}); # DOCTYPE redo A; } else { $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after PUBLIC literal'); if ($self->{ct}->{type} == DOCTYPE_TOKEN) { $self->{ct}->{quirks} = 1; $self->{state} = BOGUS_DOCTYPE_STATE; } else { $self->{state} = BOGUS_MD_STATE; } if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } } elsif ($state == AFTER_DOCTYPE_SYSTEM_KEYWORD_STATE or $state == BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE) { if ($is_space->{$nc}) { ## Stay in or switch to the state. $self->{state} = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($nc == 0x0022) { # " if ($state == AFTER_DOCTYPE_SYSTEM_KEYWORD_STATE) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before system literal'); # XXX documentation } else { } $self->{ct}->{sysid} = ''; # DOCTYPE $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($nc == 0x0027) { # ' if ($state == AFTER_DOCTYPE_SYSTEM_KEYWORD_STATE) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before system literal'); # XXX documentation } else { } $self->{ct}->{sysid} = ''; # DOCTYPE $self->{state} = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($nc == 0x003E) { # > $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal'); if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } if ($self->{ct}->{type} == DOCTYPE_TOKEN) { $self->{state} = DATA_STATE; $self->{ct}->{quirks} = 1; } else { $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; } return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION redo A; } elsif ($nc == EOF_CHAR) { if ($self->{ct}->{type} == DOCTYPE_TOKEN) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE'); $self->{state} = DATA_STATE; $self->{ct}->{quirks} = 1; } else { $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; } ## Reconsume. return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION redo A; } elsif ($self->{is_xml} and $self->{ct}->{type} == DOCTYPE_TOKEN and $nc == 0x005B) { # [ $self->{parse_error}->(level => $self->{level}->{must}, type => 'no SYSTEM literal'); $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; $self->{ct}->{has_internal_subset} = 1; # DOCTYPE $self->{in_subset} = 1; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } return ($self->{ct}); # DOCTYPE redo A; } else { $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM'); if ($self->{ct}->{type} == DOCTYPE_TOKEN) { $self->{ct}->{quirks} = 1; $self->{state} = BOGUS_DOCTYPE_STATE; } else { $self->{state} = BOGUS_MD_STATE; } if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } } elsif ($state == DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED_STATE) { if ($nc == 0x0022) { # " $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif (not $self->{is_xml} and $nc == 0x003E) { # > $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal'); if ($self->{ct}->{type} == DOCTYPE_TOKEN) { $self->{state} = DATA_STATE; $self->{ct}->{quirks} = 1; } else { $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; } if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION redo A; } elsif ($nc == -1) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal'); if ($self->{ct}->{type} == DOCTYPE_TOKEN) { $self->{state} = DATA_STATE; $self->{ct}->{quirks} = 1; } else { $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; } ## reconsume return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION redo A; } elsif ($nc == 0x0000) { $self->{ct}->{sysid} .= "\x{FFFD}"; # DOCTYPE/ENTITY/NOTATION ## Stay in the state. if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } else { $self->{ct}->{sysid} .= chr $nc; # DOCTYPE/ENTITY/NOTATION $self->{read_until}->($self->{ct}->{sysid}, qq[\x00">], length $self->{ct}->{sysid}); ## Stay in the state if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } } elsif ($state == DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED_STATE) { if ($nc == 0x0027) { # ' $self->{state} = AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif (not $self->{is_xml} and $nc == 0x003E) { # > $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal'); $self->{state} = DATA_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } $self->{ct}->{quirks} = 1; return ($self->{ct}); # DOCTYPE redo A; } elsif ($nc == -1) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed SYSTEM literal'); if ($self->{ct}->{type} == DOCTYPE_TOKEN) { $self->{state} = DATA_STATE; $self->{ct}->{quirks} = 1; } else { $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; } ## reconsume return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION redo A; } elsif ($nc == 0x0000) { $self->{ct}->{sysid} .= "\x{FFFD}"; # DOCTYPE/ENTITY/NOTATION ## Stay in the state. if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } else { $self->{ct}->{sysid} .= chr $nc; # DOCTYPE/ENTITY/NOTATION $self->{read_until}->($self->{ct}->{sysid}, qq[\x00'>], length $self->{ct}->{sysid}); ## Stay in the state if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } } elsif ($state == AFTER_DOCTYPE_SYSTEM_IDENTIFIER_STATE) { if ($is_space->{$nc}) { if ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN) { $self->{state} = BEFORE_NDATA_STATE; } else { ## Stay in the state } if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($nc == 0x003E) { # > if ($self->{ct}->{type} == DOCTYPE_TOKEN) { $self->{state} = DATA_STATE; } else { $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; } if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION redo A; } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and ($nc == 0x004E or # N $nc == 0x006E)) { # n $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before NDATA'); ## TODO: type $self->{state} = NDATA_STATE; $self->{kwd} = chr $nc; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($nc == -1) { if ($self->{ct}->{type} == DOCTYPE_TOKEN) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE'); $self->{state} = DATA_STATE; $self->{ct}->{quirks} = 1; } else { $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; } ## reconsume return ($self->{ct}); # DOCTYPE/ENTITY/NOTATION redo A; } elsif ($self->{is_xml} and $self->{ct}->{type} == DOCTYPE_TOKEN and $nc == 0x005B) { # [ $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; $self->{ct}->{has_internal_subset} = 1; # DOCTYPE $self->{in_subset} = 1; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } return ($self->{ct}); # DOCTYPE redo A; } else { $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal'); if ($self->{ct}->{type} == DOCTYPE_TOKEN) { #$self->{ct}->{quirks} = 1; $self->{state} = BOGUS_DOCTYPE_STATE; } else { $self->{state} = BOGUS_MD_STATE; } if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } } elsif ($state == BEFORE_NDATA_STATE) { if ($is_space->{$nc}) { ## Stay in the state. if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($nc == 0x003E) { # > $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } return ($self->{ct}); # ENTITY redo A; } elsif ($nc == 0x004E or # N $nc == 0x006E) { # n $self->{state} = NDATA_STATE; $self->{kwd} = chr $nc; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($nc == -1) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## reconsume return ($self->{ct}); # ENTITY redo A; } else { $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after SYSTEM literal'); $self->{state} = BOGUS_MD_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } } elsif ($state == BOGUS_DOCTYPE_STATE) { if ($nc == 0x003E) { # > $self->{state} = DATA_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } return ($self->{ct}); # DOCTYPE redo A; } elsif ($self->{is_xml} and $nc == 0x005B) { # [ $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; $self->{ct}->{has_internal_subset} = 1; # DOCTYPE $self->{in_subset} = 1; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } return ($self->{ct}); # DOCTYPE redo A; } elsif ($nc == -1) { $self->{state} = DATA_STATE; ## reconsume return ($self->{ct}); # DOCTYPE redo A; } else { my $s = ''; $self->{read_until}->($s, q{>[}, 0); ## Stay in the state if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } } elsif ($state == CDATA_SECTION_STATE) { ## NOTE: "CDATA section state" in the state is jointly implemented ## by three states, |CDATA_SECTION_STATE|, |CDATA_SECTION_MSE1_STATE|, ## and |CDATA_SECTION_MSE2_STATE|. ## XML5: "CDATA state". if ($nc == 0x005D) { # ] $self->{state} = CDATA_SECTION_MSE1_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($nc == -1) { if ($self->{is_xml}) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'no mse'); ## TODO: type } else { } $self->{state} = DATA_STATE; ## Reconsume. if (length $self->{ct}->{data}) { # character return ($self->{ct}); # character } else { ## No token to emit. $self->{ct} is discarded. } redo A; } else { $self->{ct}->{data} .= chr $nc; $self->{read_until}->($self->{ct}->{data}, qq<\x00]>, length $self->{ct}->{data}); ## NOTE: NULLs are left as is (see spec's comment). However, ## a token cannot contain more than one U+0000 NULL character ## for the ease of processing in the tree constructor. ## Stay in the state. if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } ## ISSUE: "text tokens" in spec. } elsif ($state == CDATA_SECTION_MSE1_STATE) { ## XML5: "CDATA bracket state". if ($nc == 0x005D) { # ] $self->{state} = CDATA_SECTION_MSE2_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } else { ## XML5: If EOF, "]" is not appended and changed to the data state. $self->{ct}->{data} .= ']'; $self->{state} = CDATA_SECTION_STATE; ## XML5: Stay in the state. ## Reconsume. redo A; } } elsif ($state == CDATA_SECTION_MSE2_STATE) { ## XML5: "CDATA end state". if ($nc == 0x003E) { # > $self->{state} = DATA_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } if (length $self->{ct}->{data}) { # character return ($self->{ct}); # character } else { ## No token to emit. $self->{ct} is discarded. } redo A; } elsif ($nc == 0x005D) { # ] # character $self->{ct}->{data} .= ']'; ## Add first "]" of "]]]". ## Stay in the state. if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } else { $self->{ct}->{data} .= ']]'; # character $self->{state} = CDATA_SECTION_STATE; ## Reconsume. ## XML5: Emit. redo A; } } elsif ($state == ENTITY_STATE) { if ($is_space->{$nc} or { 0x003C => 1, 0x0026 => 1, -1 => 1, # <, & ## Following characters are added here to detect parse ## error for "=" of "&=" in an unquoted attribute value. ## Though this disagree with the Web Applications 1.0 ## spec, the result token sequences of both algorithms ## should be same, as these characters cannot form a part ## of character references. 0x0022 => 1, 0x0027 => 1, 0x0060 => 1, # ", ', ` 0x003D => 1, # = ## As a result of the addition above, the following clause ## has no effect in fact. $self->{entity_add} => 1, }->{$nc}) { if ($self->{is_xml}) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero', line => $self->{line_prev}, column => $self->{column_prev} + ($nc == -1 ? 1 : 0)); } else { ## No error } ## Don't consume ## Return nothing. # } elsif ($nc == 0x0023) { # # $self->{state} = ENTITY_HASH_STATE; $self->{kwd} = '#'; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($self->{is_xml} or (0x0041 <= $nc and $nc <= 0x005A) or # A..Z (0x0061 <= $nc and $nc <= 0x007A)) { # a..z #require HTML::HTML5::Parser::NamedEntityList; $self->{state} = ENTITY_NAME_STATE; $self->{kwd} = chr $nc; $self->{entity__value} = $self->{kwd}; $self->{entity__match} = 0; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } else { ## Return nothing. # } ## We implement the "consume a character reference" in a ## slightly different way from the spec's algorithm, though the ## end result should be exactly same. ## NOTE: No character is consumed by the "consume a character ## reference" algorithm. In other word, there is an "&" character ## that does not introduce a character reference, which would be ## appended to the parent element or the attribute value in later ## process of the tokenizer. if ($self->{prev_state} == DATA_STATE or $self->{prev_state} == RCDATA_STATE) { $self->{state} = $self->{prev_state}; ## Reconsume. return ({type => CHARACTER_TOKEN, data => '&', line => $self->{line_prev}, column => $self->{column_prev}, }); redo A; } else { $self->{ca}->{value} .= '&'; $self->{state} = $self->{prev_state}; ## Reconsume. redo A; } } elsif ($state == ENTITY_HASH_STATE) { if ($nc == 0x0078) { # x $self->{state} = HEXREF_X_STATE; $self->{kwd} .= chr $nc; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($nc == 0x0058) { # X if ($self->{is_xml}) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'uppercase hcro'); ## TODO: type } $self->{state} = HEXREF_X_STATE; $self->{kwd} .= chr $nc; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif (0x0030 <= $nc and $nc <= 0x0039) { # 0..9 $self->{state} = NCR_NUM_STATE; $self->{kwd} = $nc - 0x0030; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } else { $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare nero', line => $self->{line_prev}, column => $self->{column_prev} - 1); ## NOTE: According to the spec algorithm, nothing is returned, ## and then "&#" is appended to the parent element or the attribute ## value in the later processing. if ($self->{prev_state} == DATA_STATE or $self->{prev_state} == RCDATA_STATE) { $self->{state} = $self->{prev_state}; ## Reconsume. return ({type => CHARACTER_TOKEN, data => '&#', line => $self->{line_prev}, column => $self->{column_prev} - 1, }); redo A; } else { $self->{ca}->{value} .= '&#'; $self->{state} = $self->{prev_state}; ## Reconsume. redo A; } } } elsif ($state == NCR_NUM_STATE) { if (0x0030 <= $nc and $nc <= 0x0039) { # 0..9 $self->{kwd} *= 10; $self->{kwd} += $nc - 0x0030; ## Stay in the state. if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($nc == 0x003B) { # ; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } # } else { $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc'); ## Reconsume. # } my $code = $self->{kwd}; my $l = $self->{line_prev}; my $c = $self->{column_prev}; if ((not $self->{is_xml} and $charref_map->{$code}) or ($self->{is_xml} and 0xD800 <= $code and $code <= 0xDFFF) or ($self->{is_xml} and $code == 0x0000)) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference', text => (sprintf 'U+%04X', $code), line => $l, column => $c); $code = $charref_map->{$code}; } elsif ($code > 0x10FFFF) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference', text => (sprintf 'U-%08X', $code), line => $l, column => $c); $code = 0xFFFD; } if ($self->{prev_state} == DATA_STATE or $self->{prev_state} == RCDATA_STATE) { $self->{state} = $self->{prev_state}; ## Reconsume. return ({type => CHARACTER_TOKEN, data => chr $code, has_reference => 1, line => $l, column => $c, }); redo A; } else { $self->{ca}->{value} .= chr $code; $self->{ca}->{has_reference} = 1; $self->{state} = $self->{prev_state}; ## Reconsume. redo A; } } elsif ($state == HEXREF_X_STATE) { if ((0x0030 <= $nc and $nc <= 0x0039) or (0x0041 <= $nc and $nc <= 0x0046) or (0x0061 <= $nc and $nc <= 0x0066)) { # 0..9, A..F, a..f $self->{state} = HEXREF_HEX_STATE; $self->{kwd} = 0; ## Reconsume. redo A; } else { $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare hcro', line => $self->{line_prev}, column => $self->{column_prev} - 2); ## NOTE: According to the spec algorithm, nothing is returned, ## and then "&#" followed by "X" or "x" is appended to the parent ## element or the attribute value in the later processing. if ($self->{prev_state} == DATA_STATE or $self->{prev_state} == RCDATA_STATE) { $self->{state} = $self->{prev_state}; ## Reconsume. return ({type => CHARACTER_TOKEN, data => '&' . $self->{kwd}, line => $self->{line_prev}, column => $self->{column_prev} - length $self->{kwd}, }); redo A; } else { $self->{ca}->{value} .= '&' . $self->{kwd}; $self->{state} = $self->{prev_state}; ## Reconsume. redo A; } } } elsif ($state == HEXREF_HEX_STATE) { if (0x0030 <= $nc and $nc <= 0x0039) { # 0..9 $self->{kwd} *= 0x10; $self->{kwd} += $nc - 0x0030; ## Stay in the state. if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif (0x0061 <= $nc and $nc <= 0x0066) { # a..f $self->{kwd} *= 0x10; $self->{kwd} += $nc - 0x0060 + 9; ## Stay in the state. if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif (0x0041 <= $nc and $nc <= 0x0046) { # A..F $self->{kwd} *= 0x10; $self->{kwd} += $nc - 0x0040 + 9; ## Stay in the state. if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($nc == 0x003B) { # ; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } # } else { $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc', line => $self->{line}, column => $self->{column}); ## Reconsume. # } my $code = $self->{kwd}; my $l = $self->{line_prev}; my $c = $self->{column_prev}; if ((not $self->{is_xml} and $charref_map->{$code}) or ($self->{is_xml} and 0xD800 <= $code and $code <= 0xDFFF) or ($self->{is_xml} and $code == 0x0000)) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference', text => (sprintf 'U+%04X', $code), line => $l, column => $c); $code = $charref_map->{$code}; } elsif ($code > 0x10FFFF) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'invalid character reference', text => (sprintf 'U-%08X', $code), line => $l, column => $c); $code = 0xFFFD; } if ($self->{prev_state} == DATA_STATE or $self->{prev_state} == RCDATA_STATE) { $self->{state} = $self->{prev_state}; ## Reconsume. return ({type => CHARACTER_TOKEN, data => chr $code, has_reference => 1, line => $l, column => $c, }); redo A; } else { $self->{ca}->{value} .= chr $code; $self->{ca}->{has_reference} = 1; $self->{state} = $self->{prev_state}; ## Reconsume. redo A; } } elsif ($state == ENTITY_NAME_STATE) { if ((0x0041 <= $nc and # a $nc <= 0x005A) or # x (0x0061 <= $nc and # a $nc <= 0x007A) or # z (0x0030 <= $nc and # 0 $nc <= 0x0039) or # 9 $nc == 0x003B or # ; ($self->{is_xml} and not ($is_space->{$nc} or { 0x003C => 1, 0x0026 => 1, -1 => 1, # <, & ## See comment in the |ENTITY_STATE|'s |if| ## statement for the rationale of addition of these ## characters. 0x0022 => 1, 0x0027 => 1, 0x0060 => 1, # ", ', ` 0x003D => 1, # = ## This is redundant for the same reason. $self->{entity_add} => 1, }->{$nc}))) { #local %entity2char; $self->{kwd} .= chr $nc; ## Bare entity name. if (defined $entity2char{$self->{kwd}} or ## HTML charrefs. $self->{ge}->{$self->{kwd}}) { ## XML general entities. if ($nc == 0x003B) { # ; if (defined $self->{ge}->{$self->{kwd}}) { ## A declared XML entity. if ($self->{ge}->{$self->{kwd}}->{only_text}) { $self->{entity__value} = $self->{ge}->{$self->{kwd}}->{value}; } else { if (defined $self->{ge}->{$self->{kwd}}->{notation}) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'unparsed entity', ## TODO: type value => $self->{kwd}); } else { } $self->{entity__value} = '&' . $self->{kwd}; ## TODO: expand } } else { ## An HTML character reference. if ($self->{is_xml}) { ## Not a declared XML entity. $self->{parse_error}->(level => $self->{level}->{must}, type => 'entity not declared', ## TODO: type value => $self->{kwd}, level => { 'amp;' => $self->{level}->{warn}, 'quot;' => $self->{level}->{warn}, 'lt;' => $self->{level}->{warn}, 'gt;' => $self->{level}->{warn}, 'apos;' => $self->{level}->{warn}, }->{$self->{kwd}} || $self->{level}->{must}, line => $self->{line_prev}, column => $self->{column} - length $self->{kwd}); } else { } $self->{entity__value} = $entity2char{$self->{kwd}}; } $self->{entity__match} = 1; ## Matched exactly with ";" entity. if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } # } else { $self->{entity__value} = $entity2char{$self->{kwd}}; $self->{entity__match} = -1; ## Exactly matched to non-";" entity. ## Stay in the state. if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } } else { if ($nc == 0x003B) { # ; ## A reserved HTML character reference or an undeclared ## XML entity reference. $self->{parse_error}->(level => $self->{level}->{must}, type => 'entity not declared', ## XXXtype value => $self->{kwd}, level => $self->{level}->{must}, line => $self->{line_prev}, column => $self->{column} - length $self->{kwd}); $self->{entity__value} .= chr $nc; $self->{entity__match} *= 2; ## Matched (positive) or not (zero) if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } # } else { $self->{entity__value} .= chr $nc; $self->{entity__match} *= 2; ## Matched (positive) or not (zero) ## Stay in the state. if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } } } elsif ($nc == 0x003D) { # = if ($self->{entity__match} < 0 and $self->{prev_state} != DATA_STATE and # in attribute $self->{prev_state} != RCDATA_STATE) { $self->{entity__match} = 0; } } my $data; my $has_ref; if ($self->{entity__match} > 0) { ## A ";" entity. $data = $self->{entity__value}; ## Strictly speaking the $has_ref flag should not be set if ## there is no matched entity. However, this flag is used ## only in contexts where use of an ## unexpanded-entity-reference-like string is in no way ## allowed, so it should not make any difference in theory. $has_ref = 1; # } elsif ($self->{entity__match} < 0) { ## Matched to non-";" entity. if ($self->{prev_state} != DATA_STATE and # in attribute $self->{prev_state} != RCDATA_STATE and $self->{entity__match} < -1) { ## In attribute-value contexts, matched non-";" string is ## left as is if there is trailing alphabetical letters. $data = '&' . $self->{kwd}; # } else { ## In attribute-value contexts, exactly matched non-";" ## string is replaced as a character reference. In any ## context, matched non-";" string with or without trailing ## alphabetical letters is replaced as a character reference ## (with trailing letters). Note that use of a no-";" ## character reference is always non-conforming. $self->{parse_error}->(level => $self->{level}->{must}, type => 'no refc'); $data = $self->{entity__value}; $has_ref = 1; # } } else { ## Unmatched string. if ($self->{is_xml} and not $self->{kwd} =~ /;$/) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero', line => $self->{line_prev}, column => $self->{column_prev} - length $self->{kwd}); } else { } $data = '&' . $self->{kwd}; # } ## NOTE: In these cases, when a character reference is found, ## it is consumed and a character token is returned, or, otherwise, ## nothing is consumed and returned, according to the spec algorithm. ## In this implementation, anything that has been examined by the ## tokenizer is appended to the parent element or the attribute value ## as string, either literal string when no character reference or ## entity-replaced string otherwise, in this stage, since any characters ## that would not be consumed are appended in the data state or in an ## appropriate attribute value state anyway. if ($self->{prev_state} == DATA_STATE or $self->{prev_state} == RCDATA_STATE) { $self->{state} = $self->{prev_state}; ## Reconsume. return ({type => CHARACTER_TOKEN, data => $data, has_reference => $has_ref, line => $self->{line_prev}, column => $self->{column_prev} + 1 - length $self->{kwd}, }); redo A; } else { $self->{ca}->{value} .= $data; $self->{ca}->{has_reference} = 1 if $has_ref; $self->{state} = $self->{prev_state}; ## Reconsume. redo A; } ## ========== XML-only states ========== } elsif ($state == PI_STATE) { ## XML5: "Pi state" and "DOCTYPE pi state". if ($is_space->{$nc} or $nc == 0x003F or # ? $nc == -1) { ## XML5: U+003F: "pi state": Same as "Anything else"; "DOCTYPE ## pi state": Switch to the "DOCTYPE pi after state". EOF: ## "DOCTYPE pi state": Parse error, switch to the "data ## state". $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare pio', ## TODO: type line => $self->{line_prev}, column => $self->{column_prev} - 1 * ($nc != -1)); $self->{state} = BOGUS_COMMENT_STATE; ## Reconsume. $self->{ct} = {type => COMMENT_TOKEN, data => '?', line => $self->{line_prev}, column => $self->{column_prev} - 1 * ($nc != -1), }; redo A; } else { ## XML5: "DOCTYPE pi state": Stay in the state. if ($nc == 0x0000) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL'); } $self->{ct} = {type => PI_TOKEN, target => $nc == 0x0000 ? "\x{FFFD}" : chr $nc, data => '', line => $self->{line_prev}, column => $self->{column_prev} - 1, }; $self->{state} = PI_TARGET_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } } elsif ($state == PI_TARGET_STATE) { if ($is_space->{$nc}) { $self->{state} = PI_TARGET_AFTER_STATE; $self->{kwd} = chr $nc; # "temporary buffer" if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($nc == EOF_CHAR) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type if ($self->{in_subset}) { $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; } else { $self->{state} = DATA_STATE; } ## Reconsume. return ({type => COMMENT_TOKEN, data => '?' . $self->{ct}->{target}, line => $self->{ct}->{line}, column => $self->{ct}->{column}}); redo A; } elsif ($nc == 0x003F) { # ? $self->{state} = PI_AFTER_STATE; $self->{kwd} = ''; # "temporary buffer" if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } else { ## XML5: typo ("tag name" -> "target") if ($nc == 0x0000) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL'); } $self->{ct}->{target} .= $nc == 0x0000 ? "\x{FFFD}" : chr $nc; # pi if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } } elsif ($state == PI_TARGET_AFTER_STATE) { if ($is_space->{$nc}) { $self->{kwd} .= chr $nc; # "temporary buffer" ## Stay in the state. if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } else { $self->{state} = PI_DATA_STATE; ## Reprocess. redo A; } } elsif ($state == PI_DATA_STATE) { if ($nc == 0x003F) { # ? $self->{state} = PI_DATA_AFTER_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($nc == EOF_CHAR) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'no pic'); ## TODO: type if ($self->{in_subset}) { $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state" } else { $self->{state} = DATA_STATE; } ## Reprocess. return ({type => COMMENT_TOKEN, data => '?' . $self->{ct}->{target} . $self->{kwd} . # "temporary buffer" $self->{ct}->{data}, line => $self->{ct}->{line}, column => $self->{ct}->{column}}); redo A; } else { if ($nc == 0x0000) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL'); } $self->{ct}->{data} .= $nc == 0x0000 ? "\x{FFFD}" : chr $nc; # pi $self->{read_until}->($self->{ct}->{data}, qq[\x00?], length $self->{ct}->{data}); ## Stay in the state. if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } ## Reprocess. redo A; } } elsif ($state == PI_AFTER_STATE) { ## XML5: Part of "Pi after state". if ($nc == 0x003E) { # > if ($self->{in_subset}) { $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; } else { $self->{state} = DATA_STATE; } if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } return ($self->{ct}); # pi redo A; } elsif ($nc == 0x003F) { # ? $self->{parse_error}->(level => $self->{level}->{must}, type => 'no s after target', ## TODO: type line => $self->{line_prev}, column => $self->{column_prev}); ## XML5: no error $self->{ct}->{data} .= '?'; $self->{state} = PI_DATA_AFTER_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } else { $self->{parse_error}->(level => $self->{level}->{must}, type => 'no s after target', ## TODO: type line => $self->{line_prev}, column => $self->{column_prev} + 1 * ($nc == -1)); ## XML5: no error $self->{ct}->{data} .= '?'; ## XML5: not appended $self->{state} = PI_DATA_STATE; ## Reprocess. redo A; } } elsif ($state == PI_DATA_AFTER_STATE) { ## XML5: Same as "pi after state" and "DOCTYPE pi after state". if ($nc == 0x003E) { # > if ($self->{in_subset}) { $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; } else { $self->{state} = DATA_STATE; } if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } return ($self->{ct}); # pi redo A; } elsif ($nc == 0x003F) { # ? $self->{ct}->{data} .= '?'; ## Stay in the state. if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } else { $self->{ct}->{data} .= '?'; ## XML5: not appended $self->{state} = PI_DATA_STATE; ## Reprocess. redo A; } } elsif ($state == DOCTYPE_INTERNAL_SUBSET_STATE) { if ($nc == 0x003C) { # < $self->{state} = DOCTYPE_TAG_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($nc == 0x0025) { # % ## XML5: Not defined yet. ## TODO: parameter entity expansion if (not $self->{stop_processing} and not $self->{document}->xml_standalone) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'stop processing', ## TODO: type level => $self->{level}->{info}); $self->{stop_processing} = 1; } if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($nc == 0x005D) { # ] delete $self->{in_subset}; $self->{state} = DOCTYPE_INTERNAL_SUBSET_AFTER_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($is_space->{$nc}) { ## Stay in the state. if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($nc == EOF_CHAR) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed internal subset'); ## TODO: type delete $self->{in_subset}; $self->{state} = DATA_STATE; ## Reconsume. return ({type => END_OF_DOCTYPE_TOKEN}); redo A; } else { unless ($self->{internal_subset_tainted}) { ## XML5: No parse error. $self->{parse_error}->(level => $self->{level}->{must}, type => 'string in internal subset'); $self->{internal_subset_tainted} = 1; } ## Stay in the state. if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } } elsif ($state == DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) { if ($nc == 0x003E) { # > $self->{state} = DATA_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } return ({type => END_OF_DOCTYPE_TOKEN}); redo A; } elsif ($nc == EOF_CHAR) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed DOCTYPE'); $self->{state} = DATA_STATE; ## Reconsume. return ({type => END_OF_DOCTYPE_TOKEN}); redo A; } else { ## XML5: No parse error and stay in the state. $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after internal subset'); ## TODO: type $self->{state} = BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } } elsif ($state == BOGUS_DOCTYPE_INTERNAL_SUBSET_AFTER_STATE) { if ($nc == 0x003E) { # > $self->{state} = DATA_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } return ({type => END_OF_DOCTYPE_TOKEN}); redo A; } elsif ($nc == EOF_CHAR) { $self->{state} = DATA_STATE; ## Reconsume. return ({type => END_OF_DOCTYPE_TOKEN}); redo A; } else { ## Stay in the state. if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } } elsif ($state == DOCTYPE_TAG_STATE) { if ($nc == 0x0021) { # ! $self->{state} = DOCTYPE_MARKUP_DECLARATION_OPEN_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($nc == 0x003F) { # ? $self->{state} = PI_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($nc == EOF_CHAR) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago'); $self->{state} = DATA_STATE; ## Reconsume. redo A; } else { $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare stago', ## XML5: Not a parse error. line => $self->{line_prev}, column => $self->{column_prev}); $self->{state} = BOGUS_COMMENT_STATE; $self->{ct} = {type => COMMENT_TOKEN, data => '', }; ## NOTE: Will be discarded. if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } } elsif ($state == DOCTYPE_MARKUP_DECLARATION_OPEN_STATE) { ## XML5: "DOCTYPE markup declaration state". if ($nc == 0x002D) { # - $self->{state} = MD_HYPHEN_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($nc == 0x0045 or # E $nc == 0x0065) { # e $self->{state} = MD_E_STATE; $self->{kwd} = chr $nc; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($nc == 0x0041 or # A $nc == 0x0061) { # a $self->{state} = MD_ATTLIST_STATE; $self->{kwd} = chr $nc; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($nc == 0x004E or # N $nc == 0x006E) { # n $self->{state} = MD_NOTATION_STATE; $self->{kwd} = chr $nc; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } else { # } ## XML5: No parse error. $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment', line => $self->{line_prev}, column => $self->{column_prev} - 1); ## Reconsume. $self->{state} = BOGUS_COMMENT_STATE; $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded. redo A; } elsif ($state == MD_E_STATE) { if ($nc == 0x004E or # N $nc == 0x006E) { # n $self->{state} = MD_ENTITY_STATE; $self->{kwd} .= chr $nc; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($nc == 0x004C or # L $nc == 0x006C) { # l ## XML5: <!ELEMENT> not supported. $self->{state} = MD_ELEMENT_STATE; $self->{kwd} .= chr $nc; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } else { ## XML5: No parse error. $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment', line => $self->{line_prev}, column => $self->{column_prev} - 2 + 1 * ($nc == EOF_CHAR)); ## Reconsume. $self->{state} = BOGUS_COMMENT_STATE; $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded redo A; } } elsif ($state == MD_ENTITY_STATE) { if ($nc == [ undef, undef, 0x0054, # T 0x0049, # I 0x0054, # T NEVER_CHAR, # (Y) ]->[length $self->{kwd}] or $nc == [ undef, undef, 0x0074, # t 0x0069, # i 0x0074, # t NEVER_CHAR, # (y) ]->[length $self->{kwd}]) { ## Stay in the state. $self->{kwd} .= chr $nc; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ((length $self->{kwd}) == 5 and ($nc == 0x0059 or # Y $nc == 0x0079)) { # y if ($self->{kwd} ne 'ENTIT' or $nc == 0x0079) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type text => 'ENTITY', line => $self->{line_prev}, column => $self->{column_prev} - 4); } $self->{ct} = {type => GENERAL_ENTITY_TOKEN, name => '', line => $self->{line_prev}, column => $self->{column_prev} - 6}; $self->{state} = DOCTYPE_MD_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } else { $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment', line => $self->{line_prev}, column => $self->{column_prev} - 1 - (length $self->{kwd}) + 1 * ($nc == EOF_CHAR)); $self->{state} = BOGUS_COMMENT_STATE; ## Reconsume. $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded redo A; } } elsif ($state == MD_ELEMENT_STATE) { if ($nc == [ undef, undef, 0x0045, # E 0x004D, # M 0x0045, # E 0x004E, # N NEVER_CHAR, # (T) ]->[length $self->{kwd}] or $nc == [ undef, undef, 0x0065, # e 0x006D, # m 0x0065, # e 0x006E, # n NEVER_CHAR, # (t) ]->[length $self->{kwd}]) { ## Stay in the state. $self->{kwd} .= chr $nc; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ((length $self->{kwd}) == 6 and ($nc == 0x0054 or # T $nc == 0x0074)) { # t if ($self->{kwd} ne 'ELEMEN' or $nc == 0x0074) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type text => 'ELEMENT', line => $self->{line_prev}, column => $self->{column_prev} - 5); } $self->{ct} = {type => ELEMENT_TOKEN, name => '', line => $self->{line_prev}, column => $self->{column_prev} - 7}; $self->{state} = DOCTYPE_MD_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } else { $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment', line => $self->{line_prev}, column => $self->{column_prev} - 1 - (length $self->{kwd}) + 1 * ($nc == EOF_CHAR)); $self->{state} = BOGUS_COMMENT_STATE; ## Reconsume. $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded redo A; } } elsif ($state == MD_ATTLIST_STATE) { if ($nc == [ undef, 0x0054, # T 0x0054, # T 0x004C, # L 0x0049, # I 0x0053, # S NEVER_CHAR, # (T) ]->[length $self->{kwd}] or $nc == [ undef, 0x0074, # t 0x0074, # t 0x006C, # l 0x0069, # i 0x0073, # s NEVER_CHAR, # (t) ]->[length $self->{kwd}]) { ## Stay in the state. $self->{kwd} .= chr $nc; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ((length $self->{kwd}) == 6 and ($nc == 0x0054 or # T $nc == 0x0074)) { # t if ($self->{kwd} ne 'ATTLIS' or $nc == 0x0074) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type text => 'ATTLIST', line => $self->{line_prev}, column => $self->{column_prev} - 5); } $self->{ct} = {type => ATTLIST_TOKEN, name => '', attrdefs => [], line => $self->{line_prev}, column => $self->{column_prev} - 7}; $self->{state} = DOCTYPE_MD_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } else { $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment', line => $self->{line_prev}, column => $self->{column_prev} - 1 - (length $self->{kwd}) + 1 * ($nc == EOF_CHAR)); $self->{state} = BOGUS_COMMENT_STATE; ## Reconsume. $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded redo A; } } elsif ($state == MD_NOTATION_STATE) { if ($nc == [ undef, 0x004F, # O 0x0054, # T 0x0041, # A 0x0054, # T 0x0049, # I 0x004F, # O NEVER_CHAR, # (N) ]->[length $self->{kwd}] or $nc == [ undef, 0x006F, # o 0x0074, # t 0x0061, # a 0x0074, # t 0x0069, # i 0x006F, # o NEVER_CHAR, # (n) ]->[length $self->{kwd}]) { ## Stay in the state. $self->{kwd} .= chr $nc; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ((length $self->{kwd}) == 7 and ($nc == 0x004E or # N $nc == 0x006E)) { # n if ($self->{kwd} ne 'NOTATIO' or $nc == 0x006E) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type text => 'NOTATION', line => $self->{line_prev}, column => $self->{column_prev} - 6); } $self->{ct} = {type => NOTATION_TOKEN, name => '', line => $self->{line_prev}, column => $self->{column_prev} - 8}; $self->{state} = DOCTYPE_MD_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } else { $self->{parse_error}->(level => $self->{level}->{must}, type => 'bogus comment', line => $self->{line_prev}, column => $self->{column_prev} - 1 - (length $self->{kwd}) + 1 * ($nc == EOF_CHAR)); $self->{state} = BOGUS_COMMENT_STATE; ## Reconsume. $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded redo A; } } elsif ($state == DOCTYPE_MD_STATE) { ## XML5: "DOCTYPE ENTITY state", "DOCTYPE ATTLIST state", and ## "DOCTYPE NOTATION state". if ($is_space->{$nc}) { ## XML5: [NOTATION] Switch to the "DOCTYPE NOTATION identifier state". $self->{state} = BEFORE_MD_NAME_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and $nc == 0x0025) { # % ## XML5: Switch to the "DOCTYPE bogus comment state". $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before md name'); ## TODO: type $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($nc == EOF_CHAR) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state". ## Reconsume. redo A; } elsif ($nc == 0x003E) { # > ## XML5: Switch to the "DOCTYPE bogus comment state". $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } else { ## XML5: Switch to the "DOCTYPE bogus comment state". $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before md name'); ## TODO: type $self->{state} = BEFORE_MD_NAME_STATE; redo A; } } elsif ($state == BEFORE_MD_NAME_STATE) { ## XML5: "DOCTYPE ENTITY parameter state", "DOCTYPE ENTITY type ## before state", "DOCTYPE ATTLIST name before state". if ($is_space->{$nc}) { ## Stay in the state. if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($self->{ct}->{type} == GENERAL_ENTITY_TOKEN and $nc == 0x0025) { # % $self->{state} = DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($nc == 0x003E) { # > ## XML5: Same as "Anything else". $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($nc == EOF_CHAR) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state". ## Reconsume. redo A; } else { ## XML5: [ATTLIST] Not defined yet. if ($nc == 0x0000) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL'); } $self->{ct}->{name} .= $nc == 0x0000 ? "\x{FFFD}" : chr $nc; $self->{state} = MD_NAME_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } } elsif ($state == DOCTYPE_ENTITY_PARAMETER_BEFORE_STATE) { if ($is_space->{$nc}) { ## XML5: Switch to the "DOCTYPE ENTITY parameter state". $self->{ct}->{type} = PARAMETER_ENTITY_TOKEN; $self->{state} = BEFORE_MD_NAME_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($nc == 0x003E) { # > ## XML5: Same as "Anything else". $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md name'); ## TODO: type $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($nc == EOF_CHAR) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state". ## Reconsume. redo A; } else { ## XML5: No parse error. $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space after ENTITY percent'); ## TODO: type $self->{state} = BOGUS_COMMENT_STATE; $self->{ct} = {type => COMMENT_TOKEN, data => ''}; ## Will be discarded ## Reconsume. redo A; } } elsif ($state == MD_NAME_STATE) { ## XML5: "DOCTYPE ENTITY name state" and "DOCTYPE ATTLIST name state". if ($is_space->{$nc}) { if ($self->{ct}->{type} == ATTLIST_TOKEN) { $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE; } elsif ($self->{ct}->{type} == ELEMENT_TOKEN) { $self->{state} = AFTER_ELEMENT_NAME_STATE; } else { # ENTITY/NOTATION $self->{state} = AFTER_DOCTYPE_NAME_STATE; } if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($nc == 0x003E) { # > if ($self->{ct}->{type} == ATTLIST_TOKEN) { # } else { $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md def'); ## TODO: type } $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } return ($self->{ct}); # ELEMENT/ENTITY/ATTLIST/NOTATION redo A; } elsif ($nc == EOF_CHAR) { ## XML5: [ATTLIST] No parse error. $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state". ## Reconsume. redo A; } else { ## XML5: [ATTLIST] Not defined yet. if ($nc == 0x0000) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL'); } $self->{ct}->{name} .= $nc == 0x0000 ? "\x{FFFD}" : chr $nc; ## Stay in the state. if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } } elsif ($state == DOCTYPE_ATTLIST_NAME_AFTER_STATE) { if ($is_space->{$nc}) { ## Stay in the state. if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($nc == 0x003E) { # > $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } return ($self->{ct}); # ATTLIST redo A; } elsif ($nc == EOF_CHAR) { ## XML5: No parse error. $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state". ## Discard the current token. redo A; } else { ## XML5: Not defined yet. if ($nc == 0x0000) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL'); } $self->{ca} = {name => $nc == 0x0000 ? "\x{FFFD}" : chr $nc, # attrdef tokens => [], line => $self->{line}, column => $self->{column}}; $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } } elsif ($state == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_STATE) { if ($is_space->{$nc}) { $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($nc == 0x003E) { # > ## XML5: Same as "anything else". $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr type'); ## TODO: type $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } return ($self->{ct}); # ATTLIST redo A; } elsif ($nc == 0x0028) { # ( ## XML5: Same as "anything else". $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before paren'); ## TODO: type $self->{state} = BEFORE_ALLOWED_TOKEN_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($nc == EOF_CHAR) { ## XML5: No parse error. $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state". if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } ## Discard the current token. redo A; } else { ## XML5: Not defined yet. if ($nc == 0x0000) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL'); } $self->{ca}->{name} .= $nc == 0x0000 ? "\x{FFFD}" : chr $nc; ## Stay in the state. if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } } elsif ($state == DOCTYPE_ATTLIST_ATTRIBUTE_NAME_AFTER_STATE) { if ($is_space->{$nc}) { ## Stay in the state. if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($nc == 0x003E) { # > ## XML5: Same as "anything else". $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr type'); ## TODO: type $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } return ($self->{ct}); # ATTLIST redo A; } elsif ($nc == 0x0028) { # ( ## XML5: Same as "anything else". $self->{state} = BEFORE_ALLOWED_TOKEN_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($nc == EOF_CHAR) { ## XML5: No parse error. $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state". if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } ## Discard the token. redo A; } else { ## XML5: Not defined yet. $self->{ca}->{type} = chr $nc; $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } } elsif ($state == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_STATE) { if ($is_space->{$nc}) { $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($nc == 0x0023) { # # ## XML5: Same as "anything else". $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($nc == 0x0022) { # " ## XML5: Same as "anything else". $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type $self->{ca}->{value} = ''; $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($nc == 0x0027) { # ' ## XML5: Same as "anything else". $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type $self->{ca}->{value} = ''; $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($nc == 0x003E) { # > ## XML5: Same as "anything else". $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } return ($self->{ct}); # ATTLIST redo A; } elsif ($nc == 0x0028) { # ( ## XML5: Same as "anything else". $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before paren'); ## TODO: type $self->{state} = BEFORE_ALLOWED_TOKEN_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($nc == EOF_CHAR) { ## XML5: No parse error. $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state". if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } ## Discard the token. redo A; } else { ## XML5: Not defined yet. $self->{ca}->{type} .= chr $nc; ## Stay in the state. if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } } elsif ($state == DOCTYPE_ATTLIST_ATTRIBUTE_TYPE_AFTER_STATE) { if ($is_space->{$nc}) { ## Stay in the state. if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($nc == 0x0028) { # ( ## XML5: Same as "anything else". $self->{state} = BEFORE_ALLOWED_TOKEN_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($nc == 0x0023) { # # $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($nc == 0x0022) { # " ## XML5: Same as "anything else". $self->{ca}->{value} = ''; $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($nc == 0x0027) { # ' ## XML5: Same as "anything else". $self->{ca}->{value} = ''; $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($nc == 0x003E) { # > ## XML5: Same as "anything else". $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } return ($self->{ct}); # ATTLIST redo A; } elsif ($nc == EOF_CHAR) { ## XML5: No parse error. $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state". if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } ## Discard the current token. redo A; } else { ## XML5: Switch to the "DOCTYPE bogus comment state". $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type $self->{ca}->{value} = ''; $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE; ## Reconsume. redo A; } } elsif ($state == BEFORE_ALLOWED_TOKEN_STATE) { if ($is_space->{$nc}) { ## Stay in the state. if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($nc == 0x007C) { # | $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty allowed token'); ## TODO: type ## Stay in the state. if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($nc == 0x0029) { # ) $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty allowed token'); ## TODO: type $self->{state} = AFTER_ALLOWED_TOKENS_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($nc == 0x003E) { # > $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } return ($self->{ct}); # ATTLIST redo A; } elsif ($nc == EOF_CHAR) { ## XML5: No parse error. $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state". if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } ## Discard the current token. redo A; } else { if ($nc == 0x000) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL'); } push @{$self->{ca}->{tokens}}, $nc == 0x0000 ? "\x{FFFD}" : chr $nc; $self->{state} = ALLOWED_TOKEN_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } } elsif ($state == ALLOWED_TOKEN_STATE) { if ($is_space->{$nc}) { $self->{state} = AFTER_ALLOWED_TOKEN_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($nc == 0x007C) { # | $self->{state} = BEFORE_ALLOWED_TOKEN_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($nc == 0x0029) { # ) $self->{state} = AFTER_ALLOWED_TOKENS_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($nc == 0x003E) { # > $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } return ($self->{ct}); # ATTLIST redo A; } elsif ($nc == EOF_CHAR) { ## XML5: No parse error. $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state". if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } ## Discard the current token. redo A; } else { if ($nc == 0x0000) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL'); } $self->{ca}->{tokens}->[-1] .= $nc == 0x0000 ? "\x{FFFD}" : chr $nc; ## Stay in the state. if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } } elsif ($state == AFTER_ALLOWED_TOKEN_STATE) { if ($is_space->{$nc}) { ## Stay in the state. if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($nc == 0x007C) { # | $self->{state} = BEFORE_ALLOWED_TOKEN_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($nc == 0x0029) { # ) $self->{state} = AFTER_ALLOWED_TOKENS_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($nc == 0x003E) { # > $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed allowed tokens'); ## TODO: type $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } return ($self->{ct}); # ATTLIST redo A; } elsif ($nc == EOF_CHAR) { ## XML5: No parse error. $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state". if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } ## Discard the current token. redo A; } else { $self->{parse_error}->(level => $self->{level}->{must}, type => 'space in allowed token', ## TODO: type line => $self->{line_prev}, column => $self->{column_prev}); if ($nc == 0x0000) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL'); } $self->{ca}->{tokens}->[-1] .= ' ' . ($nc == 0x0000 ? "\x{FFFD}" : chr $nc); $self->{state} = ALLOWED_TOKEN_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } } elsif ($state == AFTER_ALLOWED_TOKENS_STATE) { if ($is_space->{$nc}) { $self->{state} = BEFORE_ATTR_DEFAULT_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($nc == 0x0023) { # # $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($nc == 0x0022) { # " $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type $self->{ca}->{value} = ''; $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($nc == 0x0027) { # ' $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type $self->{ca}->{value} = ''; $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($nc == 0x003E) { # > $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } return ($self->{ct}); # ATTLIST redo A; } elsif ($nc == EOF_CHAR) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } ## Discard the current token. redo A; } else { $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE; ## Reconsume. redo A; } } elsif ($state == BEFORE_ATTR_DEFAULT_STATE) { if ($is_space->{$nc}) { ## Stay in the state. if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($nc == 0x0023) { # # $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($nc == 0x0022) { # " $self->{ca}->{value} = ''; $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($nc == 0x0027) { # ' $self->{ca}->{value} = ''; $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($nc == 0x003E) { # > $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } return ($self->{ct}); # ATTLIST redo A; } elsif ($nc == EOF_CHAR) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } ## Discard the current token. redo A; } else { $self->{parse_error}->(level => $self->{level}->{must}, type => 'unquoted attr value'); ## TODO: type $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE; ## Reconsume. redo A; } } elsif ($state == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_BEFORE_STATE) { if ($is_space->{$nc}) { ## XML5: No parse error. $self->{parse_error}->(level => $self->{level}->{must}, type => 'no default type'); ## TODO: type $self->{state} = BOGUS_MD_STATE; ## Reconsume. redo A; } elsif ($nc == 0x0022) { # " # XXX parse error? ## XML5: Same as "anything else". $self->{ca}->{value} = ''; $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($nc == 0x0027) { # ' # XXX parse error? ## XML5: Same as "anything else". $self->{ca}->{value} = ''; $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($nc == 0x003E) { # > ## XML5: Same as "anything else". $self->{parse_error}->(level => $self->{level}->{must}, type => 'no attr default'); ## TODO: type $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } return ($self->{ct}); # ATTLIST redo A; } elsif ($nc == EOF_CHAR) { ## XML5: No parse error. $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state". if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } ## Discard the current token. redo A; } else { $self->{ca}->{default} = chr $nc; $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } } elsif ($state == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_STATE) { if ($is_space->{$nc}) { $self->{state} = DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($nc == 0x0022) { # " ## XML5: Same as "anything else". $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type $self->{ca}->{value} = ''; $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($nc == 0x0027) { # ' ## XML5: Same as "anything else". $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before default value'); ## TODO: type $self->{ca}->{value} = ''; $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($nc == 0x003E) { # > ## XML5: Same as "anything else". push @{$self->{ct}->{attrdefs}}, $self->{ca}; $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } return ($self->{ct}); # ATTLIST redo A; } elsif ($nc == EOF_CHAR) { ## XML5: No parse error. $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type push @{$self->{ct}->{attrdefs}}, $self->{ca}; $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state". if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } ## Discard the current token. redo A; } else { $self->{ca}->{default} .= chr $nc; ## Stay in the state. if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } } elsif ($state == DOCTYPE_ATTLIST_ATTRIBUTE_DECLARATION_AFTER_STATE) { if ($is_space->{$nc}) { ## Stay in the state. if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($nc == 0x0022) { # " $self->{ca}->{value} = ''; $self->{state} = ATTRIBUTE_VALUE_DOUBLE_QUOTED_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($nc == 0x0027) { # ' $self->{ca}->{value} = ''; $self->{state} = ATTRIBUTE_VALUE_SINGLE_QUOTED_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($nc == 0x003E) { # > push @{$self->{ct}->{attrdefs}}, $self->{ca}; $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } return ($self->{ct}); # ATTLIST redo A; } elsif ($nc == EOF_CHAR) { ## XML5: No parse error. $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type push @{$self->{ct}->{attrdefs}}, $self->{ca}; $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## XML5: "Data state". if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } ## Discard the current token. redo A; } else { ## XML5: Not defined yet. if ($self->{ca}->{default} eq 'FIXED') { $self->{state} = ATTRIBUTE_VALUE_UNQUOTED_STATE; } else { push @{$self->{ct}->{attrdefs}}, $self->{ca}; $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE; } ## Reconsume. redo A; } } elsif ($state == AFTER_ATTLIST_ATTR_VALUE_QUOTED_STATE) { if ($is_space->{$nc} or $nc == EOF_CHAR or $nc == 0x003E) { # > $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE; ## Reconsume. redo A; } else { $self->{parse_error}->(level => $self->{level}->{must}, type => 'no space before attr name'); ## TODO: type $self->{state} = DOCTYPE_ATTLIST_NAME_AFTER_STATE; ## Reconsume. redo A; } } elsif ($state == NDATA_STATE) { ## ASCII case-insensitive if ($nc == [ undef, 0x0044, # D 0x0041, # A 0x0054, # T NEVER_CHAR, # (A) ]->[length $self->{kwd}] or $nc == [ undef, 0x0064, # d 0x0061, # a 0x0074, # t NEVER_CHAR, # (a) ]->[length $self->{kwd}]) { ## Stay in the state. $self->{kwd} .= chr $nc; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ((length $self->{kwd}) == 4 and ($nc == 0x0041 or # A $nc == 0x0061)) { # a if ($self->{kwd} ne 'NDAT' or $nc == 0x0061) { # a $self->{parse_error}->(level => $self->{level}->{must}, type => 'lowercase keyword', ## TODO: type text => 'NDATA', line => $self->{line_prev}, column => $self->{column_prev} - 4); } else { } $self->{state} = AFTER_NDATA_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } else { $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after literal', ## TODO: type line => $self->{line_prev}, column => $self->{column_prev} + 1 - length $self->{kwd}); $self->{state} = BOGUS_MD_STATE; ## Reconsume. redo A; } } elsif ($state == AFTER_NDATA_STATE) { if ($is_space->{$nc}) { $self->{state} = BEFORE_NOTATION_NAME_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($nc == 0x003E) { # > $self->{parse_error}->(level => $self->{level}->{must}, type => 'no notation name'); ## TODO: type $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } return ($self->{ct}); # ENTITY redo A; } elsif ($nc == EOF_CHAR) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } ## Discard the current token. redo A; } else { $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after literal', ## TODO: type line => $self->{line_prev}, column => $self->{column_prev} + 1 - length $self->{kwd}); $self->{state} = BOGUS_MD_STATE; ## Reconsume. redo A; } } elsif ($state == BEFORE_NOTATION_NAME_STATE) { if ($is_space->{$nc}) { ## Stay in the state. if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($nc == 0x003E) { # > $self->{parse_error}->(level => $self->{level}->{must}, type => 'no notation name'); ## TODO: type $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } return ($self->{ct}); # ENTITY redo A; } elsif ($nc == EOF_CHAR) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } ## Discard the current token. redo A; } else { if ($nc == 0x0000) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL'); } $self->{ct}->{notation} = $nc == 0x0000 ? "\x{FFFD}" : chr $nc; # ENTITY $self->{state} = NOTATION_NAME_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } } elsif ($state == NOTATION_NAME_STATE) { if ($is_space->{$nc}) { $self->{state} = AFTER_MD_DEF_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($nc == 0x003E) { # > $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } return ($self->{ct}); # ENTITY redo A; } elsif ($nc == EOF_CHAR) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } ## The current token. redo A; } else { if ($nc == 0x0000) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL'); } $self->{ct}->{notation} .= $nc == 0x0000 ? "\x{FFFD}" : chr $nc; # ENTITY ## Stay in the state. if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } } elsif ($state == DOCTYPE_ENTITY_VALUE_DOUBLE_QUOTED_STATE) { if ($nc == 0x0022) { # " $self->{state} = AFTER_MD_DEF_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($nc == 0x0026) { # & $self->{prev_state} = $state; $self->{state} = ENTITY_VALUE_ENTITY_STATE; $self->{entity_add} = 0x0022; # " if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; ## TODO: % } elsif ($nc == EOF_CHAR) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed entity value'); ## TODO: type $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## Reconsume. ## Discard the current token. redo A; } else { if ($nc == 0x0000) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL'); } $self->{ct}->{value} .= $nc == 0x0000 ? "\x{FFFD}" : chr $nc; # ENTITY if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } } elsif ($state == DOCTYPE_ENTITY_VALUE_SINGLE_QUOTED_STATE) { if ($nc == 0x0027) { # ' $self->{state} = AFTER_MD_DEF_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($nc == 0x0026) { # & $self->{prev_state} = $state; $self->{state} = ENTITY_VALUE_ENTITY_STATE; $self->{entity_add} = 0x0027; # ' if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; ## TODO: % } elsif ($nc == EOF_CHAR) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed entity value'); ## TODO: type $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## Reconsume. ## Discard the current token. redo A; } else { if ($nc == 0x0000) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL'); } $self->{ct}->{value} .= $nc == 0x0000 ? "\x{FFFD}" : chr $nc; # ENTITY if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } } elsif ($state == ENTITY_VALUE_ENTITY_STATE) { if ($is_space->{$nc} or { 0x003C => 1, 0x0026 => 1, (EOF_CHAR) => 1, # <, & $self->{entity_add} => 1, }->{$nc}) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'bare ero', line => $self->{line_prev}, column => $self->{column_prev} + ($nc == EOF_CHAR ? 1 : 0)); ## Don't consume ## Return nothing. # } elsif ($nc == 0x0023) { # # $self->{ca} = $self->{ct}; $self->{state} = ENTITY_HASH_STATE; $self->{kwd} = '#'; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } else { # } $self->{ct}->{value} .= '&'; $self->{state} = $self->{prev_state}; ## Reconsume. redo A; } elsif ($state == AFTER_ELEMENT_NAME_STATE) { if ($is_space->{$nc}) { $self->{state} = BEFORE_ELEMENT_CONTENT_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($nc == 0x0028) { # ( $self->{state} = AFTER_CM_GROUP_OPEN_STATE; $self->{ct}->{content} = ['(']; $self->{group_depth} = 1; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($nc == 0x003E) { # > $self->{parse_error}->(level => $self->{level}->{must}, type => 'no md def'); ## TODO: type $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } return ($self->{ct}); # ELEMENT redo A; } elsif ($nc == EOF_CHAR) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } ## Discard the current token. redo A; } else { if ($nc == 0x0000) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL'); } $self->{ct}->{content} = [$nc == 0x0000 ? "\x{FFFD}" : chr $nc]; $self->{state} = CONTENT_KEYWORD_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } } elsif ($state == CONTENT_KEYWORD_STATE) { if ($is_space->{$nc}) { $self->{state} = AFTER_MD_DEF_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($nc == 0x003E) { # > $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } return ($self->{ct}); # ELEMENT redo A; } elsif ($nc == EOF_CHAR) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } ## Discard the current token. redo A; } else { if ($nc == 0x0000) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL'); } $self->{ct}->{content}->[-1] .= $nc == 0x0000 ? "\x{FFFD}" : chr $nc; # ELEMENT ## Stay in the state. if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } } elsif ($state == AFTER_CM_GROUP_OPEN_STATE) { if ($is_space->{$nc}) { ## Stay in the state. if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($nc == 0x0028) { # ( $self->{group_depth}++; push @{$self->{ct}->{content}}, chr $nc; ## Stay in the state. if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($nc == 0x007C or # | $nc == 0x002C) { # , $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty element name'); ## TODO: type ## Stay in the state. if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($nc == 0x0029) { # ) $self->{parse_error}->(level => $self->{level}->{must}, type => 'empty element name'); ## TODO: type push @{$self->{ct}->{content}}, chr $nc; $self->{group_depth}--; $self->{state} = AFTER_CM_GROUP_CLOSE_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($nc == 0x003E) { # > $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type push @{$self->{ct}->{content}}, (')') x $self->{group_depth}; $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } return ($self->{ct}); # ELEMENT redo A; } elsif ($nc == EOF_CHAR) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type #push @{$self->{ct}->{content}}, (')') x $self->{group_depth}; $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } ## Discard the current token. redo A; } else { if ($nc == 0x0000) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL'); } push @{$self->{ct}->{content}}, $nc == 0x0000 ? "\x{FFFD}" : chr $nc; $self->{state} = CM_ELEMENT_NAME_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } } elsif ($state == CM_ELEMENT_NAME_STATE) { if ($is_space->{$nc}) { $self->{state} = AFTER_CM_ELEMENT_NAME_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($nc == 0x002A or # * $nc == 0x002B or # + $nc == 0x003F) { # ? push @{$self->{ct}->{content}}, chr $nc; $self->{state} = AFTER_CM_ELEMENT_NAME_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($nc == 0x007C or # | $nc == 0x002C) { # , push @{$self->{ct}->{content}}, $nc == 0x007C ? ' | ' : ', '; $self->{state} = AFTER_CM_GROUP_OPEN_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($nc == 0x0029) { # ) $self->{group_depth}--; push @{$self->{ct}->{content}}, chr $nc; $self->{state} = AFTER_CM_GROUP_CLOSE_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($nc == 0x003E) { # > $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type push @{$self->{ct}->{content}}, (')') x $self->{group_depth}; $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } return ($self->{ct}); # ELEMENT redo A; } elsif ($nc == EOF_CHAR) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type #push @{$self->{ct}->{content}}, (')') x $self->{group_depth}; $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } ## Discard the token. redo A; } else { if ($nc == 0x0000) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'NULL'); } $self->{ct}->{content}->[-1] .= $nc == 0x0000 ? "\x{FFFD}" : chr $nc; ## Stay in the state. if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } } elsif ($state == AFTER_CM_ELEMENT_NAME_STATE) { if ($is_space->{$nc}) { ## Stay in the state. if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($nc == 0x007C or # | $nc == 0x002C) { # , push @{$self->{ct}->{content}}, $nc == 0x007C ? ' | ' : ', '; $self->{state} = AFTER_CM_GROUP_OPEN_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($nc == 0x0029) { # ) $self->{group_depth}--; push @{$self->{ct}->{content}}, chr $nc; $self->{state} = AFTER_CM_GROUP_CLOSE_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($nc == 0x003E) { # > $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type push @{$self->{ct}->{content}}, (')') x $self->{group_depth}; $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } return ($self->{ct}); # ELEMENT redo A; } elsif ($nc == EOF_CHAR) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type #push @{$self->{ct}->{content}}, (')') x $self->{group_depth}; $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } ## Discard the current token. redo A; } else { $self->{parse_error}->(level => $self->{level}->{must}, type => 'after element name'); ## TODO: type push @{$self->{ct}->{content}}, (')') x $self->{group_depth}; $self->{state} = BOGUS_MD_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } } elsif ($state == AFTER_CM_GROUP_CLOSE_STATE) { if ($is_space->{$nc}) { if ($self->{group_depth}) { $self->{state} = AFTER_CM_ELEMENT_NAME_STATE; } else { $self->{state} = AFTER_MD_DEF_STATE; } if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($nc == 0x002A or # * $nc == 0x002B or # + $nc == 0x003F) { # ? push @{$self->{ct}->{content}}, chr $nc; if ($self->{group_depth}) { $self->{state} = AFTER_CM_ELEMENT_NAME_STATE; } else { $self->{state} = AFTER_MD_DEF_STATE; } if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($nc == 0x0029) { # ) if ($self->{group_depth}) { $self->{group_depth}--; push @{$self->{ct}->{content}}, chr $nc; ## Stay in the state. if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } else { $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after md def'); ## TODO: type $self->{state} = BOGUS_MD_STATE; ## Reconsume. redo A; } } elsif ($nc == 0x003E) { # > if ($self->{group_depth}) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed cm group'); ## TODO: type push @{$self->{ct}->{content}}, (')') x $self->{group_depth}; } $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } return ($self->{ct}); # ELEMENT redo A; } elsif ($nc == EOF_CHAR) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type #push @{$self->{ct}->{content}}, (')') x $self->{group_depth}; $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } ## Discard the current token. redo A; } else { if ($self->{group_depth}) { $self->{state} = AFTER_CM_ELEMENT_NAME_STATE; } else { $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after md def'); ## TODO: type $self->{state} = BOGUS_MD_STATE; } ## Reconsume. redo A; } } elsif ($state == AFTER_MD_DEF_STATE) { if ($is_space->{$nc}) { ## Stay in the state. if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } elsif ($nc == 0x003E) { # > $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } return ($self->{ct}); # ENTITY/ELEMENT redo A; } elsif ($nc == EOF_CHAR) { $self->{parse_error}->(level => $self->{level}->{must}, type => 'unclosed md'); ## TODO: type $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } ## Discard the current token. redo A; } else { $self->{parse_error}->(level => $self->{level}->{must}, type => 'string after md def'); ## TODO: type $self->{state} = BOGUS_MD_STATE; ## Reconsume. redo A; } } elsif ($state == BOGUS_MD_STATE) { if ($nc == 0x003E) { # > $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } return ($self->{ct}); # ATTLIST/ENTITY/NOTATION redo A; } elsif ($nc == EOF_CHAR) { $self->{state} = DOCTYPE_INTERNAL_SUBSET_STATE; ## Reconsume. ## Discard the current token. redo A; } else { ## Stay in the state. if ($self->{char_buffer_pos} < length $self->{char_buffer}) { $self->{line_prev} = $self->{line}; $self->{column_prev} = $self->{column}; $self->{column}++; $self->{nc} = ord substr ($self->{char_buffer}, $self->{char_buffer_pos}++, 1); } else { $self->{set_nc}->($self); } redo A; } } else { die "$0: $state: Unknown state"; } } # A die "$0: _get_next_token: unexpected case"; } # _get_next_token 1; # Copyright 2007-2011 Wakaba <w@suika.fam.cx>. # # This library is free software; you can redistribute it and/or modify # it under the same terms as Perl itself. ���������HTML-HTML5-Parser-0.301/lib/HTML/HTML5/Parser/Charset/����������������������������������������������0000755�0001750�0001750�00000000000�12166545247�017462� 5����������������������������������������������������������������������������������������������������ustar �tai�����������������������������tai��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������HTML-HTML5-Parser-0.301/lib/HTML/HTML5/Parser/Charset/UniversalCharDet.pm���������������������������0000644�0001750�0001750�00000001633�12166544311�023215� 0����������������������������������������������������������������������������������������������������ustar �tai�����������������������������tai��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������package HTML::HTML5::Parser::Charset::UniversalCharDet; ## skip Test::Tabs use strict; use IO::HTML (); our $VERSION='0.301'; our $DEBUG; # this really shouldn't work, but for some reason it does... sub _detect { return +{ encoding => 'UTF-8' } if !utf8::is_utf8($_[0]); # huh? open my $fh, '<:raw', \$_[0]; my $e = IO::HTML::sniff_encoding($fh => 'string'); return +{ encoding => $e } if defined $e; return +{}; } sub detect_byte_string ($$) { my $de; eval { $de = _detect $_[1]; 1; } or do { warn $@ unless $DEBUG; die $@ if $DEBUG; }; if (defined $de and defined $de->{encoding}) { return lc $de->{encoding}; } else { return undef; } } # detect_byte_string #Copyright 2007-2011 Wakaba <w@suika.fam.cx> #Copyright 2009-2012 Toby Inkster <tobyink@cpan.org> # #This library is free software; you can redistribute it #and/or modify it under the same terms as Perl itself. 1; �����������������������������������������������������������������������������������������������������HTML-HTML5-Parser-0.301/lib/HTML/HTML5/Parser/Charset/WebThai.pm������������������������������������0000644�0001750�0001750�00000003533�12166544311�021336� 0����������������������������������������������������������������������������������������������������ustar �tai�����������������������������tai��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������#!/usr/bin/perl package HTML::HTML5::Parser::Charset::WebThai; ## skip Test::Tabs use strict; our $VERSION='0.301'; ## NOTE: This module does not expect that its standalone uses. ## See Message::Charset::Info for how it is used. require Encode::Encoding; push our @ISA, 'Encode::Encoding'; __PACKAGE__->Define (qw/web-thai/); sub encode ($$;$) { # $self, $str, $chk if ($_[2]) { if ($_[1] =~ s/^([\x00-\x7F\xA0\x{0E01}-\x{0E3A}\x{0E3F}-\x{0E5B}]+)//) { return Encode::encode ('iso-8859-11', $1); } else { return ''; } } else { my $r = $_[1]; $r =~ s/[^\x00-\x7F\xA0\x{0E01}-\x{0E3A}\x{0E3F}-\x{0E5B}]/?/g; return Encode::encode ('iso-8859-11', $r); } } # encode sub decode ($$;$) { # $self, $s, $chk if ($_[2]) { my $r = ''; while (1) { if ($_[1] =~ s/^([\x00-\x7F\xA0-\xDA\xDF-\xFB]+)//) { $r .= Encode::decode ('iso-8859-11', $1); } else { return $r; } } } else { return Encode::decode ('windows-874', $_[1]); } } # decode package HTML::HTML5::Parser::Charset::WebThai::WebTIS620; push our @ISA, 'Encode::Encoding'; __PACKAGE__->Define (qw/web-tis-620/); sub encode ($$;$) { # $self, $str, $chk if ($_[2]) { if ($_[1] =~ s/^([\x00-\x7F\x{0E01}-\x{0E3A}\x{0E3F}-\x{0E5B}]+)//) { return Encode::encode ('tis-620', $1); } else { return ''; } } else { my $r = $_[1]; $r =~ s/[^\x00-\x7F\x{0E01}-\x{0E3A}\x{0E3F}-\x{0E5B}]/?/g; return Encode::encode ('tis-620', $r); } } # encode sub decode ($$;$) { # $self, $s, $chk if ($_[2]) { my $r = ''; while (1) { if ($_[1] =~ s/^([\x00-\x7F\xA1-\xDA\xDF-\xFB]+)//) { $r .= Encode::decode ('tis-620', $1); } else { return $r; } } } else { return Encode::decode ('windows-874', $_[1]); } } # decode 1; ## $Date: 2008/09/10 10:27:09 $ ���������������������������������������������������������������������������������������������������������������������������������������������������������������������HTML-HTML5-Parser-0.301/lib/HTML/HTML5/Parser/Charset/WebLatin1.pm����������������������������������0000644�0001750�0001750�00000010120�12166544311�021567� 0����������������������������������������������������������������������������������������������������ustar �tai�����������������������������tai��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������#!/usr/bin/perl package HTML::HTML5::Parser::Charset::WebLatin1; ## skip Test::Tabs use strict; our $VERSION='0.301'; ## NOTE: This module does not expect that its standalone uses. ## See Message::Charset::Info for how it is used. require Encode::Encoding; push our @ISA, 'Encode::Encoding'; __PACKAGE__->Define (qw/web-latin1/); sub encode ($$;$) { # $self, $str, $chk if ($_[2]) { if ($_[1] =~ s/^([\x00-\x7F\xA0-\xFF]+)//) { return Encode::encode ('iso-8859-1', $1); } else { return ''; } } else { my $r = $_[1]; $r =~ s/[^\x00-\x7F\xA0-\xFF]/?/g; return Encode::encode ('iso-8859-1', $r); } } # encode sub decode ($$;$) { # $self, $s, $chk if ($_[2]) { my $r = ''; while (1) { if ($_[1] =~ s/^([\x00-\x7F\xA0-\xFF]+)//) { $r .= $1; #} elsif ($_[1] =~ s/^([\x80\x82-\x8C\x8E\x91-\x9C\x9E\x9F])//) { # my $v = $1; # $v =~ tr/\x80-\x9F/\x{20AC}\x{FFFD}\x{201A}\x{0192}\x{201E}\x{2026}\x{2020}\x{2021}\x{02C6}\x{2030}\x{0160}\x{2039}\x{0152}\x{FFFD}\x{017D}\x{FFFD}\x{FFFD}\x{2018}\x{2019}\x{201C}\x{201D}\x{2022}\x{2013}\x{2014}\x{02DC}\x{2122}\x{0161}\x{203A}\x{0153}\x{FFFD}\x{017E}\x{0178}/; # $r .= $v; } else { return $r; } } } else { my $r = $_[1]; $r =~ tr/\x80-\x9F/\x{20AC}\x{FFFD}\x{201A}\x{0192}\x{201E}\x{2026}\x{2020}\x{2021}\x{02C6}\x{2030}\x{0160}\x{2039}\x{0152}\x{FFFD}\x{017D}\x{FFFD}\x{FFFD}\x{2018}\x{2019}\x{201C}\x{201D}\x{2022}\x{2013}\x{2014}\x{02DC}\x{2122}\x{0161}\x{203A}\x{0153}\x{FFFD}\x{017E}\x{0178}/; return $r; } } # decode package HTML::HTML5::Parser::Charset::USASCII; push our @ISA, 'Encode::Encoding'; __PACKAGE__->Define (qw/web-latin1-us-ascii/); sub encode ($$;$) { # $self, $str, $chk if ($_[2]) { if ($_[1] =~ s/^([\x00-\x7F]+)//) { return Encode::encode ('iso-8859-1', $1); } else { return ''; } } else { my $r = $_[1]; $r =~ s/[^\x00-\x7F]/?/g; return Encode::encode ('iso-8859-1', $r); } } # encode sub decode ($$;$) { # $self, $s, $chk if ($_[2]) { my $r = ''; while (1) { if ($_[1] =~ s/^([\x00-\x7F]+)//) { $r .= $1; #} elsif ($_[1] =~ s/^([\x80\x82-\x8C\x8E\x91-\x9C\x9E\x9F-\xFF])//) { # my $v = $1; # $v =~ tr/\x80-\xFF/\x{20AC}\x{FFFD}\x{201A}\x{0192}\x{201E}\x{2026}\x{2020}\x{2021}\x{02C6}\x{2030}\x{0160}\x{2039}\x{0152}\x{FFFD}\x{017D}\x{FFFD}\x{FFFD}\x{2018}\x{2019}\x{201C}\x{201D}\x{2022}\x{2013}\x{2014}\x{02DC}\x{2122}\x{0161}\x{203A}\x{0153}\x{FFFD}\x{017E}\x{0178}\xA0-\xFF/; # $r .= $v; } else { return $r; } } } else { my $r = $_[1]; $r =~ tr/\x80-\xFF/\x{20AC}\x{FFFD}\x{201A}\x{0192}\x{201E}\x{2026}\x{2020}\x{2021}\x{02C6}\x{2030}\x{0160}\x{2039}\x{0152}\x{FFFD}\x{017D}\x{FFFD}\x{FFFD}\x{2018}\x{2019}\x{201C}\x{201D}\x{2022}\x{2013}\x{2014}\x{02DC}\x{2122}\x{0161}\x{203A}\x{0153}\x{FFFD}\x{017E}\x{0178}\xA0-\xFF/; return $r; } } # decode package HTML::HTML5::Parser::Charset::WebLatin5; push our @ISA, 'Encode::Encoding'; __PACKAGE__->Define (qw/web-latin5/); sub encode ($$;$) { # $self, $str, $chk if ($_[2]) { if ($_[1] =~ s/^([\x00-\x7F]+)//) { return Encode::encode ('iso-8859-9', $1); } else { return ''; } } else { my $r = $_[1]; $r =~ s/[^\x00-\x7F]/?/g; return Encode::encode ('iso-8859-9', $r); } } # encode sub decode ($$;$) { # $self, $s, $chk if ($_[2]) { my $r = ''; while (1) { if ($_[1] =~ s/^([\x00-\x7F\xA0-\xFF]+)//) { $r .= Encode::decode ('windows-1254', $1); #} elsif ($_[1] =~ s/^([\x80\x82-\x8C\x91-\x9C\x9F])//) { # my $v = $1; # $v =~ tr/\x80-\x9F/\x{20AC}\x{FFFD}\x{201A}\x{0192}\x{201E}\x{2026}\x{2020}\x{2021}\x{02C6}\x{2030}\x{0160}\x{2039}\x{0152}\x{FFFD}\x{FFFD}\x{FFFD}\x{FFFD}\x{2018}\x{2019}\x{201C}\x{201D}\x{2022}\x{2013}\x{2014}\x{02DC}\x{2122}\x{0161}\x{203A}\x{0153}\x{FFFD}\x{FFFD}\x{0178}/; # $r .= $v; } else { return $r; } } } else { my $r = Encode::decode ('windows-1254', $_[1]); return $r; } } # decode 1; ## $Date: 2008/09/10 10:27:09 $ ������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������HTML-HTML5-Parser-0.301/lib/HTML/HTML5/Parser/Charset/DecodeHandle.pm�������������������������������0000644�0001750�0001750�00000154622�12166544311�022320� 0����������������������������������������������������������������������������������������������������ustar �tai�����������������������������tai��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������package HTML::HTML5::Parser::Charset::DecodeHandle; ## skip Test::Tabs use strict; our $VERSION = '0.301'; ## NOTE: |Message::Charset::Info| uses this module without calling ## the constructor. use HTML::HTML5::Parser::Charset::Info; my $XML_AUTO_CHARSET = q<http://suika.fam.cx/www/2006/03/xml-entity/>; my $IANA_CHARSET = q<urn:x-suika-fam-cx:charset:>; my $PERL_CHARSET = q<http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.>; my $XML_CHARSET = q<http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.>; ## ->create_decode_handle ($charset_uri, $byte_stream, $onerror) sub create_decode_handle ($$$;$) { my $csdef = $HTML::HTML5::Parser::Charset::CharsetDef->{$_[1]}; my $obj = { category => 0, char_buffer => \(my $s = ''), char_buffer_pos => 0, character_queue => [], filehandle => $_[2], charset => $_[1], byte_buffer => '', onerror => $_[3] || sub {}, #onerror_set }; if ($csdef->{uri}->{$XML_AUTO_CHARSET} or $obj->{charset} eq $XML_AUTO_CHARSET) { my $b = ''; # UTF-8 w/o BOM $csdef = $HTML::HTML5::Parser::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'}; $obj->{input_encoding} = 'UTF-8'; if (read $obj->{filehandle}, $b, 256) { no warnings "substr"; no warnings "uninitialized"; if (substr ($b, 0, 1) eq "<") { if (substr ($b, 1, 1) eq "?") { # ASCII8 if ($b =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)? encoding\s*=\s*["']([^"']*)/x) { $obj->{input_encoding} = $1; my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding}); $csdef = $HTML::HTML5::Parser::Charset::CharsetDef->{$uri}; if (not $csdef->{ascii8} or $csdef->{bom_required}) { $obj->{onerror}->(undef, 'charset-name-mismatch-error', charset_uri => $uri, charset_name => $obj->{input_encoding}); } } else { $csdef = $HTML::HTML5::Parser::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'}; $obj->{input_encoding} = 'UTF-8'; } if (defined $csdef->{no_bom_variant}) { $csdef = $HTML::HTML5::Parser::Charset::CharsetDef->{$csdef->{no_bom_variant}}; } } elsif (substr ($b, 1, 1) eq "\x00") { if (substr ($b, 2, 2) eq "?\x00") { # ASCII16LE my $c = $b; $c =~ tr/\x00//d; if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)? encoding\s*=\s*["']([^"']*)/x) { $obj->{input_encoding} = $1; my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding}); $csdef = $HTML::HTML5::Parser::Charset::CharsetDef->{$uri}; if (not $csdef->{ascii16} or $csdef->{ascii16be} or $csdef->{bom_required}) { $obj->{onerror}->(undef, 'charset-name-mismatch-error', charset_uri => $uri, charset_name => $obj->{input_encoding}); } } else { $csdef = $HTML::HTML5::Parser::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'}; $obj->{input_encoding} = 'UTF-8'; } if (defined $csdef->{no_bom_variant16le}) { $csdef = $HTML::HTML5::Parser::Charset::CharsetDef->{$csdef->{no_bom_variant16le}}; } } elsif (substr ($b, 2, 2) eq "\x00\x00") { # ASCII32Endian4321 my $c = $b; $c =~ tr/\x00//d; if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)? encoding\s*=\s*["']([^"']*)/x) { $obj->{input_encoding} = $1; my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding}); $csdef = $HTML::HTML5::Parser::Charset::CharsetDef->{$uri}; if (not $csdef->{ascii32} or $csdef->{ascii32endian1234} or $csdef->{ascii32endian2143} or $csdef->{ascii32endian3412} or $csdef->{bom_required}) { $obj->{onerror}->(undef, 'charset-name-mismatch-error', charset_uri => $uri, charset_name => $obj->{input_encoding}); } } else { $csdef = $HTML::HTML5::Parser::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'}; $obj->{input_encoding} = 'UTF-8'; } if (defined $csdef->{no_bom_variant32endian4321}) { $csdef = $HTML::HTML5::Parser::Charset::CharsetDef->{$csdef->{no_bom_variant32endian4321}}; } } } } elsif (substr ($b, 0, 3) eq "\xEF\xBB\xBF") { # UTF8 $obj->{has_bom} = 1; substr ($b, 0, 3) = ''; my $c = $b; if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)? encoding\s*=\s*["']([^"']*)/x) { $obj->{input_encoding} = $1; my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding}); $csdef = $HTML::HTML5::Parser::Charset::CharsetDef->{$uri}; if (not $csdef->{utf8_encoding_scheme} or not $csdef->{bom_allowed}) { $obj->{onerror}->(undef, 'charset-name-mismatch-error', charset_uri => $uri, charset_name => $obj->{input_encoding}); } } else { $csdef = $HTML::HTML5::Parser::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'}; $obj->{input_encoding} = 'UTF-8'; } if (defined $csdef->{no_bom_variant}) { $csdef = $HTML::HTML5::Parser::Charset::CharsetDef->{$csdef->{no_bom_variant}}; } } elsif (substr ($b, 0, 2) eq "\x00<") { if (substr ($b, 2, 2) eq "\x00?") { # ASCII16BE my $c = $b; $c =~ tr/\x00//d; if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)? encoding\s*=\s*["']([^"']*)/x) { $obj->{input_encoding} = $1; my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding}); $csdef = $HTML::HTML5::Parser::Charset::CharsetDef->{$uri}; if (not $csdef->{ascii16} or $csdef->{ascii16le} or $csdef->{bom_required}) { $obj->{onerror}->(undef, 'charset-name-mismatch-error', charset_uri => $uri, charset_name => $obj->{input_encoding}); } } else { $csdef = $HTML::HTML5::Parser::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'}; $obj->{input_encoding} = 'UTF-8'; } if (defined $csdef->{no_bom_variant16be}) { $csdef = $HTML::HTML5::Parser::Charset::CharsetDef->{$csdef->{no_bom_variant16be}}; } } elsif (substr ($b, 2, 2) eq "\x00\x00") { # ASCII32Endian3412 my $c = $b; $c =~ tr/\x00//d; if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)? encoding\s*=\s*["']([^"']*)/x) { $obj->{input_encoding} = $1; my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding}); $csdef = $HTML::HTML5::Parser::Charset::CharsetDef->{$uri}; if (not $csdef->{ascii32} or $csdef->{ascii32endian1234} or $csdef->{ascii32endian2143} or $csdef->{ascii32endian4321} or $csdef->{bom_required}) { $obj->{onerror}->(undef, 'charset-name-mismatch-error', charset_uri => $uri, charset_name => $obj->{input_encoding}); } } else { $csdef = $HTML::HTML5::Parser::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'}; $obj->{input_encoding} = 'UTF-8'; } if (defined $csdef->{no_bom_variant32endian3412}) { $csdef = $HTML::HTML5::Parser::Charset::CharsetDef->{$csdef->{no_bom_variant32endian3412}}; } } } elsif (substr ($b, 0, 2) eq "\xFE\xFF") { if (substr ($b, 2, 2) eq "\x00<") { # ASCII16BE $obj->{has_bom} = 1; substr ($b, 0, 2) = ''; my $c = $b; $c =~ tr/\x00//d; if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)? encoding\s*=\s*["']([^"']*)/x) { $obj->{input_encoding} = $1; my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding}); $csdef = $HTML::HTML5::Parser::Charset::CharsetDef->{$uri}; if (not $csdef->{ascii16} or $csdef->{ascii16le} or not $csdef->{bom_allowed}) { $obj->{onerror}->(undef, 'charset-name-mismatch-error', charset_uri => $uri, charset_name => $obj->{input_encoding}); } } else { $csdef = $HTML::HTML5::Parser::Charset::CharsetDef->{$PERL_CHARSET.'utf-16be'}; $obj->{input_encoding} = 'UTF-16'; } if (defined $csdef->{no_bom_variant16be}) { $csdef = $HTML::HTML5::Parser::Charset::CharsetDef->{$csdef->{no_bom_variant16be}}; } } elsif (substr ($b, 2, 2) eq "\x00\x00") { # ASCII32Endian3412 $obj->{has_bom} = 1; substr ($b, 0, 4) = ''; my $c = $b; $c =~ tr/\x00//d; if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)? encoding\s*=\s*["']([^"']*)/x) { $obj->{input_encoding} = $1; my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding}); $csdef = $HTML::HTML5::Parser::Charset::CharsetDef->{$uri}; if (not $csdef->{ascii32} or $csdef->{ascii32endian1234} or $csdef->{ascii32endian2143} or $csdef->{ascii32endian4321} or not $csdef->{bom_allowed}) { $obj->{onerror}->(undef, 'charset-name-mismatch-error', charset_uri => $uri, charset_name => $obj->{input_encoding}); } } else { $csdef = $HTML::HTML5::Parser::Charset::CharsetDef->{$PERL_CHARSET.'utf-16be'}; $obj->{input_encoding} = 'UTF-16'; $obj->{byte_buffer} .= "\x00\x00"; } if (defined $csdef->{no_bom_variant32endian3412}) { $csdef = $HTML::HTML5::Parser::Charset::CharsetDef->{$csdef->{no_bom_variant32endian3412}}; } } else { $csdef = $HTML::HTML5::Parser::Charset::CharsetDef->{$PERL_CHARSET.'utf-16be'}; $obj->{input_encoding} = 'UTF-16'; substr ($b, 0, 2) = ''; $obj->{has_bom} = 1; } } elsif (substr ($b, 0, 2) eq "\xFF\xFE") { if (substr ($b, 2, 2) eq "<\x00") { # ASCII16LE $obj->{has_bom} = 1; substr ($b, 0, 2) = ''; my $c = $b; $c =~ tr/\x00//d; if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)? encoding\s*=\s*["']([^"']*)/x) { $obj->{input_encoding} = $1; my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding}); $csdef = $HTML::HTML5::Parser::Charset::CharsetDef->{$uri}; if (not $csdef->{ascii16} or $csdef->{ascii16be} or not $csdef->{bom_allowed}) { $obj->{onerror}->(undef, 'charset-name-mismatch-error', charset_uri => $uri, charset_name => $obj->{input_encoding}); } } else { $csdef = $HTML::HTML5::Parser::Charset::CharsetDef->{$PERL_CHARSET.'utf-16le'}; $obj->{input_encoding} = 'UTF-16'; } if (defined $csdef->{no_bom_variant16le}) { $csdef = $HTML::HTML5::Parser::Charset::CharsetDef->{$csdef->{no_bom_variant16le}}; } } elsif (substr ($b, 2, 2) eq "\x00\x00") { # ASCII32Endian4321 $obj->{has_bom} = 1; substr ($b, 0, 4) = ''; my $c = $b; $c =~ tr/\x00//d; if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)? encoding\s*=\s*["']([^"']*)/x) { $obj->{input_encoding} = $1; my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding}); $csdef = $HTML::HTML5::Parser::Charset::CharsetDef->{$uri}; if (not $csdef->{ascii32} or $csdef->{ascii32endian1234} or $csdef->{ascii32endian2143} or $csdef->{ascii32endian3412} or not $csdef->{bom_allowed}) { $obj->{onerror}->(undef, 'charset-name-mismatch-error', charset_uri => $uri, charset_name => $obj->{input_encoding}); } } else { $csdef = $HTML::HTML5::Parser::Charset::CharsetDef->{$PERL_CHARSET.'utf-16le'}; $obj->{input_encoding} = 'UTF-16'; $obj->{byte_buffer} .= "\x00\x00"; } if (defined $csdef->{no_bom_variant32endian4321}) { $csdef = $HTML::HTML5::Parser::Charset::CharsetDef->{$csdef->{no_bom_variant32endian4321}}; } } else { $csdef = $HTML::HTML5::Parser::Charset::CharsetDef->{$PERL_CHARSET.'utf-16le'}; $obj->{input_encoding} = 'UTF-16'; substr ($b, 0, 2) = ''; $obj->{has_bom} = 1; } } elsif (substr ($b, 0, 2) eq "\x00\x00") { if (substr ($b, 2, 2) eq "\x00<") { # ASCII32Endian1234 my $c = $b; $c =~ tr/\x00//d; if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)? encoding\s*=\s*["']([^"']*)/x) { $obj->{input_encoding} = $1; my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding}); $csdef = $HTML::HTML5::Parser::Charset::CharsetDef->{$uri}; if (not $csdef->{ascii32} or $csdef->{ascii32endian2143} or $csdef->{ascii32endian3412} or $csdef->{ascii32endian4321} or $csdef->{bom_required}) { $obj->{onerror}->(undef, 'charset-name-mismatch-error', charset_uri => $uri, charset_name => $obj->{input_encoding}); } } else { $csdef = $HTML::HTML5::Parser::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'}; $obj->{input_encoding} = 'UTF-8'; } if (defined $csdef->{no_bom_variant32endian1234}) { $csdef = $HTML::HTML5::Parser::Charset::CharsetDef->{$csdef->{no_bom_variant32endian1234}}; } } elsif (substr ($b, 2, 2) eq "<\x00") { # ASCII32Endian2143 my $c = $b; $c =~ tr/\x00//d; if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)? encoding\s*=\s*["']([^"']*)/x) { $obj->{input_encoding} = $1; my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding}); $csdef = $HTML::HTML5::Parser::Charset::CharsetDef->{$uri}; if (not $csdef->{ascii32} or $csdef->{ascii32endian1234} or $csdef->{ascii32endian3412} or $csdef->{ascii32endian4321} or $csdef->{bom_required}) { $obj->{onerror}->(undef, 'charset-name-mismatch-error', charset_uri => $uri, charset_name => $obj->{input_encoding}); } } else { $csdef = $HTML::HTML5::Parser::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'}; $obj->{input_encoding} = 'UTF-8'; } if (defined $csdef->{no_bom_variant32endian2143}) { $csdef = $HTML::HTML5::Parser::Charset::CharsetDef->{$csdef->{no_bom_variant32endian2143}}; } } elsif (substr ($b, 2, 2) eq "\xFE\xFF") { # ASCII32Endian1234 $obj->{has_bom} = 1; substr ($b, 0, 4) = ''; my $c = $b; $c =~ tr/\x00//d; if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)? encoding\s*=\s*["']([^"']*)/x) { $obj->{input_encoding} = $1; my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding}); $csdef = $HTML::HTML5::Parser::Charset::CharsetDef->{$uri}; if (not $csdef->{ascii32} or $csdef->{ascii32endian2143} or $csdef->{ascii32endian3412} or $csdef->{ascii32endian4321} or $csdef->{bom_required}) { $obj->{onerror}->(undef, 'charset-name-mismatch-error', charset_uri => $uri, charset_name => $obj->{input_encoding}); } } else { $csdef = $HTML::HTML5::Parser::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'}; $obj->{input_encoding} = 'UTF-8'; $obj->{has_bom} = 0; $obj->{byte_buffer} .= "\x00\x00\xFE\xFF"; } if (defined $csdef->{no_bom_variant32endian1234}) { $csdef = $HTML::HTML5::Parser::Charset::CharsetDef->{$csdef->{no_bom_variant32endian1234}}; } } elsif (substr ($b, 2, 2) eq "\xFF\xFE") { # ASCII32Endian2143 $obj->{has_bom} = 1; substr ($b, 0, 4) = ''; my $c = $b; $c =~ tr/\x00//d; if ($c =~ /^<\?xml\s+(?:version\s*=\s*["'][^"']*["']\s*)? encoding\s*=\s*["']([^"']*)/x) { $obj->{input_encoding} = $1; my $uri = name_to_uri (undef, 'xml', $obj->{input_encoding}); $csdef = $HTML::HTML5::Parser::Charset::CharsetDef->{$uri}; if (not $csdef->{ascii32} or $csdef->{ascii32endian1234} or $csdef->{ascii32endian3412} or $csdef->{ascii32endian4321} or $csdef->{bom_required}) { $obj->{onerror}->(undef, 'charset-name-mismatch-error', charset_uri => $uri, charset_name => $obj->{input_encoding}); } } else { $csdef = $HTML::HTML5::Parser::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'}; $obj->{input_encoding} = 'UTF-8'; $obj->{has_bom} = 0; $obj->{byte_buffer} .= "\x00\x00\xFF\xFE"; } if (defined $csdef->{no_bom_variant32endian2143}) { $csdef = $HTML::HTML5::Parser::Charset::CharsetDef->{$csdef->{no_bom_variant32endian2143}}; } } # \x4C\x6F\xA7\x94 EBCDIC } # buffer $obj->{byte_buffer} .= $b; } # read } elsif ($csdef->{uri}->{$XML_CHARSET.'utf-8'}) { ## BOM is optional. my $b = ''; if (read $obj->{filehandle}, $b, 3) { if ($b eq "\xEF\xBB\xBF") { $obj->{has_bom} = 1; } else { $obj->{byte_buffer} .= $b; } } $csdef = $HTML::HTML5::Parser::Charset::CharsetDef->{$PERL_CHARSET.'utf-8'}; # UTF-8 w/o BOM } elsif ($csdef->{uri}->{$XML_CHARSET.'utf-16'}) { ## BOM is mandated. my $b = ''; if (read $obj->{filehandle}, $b, 2) { if ($b eq "\xFE\xFF") { $obj->{has_bom} = 1; # UTF-16BE w/o BOM $csdef = $HTML::HTML5::Parser::Charset::CharsetDef->{$PERL_CHARSET.'utf-16be'}; } elsif ($b eq "\xFF\xFE") { $obj->{has_bom} = 1; # UTF-16LE w/o BOM $csdef = $HTML::HTML5::Parser::Charset::CharsetDef->{$PERL_CHARSET.'utf-16le'}; } else { $obj->{onerror}->(undef, 'no-bom-error', charset_uri => $obj->{charset}); $obj->{has_bom} = 0; $obj->{byte_buffer} .= $b; # UTF-16BE w/o BOM $csdef = $HTML::HTML5::Parser::Charset::CharsetDef->{$PERL_CHARSET.'utf-16be'}; } } else { $obj->{onerror}->(undef, 'no-bom-error', charset_uri => $obj->{charset}); $obj->{has_bom} = 0; # UTF-16BE w/o BOM $csdef = $HTML::HTML5::Parser::Charset::CharsetDef->{$PERL_CHARSET.'utf-16be'}; } } if ($csdef->{uri}->{$XML_CHARSET.'iso-2022-jp'}) { $obj->{state_2440} = 'gl-jis-1997-swapped'; $obj->{state_2442} = 'gl-jis-1997'; $obj->{state} = 'state_2842'; require Encode::GLJIS1997Swapped; require Encode::GLJIS1997; if (Encode::find_encoding ($obj->{state_2440}) and Encode::find_encoding ($obj->{state_2442})) { return bless $obj, 'HTML::HTML5::Parser::Charset::DecodeHandle::ISO2022JP'; } } elsif ($csdef->{uri}->{$IANA_CHARSET.'iso-2022-jp'}) { $obj->{state_2440} = 'gl-jis-1978'; $obj->{state_2442} = 'gl-jis-1983'; $obj->{state} = 'state_2842'; require Encode::GLJIS1978; require Encode::GLJIS1983; if (Encode::find_encoding ($obj->{state_2440}) and Encode::find_encoding ($obj->{state_2442})) { return bless $obj, 'HTML::HTML5::Parser::Charset::DecodeHandle::ISO2022JP'; } } elsif (defined $csdef->{perl_name}->[0]) { if ($csdef->{uri}->{$XML_CHARSET.'euc-jp'} or $csdef->{uri}->{$IANA_CHARSET.'euc-jp'}) { $obj->{perl_encoding_name} = $csdef->{perl_name}->[0]; require Encode::EUCJP1997; if (Encode::find_encoding ($obj->{perl_encoding_name})) { $obj->{category} |= HTML::HTML5::Parser::Charset::Info::CHARSET_CATEGORY_EUCJP; return bless $obj, 'HTML::HTML5::Parser::Charset::DecodeHandle::Encode'; } } elsif ($csdef->{uri}->{$XML_CHARSET.'shift_jis'} or $csdef->{uri}->{$IANA_CHARSET.'shift_jis'}) { $obj->{perl_encoding_name} = $csdef->{perl_name}->[0]; require Encode::ShiftJIS1997; if (Encode::find_encoding ($obj->{perl_encoding_name})) { return bless $obj, 'HTML::HTML5::Parser::Charset::DecodeHandle::Encode'; } } elsif ($csdef->{is_block_safe}) { $obj->{perl_encoding_name} = $csdef->{perl_name}->[0]; require Encode; if (Encode::find_encoding ($obj->{perl_encoding_name})) { return bless $obj, 'HTML::HTML5::Parser::Charset::DecodeHandle::Encode'; } } } $obj->{onerror}->(undef, 'charset-not-supported-error', charset_uri => $obj->{charset}); return undef; } # create_decode_handle sub name_to_uri ($$$) { my $domain = $_[1]; my $name = lc $_[2]; if ($domain eq 'ietf') { return $IANA_CHARSET . $name; } elsif ($domain eq 'xml') { if ({ 'utf-8' => 1, 'utf-16' => 1, 'iso-10646-ucs-2' => 1, 'iso-10646-ucs-4' => 1, 'iso-8859-1' => 1, 'iso-8859-2' => 1, 'iso-8859-3' => 1, 'iso-8859-4' => 1, 'iso-8859-5' => 1, 'iso-8859-6' => 1, 'iso-8859-7' => 1, 'iso-8859-8' => 1, 'iso-8859-9' => 1, 'iso-8859-10' => 1, 'iso-8859-11' => 1, 'iso-8859-13' => 1, 'iso-8859-14' => 1, 'iso-8859-15' => 1, 'iso-8859-16' => 1, 'iso-2022-jp' => 1, 'shift_jis' => 1, 'euc-jp' => 1, }->{$name}) { return $XML_CHARSET . $name; } my $uri = $IANA_CHARSET . $name; return $uri if $HTML::HTML5::Parser::Charset::CharsetDef->{$uri}; return $XML_CHARSET . $name; } else { return undef; } } # name_to_uri sub uri_to_name ($$$) { my (undef, $domain, $uri) = @_; if ($domain eq 'xml') { my $v = $HTML::HTML5::Parser::Charset::CharsetDef->{$uri}->{xml_name}; return $v if defined $v; if (substr ($uri, 0, length $XML_CHARSET) eq $XML_CHARSET) { return substr ($uri, length $XML_CHARSET); } $domain = 'ietf'; ## TODO: XML encoding name has smaller range } if ($domain eq 'ietf') { my $v = $HTML::HTML5::Parser::Charset::CharsetDef->{$uri}->{iana_name}; return $v->[0] if defined $v; if (substr ($uri, 0, length $IANA_CHARSET) eq $IANA_CHARSET) { return substr ($uri, length $IANA_CHARSET); } } return undef; } # uri_to_name require IO::Handle; package HTML::HTML5::Parser::Charset::DecodeHandle::ByteBuffer; ## NOTE: Provides a byte buffer wrapper object. sub new ($$) { my $self = bless { buffer => '', }, shift; $self->{filehandle} = shift; return $self; } # new sub read { my $self = shift; my $pos = length $self->{buffer}; my $r = $self->{filehandle}->read ($self->{buffer}, $_[1], $pos); substr ($_[0], $_[2]) = substr ($self->{buffer}, $pos); ## NOTE: This would do different behavior from Perl's standard ## |read| when $pos points beyond the end of the string. return $r; } # read sub close { $_[0]->{filehandle}->close } package HTML::HTML5::Parser::Charset::DecodeHandle::CharString; ## NOTE: Same as Perl's standard |open $handle, '<', \$char_string|, ## but supports |ungetc| and other extensions. sub new ($$) { my $self = bless {pos => 0}, shift; $self->{string} = shift; # must be a scalar ref return $self; } # new sub getc ($) { my $self = shift; if ($self->{pos} < length ${$self->{string}}) { return substr ${$self->{string}}, $self->{pos}++, 1; } else { return undef; } } # getc sub read ($$$$) { #my ($self, $scalar, $length, $offset) = @_; my $self = $_[0]; my $length = $_[2] || 0; my $offset = $_[3] || 0; ## NOTE: We don't support standard Perl semantics if $offset is ## greater than the length of $scalar. substr ($_[1], $offset) = substr (${$self->{string}}, $self->{pos}, $length); my $count = (length $_[1]) - $offset; $self->{pos} += $count; return $count; } # read sub manakai_read_until ($$$;$) { #my ($self, $scalar, $pattern, $offset) = @_; my $self = $_[0]; pos (${$self->{string}}) = $self->{pos}; if (${$self->{string}} =~ /\G(?>$_[2])+/) { substr ($_[1], $_[3]) = substr (${$self->{string}}, $-[0], $+[0] - $-[0]); $self->{pos} += $+[0] - $-[0]; return $+[0] - $-[0]; } else { return 0; } } # manakai_read_until sub ungetc ($$) { my $self = shift; ## Ignore second parameter. $self->{pos}-- if $self->{pos} > 0; } # ungetc sub close ($) { } sub onerror ($;$) { } package HTML::HTML5::Parser::Charset::DecodeHandle::Encode; ## NOTE: Provides a Perl |Encode| module wrapper object. sub charset ($) { $_[0]->{charset} } sub close ($) { $_[0]->{filehandle}->close } sub getc ($) { my $c = ''; my $l = $_[0]->read ($c, 1); if ($l) { return $c; } else { return undef; } } # getc sub read ($$$;$) { my $self = $_[0]; #my $scalar = $_[1]; my $length = $_[2]; my $offset = $_[3] || 0; my $count = 0; my $eof; ## NOTE: It is incompatible with the standard Perl semantics ## if $offset is greater than the length of $scalar. A: { return $count if $length < 1; if (my $l = (length ${$self->{char_buffer}}) - $self->{char_buffer_pos}) { if ($l >= $length) { substr ($_[1], $offset) = substr (${$self->{char_buffer}}, $self->{char_buffer_pos}, $length); $count += $length; $self->{char_buffer_pos} += $length; $length = 0; return $count; } else { substr ($_[1], $offset) = substr (${$self->{char_buffer}}, $self->{char_buffer_pos}); $count += $l; $length -= $l; ${$self->{char_buffer}} = ''; $self->{char_buffer_pos} = 0; } $offset = length $_[1]; } if ($eof) { return $count; } my $error; if ($self->{continue}) { if ($self->{filehandle}->read ($self->{byte_buffer}, 256, length $self->{byte_buffer})) { # } else { $error = 1; } $self->{continue} = 0; } elsif (512 > length $self->{byte_buffer}) { if ($self->{filehandle}->read ($self->{byte_buffer}, 256, length $self->{byte_buffer})) { # } else { $eof = 1; } } unless ($error) { if (not $self->{bom_checked}) { if (defined $self->{bom_pattern}) { if ($self->{byte_buffer} =~ s/^$self->{bom_pattern}//) { $self->{has_bom} = 1; } } $self->{bom_checked} = 1; } my $string = do { BEGIN { $SIG{__WARN__} = sub { warn $_[0] unless $_[0] =~ /^Code point/ } } Encode::decode ($self->{perl_encoding_name}, $self->{byte_buffer}, Encode::FB_QUIET ()); }; if (length $string) { $self->{char_buffer} = \$string; $self->{char_buffer_pos} = 0; if (length $self->{byte_buffer}) { $self->{continue} = 1; } } else { if (length $self->{byte_buffer}) { $error = 1; } else { ## NOTE: No further input. redo A; } } } if ($error) { my $r = substr $self->{byte_buffer}, 0, 1, ''; my $fallback; my $etype = 'illegal-octets-error'; my %earg; if ($self->{category} & HTML::HTML5::Parser::Charset::Info::CHARSET_CATEGORY_SJIS) { if ($r =~ /^[\x81-\x9F\xE0-\xFC]/) { if ($self->{byte_buffer} =~ s/(.)//s) { $r .= $1; # not limited to \x40-\xFC - \x7F $etype = 'unassigned-code-point-error'; } ## NOTE: Range [\xF0-\xFC] is unassigned and may be used as a ## single-byte character or as the first-byte of a double-byte ## character, according to JIS X 0208:1997 Appendix 1. However, the ## current practice is using the range as first-bytes of double-byte ## characters. } elsif ($r =~ /^[\x80\xA0\xFD-\xFF]/) { $etype = 'unassigned-code-point-error'; } } elsif ($self->{category} & HTML::HTML5::Parser::Charset::Info::CHARSET_CATEGORY_EUCJP) { if ($r =~ /^[\xA1-\xFE]/) { if ($self->{byte_buffer} =~ s/^([\xA1-\xFE])//) { $r .= $1; $etype = 'unassigned-code-point-error'; } } elsif ($r eq "\x8F") { if ($self->{byte_buffer} =~ s/^([\xA1-\xFE][\xA1-\xFE]?)//) { $r .= $1; $etype = 'unassigned-code-point-error' if length $1 == 2; } } elsif ($r eq "\x8E") { if ($self->{byte_buffer} =~ s/^([\xA1-\xFE])//) { $r .= $1; $etype = 'unassigned-code-point-error'; } } elsif ($r eq "\xA0" or $r eq "\xFF") { $etype = 'unassigned-code-point-error'; } } else { $fallback = $self->{fallback}->{$r}; if (defined $fallback) { ## NOTE: This is an HTML5 parse error. $etype = 'fallback-char-error'; $earg{char} = \$fallback; } elsif (exists $self->{fallback}->{$r}) { ## NOTE: This is an HTML5 parse error. In addition, the octet ## is not assigned with a character. $etype = 'fallback-unassigned-error'; } } ## NOTE: Fixup line/column number by counting the number of ## lines/columns in the string that is to be retuend by this ## method call. my $line_diff = 0; my $col_diff = 0; my $set_col; for (my $i = 0; $i < $count; $i++) { my $s = substr $_[1], $i - $count, 1; if ($s eq "\x0D") { $line_diff++; $col_diff = 0; $set_col = 1; $i++ if substr ($_[1], $i - $count + 1, 1) eq "\x0A"; } elsif ($s eq "\x0A") { $line_diff++; $col_diff = 0; $set_col = 1; } else { $col_diff++; } } my $i = $self->{char_buffer_pos}; if ($count and substr (${$self->{char_buffer}}, -1, 1) eq "\x0D") { if (substr (${$self->{char_buffer}}, $i, 1) eq "\x0A") { $i++; } } my $cb_length = length ${$self->{char_buffer}}; for (; $i < $cb_length; $i++) { my $s = substr $_[1], $i, 1; if ($s eq "\x0D") { $line_diff++; $col_diff = 0; $set_col = 1; $i++ if substr ($_[1], $i + 1, 1) eq "\x0A"; } elsif ($s eq "\x0A") { $line_diff++; $col_diff = 0; $set_col = 1; } else { $col_diff++; } } $self->{onerror}->($self, $etype, octets => \$r, %earg, level => $self->{level}->{$self->{error_level}->{$etype}}, line_diff => $line_diff, ($set_col ? (column => 1) : ()), column_diff => $col_diff); ## NOTE: Error handler may modify |octets| parameter, which ## would be returned as part of the output. Note that what ## is returned would affect what |manakai_read_until| returns. ${$self->{char_buffer}} .= defined $fallback ? $fallback : $r; } redo A; } # A } # read sub manakai_read_until ($$$;$) { #my ($self, $scalar, $pattern, $offset) = @_; my $self = $_[0]; my $s = ''; $self->read ($s, 255); if ($s =~ /^(?>$_[2])+/) { my $rem_length = (length $s) - $+[0]; if ($rem_length) { if ($self->{char_buffer_pos} > $rem_length) { $self->{char_buffer_pos} -= $rem_length; } else { substr (${$self->{char_buffer}}, 0, $self->{char_buffer_pos}) = substr ($s, $+[0]); $self->{char_buffer_pos} = 0; } } substr ($_[1], $_[3]) = substr ($s, $-[0], $+[0] - $-[0]); return $+[0]; } elsif (length $s) { if ($self->{char_buffer_pos} > length $s) { $self->{char_buffer_pos} -= length $s; } else { substr (${$self->{char_buffer}}, 0, $self->{char_buffer_pos}) = $s; $self->{char_buffer_pos} = 0; } } return 0; } # manakai_read_until sub has_bom ($) { $_[0]->{has_bom} } sub input_encoding ($) { my $v = $_[0]->{input_encoding}; return $v if defined $v; my $uri = $_[0]->{charset}; if (defined $uri) { return HTML::HTML5::Parser::Charset::DecodeHandle->uri_to_name (xml => $uri); } return undef; } # input_encoding sub onerror ($;$) { if (@_ > 1) { if ($_[1]) { $_[0]->{onerror} = $_[1]; $_[0]->{onerror_set} = 1; } else { $_[0]->{onerror} = sub { }; delete $_[0]->{onerror_set}; } } return $_[0]->{onerror_set} ? $_[0]->{onerror} : undef; } # onerror sub ungetc ($$) { unshift @{$_[0]->{character_queue}}, chr int ($_[1] or 0); } # ungetc package HTML::HTML5::Parser::Charset::DecodeHandle::ISO2022JP; push our @ISA, 'HTML::HTML5::Parser::Charset::DecodeHandle::Encode'; sub getc ($) { my $self = $_[0]; return shift @{$self->{character_queue}} if @{$self->{character_queue}}; my $r; A: { my $error; if ($self->{continue}) { if ($self->{filehandle}->read ($self->{byte_buffer}, 256, length $self->{byte_buffer})) { # } else { $error = 1; } $self->{continue} = 0; } elsif (512 > length $self->{byte_buffer}) { $self->{filehandle}->read ($self->{byte_buffer}, 256, length $self->{byte_buffer}); } unless ($error) { if ($self->{byte_buffer} =~ s/^\x1B(\x24[\x40\x42]|\x28[\x42\x4A])//) { $self->{state} = { "\x24\x40" => 'state_2440', "\x24\x42" => 'state_2442', "\x28\x42" => 'state_2842', "\x28\x4A" => 'state_284A', }->{$1}; redo A; } elsif ($self->{state} eq 'state_2842') { # IRV if ($self->{byte_buffer} =~ s/^([\x00-\x0D\x10-\x1A\x1C-\x7F]+)//) { push @{$self->{character_queue}}, split //, $1; $r = shift @{$self->{character_queue}}; } else { if (length $self->{byte_buffer}) { $error = 1; } else { $r = undef; } } } elsif ($self->{state} eq 'state_284A') { # 0201 if ($self->{byte_buffer} =~ s/^([\x00-\x0D\x10-\x1A\x1C-\x7F]+)//) { my $v = $1; $v =~ tr/\x5C\x7E/\xA5\x{203E}/; push @{$self->{character_queue}}, split //, $v; $r = shift @{$self->{character_queue}}; } else { if (length $self->{byte_buffer}) { $error = 1; } else { $r = undef; $self->{onerror}->($self, 'invalid-state-error', state => $self->{state}, level => $self->{level}->{$self->{error_level}->{'invalid-state-error'}}); } } } elsif ($self->{state} eq 'state_2442') { # 1983 my $v = Encode::decode ($self->{state_2442}, $self->{byte_buffer}, Encode::FB_QUIET ()); if (length $v) { push @{$self->{character_queue}}, split //, $v; $r = shift @{$self->{character_queue}}; } else { if (length $self->{byte_buffer}) { $error = 1; } else { $r = undef; $self->{onerror}->($self, 'invalid-state-error', state => $self->{state}, level => $self->{level}->{$self->{error_level}->{'invalid-state-error'}}); } } } elsif ($self->{state} eq 'state_2440') { # 1978 my $v = Encode::decode ($self->{state_2440}, $self->{byte_buffer}, Encode::FB_QUIET ()); if (length $v) { push @{$self->{character_queue}}, split //, $v; $r = shift @{$self->{character_queue}}; } else { if (length $self->{byte_buffer}) { $error = 1; } else { $r = undef; $self->{onerror}->($self, 'invalid-state-error', state => $self->{state}, level => $self->{level}->{$self->{error_level}->{'invalid-state-error'}}); } } } else { $error = 1; } } if ($error) { $r = substr $self->{byte_buffer}, 0, 1, ''; my $etype = 'illegal-octets-error'; if (($self->{state} eq 'state_2442' or $self->{state} eq 'state_2440') and $r =~ /^[\x21-\x7E]/ and $self->{byte_buffer} =~ s/^([\x21-\x7E])//) { $r .= $1; $etype = 'unassigned-code-point-error'; } elsif ($r eq "\x1B" and $self->{byte_buffer} =~ s/^\(H//) { # Old 0201 $r .= "(H"; $self->{state} = 'state_284A'; } $self->{onerror}->($self, $etype, octets => \$r, level => $self->{level}->{$self->{error_level}->{$etype}}); } } # A return $r; } # getc ## TODO: This is not good for performance. Should be replaced ## by read-centric implementation. sub read ($$$;$) { #my ($self, $scalar, $length, $offset) = @_; my $length = $_[2]; my $r = ''; while ($length > 0) { my $c = $_[0]->getc; last unless defined $c; $r .= $c; $length--; } substr ($_[1], $_[3]) = $r; ## NOTE: This would do different thing from what Perl's |read| do ## if $offset points beyond the end of the $scalar. return length $r; } # read sub manakai_read_until ($$$;$) { #my ($self, $scalar, $pattern, $offset) = @_; my $self = $_[0]; my $c = $self->getc; if ($c =~ /^$_[2]/) { substr ($_[1], $_[3]) = $c; return 1; } elsif (defined $c) { $self->ungetc (ord $c); return 0; } else { return 0; } } # manakai_read_until $HTML::HTML5::Parser::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:us-ascii'} = $HTML::HTML5::Parser::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:us'} = $HTML::HTML5::Parser::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:iso646-us'} = $HTML::HTML5::Parser::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:cp367'} = $HTML::HTML5::Parser::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:ibm367'} = $HTML::HTML5::Parser::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:ansi_x3.4-1986'} = $HTML::HTML5::Parser::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:ansi_x3.4-1968'} = $HTML::HTML5::Parser::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:iso-ir-6'} = $HTML::HTML5::Parser::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:csascii'} = $HTML::HTML5::Parser::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:iso_646.irv:1991'} = $HTML::HTML5::Parser::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:ascii'} = {ascii8 => '1', is_block_safe => '1', ietf_name => ['ansi_x3.4-1968', 'ansi_x3.4-1986', 'ascii', 'cp367', 'csascii', 'ibm367', 'iso-ir-6', 'iso646-us', 'iso_646.irv:1991', 'us', 'us-ascii', 'us-ascii'], mime_name => 'us-ascii', perl_name => ['ascii', 'iso-646-us', 'us-ascii'], utf8_encoding_scheme => '1', 'uri', {'urn:x-suika-fam-cx:charset:ansi_x3.4-1968', '1', 'urn:x-suika-fam-cx:charset:ansi_x3.4-1986', '1', 'urn:x-suika-fam-cx:charset:ascii', '1', 'urn:x-suika-fam-cx:charset:cp367', '1', 'urn:x-suika-fam-cx:charset:csascii', '1', 'urn:x-suika-fam-cx:charset:ibm367', '1', 'urn:x-suika-fam-cx:charset:iso-ir-6', '1', 'urn:x-suika-fam-cx:charset:iso646-us', '1', 'urn:x-suika-fam-cx:charset:iso_646.irv:1991', '1', 'urn:x-suika-fam-cx:charset:us', '1', 'urn:x-suika-fam-cx:charset:us-ascii', '1'}, }; $HTML::HTML5::Parser::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.ascii-ctrl'} = {perl_name => ['ascii-ctrl'], 'uri', {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.ascii-ctrl', '1'}}; $HTML::HTML5::Parser::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.null'} = {perl_name => ['null'], 'uri', {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.null', '1'}}; $HTML::HTML5::Parser::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.utf-8'} = {ascii8 => '1', bom_allowed => '1', no_bom_variant => 'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf8', utf8_encoding_scheme => '1', 'uri', {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.utf-8', '1'}, xml_name => 'UTF-8', }; $HTML::HTML5::Parser::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/UTF-8.RFC2279'} = {ascii8 => '1', bom_allowed => '1', no_bom_variant => 'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf8', utf8_encoding_scheme => '1', 'uri', {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/UTF-8.RFC2279', '1'}}; $HTML::HTML5::Parser::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-8'} = { ascii8 => 1, is_block_safe => '1', perl_name => ['utf-8'], utf8_encoding_scheme => '1', 'uri', {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-8', '1'}}; $HTML::HTML5::Parser::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:utf-8'} = { ascii8 => 1, bom_allowed => '1', no_bom_variant => 'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-8', ietf_name => ['utf-8'], mime_name => 'utf-8', utf8_encoding_scheme => '1', 'uri', {'urn:x-suika-fam-cx:charset:utf-8', '1'}, }; $HTML::HTML5::Parser::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf8'} = {ascii8 => '1', is_block_safe => '1', perl_name => ['utf8'], utf8_encoding_scheme => '1', 'uri', {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf8', '1'}}; $HTML::HTML5::Parser::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.utf-16'} = { ascii16 => 1, bom_allowed => '1', bom_required => '1', no_bom_variant => 'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16le', no_bom_variant16be => 'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16be', no_bom_variant16le => 'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16le', perl_name => ['utf-16'], 'uri', {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.utf-16', '1'}, xml_name => 'UTF-16', }; $HTML::HTML5::Parser::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:utf-16'} = { ascii16 => 1, bom_allowed => '1', no_bom_variant => 'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16le', no_bom_variant16be => 'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16be', no_bom_variant16le => 'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16le', ietf_name => ['utf-16'], mime_name => 'utf-16', 'uri', {'urn:x-suika-fam-cx:charset:utf-16', '1'}, }; $HTML::HTML5::Parser::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:utf-16be'} = { ascii16 => 1, ascii16be => 1, bom_allowed => '1', no_bom_variant => 'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16be', no_bom_variant16be => 'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16be', ietf_name => ['utf-16be'], mime_name => 'utf-16be', 'uri', {'urn:x-suika-fam-cx:charset:utf-16be', '1'}, }; $HTML::HTML5::Parser::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:utf-16le'} = { ascii16 => 1, ascii16le => 1, bom_allowed => '1', no_bom_variant => 'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16le', no_bom_variant16le => 'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16le', ietf_name => ['utf-16le'], mime_name => 'utf-16le', 'uri', {'urn:x-suika-fam-cx:charset:utf-16le', '1'}, }; $HTML::HTML5::Parser::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16be'} = { ascii16 => 1, ascii16be => 1, is_block_safe => '1', perl_name => ['utf-16be'], 'uri', {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16be', '1'}}; $HTML::HTML5::Parser::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16le'} = { ascii16 => 1, ascii16le => 1, is_block_safe => '1', perl_name => ['utf-16le'], 'uri', {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-16le', '1'}}; $HTML::HTML5::Parser::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-10646-ucs-2'} = $HTML::HTML5::Parser::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:iso-10646-ucs-2'} = { ascii16 => 1, bom_allowed => '1', no_bom_variant => 'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.ucs-2le', no_bom_variant16be => 'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.ucs-2be', no_bom_variant16le => 'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.ucs-2le', ietf_name => ['csunicode', 'iso-10646-ucs-2'], mime_name => 'iso-10646-ucs-2', 'uri', {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-10646-ucs-2', '1', 'urn:x-suika-fam-cx:charset:iso-10646-ucs-2', '1'}, xml_name => 'ISO-10646-UCS-2', }; $HTML::HTML5::Parser::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.ucs-2be'} = { ascii16 => 1, ascii16be => 1, is_block_safe => '1', perl_name => ['ucs-2be'], 'uri', {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.ucs-2be', '1'}}; $HTML::HTML5::Parser::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.ucs-2le'} = { ascii16 => 1, ascii16le => 1, is_block_safe => '1', perl_name => ['ucs-2le'], 'uri', {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.ucs-2le', '1'}}; $HTML::HTML5::Parser::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-10646-ucs-4'} = $HTML::HTML5::Parser::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:iso-10646-ucs-4'} = { ascii32 => 1, bom_allowed => '1', no_bom_variant => 'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-32le', no_bom_variant32endian1234 => 'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-32be', no_bom_variant32endian4321 => 'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-32le', ietf_name => ['csucs4', 'iso-10646-ucs-4'], mime_name => 'iso-10646-ucs-4', 'uri', {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-10646-ucs-4', '1', 'urn:x-suika-fam-cx:charset:iso-10646-ucs-4', '1'}, xml_name => 'ISO-10646-UCS-4', }; $HTML::HTML5::Parser::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-32be'} = { ascii32 => 1, ascii32endian1234 => 1, is_block_safe => '1', perl_name => ['ucs-4be', 'utf-32be'], 'uri', {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-32be', '1'}}; $HTML::HTML5::Parser::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-32le'} = { ascii32 => 1, ascii32endian4321 => 1, is_block_safe => '1', perl_name => ['ucs-4le', 'utf-32le'], 'uri', {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.utf-32le', '1'}}; $HTML::HTML5::Parser::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:iso_8859-1:1987'} = $HTML::HTML5::Parser::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-1'} = {ascii8 => '1', is_block_safe => '1', ietf_name => ['cp819', 'csisolatin1', 'ibm819', 'iso-8859-1', 'iso-8859-1', 'iso-ir-100', 'iso_8859-1', 'iso_8859-1:1987', 'l1', 'latin1'], mime_name => 'iso-8859-1', perl_name => ['iso-8859-1', 'latin1'], 'uri', {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-1', '1', 'urn:x-suika-fam-cx:charset:iso_8859-1:1987', '1'}, xml_name => 'ISO-8859-1', }; $HTML::HTML5::Parser::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-2'} = {ascii8 => '1', is_block_safe => '1', 'uri', {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-2', '1'}, xml_name => 'ISO-8859-2', }; $HTML::HTML5::Parser::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-3'} = {ascii8 => '1', is_block_safe => '1', 'uri', {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-3', '1'}, xml_name => 'ISO-8859-3', }; $HTML::HTML5::Parser::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-4'} = {ascii8 => '1', is_block_safe => '1', 'uri', {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-4', '1'}, xml_name => 'ISO-8859-4', }; $HTML::HTML5::Parser::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-5'} = {ascii8 => '1', is_block_safe => '1', 'uri', {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-5', '1'}, xml_name => 'ISO-8859-5', }; $HTML::HTML5::Parser::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-6'} = {ascii8 => '1', is_block_safe => '1', 'uri', {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-6', '1'}, xml_name => 'ISO-8859-6', }; $HTML::HTML5::Parser::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-7'} = {ascii8 => '1', is_block_safe => '1', 'uri', {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-7', '1'}, xml_name => 'ISO-8859-7', }; $HTML::HTML5::Parser::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-8'} = {ascii8 => '1', is_block_safe => '1', 'uri', {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-8', '1'}, xml_name => 'ISO-8859-8', }; $HTML::HTML5::Parser::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-9'} = {ascii8 => '1', is_block_safe => '1', 'uri', {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-9', '1'}, xml_name => 'ISO-8859-9', }; $HTML::HTML5::Parser::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-10'} = {ascii8 => '1', is_block_safe => '1', 'uri', {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-10', '1'}, xml_name => 'ISO-8859-10', }; $HTML::HTML5::Parser::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-11'} = {ascii8 => '1', is_block_safe => '1', 'uri', {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-11', '1'}, xml_name => 'ISO-8859-11', }; $HTML::HTML5::Parser::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-13'} = {ascii8 => '1', is_block_safe => '1', 'uri', {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-13', '1'}, xml_name => 'ISO-8859-13', }; $HTML::HTML5::Parser::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-14'} = {ascii8 => '1', is_block_safe => '1', 'uri', {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-14', '1'}, xml_name => 'ISO-8859-14', }; $HTML::HTML5::Parser::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-15'} = {ascii8 => '1', is_block_safe => '1', 'uri', {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-15', '1'}, xml_name => 'ISO-8859-15', }; $HTML::HTML5::Parser::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-16'} = {ascii8 => '1', is_block_safe => '1', 'uri', {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-8859-16', '1'}, xml_name => 'ISO-8859-16', }; $HTML::HTML5::Parser::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-2022-jp'} = {ascii8 => '1', 'uri', {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.iso-2022-jp', '1'}, xml_name => 'ISO-2022-JP', }; $HTML::HTML5::Parser::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:iso-2022-jp'} = {ascii8 => '1', ietf_name => ['csiso2022jp', 'iso-2022-jp', 'iso-2022-jp'], mime_name => 'iso-2022-jp', 'uri', {'urn:x-suika-fam-cx:charset:iso-2022-jp', '1'}, }; $HTML::HTML5::Parser::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.iso-2022-jp'} = {ascii8 => '1', perl_name => ['iso-2022-jp'], 'uri', {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.iso-2022-jp', '1'}}; $HTML::HTML5::Parser::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:shift_jis'} = $HTML::HTML5::Parser::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.shift_jis'} = {ascii8 => '1', is_block_safe => '1', ietf_name => ['csshiftjis', 'ms_kanji', 'shift_jis', 'shift_jis'], mime_name => 'shift_jis', perl_name => ['shift-jis-1997'], 'uri', {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.shift_jis', '1', 'urn:x-suika-fam-cx:charset:shift_jis', '1'}, xml_name => 'Shift_JIS', }; $HTML::HTML5::Parser::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.shiftjis'} = {ascii8 => '1', is_block_safe => '1', perl_name => ['shiftjis', 'sjis'], 'uri', {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.shiftjis', '1'}}; $HTML::HTML5::Parser::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:euc-jp'} = $HTML::HTML5::Parser::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.euc-jp'} = $HTML::HTML5::Parser::Charset::CharsetDef->{'urn:x-suika-fam-cx:charset:extended_unix_code_packed_format_for_japanese'} = {ascii8 => '1', is_block_safe => '1', ietf_name => ['cseucpkdfmtjapanese', 'euc-jp', 'euc-jp', 'extended_unix_code_packed_format_for_japanese'], mime_name => 'euc-jp', perl_name => ['euc-jp-1997'], 'uri', {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/XML.euc-jp', '1', 'urn:x-suika-fam-cx:charset:euc-jp', '1', 'urn:x-suika-fam-cx:charset:extended_unix_code_packed_format_for_japanese', '1'}, xml_name => 'EUC-JP', }; $HTML::HTML5::Parser::Charset::CharsetDef->{'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.euc-jp'} = {ascii8 => '1', is_block_safe => '1', perl_name => ['euc-jp', 'ujis'], 'uri', {'http://suika.fam.cx/~wakaba/archive/2004/dis/Charset/Perl.euc-jp', '1'}}; 1; ## $Date: 2008/09/15 07:19:03 $ ��������������������������������������������������������������������������������������������������������������HTML-HTML5-Parser-0.301/lib/HTML/HTML5/Parser/Charset/UnicodeChecker.pm�����������������������������0000644�0001750�0001750�00000024154�12166544311�022670� 0����������������������������������������������������������������������������������������������������ustar �tai�����������������������������tai��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������package HTML::HTML5::Parser::Charset::UnicodeChecker; ## skip Test::Tabs use strict; our $VERSION = '0.301'; ## NOTE: For more information (including rationals of checks performed ## in this module), see ## <http://suika.fam.cx/gate/2005/sw/Unicode%E7%AC%A6%E5%8F%B7%E5%8C%96%E6%96%87%E5%AD%97%E5%88%97%E3%81%AE%E9%81%A9%E5%90%88%E6%80%A7>. ## NOTE: Unicode's definition for character string conformance is ## very, very vague so that it is difficult to determine what error ## level is appropriate for each error. The Unicode Standard abuses ## conformance-creteria-like terms such as "deprecated", "discouraged", ## "preferred", "better", "not encouraged", "should", and so on with no ## clear explanation of their difference (if any) or relationship to ## the conformance. In fact, that specification does not define the ## conformance class for character strings. sub new_handle ($$;$) { my $self = bless { queue => [], new_queue => [], onerror => sub {}, #onerror_set level => { unicode_should => 'w', unicode_deprecated => 'w', # = unicode_should unicode_discouraged => 'w', unicode_preferred => 'w', ## NOTE: We do some "unification" of levels - for example, ## "not encouraged" is unified with "discouraged" and ## "better" is unified with "preferred". must => 'm', warn => 'w', }, }, $_[0]; $self->{handle} = $_[1]; # char stream my $mode = $_[2] || 'default'; # or 'html5' $self->{level_map} = { ## Unicode errors 'unicode deprecated' => 'unicode_deprecated', 'nonchar' => $mode eq 'html5' ? 'must' : 'unicode_should', ## NOTE: HTML5 parse error. 'unicode should' => 'unicode_should', 'unicode discouraged' => 'unicode_discouraged', 'unicode not preferred' => 'unicode_preferred', ## HTML5 errors (See "text" definition of the spec) 'control char' => $mode eq 'html5' ? 'must' : 'warn', ## NOTE: HTML5 parse error. 'non unicode' => $mode eq 'html5' ? 'must' : 'warn', ## NOTE: In HTML5, replaced by U+FFFD (not a parse error). }; $self->{replace_non_unicode} = ($mode eq 'html5'); return $self; } # new_handle my $etypes = { 0x0340 => 'unicode deprecated', 0x0341 => 'unicode deprecated', 0x17A3 => 'unicode deprecated', 0x17D3 => 'unicode deprecated', 0x206A => 'unicode deprecated', 0x206B => 'unicode deprecated', 0x206C => 'unicode deprecated', 0x206D => 'unicode deprecated', 0x206E => 'unicode deprecated', 0x206F => 'unicode deprecated', 0xE0001 => 'unicode deprecated', 0xFFFE => 'nonchar', 0xFFFF => 'nonchar', 0x1FFFE => 'nonchar', 0x1FFFF => 'nonchar', 0x2FFFE => 'nonchar', 0x2FFFF => 'nonchar', 0x3FFFE => 'nonchar', 0x3FFFF => 'nonchar', 0x4FFFE => 'nonchar', 0x4FFFF => 'nonchar', 0x5FFFE => 'nonchar', 0x5FFFF => 'nonchar', 0x6FFFE => 'nonchar', 0x6FFFF => 'nonchar', 0x7FFFE => 'nonchar', 0x7FFFF => 'nonchar', 0x8FFFE => 'nonchar', 0x8FFFF => 'nonchar', 0x9FFFE => 'nonchar', 0x9FFFF => 'nonchar', 0xAFFFE => 'nonchar', 0xAFFFF => 'nonchar', 0xBFFFE => 'nonchar', 0xBFFFF => 'nonchar', 0xCFFFE => 'nonchar', 0xCFFFF => 'nonchar', 0xDFFFE => 'nonchar', 0xDFFFF => 'nonchar', 0xEFFFE => 'nonchar', 0xEFFFF => 'nonchar', 0xFFFFE => 'nonchar', 0xFFFFF => 'nonchar', 0x10FFFE => 'nonchar', 0x10FFFF => 'nonchar', 0x0344 => 'unicode should', # COMBINING GREEK DIALYTIKA TONOS 0x03D3 => 'unicode should', # GREEK UPSILON WITH ... 0x03D4 => 'unicode should', # GREEK UPSILON WITH ... 0x20A4 => 'unicode should', # LIRA SIGN 0x2126 => 'unicode should', # OHM SIGN # also, discouraged 0x212A => 'unicode should', # KELVIN SIGN 0x212B => 'unicode should', # ANGSTROM SIGN ## Styled overlines/underlines in CJK Compatibility Forms 0xFE49 => 'unicode discouraged', 0xFE4A => 'unicode discouraged', 0xFE4B => 'unicode discouraged', 0xFE4C => 'unicode discouraged', 0xFE4D => 'unicode discouraged', 0xFE4E => 'unicode discouraged', 0xFE4F => 'unicode discouraged', 0x037E => 'unicode discouraged', # greek punctuations 0x0387 => 'unicode discouraged', # greek punctuations #0x17A3 => 'unicode discouraged', # also, deprecated 0x17A4 => 'unicode discouraged', 0x17B4 => 'unicode discouraged', 0x17B5 => 'unicode discouraged', 0x17D8 => 'unicode discouraged', 0x2121 => 'unicode discouraged', # tel 0x213B => 'unicode discouraged', # fax #0x2120 => 'unicode discouraged', # SM (superscript) #0x2122 => 'unicode discouraged', # TM (superscript) ## inline annotations 0xFFF9 => 'unicode discouraged', 0xFFFA => 'unicode discouraged', 0xFFFB => 'unicode discouraged', ## greek punctuations 0x055A => 'unicode not preferred', 0x0559 => 'unicode not preferred', ## degree signs 0x2103 => 'unicode not preferred', 0x2109 => 'unicode not preferred', ## strongly preferrs U+2060 WORD JOINTER 0xFEFE => 'unicode not preferred', }; $etypes->{$_} = 'unicode deprecated' for 0xE0020 .. 0xE007F; $etypes->{$_} = 'nonchar' for 0xFDD0 .. 0xFDEF; ## ISSUE: U+FDE0-U+FDEF are not excluded in HTML5. $etypes->{$_} = 'unicode should' for 0xFA30 .. 0xFA6A, 0xFA70 .. 0xFAD9; $etypes->{$_} = 'unicode should' for 0x2F800 .. 0x2FA1D, 0x239B .. 0x23B3; $etypes->{$_} = 'unicode should' for 0xFB50 .. 0xFBB1, 0xFBD3 .. 0xFD3D, 0xFD50 .. 0xFD8F, 0xFD92 .. 0xFDC7, 0xFDF0 .. 0xFDFB, 0xFE70 .. 0xFE74, 0xFE76 .. 0xFEFC; ## NOTE: Arabic Presentation Forms-A/B blocks, w/o code points where ## no character is assigned, noncharacter code points, and ## U+FD3E and U+FD3F, which are explicitly allowed. $etypes->{$_} = 'unicode discouraged' for 0x2153 .. 0x217F; $etypes->{$_} = 'control char' for 0x0001 .. 0x0008, 0x000B, 0x000E .. 0x001F, 0x007F .. 0x009F; #0x0000 #$etypes->{$_} = 'control char' for 0xD800 .. 0xDFFF; my $check_char = sub ($$) { my ($self, $char_code) = @_; ## NOTE: Negative $char_code is not supported. if ($char_code == 0x000D) { $self->{line_diff}++; $self->{column_diff} = 0; $self->{set_column} = 1; $self->{has_cr} = 1; return; } elsif ($char_code == 0x000A) { if ($self->{has_cr}) { delete $self->{has_cr}; } else { $self->{line_diff}++; $self->{column_diff} = 0; $self->{set_column} = 1; } return; } else { $self->{column_diff}++; delete $self->{has_cr}; } if ($char_code > 0x10FFFF) { $self->{onerror}->(type => 'non unicode', text => (sprintf 'U-%08X', $char_code), layer => 'charset', level => $self->{level}->{$self->{level_map}->{'non unicode'}}, line_diff => $self->{line_diff}, column_diff => $self->{column_diff}, ($self->{set_column} ? (column => 1) : ())); if ($self->{replace_non_unicode}) { return "\x{FFFD}"; } else { return; } } my $etype = $etypes->{$char_code}; if (defined $etype) { $self->{onerror}->(type => $etype, text => (sprintf 'U+%04X', $char_code), layer => 'charset', level => $self->{level}->{$self->{level_map}->{$etype}}, line_diff => $self->{line_diff}, column_diff => $self->{column_diff}, ($self->{set_column} ? (column => 1) : ())); } ## TODO: "khanda ta" should be represented by U+09CE ## rather than <U+09A4, U+09CD, U+200D>. ## TODO: IDS syntax ## TODO: langtag syntax return; }; # $check_char sub read ($$$;$) { my $self = shift; my $offset = $_[2] || 0; my $count = $self->{handle}->read (@_); $self->{line_diff} = 0; $self->{column_diff} = -1; delete $self->{set_column}; delete $self->{has_cr}; for ($offset .. ($offset + $count - 1)) { my $c = $check_char->($self, ord substr $_[0], $_, 1); if (defined $c) { substr ($_[0], $_, 1) = $c; } } return $count; } # read sub manakai_read_until ($$$;$) { #my ($self, $scalar, $pattern, $offset) = @_; my $self = shift; my $offset = $_[2] || 0; my $count = $self->{handle}->manakai_read_until (@_); $self->{line_diff} = 0; $self->{column_diff} = -1; delete $self->{set_column}; delete $self->{has_cr}; for ($offset .. ($offset + $count - 1)) { my $c = $check_char->($self, ord substr $_[0], $_, 1); if (defined $c) { substr ($_[0], $_, 1) = $c; } } return $count; } # manakai_read_until sub ungetc ($$) { unshift @{$_[0]->{queue}}, chr int ($_[1] or 0); } # ungetc sub close ($) { shift->{handle}->close; } # close sub charset ($) { shift->{handle}->charset; } # charset sub has_bom ($) { shift->{handle}->has_bom; } # has_bom sub input_encoding ($) { shift->{handle}->input_encoding; } # input_encoding sub onerror ($;$) { if (@_ > 1) { if (defined $_[1]) { $_[0]->{handle}->onerror ($_[0]->{onerror} = $_[1]); $_[0]->{onerror_set} = 1; } else { $_[0]->{handle}->onerror ($_[0]->{onerror} = sub {}); delete $_[0]->{onerror_set}; } } return $_[0]->{onerror_set} ? $_[0]->{onerror} : undef; } # onerror 1; ��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������HTML-HTML5-Parser-0.301/lib/HTML/HTML5/Parser/Charset/Info.pm���������������������������������������0000644�0001750�0001750�00000127677�12166544311�020726� 0����������������������������������������������������������������������������������������������������ustar �tai�����������������������������tai��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������package HTML::HTML5::Parser::Charset::Info; ## skip Test::Tabs use strict; our $VERSION='0.301'; ## TODO: Certain encodings MUST NOT be implemented [HTML5]. ## ISSUE: Should we convert unassigned code point with trivial Unicode ## mapping into U+FFFD? Or, should we return that Unicode character ## with an error? (For example, Windows-1252's 0x81 should be converted ## to U+FFFD or U+0081?) sub UNREGISTERED_CHARSET_NAME () { 0b1 } ## Names for non-standard encodings/implementations for Perl encodings sub REGISTERED_CHARSET_NAME () { 0b10 } ## Names for standard encodings for Perl encodings sub PRIMARY_CHARSET_NAME () { 0b100 } ## "Name:" field for IANA names ## Canonical name for Perl encodings sub PREFERRED_CHARSET_NAME () { 0b1000 } ## "preferred MIME name" for IANA names sub FALLBACK_ENCODING_IMPL () { 0b10000 } ## For Perl encodings: Not a name of the encoding, the encoding ## for the name might be useful as a fallback when the correct ## encoding is not supported. sub NONCONFORMING_ENCODING_IMPL () { FALLBACK_ENCODING_IMPL } ## For Perl encodings: Not a conforming implementation of the encoding, ## though it seems that the intention was to implement that encoding. sub SEMICONFORMING_ENCODING_IMPL () { 0b1000000 } ## For Perl encodings: The implementation itself (returned by ## |get_perl_encoding|) is non-conforming. The decode handle ## implementation (returned by |get_decode_handle|) is conforming. sub ERROR_REPORTING_ENCODING_IMPL () { 0b100000 } ## For Perl encodings: Support error reporting via |manakai_onerror| ## handler when the encoding is handled with decode handle. ## iana_status sub STATUS_COMMON () { 0b1 } sub STATUS_LIMITED_USE () { 0b10 } sub STATUS_OBSOLETE () { 0b100 } ## category sub CHARSET_CATEGORY_BLOCK_SAFE () { 0b1 } ## NOTE: Stateless sub CHARSET_CATEGORY_EUCJP () { 0b10 } sub CHARSET_CATEGORY_SJIS () { 0b100 } sub CHARSET_CATEGORY_UTF16 () { 0b1000 } ## NOTE: "A UTF-16 encoding" in HTML5. sub CHARSET_CATEGORY_ASCII_COMPAT () { 0b10000 } ## NOTE: "superset of US-ASCII (specifically, ANSI_X3.4-1968) ## for bytes in the range 0x09-0x0A, 0x0C-0x0D, 0x20-0x22, 0x26, 0x27, ## 0x2C-0x3F, 0x41-0x5A, and 0x61-0x7A" [HTML5] sub CHARSET_CATEGORY_EBCDIC () { 0b100000 } ## NOTE: "based on EBCDIC" in HTML5. sub CHARSET_CATEGORY_MIME_TEXT () { 0b1000000 } ## NOTE: Suitable as MIME text. ## ISSUE: Shift_JIS is a superset of US-ASCII? ISO-2022-JP is? ## ISSUE: 0x5F (_) should be added to the range? my $Charset; ## TODO: this is obsolete. our $IANACharset; ## NOTE: Charset names used where IANA charset names are allowed, either ## registered or not. our $HTMLCharset; ## NOTE: Same as charset names in $IANACharset, except all ASCII ## punctuations are dropped and letters/digits only names are not included. $Charset->{'us-ascii'} = $IANACharset->{'ansi_x3.4-1968'} = $IANACharset->{'iso-ir-6'} = $IANACharset->{'ansi_x3.4-1986'} = $IANACharset->{'iso_646.irv:1991'} = $IANACharset->{'ascii'} = $IANACharset->{'iso646-us'} = $IANACharset->{'us-ascii'} = $IANACharset->{'us'} = $IANACharset->{'ibm367'} = $IANACharset->{'cp367'} = $IANACharset->{'csascii'} = $HTMLCharset->{'ansix341968'} = $HTMLCharset->{'isoir6'} = $HTMLCharset->{'ansix341986'} = $HTMLCharset->{'iso646irv1991'} = $HTMLCharset->{'iso646us'} = $HTMLCharset->{'usascii'} = __PACKAGE__->new ({ category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT, iana_names => { 'ansi_x3.4-1968' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME, 'iso-ir-6' => REGISTERED_CHARSET_NAME, 'ansi_x3.4-1986' => REGISTERED_CHARSET_NAME, 'iso_646.irv:1991' => REGISTERED_CHARSET_NAME, 'ascii' => REGISTERED_CHARSET_NAME, 'iso646-us' => REGISTERED_CHARSET_NAME, 'us-ascii' => PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME, 'us' => REGISTERED_CHARSET_NAME, 'ibm367' => REGISTERED_CHARSET_NAME, 'cp367' => REGISTERED_CHARSET_NAME, 'csascii' => REGISTERED_CHARSET_NAME, }, perl_names => { 'web-latin1-us-ascii' => UNREGISTERED_CHARSET_NAME | SEMICONFORMING_ENCODING_IMPL | ERROR_REPORTING_ENCODING_IMPL, 'cp1252' => FALLBACK_ENCODING_IMPL, # part of standard Perl distribution }, fallback => { "\x80" => "\x{20AC}", "\x81" => undef, "\x82" => "\x{201A}", "\x83" => "\x{0192}", "\x84" => "\x{201E}", "\x85" => "\x{2026}", "\x86" => "\x{2020}", "\x87" => "\x{2021}", "\x88" => "\x{02C6}", "\x89" => "\x{2030}", "\x8A" => "\x{0160}", "\x8B" => "\x{2039}", "\x8C" => "\x{0152}", "\x8D" => undef, "\x8E" => "\x{017D}", "\x8F" => undef, "\x90" => undef, "\x91" => "\x{2018}", "\x92" => "\x{2019}", "\x93" => "\x{201C}", "\x94" => "\x{201D}", "\x95" => "\x{2022}", "\x96" => "\x{2013}", "\x97" => "\x{2014}", "\x98" => "\x{02DC}", "\x99" => "\x{2122}", "\x9A" => "\x{0161}", "\x9B" => "\x{203A}", "\x9C" => "\x{0153}", "\x9D" => undef, "\x9E" => "\x{017E}", "\x9F" => "\x{0178}", "\xA0" => "\xA0", "\xA1" => "\xA1", "\xA2" => "\xA2", "\xA3" => "\xA3", "\xA4" => "\xA4", "\xA5" => "\xA5", "\xA6" => "\xA6", "\xA7" => "\xA7", "\xA8" => "\xA8", "\xA9" => "\xA9", "\xAA" => "\xAA", "\xAB" => "\xAB", "\xAC" => "\xAC", "\xAD" => "\xAD", "\xAE" => "\xAE", "\xAF" => "\xAF", "\xB0" => "\xB0", "\xB1" => "\xB1", "\xB2" => "\xB2", "\xB3" => "\xB3", "\xB4" => "\xB4", "\xB5" => "\xB5", "\xB6" => "\xB6", "\xB7" => "\xB7", "\xB8" => "\xB8", "\xB9" => "\xB9", "\xBA" => "\xBA", "\xBB" => "\xBB", "\xBC" => "\xBC", "\xBD" => "\xBD", "\xBE" => "\xBE", "\xBF" => "\xBF", "\xC0" => "\xC0", "\xC1" => "\xC1", "\xC2" => "\xC2", "\xC3" => "\xC3", "\xC4" => "\xC4", "\xC5" => "\xC5", "\xC6" => "\xC6", "\xC7" => "\xC7", "\xC8" => "\xC8", "\xC9" => "\xC9", "\xCA" => "\xCA", "\xCB" => "\xCB", "\xCC" => "\xCC", "\xCD" => "\xCD", "\xCE" => "\xCE", "\xCF" => "\xCF", "\xD0" => "\xD0", "\xD1" => "\xD1", "\xD2" => "\xD2", "\xD3" => "\xD3", "\xD4" => "\xD4", "\xD5" => "\xD5", "\xD6" => "\xD6", "\xD7" => "\xD7", "\xD8" => "\xD8", "\xD9" => "\xD9", "\xDA" => "\xDA", "\xDB" => "\xDB", "\xDC" => "\xDC", "\xDD" => "\xDD", "\xDE" => "\xDE", "\xDF" => "\xDF", "\xE0" => "\xE0", "\xE1" => "\xE1", "\xE2" => "\xE2", "\xE3" => "\xE3", "\xE4" => "\xE4", "\xE5" => "\xE5", "\xE6" => "\xE6", "\xE7" => "\xE7", "\xE8" => "\xE8", "\xE9" => "\xE9", "\xEA" => "\xEA", "\xEB" => "\xEB", "\xEC" => "\xEC", "\xED" => "\xED", "\xEE" => "\xEE", "\xEF" => "\xEF", "\xF0" => "\xF0", "\xF1" => "\xF1", "\xF2" => "\xF2", "\xF3" => "\xF3", "\xF4" => "\xF4", "\xF5" => "\xF5", "\xF6" => "\xF6", "\xF7" => "\xF7", "\xF8" => "\xF8", "\xF9" => "\xF9", "\xFA" => "\xFA", "\xFB" => "\xFB", "\xFC" => "\xFC", "\xFD" => "\xFD", "\xFE" => "\xFE", "\xFF" => "\xFF", }, ## NOTE: Treated as |windows-1252|. Properties of this charset ## should be consistent with those of that charset. }); $Charset->{'iso-8859-1'} = $IANACharset->{'iso_8859-1:1987'} = $IANACharset->{'iso-ir-100'} = $IANACharset->{'iso_8859-1'} = $IANACharset->{'iso-8859-1'} = $IANACharset->{'latin1'} = $IANACharset->{'l1'} = $IANACharset->{'ibm819'} = $IANACharset->{'cp819'} = $IANACharset->{'csisolatin1'} = $HTMLCharset->{'iso885911987'} = $HTMLCharset->{'isoir100'} = $HTMLCharset->{'iso88591'} = __PACKAGE__->new ({ category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT, iana_names => { 'iso_8859-1:1987' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME, 'iso-ir-100' => REGISTERED_CHARSET_NAME, 'iso_8859-1' => REGISTERED_CHARSET_NAME, 'iso-8859-1' => PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME, 'latin1' => REGISTERED_CHARSET_NAME, 'l1' => REGISTERED_CHARSET_NAME, 'ibm819' => REGISTERED_CHARSET_NAME, 'cp819' => REGISTERED_CHARSET_NAME, 'csisolatin1' => REGISTERED_CHARSET_NAME, }, perl_names => { 'web-latin1' => UNREGISTERED_CHARSET_NAME | SEMICONFORMING_ENCODING_IMPL | ERROR_REPORTING_ENCODING_IMPL, 'cp1252' => FALLBACK_ENCODING_IMPL, # part of standard Perl distribution }, fallback => { "\x80" => "\x{20AC}", "\x81" => undef, "\x82" => "\x{201A}", "\x83" => "\x{0192}", "\x84" => "\x{201E}", "\x85" => "\x{2026}", "\x86" => "\x{2020}", "\x87" => "\x{2021}", "\x88" => "\x{02C6}", "\x89" => "\x{2030}", "\x8A" => "\x{0160}", "\x8B" => "\x{2039}", "\x8C" => "\x{0152}", "\x8D" => undef, "\x8E" => "\x{017D}", "\x8F" => undef, "\x90" => undef, "\x91" => "\x{2018}", "\x92" => "\x{2019}", "\x93" => "\x{201C}", "\x94" => "\x{201D}", "\x95" => "\x{2022}", "\x96" => "\x{2013}", "\x97" => "\x{2014}", "\x98" => "\x{02DC}", "\x99" => "\x{2122}", "\x9A" => "\x{0161}", "\x9B" => "\x{203A}", "\x9C" => "\x{0153}", "\x9D" => undef, "\x9E" => "\x{017E}", "\x9F" => "\x{0178}", }, ## NOTE: Treated as |windows-1252|. Properties of this charset ## should be consistent with those of that charset. }); $Charset->{'iso-8859-2'} = $IANACharset->{'iso_8859-2:1987'} = $IANACharset->{'iso-ir-101'} = $IANACharset->{'iso_8859-2'} = $IANACharset->{'iso-8859-2'} = $IANACharset->{'latin2'} = $IANACharset->{'l2'} = $IANACharset->{'csisolatin2'} = $HTMLCharset->{'iso885921987'} = $HTMLCharset->{'isoir101'} = $HTMLCharset->{'iso88592'} = __PACKAGE__->new ({ category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT, iana_names => { 'iso_8859-2:1987' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME, 'iso-ir-101' => REGISTERED_CHARSET_NAME, 'iso_8859-2' => REGISTERED_CHARSET_NAME, 'iso-8859-2' => PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME, 'latin2' => REGISTERED_CHARSET_NAME, 'l2' => REGISTERED_CHARSET_NAME, 'csisolatin2' => REGISTERED_CHARSET_NAME, }, }); $Charset->{'iso-8859-3'} = $IANACharset->{'iso_8859-3:1988'} = $IANACharset->{'iso-ir-109'} = $IANACharset->{'iso_8859-3'} = $IANACharset->{'iso-8859-3'} = $IANACharset->{'latin3'} = $IANACharset->{'l3'} = $IANACharset->{'csisolatin3'} = $HTMLCharset->{'iso885931988'} = $HTMLCharset->{'isoir109'} = $HTMLCharset->{'iso88593'} = __PACKAGE__->new ({ category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT, iana_names => { 'iso_8859-3:1988' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME, 'iso-ir-109' => REGISTERED_CHARSET_NAME, 'iso_8859-3' => REGISTERED_CHARSET_NAME, 'iso-8859-3' => PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME, 'latin3' => REGISTERED_CHARSET_NAME, 'l3' => REGISTERED_CHARSET_NAME, 'csisolatin3' => REGISTERED_CHARSET_NAME, }, error_level => { 'unassigned-code-point-error' => 'iso_shall', ## NOTE: I didn't check whether ISO/IEC 8859-3 prohibits the use of ## unassigned code points, but ECMA-94:1986 (whose content considered ## as equivalent to ISO 8859/1-4) disallows the use of them. }, }); $Charset->{'iso-8859-4'} = $IANACharset->{'iso_8859-4:1988'} = $IANACharset->{'iso-ir-110'} = $IANACharset->{'iso_8859-4'} = $IANACharset->{'iso-8859-4'} = $IANACharset->{'latin4'} = $IANACharset->{'l4'} = $IANACharset->{'csisolatin4'} = $HTMLCharset->{'iso885941988'} = $HTMLCharset->{'isoir110'} = $HTMLCharset->{'iso88594'} = __PACKAGE__->new ({ category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT, iana_names => { 'iso_8859-4:1988' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME, 'iso-ir-110' => REGISTERED_CHARSET_NAME, 'iso_8859-4' => REGISTERED_CHARSET_NAME, 'iso-8859-4' => PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME, 'latin4' => REGISTERED_CHARSET_NAME, 'l4' => REGISTERED_CHARSET_NAME, 'csisolatin4' => REGISTERED_CHARSET_NAME, }, error_level => { 'unassigned-code-point-error' => 'iso_shall', ## NOTE: I didn't check whether ISO/IEC 8859-3 prohibits the use of ## unassigned code points, but ECMA-94:1986 (whose content considered ## as equivalent to ISO 8859/1-4) disallows the use of them. }, }); $Charset->{'iso-8859-5'} = $IANACharset->{'iso_8859-5:1988'} = $IANACharset->{'iso-ir-144'} = $IANACharset->{'iso_8859-5'} = $IANACharset->{'iso-8859-5'} = $IANACharset->{'cyrillic'} = $IANACharset->{'csisolatincyrillic'} = $HTMLCharset->{'iso885951988'} = $HTMLCharset->{'isoir144'} = $HTMLCharset->{'iso88595'} = __PACKAGE__->new ({ category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT, iana_names => { 'iso_8859-5:1988' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME, 'iso-ir-144' => REGISTERED_CHARSET_NAME, 'iso_8859-5' => REGISTERED_CHARSET_NAME, 'iso-8859-5' => PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME, 'cyrillic' => REGISTERED_CHARSET_NAME, 'csisolatincyrillic' => REGISTERED_CHARSET_NAME, }, }); $Charset->{'iso-8859-6'} = $IANACharset->{'iso_8859-6:1987'} = $IANACharset->{'iso-ir-127'} = $IANACharset->{'iso_8859-6'} = $IANACharset->{'iso-8859-6'} = $IANACharset->{'ecma-114'} = $IANACharset->{'asmo-708'} = $IANACharset->{'arabic'} = $IANACharset->{'csisolatinarabic'} = $HTMLCharset->{'iso885961987'} = $HTMLCharset->{'isoir127'} = $HTMLCharset->{'iso88596'} = $HTMLCharset->{'ecma114'} = $HTMLCharset->{'asmo708'} = __PACKAGE__->new ({ category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT, ## NOTE: 3/0..3/9 have different semantics from U+0030..0039, ## but have same character names (maybe). ## NOTE: According to RFC 2046, charset left-hand half of "iso-8859-6" ## is same as "us-ascii". ## TODO: RFC 1345 def? iana_names => { 'iso_8859-6:1987' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME, 'iso-ir-127' => REGISTERED_CHARSET_NAME, 'iso_8859-6' => REGISTERED_CHARSET_NAME, 'iso-8859-6' => PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME, 'ecma-114' => REGISTERED_CHARSET_NAME, 'asmo-708' => REGISTERED_CHARSET_NAME, 'arabic' => REGISTERED_CHARSET_NAME, 'csisolatinarabic' => REGISTERED_CHARSET_NAME, }, ## TODO: |error_level| }); $Charset->{'iso-8859-7'} = $IANACharset->{'iso_8859-7:1987'} = $IANACharset->{'iso-ir-126'} = $IANACharset->{'iso_8859-7'} = $IANACharset->{'iso-8859-7'} = $IANACharset->{'elot_928'} = $IANACharset->{'ecma-118'} = $IANACharset->{'greek'} = $IANACharset->{'greek8'} = $IANACharset->{'csisolatingreek'} = $HTMLCharset->{'iso885971987'} = $HTMLCharset->{'isoir126'} = $HTMLCharset->{'iso88597'} = $HTMLCharset->{'elot928'} = $HTMLCharset->{'ecma118'} = __PACKAGE__->new ({ category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT, iana_names => { 'iso_8859-7:1987' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME, 'iso-ir-126' => REGISTERED_CHARSET_NAME, 'iso_8859-7' => REGISTERED_CHARSET_NAME, 'iso-8859-7' => PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME, 'elot_928' => REGISTERED_CHARSET_NAME, 'ecma-118' => REGISTERED_CHARSET_NAME, 'greek' => REGISTERED_CHARSET_NAME, 'greek8' => REGISTERED_CHARSET_NAME, 'csisolatingreek' => REGISTERED_CHARSET_NAME, }, ## TODO: |error_level| }); $Charset->{'iso-8859-8'} = $IANACharset->{'iso_8859-8:1988'} = $IANACharset->{'iso-ir-138'} = $IANACharset->{'iso_8859-8'} = $IANACharset->{'iso-8859-8'} = $IANACharset->{'hebrew'} = $IANACharset->{'csisolatinhebrew'} = $HTMLCharset->{'iso885981988'} = $HTMLCharset->{'isoir138'} = $HTMLCharset->{'iso88598'} = __PACKAGE__->new ({ category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT, iana_names => { 'iso_8859-8:1988' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME, 'iso-ir-138' => REGISTERED_CHARSET_NAME, 'iso_8859-8' => REGISTERED_CHARSET_NAME, 'iso-8859-8' => PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME, 'hebrew' => REGISTERED_CHARSET_NAME, 'csisolatinhebrew' => REGISTERED_CHARSET_NAME, }, ## TODO: |error_level| }); $Charset->{'iso-8859-9'} = $IANACharset->{'iso_8859-9:1989'} = $IANACharset->{'iso-ir-148'} = $IANACharset->{'iso_8859-9'} = $IANACharset->{'iso-8859-9'} = $IANACharset->{'latin5'} = $IANACharset->{'l5'} = $IANACharset->{'csisolatin5'} = $HTMLCharset->{'iso885991989'} = $HTMLCharset->{'isoir148'} = $HTMLCharset->{'iso88599'} = __PACKAGE__->new ({ category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT, iana_names => { 'iso_8859-9:1989' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME, 'iso-ir-148' => REGISTERED_CHARSET_NAME, 'iso_8859-9' => REGISTERED_CHARSET_NAME, 'iso-8859-9' => PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME, 'latin5' => REGISTERED_CHARSET_NAME, 'l5' => REGISTERED_CHARSET_NAME, 'csisolatin5' => REGISTERED_CHARSET_NAME, }, perl_names => { 'web-latin5' => UNREGISTERED_CHARSET_NAME | SEMICONFORMING_ENCODING_IMPL | ERROR_REPORTING_ENCODING_IMPL, 'cp1254' => FALLBACK_ENCODING_IMPL, # part of standard Perl distribution }, fallback => { "\x80" => "\x{20AC}", "\x81" => undef, "\x82" => "\x{201A}", "\x83" => "\x{0192}", "\x84" => "\x{201E}", "\x85" => "\x{2026}", "\x86" => "\x{2020}", "\x87" => "\x{2021}", "\x88" => "\x{02C6}", "\x89" => "\x{2030}", "\x8A" => "\x{0160}", "\x8B" => "\x{2039}", "\x8C" => "\x{0152}", "\x8D" => undef, "\x8E" => undef, "\x8F" => undef, "\x90" => undef, "\x91" => "\x{2018}", "\x92" => "\x{2019}", "\x93" => "\x{201C}", "\x94" => "\x{201D}", "\x95" => "\x{2022}", "\x96" => "\x{2013}", "\x97" => "\x{2014}", "\x98" => "\x{02DC}", "\x99" => "\x{2122}", "\x9A" => "\x{0161}", "\x9B" => "\x{203A}", "\x9C" => "\x{0153}", "\x9D" => undef, "\x9E" => undef, "\x9F" => "\x{0178}", }, ## NOTE: Treated as |windows-1254|. Properties of this charset ## should be consistent with those of that charset. }); $Charset->{'iso-8859-10'} = $IANACharset->{'iso-8859-10'} = $IANACharset->{'iso-ir-157'} = $IANACharset->{'l6'} = $IANACharset->{'iso_8859-10:1992'} = $IANACharset->{'csisolatin6'} = $IANACharset->{'latin6'} = $HTMLCharset->{'iso885910'} = $HTMLCharset->{'isoir157'} = $HTMLCharset->{'iso8859101992'} = __PACKAGE__->new ({ category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT, iana_names => { 'iso-8859-10' => PRIMARY_CHARSET_NAME | PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME, 'iso-ir-157' => REGISTERED_CHARSET_NAME, 'l6' => REGISTERED_CHARSET_NAME, 'iso_8859-10:1992' => REGISTERED_CHARSET_NAME, 'csisolatin6' => REGISTERED_CHARSET_NAME, 'latin6' => REGISTERED_CHARSET_NAME, }, ## TODO: |error_level| }); $Charset->{'iso_6937-2-add'} = $IANACharset->{'iso_6937-2-add'} = $IANACharset->{'iso-ir-142'} = $IANACharset->{'csisotextcomm'} = $HTMLCharset->{'iso69372add'} = $HTMLCharset->{'isoir142'} = __PACKAGE__->new ({ category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT, iana_names => { 'iso_6937-2-add' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME, 'iso-ir-142' => REGISTERED_CHARSET_NAME, 'csisotextcomm' => REGISTERED_CHARSET_NAME, }, ## TODO: |error_level| }); $Charset->{'jis_x0201'} = $IANACharset->{'jis_x0201'} = $IANACharset->{'x0201'} = $IANACharset->{'cshalfwidthkatakana'} = $HTMLCharset->{'jisx0201'} = __PACKAGE__->new ({ category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT, iana_names => { 'jis_x0201' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME, 'x0201' => REGISTERED_CHARSET_NAME, 'cshalfwidthkatakana' => REGISTERED_CHARSET_NAME, }, ## TODO: |error_level| }); $Charset->{'jis_encoding'} = $IANACharset->{'jis_encoding'} = $IANACharset->{'csjisencoding'} = $HTMLCharset->{'jisencoding'} = __PACKAGE__->new ({ category => 0, iana_names => { 'jis_encoding' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME, 'csjisencoding' => REGISTERED_CHARSET_NAME, }, ## NOTE: What is this? }); $Charset->{'shift_jis'} = $IANACharset->{'shift_jis'} = $IANACharset->{'ms_kanji'} = $IANACharset->{'csshiftjis'} = $HTMLCharset->{'shiftjis'} = $HTMLCharset->{'mskanji'} = __PACKAGE__->new ({ category => CHARSET_CATEGORY_SJIS | CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_MIME_TEXT | CHARSET_CATEGORY_ASCII_COMPAT, iana_names => { 'shift_jis' => PREFERRED_CHARSET_NAME | PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME, 'ms_kanji' => REGISTERED_CHARSET_NAME, 'csshiftjis' => REGISTERED_CHARSET_NAME, }, perl_names => { 'shift-jis-1997' => UNREGISTERED_CHARSET_NAME | SEMICONFORMING_ENCODING_IMPL | ERROR_REPORTING_ENCODING_IMPL, shiftjis => PRIMARY_CHARSET_NAME | NONCONFORMING_ENCODING_IMPL | ERROR_REPORTING_ENCODING_IMPL, ## NOTE: Unicode mapping is wrong. }, ## TODO: |error_level| }); $Charset->{'x-sjis'} = $IANACharset->{'x-sjis'} = $HTMLCharset->{'xsjis'} = __PACKAGE__->new ({ category => CHARSET_CATEGORY_SJIS | CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT, iana_names => { 'x-sjis' => UNREGISTERED_CHARSET_NAME, }, perl_names => { 'shift-jis-1997' => FALLBACK_ENCODING_IMPL | ERROR_REPORTING_ENCODING_IMPL, }, ## TODO: |error_level| }); $Charset->{shift_jisx0213} = $IANACharset->{shift_jisx0213} = $HTMLCharset->{shiftjisx0213} = __PACKAGE__->new ({ category => CHARSET_CATEGORY_SJIS | CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_MIME_TEXT, iana_names => { shift_jisx0213 => UNREGISTERED_CHARSET_NAME, }, perl_names => { #shift_jisx0213 (non-standard - i don't know its conformance) 'shift-jis-1997' => FALLBACK_ENCODING_IMPL | ERROR_REPORTING_ENCODING_IMPL, 'shiftjis' => FALLBACK_ENCODING_IMPL | ERROR_REPORTING_ENCODING_IMPL, }, ## TODO: |error_level| }); $Charset->{'euc-jp'} = $IANACharset->{'extended_unix_code_packed_format_for_japanese'} = $IANACharset->{'cseucpkdfmtjapanese'} = $IANACharset->{'euc-jp'} = $HTMLCharset->{'extendedunixcodepackedformatforjapanese'} = $HTMLCharset->{'cseucpkdfmtjapanese'} = $HTMLCharset->{'eucjp'} = __PACKAGE__->new ({ category => CHARSET_CATEGORY_EUCJP | CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_MIME_TEXT | CHARSET_CATEGORY_ASCII_COMPAT, iana_names => { 'extended_unix_code_packed_format_for_japanese' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME, 'cseucpkdfmtjapanese' => REGISTERED_CHARSET_NAME, 'euc-jp' => PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME, }, perl_names => { 'euc-jp-1997' => UNREGISTERED_CHARSET_NAME | SEMICONFORMING_ENCODING_IMPL | ERROR_REPORTING_ENCODING_IMPL, ## NOTE: Though the IANA definition references the 1990 version ## of EUC-JP, the 1997 version of JIS standard claims that the version ## is same coded character set as the 1990 version, such that we ## consider the EUC-JP 1990 version is same as the 1997 version. 'euc-jp' => PREFERRED_CHARSET_NAME | NONCONFORMING_ENCODING_IMPL | ERROR_REPORTING_ENCODING_IMPL, ## NOTE: Unicode mapping is wrong. }, ## TODO: |error_level| }); $Charset->{'x-euc-jp'} = $IANACharset->{'x-euc-jp'} = $HTMLCharset->{'xeucjp'} = __PACKAGE__->new ({ category => CHARSET_CATEGORY_EUCJP | CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_MIME_TEXT | CHARSET_CATEGORY_ASCII_COMPAT, iana_names => { 'x-euc-jp' => UNREGISTERED_CHARSET_NAME, }, perl_names => { 'euc-jp-1997' => FALLBACK_ENCODING_IMPL | ERROR_REPORTING_ENCODING_IMPL, 'euc-jp' => FALLBACK_ENCODING_IMPL | ERROR_REPORTING_ENCODING_IMPL, }, }); $Charset->{'extended_unix_code_fixed_width_for_japanese'} = $IANACharset->{'extended_unix_code_fixed_width_for_japanese'} = $IANACharset->{'cseucfixwidjapanese'} = $HTMLCharset->{'extendedunixcodefixedwidthforjapanese'} = __PACKAGE__->new ({ category => CHARSET_CATEGORY_BLOCK_SAFE, iana_names => { 'extended_unix_code_fixed_width_for_japanese' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME, 'cseucfixwidjapanese' => REGISTERED_CHARSET_NAME, }, ## TODO: |error_level| }); ## TODO: ... $Charset->{'euc-kr'} = $IANACharset->{'euc-kr'} = $IANACharset->{'cseuckr'} = $HTMLCharset->{'euckr'} = __PACKAGE__->new ({ category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT, iana_names => { 'euc-kr' => PRIMARY_CHARSET_NAME | PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME, 'cseuckr' => REGISTERED_CHARSET_NAME, }, perl_names => { ## TODO: We need a parse error generating wrapper for the decoder. 'cp949' => FALLBACK_ENCODING_IMPL, # part of standard Perl distribution }, ## NOTE: |euc-kr| is handled as |windows-949|, such that properties ## should be consistent with that encoding's properties. }); $Charset->{'iso-2022-jp'} = $IANACharset->{'iso-2022-jp'} = $IANACharset->{'csiso2022jp'} = $IANACharset->{'iso2022jp'} = $IANACharset->{'junet-code'} = $HTMLCharset->{'iso2022jp'} = $HTMLCharset->{'junetcode'} = __PACKAGE__->new ({ category => CHARSET_CATEGORY_MIME_TEXT | CHARSET_CATEGORY_ASCII_COMPAT, iana_names => { 'iso-2022-jp' => PREFERRED_CHARSET_NAME | PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME, 'csiso2022jp' => REGISTERED_CHARSET_NAME, 'iso2022jp' => UNREGISTERED_CHARSET_NAME, 'junet-code' => UNREGISTERED_CHARSET_NAME, }, ## TODO: |error_level| }); $Charset->{'iso-2022-jp-2'} = $IANACharset->{'iso-2022-jp-2'} = $IANACharset->{'csiso2022jp2'} = $HTMLCharset->{'iso2022jp2'} = __PACKAGE__->new ({ category => CHARSET_CATEGORY_MIME_TEXT, iana_names => { 'iso-2022-jp-2' => PREFERRED_CHARSET_NAME | PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME, 'csiso2022jp2' => REGISTERED_CHARSET_NAME, }, ## TODO: |error_level| }); ## TODO: ... $IANACharset->{'gb_2312-80'} = $IANACharset->{'iso-ir-58'} = $IANACharset->{chinese} = $HTMLCharset->{gb231280} = $HTMLCharset->{isoir58} = __PACKAGE__->new ({ ## NOTE: What is represented by this charset is unclear... I don't ## understand what RFC 1945 describes... category => 0, iana_names => { 'gb_2312-80' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME, 'iso-ir-58' => REGISTERED_CHARSET_NAME, 'chinese' => REGISTERED_CHARSET_NAME, 'csiso58gb231280' => REGISTERED_CHARSET_NAME, }, perl_names => { ## TODO: GB2312->GBK Parse Error wrapper 'cp936' => FALLBACK_ENCODING_IMPL, }, ## NOTE: |gb2312| is handled as |gbk|, such that properties should be ## consistent. }); ## TODO: ... $Charset->{'utf-8'} = $IANACharset->{'utf-8'} = $IANACharset->{'x-utf-8'} = $HTMLCharset->{'utf8'} = $HTMLCharset->{'xutf8'} = __PACKAGE__->new ({ category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT | CHARSET_CATEGORY_MIME_TEXT, iana_names => { 'utf-8' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME, ## NOTE: IANA name "utf-8" references RFC 3629. According to the RFC, ## the definitive definition is one specified in the Unicode Standard. 'x-utf-8' => UNREGISTERED_CHARSET_NAME, ## NOTE: We treat |x-utf-8| as an alias of |utf-8|, since unlike ## other charset like |x-sjis| or |x-euc-jp|, there is no major ## variant for the UTF-8 encoding. ## TODO: We might ought to reconsider this policy, since ## there are UTF-8 variant in fact, such as ## Unicode's UTF-8, ISO/IEC 10646's UTF-8, UTF-8n, and as ## such. }, perl_names => { 'utf-8-strict' => PRIMARY_CHARSET_NAME | SEMICONFORMING_ENCODING_IMPL | ERROR_REPORTING_ENCODING_IMPL, ## NOTE: It does not support non-Unicode UCS characters (conforming). ## It does detect illegal sequences (conforming). ## It does not support surrpgate pairs (conforming). ## It does not support BOMs (non-conforming). }, ## TODO: |error_level| bom_pattern => qr/\xEF\xBB\xBF/, }); $Charset->{'utf-8n'} = $IANACharset->{'utf-8n'} = $HTMLCharset->{'utf-8'} = __PACKAGE__->new ({ category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_MIME_TEXT | CHARSET_CATEGORY_ASCII_COMPAT, iana_names => { 'utf-8n' => UNREGISTERED_CHARSET_NAME, ## NOTE: Is there any normative definition for the charset? ## What variant of UTF-8 should we use for the charset? }, perl_names => { 'utf-8-strict' => PRIMARY_CHARSET_NAME | ERROR_REPORTING_ENCODING_IMPL, }, ## TODO: |error_level| }); ## TODO: ... $Charset->{'gbk'} = $IANACharset->{'gbk'} = $IANACharset->{'cp936'} = $IANACharset->{'ms936'} = $IANACharset->{'windows-936'} = $HTMLCharset->{'windows936'} = __PACKAGE__->new ({ category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_MIME_TEXT, iana_names => { 'gbk' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME, 'cp936' => REGISTERED_CHARSET_NAME, 'ms936' => REGISTERED_CHARSET_NAME, 'windows-936' => REGISTERED_CHARSET_NAME, }, ## TODO: |error_level| iana_status => STATUS_COMMON | STATUS_OBSOLETE, }); $Charset->{'gb18030'} = $IANACharset->{'gb18030'} = __PACKAGE__->new ({ category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_MIME_TEXT, iana_names => { 'gb18030' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME, }, iana_status => STATUS_COMMON, mime_text_suitable => 1, }); ## TODO: ... $Charset->{'utf-16be'} = $IANACharset->{'utf-16be'} = $HTMLCharset->{'utf16be'} = __PACKAGE__->new ({ category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_UTF16, iana_names => { 'utf-16be' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME, }, ## TODO: |error_level| }); $Charset->{'utf-16le'} = $IANACharset->{'utf-16le'} = $HTMLCharset->{'utf16le'} = __PACKAGE__->new ({ category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_UTF16, iana_names => { 'utf-16le' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME, }, ## TODO: |error_level| }); $Charset->{'utf-16'} = $IANACharset->{'utf-16'} = $HTMLCharset->{'utf16'} = __PACKAGE__->new ({ category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_UTF16, iana_names => { 'utf-16' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME, }, ## TODO: |error_level| }); ## TODO: ... $Charset->{'windows-31j'} = $IANACharset->{'windows-31j'} = $IANACharset->{'cswindows31j'} = $HTMLCharset->{'windows31j'} = __PACKAGE__->new ({ category => CHARSET_CATEGORY_SJIS | CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_MIME_TEXT, iana_names => { 'windows-31j' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME, 'cswindows31j' => REGISTERED_CHARSET_NAME, }, iana_status => STATUS_LIMITED_USE, # maybe ## TODO: |error_level| }); $Charset->{'gb2312'} = $IANACharset->{'gb2312'} = $IANACharset->{'csgb2312'} = __PACKAGE__->new ({ category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_MIME_TEXT | CHARSET_CATEGORY_ASCII_COMPAT, iana_names => { 'gb2312' => PRIMARY_CHARSET_NAME | PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME, 'csgb2312' => REGISTERED_CHARSET_NAME, }, perl_names => { ## TODO: GB2312->GBK Parse Error wrapper 'cp936' => FALLBACK_ENCODING_IMPL, }, ## NOTE: |gb2312| is handled as |gbk|, such that properties should be ## consistent. }); $Charset->{'big5'} = $IANACharset->{'big5'} = $IANACharset->{'csbig5'} = $IANACharset->{'x-x-big5'} = $HTMLCharset->{xxbig5} = __PACKAGE__->new ({ category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_MIME_TEXT, iana_names => { 'big5' => PRIMARY_CHARSET_NAME | PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME, 'csbig5' => REGISTERED_CHARSET_NAME, 'x-x-big5' => UNREGISTERED_CHARSET_NAME, ## NOTE: In HTML5, |x-x-big5| is defined as an alias of |big5|. ## According to that spec, if there is any difference between ## input and replacement encodings, the result is parse error. ## However, since there is no formal definition for |x-x-big5| ## charset, we cannot raise such errors. }, ## TODO: |error_level| }); ## TODO: ... $Charset->{'big5-hkscs'} = $IANACharset->{'big5-hkscs'} = $HTMLCharset->{'big5hkscs'} = __PACKAGE__->new ({ category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_MIME_TEXT, iana_names => { 'big5-hkscs' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME, }, ## TODO: |error_level| }); ## TODO: ... $Charset->{'windows-1252'} = $IANACharset->{'windows-1252'} = $HTMLCharset->{'windows1252'} = __PACKAGE__->new ({ category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT | CHARSET_CATEGORY_MIME_TEXT, iana_names => { 'windows-1252' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME, }, ## TODO: Check whether use of 0x81 is conforming or not... }); $Charset->{'windows-1253'} = $IANACharset->{'windows-1253'} = $HTMLCharset->{'windows1253'} = __PACKAGE__->new ({ category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT | CHARSET_CATEGORY_MIME_TEXT, iana_names => { 'windows-1253' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME, }, ## TODO: Check whether use of 0x81 is conforming or not... }); $Charset->{'windows-1254'} = $IANACharset->{'windows-1254'} = $HTMLCharset->{'windows1254'} = __PACKAGE__->new ({ category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT | CHARSET_CATEGORY_MIME_TEXT, iana_names => { 'windows-1254' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME, }, ## TODO: Check whether use of 0x81 is conforming or not... }); ## TODO: ... $Charset->{'tis-620'} = $IANACharset->{'tis-620'} = $HTMLCharset->{'tis620'} = __PACKAGE__->new ({ category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT, iana_names => { 'tis-620' => PRIMARY_CHARSET_NAME | REGISTERED_CHARSET_NAME, }, perl_names => { 'web-tis-620' => UNREGISTERED_CHARSET_NAME | ERROR_REPORTING_ENCODING_IMPL, 'windows-874' => FALLBACK_ENCODING_IMPL | ERROR_REPORTING_ENCODING_IMPL, }, fallback => { "\x80" => "\x{20AC}", "\x81" => undef, "\x82" => undef, "\x83" => undef, "\x84" => undef, "\x85" => "\x{2026}", "\x86" => undef, "\x87" => undef, "\x88" => undef, "\x89" => undef, "\x8A" => undef, "\x8B" => undef, "\x8C" => undef, "\x8D" => undef, "\x8E" => undef, "\x8F" => undef, "\x90" => undef, "\x91" => "\x{2018}", "\x92" => "\x{2019}", "\x93" => "\x{201C}", "\x94" => "\x{201D}", "\x95" => "\x{2022}", "\x96" => "\x{2013}", "\x97" => "\x{2014}", "\x98" => undef, "\x99" => undef, "\x9A" => undef, "\x9B" => undef, "\x9C" => undef, "\x9D" => undef, "\x9E" => undef, "\x9F" => undef, "\xA0" => "\xA0", }, ## NOTE: |tis-620| is treated as |windows-874|, so ensure that ## they are consistent. }); $Charset->{'iso-8859-11'} = $IANACharset->{'iso-8859-11'} = $HTMLCharset->{'iso885911'} = __PACKAGE__->new ({ category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT, iana_names => { 'iso-8859-11' => UNREGISTERED_CHARSET_NAME, ## NOTE: The Web Thai encoding, i.e. windows-874. }, perl_names => { 'web-thai' => UNREGISTERED_CHARSET_NAME | ERROR_REPORTING_ENCODING_IMPL, 'windows-874' => FALLBACK_ENCODING_IMPL | ERROR_REPORTING_ENCODING_IMPL, }, fallback => { "\x80" => "\x{20AC}", "\x81" => undef, "\x82" => undef, "\x83" => undef, "\x84" => undef, "\x85" => "\x{2026}", "\x86" => undef, "\x87" => undef, "\x88" => undef, "\x89" => undef, "\x8A" => undef, "\x8B" => undef, "\x8C" => undef, "\x8D" => undef, "\x8E" => undef, "\x8F" => undef, "\x90" => undef, "\x91" => "\x{2018}", "\x92" => "\x{2019}", "\x93" => "\x{201C}", "\x94" => "\x{201D}", "\x95" => "\x{2022}", "\x96" => "\x{2013}", "\x97" => "\x{2014}", "\x98" => undef, "\x99" => undef, "\x9A" => undef, "\x9B" => undef, "\x9C" => undef, "\x9D" => undef, "\x9E" => undef, "\x9F" => undef, }, ## NOTE: |iso-8859-11| is treated as |windows-874|, so ensure that ## they are consistent. }); $Charset->{'windows-874'} = $IANACharset->{'windows-874'} = $HTMLCharset->{'windows874'} = __PACKAGE__->new ({ category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_ASCII_COMPAT, iana_names => { 'windows-874' => UNREGISTERED_CHARSET_NAME, }, perl_names => { 'windows-874' => REGISTERED_CHARSET_NAME | ERROR_REPORTING_ENCODING_IMPL, }, ## TODO: |error_level| }); $IANACharset->{'windows-949'} = $HTMLCharset->{windows949} = __PACKAGE__->new ({ category => CHARSET_CATEGORY_BLOCK_SAFE | CHARSET_CATEGORY_MIME_TEXT, iana_names => { 'windows-949' => UNREGISTERED_CHARSET_NAME, }, perl_names => { 'cp949' => PREFERRED_CHARSET_NAME | NONCONFORMING_ENCODING_IMPL | ERROR_REPORTING_ENCODING_IMPL, ## TODO: Is this implementation conforming? }, ## NOTE: |error_level| is same as default, since we can't find any formal ## definition for this charset. }); sub new ($$) { return bless $_[1], $_[0]; } # new ## NOTE: A class method sub get_by_html_name ($$) { my $name = $_[1]; $name =~ tr/A-Z/a-z/; ## ASCII case-insensitive my $iana_name = $name; $name =~ s/[\x09-\x0D\x20-\x2F\x3A-\x40\x5B-\x60\x7B-\x7E]//g; ## NOTE: U+000B is included. unless ($HTMLCharset->{$name} || $IANACharset->{$name}) { $IANACharset->{$iana_name} = $HTMLCharset->{$name} = __PACKAGE__->new ({ iana_names => { $iana_name => UNREGISTERED_CHARSET_NAME, }, }); } return $HTMLCharset->{$name} || $IANACharset->{$name}; } # get_by_html_name ## NOTE: A class method sub get_by_iana_name ($$) { my $name = $_[1]; $name =~ tr/A-Z/a-z/; ## ASCII case-insensitive unless ($IANACharset->{$name}) { $IANACharset->{$name} = __PACKAGE__->new ({ iana_names => { $name => UNREGISTERED_CHARSET_NAME, }, }); } return $IANACharset->{$name}; } # get_by_iana_name sub get_decode_handle ($$;%) { my $self = shift; my $byte_stream = shift; my %opt = @_; my $obj = { category => $self->{category}, char_buffer => \(my $s = ''), char_buffer_pos => 0, character_queue => [], filehandle => $byte_stream, charset => '', ## TODO: We set a charset name for input_encoding (when we get identify-by-URI nonsense away) byte_buffer => $opt{byte_buffer} ? ${$opt{byte_buffer}} : '', ## TODO: ref, instead of value, should be used onerror => $opt{onerror} || sub {}, #onerror_set level => $opt{level} || { must => 'm', charset_variant => 'm', charset_fact => 'm', iso_shall => 'm', }, error_level => $self->{error_level} || { ## HTML5 charset name aliases ## NOTE: Use of code points in the variant whose definition differs ## from the original charset is a parse error in HTML5. However, ## it does not affect the document conformance; the HTML5 spec ## does not define the conformance of the input stream against the ## charset in use. 'fallback-char-error' => 'charset_variant', #'fallback-illegal-error' => 'charset_variant', 'fallback-unassigned-error' => 'charset_variant', ## NOTE: An appropriate error level should be set for each charset ## (many charset prohibits use of unassigned code points). 'illegal-octets-error' => 'charset_fact', 'unassigned-code-point-error' => 'charset_fact', 'invalid-state-error' => 'charset_fact', }, }; require HTML::HTML5::Parser::Charset::DecodeHandle; if ($self->{iana_names}->{'iso-2022-jp'}) { $obj->{state_2440} = 'gl-jis-1978'; $obj->{state_2442} = 'gl-jis-1983'; $obj->{state} = 'state_2842'; eval { require Encode::GLJIS1978; require Encode::GLJIS1983; }; if (Encode::find_encoding ($obj->{state_2440}) and Encode::find_encoding ($obj->{state_2442})) { return ((bless $obj, 'HTML::HTML5::Parser::Charset::DecodeHandle::ISO2022JP'), PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME); } } elsif ($self->{xml_names}->{'iso-2022-jp'}) { $obj->{state_2440} = 'gl-jis-1997-swapped'; $obj->{state_2442} = 'gl-jis-1997'; $obj->{state} = 'state_2842'; eval { require Encode::GLJIS1997Swapped; require Encode::GLJIS1997; }; if (Encode::find_encoding ($obj->{state_2440}) and Encode::find_encoding ($obj->{state_2442})) { return ((bless $obj, 'HTML::HTML5::Parser::Charset::DecodeHandle::ISO2022JP'), PREFERRED_CHARSET_NAME | REGISTERED_CHARSET_NAME); } } my ($e, $e_status) = $self->get_perl_encoding (%opt, allow_semiconforming => 1); if ($e) { $obj->{perl_encoding_name} = $e->name; unless ($self->{category} & CHARSET_CATEGORY_BLOCK_SAFE) { $e_status |= FALLBACK_ENCODING_IMPL; } $obj->{bom_pattern} = $self->{bom_pattern}; $obj->{fallback} = $self->{fallback}; return ((bless $obj, 'HTML::HTML5::Parser::Charset::DecodeHandle::Encode'), $e_status); } else { return (undef, 0); } } # get_decode_handle sub get_perl_encoding ($;%) { my ($self, %opt) = @_; require Encode; my $load_encode = sub { my $name = shift; if ($name eq 'euc-jp-1997') { require Encode::EUCJP1997; } elsif ($name eq 'shift-jis-1997') { require Encode::ShiftJIS1997; } elsif ({'web-latin1' => 1, 'web-latin1-us-ascii' => 1, 'web-latin5' => 1}->{$name}) { require HTML::HTML5::Parser::Charset::WebLatin1; } elsif ($name eq 'web-thai' or $name eq 'web-tis-620') { require HTML::HTML5::Parser::Charset::WebThai; } }; # $load_encode if ($opt{allow_error_reporting}) { for my $perl_name (keys %{$self->{perl_names} or {}}) { my $perl_status = $self->{perl_names}->{$perl_name}; next unless $perl_status & ERROR_REPORTING_ENCODING_IMPL; next if $perl_status & FALLBACK_ENCODING_IMPL; next if $perl_status & SEMICONFORMING_ENCODING_IMPL and not $opt{allow_semiconforming}; $load_encode->($perl_name); my $e = Encode::find_encoding ($perl_name); if ($e and $e->name eq $perl_name) { ## NOTE: Don't return $e unless $e eq $perl_name, since ## |find_encoding| resolves e.g. |foobarlatin-1| to |iso-8859-1|, ## which might return wrong encoding object when a dedicated ## implementation not part of the standard Perl distribution is ## desired. return ($e, $perl_status); } } } for my $perl_name (keys %{$self->{perl_names} or {}}) { my $perl_status = $self->{perl_names}->{$perl_name}; next if $perl_status & ERROR_REPORTING_ENCODING_IMPL; next if $perl_status & FALLBACK_ENCODING_IMPL; next if $perl_status & SEMICONFORMING_ENCODING_IMPL and not $opt{allow_semiconforming}; $load_encode->($perl_name); my $e = Encode::find_encoding ($perl_name); if ($e) { return ($e, $perl_status); } } if ($opt{allow_fallback}) { for my $perl_name (keys %{$self->{perl_names} or {}}) { my $perl_status = $self->{perl_names}->{$perl_name}; next unless $perl_status & FALLBACK_ENCODING_IMPL or $perl_status & SEMICONFORMING_ENCODING_IMPL; ## NOTE: We don't prefer semi-conforming implementations to ## non-conforming implementations, since semi-conforming implementations ## will never be conforming without assist of the callee, and in such ## cases the callee should set the |allow_semiconforming| option upon ## the invocation of the method anyway. $load_encode->($perl_name); my $e = Encode::find_encoding ($perl_name); if ($e) { return ($e, $perl_status); } } for my $iana_name (keys %{$self->{iana_names} or {}}) { $load_encode->($iana_name); my $e = Encode::find_encoding ($iana_name); if ($e) { return ($e, FALLBACK_ENCODING_IMPL); } } } return (undef, 0); } # get_perl_encoding sub get_iana_name ($) { my $self = shift; my $primary; my $other; for my $iana_name (keys %{$self->{iana_names} or {}}) { my $name_status = $self->{iana_names}->{$iana_name}; if ($name_status & PREFERRED_CHARSET_NAME) { return $iana_name; } elsif ($name_status & PRIMARY_CHARSET_NAME) { $primary = $iana_name; } elsif ($name_status & REGISTERED_CHARSET_NAME) { $other = $iana_name; } else { $other ||= $iana_name; } } return $primary || $other; } # get_iana_name ## NOTE: A non-method function sub is_syntactically_valid_iana_charset_name ($) { my $name = shift; return $name =~ /\A[\x20-\x7E]{1,40}\z/; ## NOTE: According to IANAREG, "The character set names may be up to 40 ## characters taken from the printable characters of US-ASCII. However, ## no distinction is made between use of upper and lower case letters.". } # is_suntactically_valid_iana_charset_name 1; ## $Date: 2008/09/15 07:19:33 $ �����������������������������������������������������������������HTML-HTML5-Parser-0.301/lib/HTML/HTML5/Parser.pm����������������������������������������������������0000644�0001750�0001750�00000051276�12166544311�016431� 0����������������������������������������������������������������������������������������������������ustar �tai�����������������������������tai��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������package HTML::HTML5::Parser; ## skip Test::Tabs use 5.008001; use strict; use warnings; our $AUTOLOAD; our $VERSION = '0.301'; use Carp; use HTML::HTML5::Parser::Error; use HTML::HTML5::Parser::TagSoupParser; use Scalar::Util qw(blessed); use URI::file; use XML::LibXML; BEGIN { croak "Please upgrade to XML::LibXML 1.94" if XML::LibXML->VERSION =~ /^1\.9[12]/; } sub new { my $class = shift; my %p = @_; my $self = bless { errors => [], parser => HTML::HTML5::Parser::TagSoupParser->new(%p), }, $class; return $self; } sub parse_file { require HTML::HTML5::Parser::UA; my $self = shift; my $file = shift; my $opts = shift || {}; unless (blessed($file) and $file->isa('URI')) { if ($file =~ /^[a-z0-9_\.-]+:\S+$/i) { $file = URI->new($file); } else { $file = URI::file->new_abs($file); } } my $response = HTML::HTML5::Parser::UA->get($file, $opts->{user_agent}); croak "HTTP response code was not 200 OK. (Set \$opts{ignore_http_response_code} to ignore this error.)" unless ($response->{success} || $opts->{ignore_http_response_code}); my $content = $response->{decoded_content}; my $c_type = $response->{headers}{'content-type'}; $opts->{'response'} = $response; if ($c_type =~ /xml/i and not $opts->{'force_html'}) { $opts->{'parser_used'} = 'XML::LibXML::Parser'; my $xml_parser = XML::LibXML->new; $xml_parser->validation(0); $xml_parser->recover(2); $xml_parser->base_uri($response->base); $xml_parser->load_catalog($opts->{'xml_catalogue'}) if -r $opts->{'xml_catalogue'}; return $xml_parser->parse_string($content); } return $self->parse_string($content, $opts); } *parse_html_file = \&parse_file; sub parse_fh { my $self = shift; my $handle = shift; my $opts = shift || {}; my $string = ''; while (<$handle>) { $string .= $_; } return $self->parse_string($string, $opts); } *parse_html_fh = \&parse_fh; sub parse_string { my $self = shift; my $text = shift; my $opts = shift || {}; $self->{'errors'} = []; $opts->{'parser_used'} = 'HTML::HTML5::Parser'; my $dom = XML::LibXML::Document->createDocument; if (defined $opts->{'encoding'}||1) { # XXX AGAIN DO THIS TO STOP ENORMOUS MEMORY LEAKS my ($errh, $errors) = @{$self}{qw(error_handler errors)}; $self->{parser}->parse_byte_string( $opts->{'encoding'}, $text, $dom, sub { my $err = HTML::HTML5::Parser::Error->new(@_); $errh->($err) if $errh; push @$errors, $err; }); } else { $self->{parser}->parse_char_string($text, $dom, sub{ my $err = HTML::HTML5::Parser::Error->new(@_); $self->{error_handler}->($err) if $self->{error_handler}; push @{$self->{'errors'}}, $err; }); } return $dom; } *parse_html_string = \&parse_string; # TODO: noembed, noframes, noscript my %within = ( html => [qw/html/], frameset => [qw/html frameset/], frame => [qw/html frameset frame/], head => [qw/html head/], title => [qw/html head title/], style => [qw/html head style/], (map { $_ => undef } qw/base link meta basefont bgsound/), body => [qw/html body/], script => [qw/html body script/], div => [qw/html body div/], (map { $_ => [qw/html body div/, $_] } qw/a abbr acronym address applet area article aside big blockquote button center code details dir dl em fieldset figure font footer form h1 h2 h3 h4 h5 h6 header hgroup i iframe listing marquee menu nav nobr object ol p plaintext pre ruby s section small strike strong tt u ul xmp/), (map { $_ => undef } qw/br col command datagrid embed hr img input keygen param wbr/), dd => [qw/html body dl dd/], dd => [qw/html body dl dt/], figcaption => [qw/html body figure/], li => [qw/html body ul li/], ul__li => [qw/html body ul li/], ol__li => [qw/html body ol li/], optgroup => [qw/html body form div select/], option => [qw/html body form div select/], rp => [qw/html body div ruby/], rt => [qw/html body div ruby/], select => [qw/html body form div select/], summary => [qw/html body div details/], table => [qw/html body table/], (map { $_ => [qw/html body table/, $_] } qw/thead tfoot tbody tr caption colgroup/), (map { $_ => [qw/html body table tbody tr/, $_] } qw/td th/), textarea => [qw/html body form div textarea/], ); sub parse_balanced_chunk { my ($self, $chunk, $o) = @_; my %options = %{ $o || {} }; $options{as} = 'default' unless defined $options{as}; my $w = $options{force_within} || $options{within} || 'div'; my $ancestors = $within{ lc $w }; croak "Cannot parse chunk as if within $w." if !defined $ancestors; my $parent = $ancestors->[-1]; my $n = scalar(@$ancestors) - 2; my @a = $n ? @$ancestors[0 .. $n] : (); my $uniq = sprintf('rand_id_%09d', int rand 1_000_000_000); my $document = "<!doctype html>\n". (join q{}, map { "<$_>" } @a). "<$parent id='$uniq'>". $chunk. ''.# "</$parent>". '';# (join q{}, map { "</$_>" } reverse @a); my $dom = $self->parse_html_string($document); $parent = $dom->findnodes("//*[\@id='$uniq']")->get_node(1); if ($options{debug}) { if (exists &Test::More::diag) { Test::More::diag($document); Test::More::diag($dom->toString); } else { warn $document."\n"; warn $dom->toString."\n"; } } my @results = $parent->childNodes; unless ($options{force_within}) { while ($parent) { my $sibling = $parent->nextSibling; while ($sibling) { unless ($sibling->nodeName =~ /^(head|body)$/) { $sibling->setAttribute('data-perl-html-html5-parser-outlier', 1) if $options{mark_outliers} && $sibling->can('setAttribute'); push @results, $sibling; } $sibling = $sibling->nextSibling; } $sibling = $parent->previousSibling; while ($sibling) { unless ($sibling->nodeName =~ /^(head|body)$/) { $sibling->setAttribute('data-perl-html-html5-parser-outlier', 1) if $options{mark_outliers} && $sibling->can('setAttribute'); unshift @results, $sibling; } $sibling = $sibling->previousSibling; } $parent = $parent->parentNode; } } my $frag = XML::LibXML::DocumentFragment->new; $frag->appendChild($_) foreach @results; if (lc $options{as} eq 'list') { return wantarray ? @results : XML::LibXML::NodeList->new(@results); } return wantarray ? @results : $frag; } sub load_html { my $class_or_self = shift; my %args = map { ref($_) eq 'HASH' ? (%$_) : $_ } @_; my $URI = delete($args{URI}); $URI = "$URI" if defined $URI; # stringify in case it is an URI object my $parser = ref($class_or_self) ? $class_or_self : $class_or_self->new; my $dom; if ( defined $args{location} ) { $dom = $parser->parse_file( "$args{location}" ) } elsif ( defined $args{string} ) { $dom = $parser->parse_string( $args{string}, $URI ) } elsif ( defined $args{IO} ) { $dom = $parser->parse_fh( $args{IO}, $URI ) } else { croak("HTML::HTML5::Parser->load_html: specify location, string, or IO"); } return $dom; } sub load_xml { my $self = shift; my $dom; eval { $dom = XML::LibXML->load_xml(@_); }; return $dom if blessed($dom); return $self->load_html(@_); } sub AUTOLOAD { my $self = shift; my $func = $AUTOLOAD; $func =~ s/.*://; # LibXML Push Parser. if ($func =~ /^( parse_chunk | start_push | push | finish_push )$/xi) { croak "Push parser ($func) not implemented by HTML::HTML5::Parser."; } # Misc LibXML functions with no compatible interface provided. if ($func =~ /^( parse_balanced_chunk | parse_xml_chunk | process_?xincludes | get_last_error )$/xi) { croak "$func not implemented by HTML::HTML5::Parser."; } # Fixed options which are true. if ($func =~ /^( recover | recover_silently | expand_entities | keep_blanks | no_network )$/xi) { my $set = shift; if ((!$set) && defined $set) { carp "Option $func cannot be switched off."; } return 1; } # Fixed options which are false. if ($func =~ /^( validation | pedantic_parser | line_numbers load_ext_dtd | complete_attributes | expand_xinclude | load_catalog | base_uri | gdome_dom | clean_namespaces )$/xi) { my $set = shift; if (($set) && defined $set) { carp "Option $func cannot be switched on."; } return 0; } carp "HTML::HTML5::Parser doesn't understand '$func'." if length $func; } sub error_handler { my $self = shift; $self->{error_handler} = shift if @_; return $self->{error_handler}; } sub errors { my $self = shift; return @{ $self->{errors} }; } sub compat_mode { my $self = shift; my $node = shift; return $self->{parser}->_data($node)->{'manakai_compat_mode'}; } sub charset { my $self = shift; my $node = shift; return $self->{parser}->_data($node)->{'charset'}; } sub dtd_public_id { my $self = shift; my $node = shift; return $self->{parser}->_data($node)->{'DTD_PUBLIC_ID'}; } sub dtd_system_id { my $self = shift; my $node = shift; return $self->{parser}->_data($node)->{'DTD_SYSTEM_ID'}; } sub dtd_element { my $self = shift; my $node = shift; return $self->{parser}->_data($node)->{'DTD_ELEMENT'}; } sub source_line { my $self = shift; my $node = shift; my $data = ref $self ? $self->{parser}->_data($node) : HTML::HTML5::Parser::TagSoupParser::DATA($node); my $line = $data->{'manakai_source_line'}; if (wantarray) { return ( $line, $data->{'manakai_source_column'}, ($data->{'implied'} || 0), ); } else { return $line; } } sub DESTROY {} __END__ =pod =encoding utf8 =begin stopwords XML::LibXML-like XML::LibXML-Compatible 'utf-8') foobar doctype: html implictness =end stopwords =head1 NAME HTML::HTML5::Parser - parse HTML reliably =head1 SYNOPSIS use HTML::HTML5::Parser; my $parser = HTML::HTML5::Parser->new; my $doc = $parser->parse_string(<<'EOT'); <!doctype html> <title>Foo</title> <p><b><i>Foo</b> bar</i>. <p>Baz</br>Quux. EOT my $fdoc = $parser->parse_file( $html_file_name ); my $fhdoc = $parser->parse_fh( $html_file_handle ); =head1 DESCRIPTION This library is substantially the same as the non-CPAN module Whatpm::HTML. Changes include: =over 8 =item * Provides an XML::LibXML-like DOM interface. If you usually use XML::LibXML's DOM parser, this should be a drop-in solution for tag soup HTML. =item * Constructs an XML::LibXML::Document as the result of parsing. =item * Via bundling and modifications, removed external dependencies on non-CPAN packages. =back =head2 Constructor =over 8 =item C<new> $parser = HTML::HTML5::Parser->new; # or $parser = HTML::HTML5::Parser->new(no_cache => 1); The constructor does nothing interesting besides take one flag argument, C<no_cache =E<gt> 1>, to disable the global element metadata cache. Disabling the cache is handy for conserving memory if you parse a large number of documents, however, class methods such as C</source_line> will not work, and must be run from an instance of this parser. =back =head2 XML::LibXML-Compatible Methods =over =item C<parse_file>, C<parse_html_file> $doc = $parser->parse_file( $html_file_name [,\%opts] ); This function parses an HTML document from a file or network; C<$html_file_name> can be either a filename or an URL. Options include 'encoding' to indicate file encoding (e.g. 'utf-8') and 'user_agent' which should be a blessed C<LWP::UserAgent> (or L<HTTP::Tiny>) object to be used when retrieving URLs. If requesting a URL and the response Content-Type header indicates an XML-based media type (such as XHTML), XML::LibXML::Parser will be used automatically (instead of the tag soup parser). The XML parser can be told to use a DTD catalogue by setting the option 'xml_catalogue' to the filename of the catalogue. HTML (tag soup) parsing can be forced using the option 'force_html', even when an XML media type is returned. If an options hashref was passed, parse_file will set $options->{'parser_used'} to the name of the class used to parse the URL, to allow the calling code to double-check which parser was used afterwards. If an options hashref was passed, parse_file will set $options->{'response'} to the HTTP::Response object obtained by retrieving the URI. =item C<parse_fh>, C<parse_html_fh> $doc = $parser->parse_fh( $io_fh [,\%opts] ); C<parse_fh()> parses a IOREF or a subclass of C<IO::Handle>. Options include 'encoding' to indicate file encoding (e.g. 'utf-8'). =item C<parse_string>, C<parse_html_string> $doc = $parser->parse_string( $html_string [,\%opts] ); This function is similar to C<parse_fh()>, but it parses an HTML document that is available as a single string in memory. Options include 'encoding' to indicate file encoding (e.g. 'utf-8'). =item C<load_xml>, C<load_html> Wrappers for the parse_* functions. These should be roughly compatible with the equivalently named functions in L<XML::LibXML>. Note that C<load_xml> first attempts to parse as real XML, falling back to HTML5 parsing; C<load_html> just goes straight for HTML5. =item C<parse_balanced_chunk> $fragment = $parser->parse_balanced_chunk( $string [,\%opts] ); This method is roughly equivalent to XML::LibXML's method of the same name, but unlike XML::LibXML, and despite its name it does not require the chunk to be "balanced". This method is somewhat black magic, but should work, and do the proper thing in most cases. Of course, the proper thing might not be what you'd expect! I'll try to keep this explanation as brief as possible... Consider the following string: <b>Hello</b></td></tr> <i>World</i> What is the proper way to parse that? If it were found in a document like this: <html> <head><title>X</title></head> <body> <div> <b>Hello</b></td></tr> <i>World</i> </div> </body> </html> Then the document would end up equivalent to the following XHTML: <html> <head><title>X</title></head> <body> <div> <b>Hello</b> <i>World</i> </div> </body> </html> The superfluous C<< </td></tr> >> is simply ignored. However, if it were found in a document like this: <html> <head><title>X</title></head> <body> <table><tbody><tr><td> <b>Hello</b></td></tr> <i>World</i> </td></tr></tbody></table> </body> </html> Then the result would be: <html> <head><title>X</title></head> <body> <i>World</i> <table><tbody><tr><td> <b>Hello</b></td></tr> </tbody></table> </body> </html> Yes, C<< <i>World</i> >> gets hoisted up before the C<< <table> >>. This is weird, I know, but it's how browsers do it in real life. So what should: $string = q{<b>Hello</b></td></tr> <i>World</i>}; $fragment = $parser->parse_balanced_chunk($string); actually return? Well, you can choose... $string = q{<b>Hello</b></td></tr> <i>World</i>}; $frag1 = $parser->parse_balanced_chunk($string, {within=>'div'}); say $frag1->toString; # <b>Hello</b> <i>World</i> $frag2 = $parser->parse_balanced_chunk($string, {within=>'td'}); say $frag2->toString; # <i>World</i><b>Hello</b> If you don't pass a "within" option, then the chunk is parsed as if it were within a C<< <div> >> element. This is often the most sensible option. If you pass something like C<< { within => "foobar" } >> where "foobar" is not a real HTML element name (as found in the HTML5 spec), then this method will croak; if you pass the name of a void element (e.g. C<< "br" >> or C<< "meta" >>) then this method will croak; there are a handful of other unsupported elements which will croak (namely: C<< "noscript" >>, C<< "noembed" >>, C<< "noframes" >>). Note that the second time around, although we parsed the string "as if it were within a C<< <td> >> element", the C<< <i>Hello</i> >> bit did not strictly end up within the C<< <td> >> element (not even within the C<< <table> >> element!) yet it still gets returned. We'll call things such as this "outliers". There is a "force_within" option which tells parse_balanced_chunk to ignore outliers: $frag3 = $parser->parse_balanced_chunk($string, {force_within=>'td'}); say $frag3->toString; # <b>Hello</b> There is a boolean option "mark_outliers" which marks each outlier with an attribute (C<< data-perl-html-html5-parser-outlier >>) to indicate its outlier status. Clearly, this is ignored when you use "force_within" because no outliers are returned. Some outliers may be XML::LibXML::Text elements; text nodes don't have attributes, so these will not be marked with an attribute. A last note is to mention what gets returned by this method. Normally it's an L<XML::LibXML::DocumentFragment> object, but if you call the method in list context, a list of the individual node elements is returned. Alternatively you can request the data to be returned as an L<XML::LibXML::NodeList> object: # Get an XML::LibXML::NodeList my $list = $parser->parse_balanced_chunk($str, {as=>'list'}); The exact implementation of this method may change from version to version, but the long-term goal will be to approach how common desktop browsers parse HTML fragments when implementing the setter for DOM's C<innerHTML> attribute. =back The push parser and SAX-based parser are not supported. Trying to change an option (such as recover_silently) will make HTML::HTML5::Parser carp a warning. (But you can inspect the options.) =head2 Error Handling Error handling is obviously different to XML::LibXML, as errors are (bugs notwithstanding) non-fatal. =over =item C<error_handler> Get/set an error handling function. Must be set to a coderef or undef. The error handling function will be called with a single parameter, a L<HTML::HTML5::Parser::Error> object. =item C<errors> Returns a list of errors that occurred during the last parse. See L<HTML::HTML5::Parser::Error>. =back =head2 Additional Methods The module provides a few methods to obtain additional, non-DOM data from DOM nodes. =over =item C<dtd_public_id> $pubid = $parser->dtd_public_id( $doc ); For an XML::LibXML::Document which has been returned by HTML::HTML5::Parser, using this method will tell you the Public Identifier of the DTD used (if any). =item C<dtd_system_id> $sysid = $parser->dtd_system_id( $doc ); For an XML::LibXML::Document which has been returned by HTML::HTML5::Parser, using this method will tell you the System Identifier of the DTD used (if any). =item C<dtd_element> $element = $parser->dtd_element( $doc ); For an XML::LibXML::Document which has been returned by HTML::HTML5::Parser, using this method will tell you the root element declared in the DTD used (if any). That is, if the document has this doctype: <!doctype html> ... it will return "html". This may return the empty string if a DTD was present but did not contain a root element; or undef if no DTD was present. =item C<compat_mode> $mode = $parser->compat_mode( $doc ); Returns 'quirks', 'limited quirks' or undef (standards mode). =item C<charset> $charset = $parser->charset( $doc ); The character set apparently used by the document. =item C<source_line> ($line, $col) = $parser->source_line( $node ); $line = $parser->source_line( $node ); In scalar context, C<source_line> returns the line number of the source code that started a particular node (element, attribute or comment). In list context, returns a tuple: $line, $column, $implicitness. Tab characters count as one column, not eight. $implicitness indicates that the node was not explicitly marked up in the source code, but its existence was inferred by the parser. For example, in the following markup, the HTML, TITLE and P elements are explicit, but the HEAD and BODY elements are implicit. <html> <title>I have an implicit head</title> <p>And an implicit body too!</p> </html> (Note that implicit elements do still have a line number and column number.) The implictness indicator is a new feature, and I'd appreciate any bug reports where it gets things wrong. L<XML::LibXML::Node> has a C<line_number> method. In general this will always return 0 and HTML::HTML5::Parser has no way of influencing it. However, if you install L<XML::LibXML::Devel::SetLineNumber> on your system, the C<line_number> method will start working (at least for elements). =back =head1 SEE ALSO L<http://suika.fam.cx/www/markup/html/whatpm/Whatpm/HTML.html>. L<HTML::HTML5::Writer>, L<HTML::HTML5::Builder>, L<XML::LibXML>, L<XML::LibXML::PrettyPrint>, L<XML::LibXML::Devel::SetLineNumber>. =head1 AUTHOR Toby Inkster, E<lt>tobyink@cpan.orgE<gt> =head1 COPYRIGHT AND LICENCE Copyright (C) 2007-2011 by Wakaba Copyright (C) 2009-2012 by Toby Inkster This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =head1 DISCLAIMER OF WARRANTIES THIS PACKAGE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. ����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������HTML-HTML5-Parser-0.301/TODO������������������������������������������������������������������������0000644�0001750�0001750�00000000163�11732176064�013155� 0����������������������������������������������������������������������������������������������������ustar �tai�����������������������������tai��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������* It would be nice to pass some more of the html5lib tests. (The entities tests in particular should be doable.) �������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������HTML-HTML5-Parser-0.301/MANIFEST��������������������������������������������������������������������0000644�0001750�0001750�00000005616�12166545247�013633� 0����������������������������������������������������������������������������������������������������ustar �tai�����������������������������tai��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������bin/html2xhtml bin/html5debug Changes COPYRIGHT CREDITS examples/charsets.pl examples/html/iso-8859-15.html examples/html/utf-16.html examples/html/utf-8.html inc/Module/AutoInstall.pm inc/Module/Install.pm inc/Module/Install/AutoInstall.pm inc/Module/Install/AutoManifest.pm inc/Module/Install/Base.pm inc/Module/Install/Can.pm inc/Module/Install/Contributors.pm inc/Module/Install/Fetch.pm inc/Module/Install/Include.pm inc/Module/Install/Makefile.pm inc/Module/Install/Metadata.pm inc/Module/Install/Package.pm inc/Module/Install/Scripts.pm inc/Module/Install/TrustMetaYml.pm inc/Module/Install/Win32.pm inc/Module/Install/WriteAll.pm inc/Module/Package.pm inc/Module/Package/Dist/RDF.pm inc/YAML/Tiny.pm lib/HTML/HTML5/Parser.pm lib/HTML/HTML5/Parser/Charset/DecodeHandle.pm lib/HTML/HTML5/Parser/Charset/Info.pm lib/HTML/HTML5/Parser/Charset/UnicodeChecker.pm lib/HTML/HTML5/Parser/Charset/UniversalCharDet.pm lib/HTML/HTML5/Parser/Charset/WebLatin1.pm lib/HTML/HTML5/Parser/Charset/WebThai.pm lib/HTML/HTML5/Parser/Error.pm lib/HTML/HTML5/Parser/TagSoupParser.pm lib/HTML/HTML5/Parser/Tokenizer.pm lib/HTML/HTML5/Parser/UA.pm LICENSE Makefile.PL MANIFEST This list of files META.ttl META.yml meta/changes.ttl meta/doap.ttl meta/makefile.ttl meta/rt-bugs.ttl NEWS README t/01basic.t t/02html5.t t/03html4.t t/04fragments.t t/05origins.t t/06xlxdsln.t t/07ua.t t/08ua-lwp.t t/99html5lib.t t/html5lib-fail/domjs-unsafe.dat t/html5lib-fail/entities02.dat t/html5lib-fail/plain-text-unsafe.dat t/html5lib-fail/tests1.dat t/html5lib-fail/tests11.dat t/html5lib-fail/tests14.dat t/html5lib-fail/tests16.dat t/html5lib-fail/tests21.dat t/html5lib-fail/tests22.dat t/html5lib-fail/tests23.dat t/html5lib-fail/tests5.dat t/html5lib-fail/tests6.dat t/html5lib-fail/tests9.dat t/html5lib-fail/tests_innerHTML_1.dat t/html5lib-pass/adoption01.dat t/html5lib-pass/adoption02.dat t/html5lib-pass/comments01.dat t/html5lib-pass/doctype01.dat t/html5lib-pass/entities01.dat t/html5lib-pass/html5test-com.dat t/html5lib-pass/inbody01.dat t/html5lib-pass/isindex.dat t/html5lib-pass/pending-spec-changes-plain-text-unsafe.dat t/html5lib-pass/pending-spec-changes.dat t/html5lib-pass/scriptdata01.dat t/html5lib-pass/scripted/adoption01.dat t/html5lib-pass/scripted/ark.dat t/html5lib-pass/scripted/webkit01.dat t/html5lib-pass/tables01.dat t/html5lib-pass/tests10.dat t/html5lib-pass/tests12.dat t/html5lib-pass/tests15.dat t/html5lib-pass/tests17.dat t/html5lib-pass/tests18.dat t/html5lib-pass/tests19.dat t/html5lib-pass/tests2.dat t/html5lib-pass/tests20.dat t/html5lib-pass/tests24.dat t/html5lib-pass/tests25.dat t/html5lib-pass/tests26.dat t/html5lib-pass/tests3.dat t/html5lib-pass/tests4.dat t/html5lib-pass/tests7.dat t/html5lib-pass/tests8.dat t/html5lib-pass/tricky01.dat t/html5lib-pass/webkit01.dat t/html5lib-pass/webkit02.dat t/lib/Test/HTTP/Server.pm t/rt-79019.t TODO SIGNATURE Public-key signature (added by MakeMaker) ������������������������������������������������������������������������������������������������������������������HTML-HTML5-Parser-0.301/SIGNATURE�������������������������������������������������������������������0000664�0001750�0001750�00000020630�12166545250�013753� 0����������������������������������������������������������������������������������������������������ustar �tai�����������������������������tai��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������This file contains message digests of all files listed in MANIFEST, signed via the Module::Signature module, version 0.70. To verify the content in this distribution, first make sure you have Module::Signature installed, then type: % cpansign -v It will check each file's integrity, as well as the signature's validity. If "==> Signature verified OK! <==" is not displayed, the distribution may already have been compromised, and you should not run its Makefile.PL or Build.PL. -----BEGIN PGP SIGNED MESSAGE----- Hash: SHA1 SHA1 ecca60615e67156c5d94a16ece995ac1c6bc2e66 COPYRIGHT SHA1 b75971ea5b047b4c393f8ae46b722c8be8d8ac64 CREDITS SHA1 a8fa4559bf87a9a9aa49672d7812a8a62fa0e07d Changes SHA1 6454f6519d74bccf17b5a1bae5871ba56d57b23b LICENSE SHA1 f7b079f72a76998cd046a484a31829cdec9c8159 MANIFEST SHA1 a88932b586b2cfefc3aebed323aa3a2cb127f43b META.ttl SHA1 7bae71cc8c4db8d1b06e54a439c125a04b715610 META.yml SHA1 7150e5e086ef493e1e527a1eeec44a8344b80db6 Makefile.PL SHA1 04a8ce76a4a59fe5bce5dbe21414e8f0b02562c5 NEWS SHA1 5a4655fd72a4f6c594835655dc02f47893ee7db0 README SHA1 867d474416e190e48bc4861861b69063b23d2000 TODO SHA1 469b204f58fe3cc35774b40e017ef5df46d65cec bin/html2xhtml SHA1 1cf426fcf6ddd1e59ff735afa1822de423735e62 bin/html5debug SHA1 8002a7d30f735cdf74fdfd4dc7cf1bc73472e73a examples/charsets.pl SHA1 7afb17f9288ade65fd6feaeb8e0fb10d2060d33b examples/html/iso-8859-15.html SHA1 5f4244551518a85def4539fad6e192ee4f063a4b examples/html/utf-16.html SHA1 f01e6078e5a9fa52a23b0f0c32e1baea16810873 examples/html/utf-8.html SHA1 06c410f05488c1612ed66b06d3a86b2580581e4a inc/Module/AutoInstall.pm SHA1 8a924add836b60fb23b25c8506d45945e02f42f4 inc/Module/Install.pm SHA1 61ab1dd37e33ddbe155907ce51df8a3e56ac8bbf inc/Module/Install/AutoInstall.pm SHA1 c04f94f91fa97b9f8cfb5a36071098ab0e6c78e3 inc/Module/Install/AutoManifest.pm SHA1 2d0fad3bf255f8c1e7e1e34eafccc4f595603ddc inc/Module/Install/Base.pm SHA1 f0e01fff7d73cd145fbf22331579918d4628ddb0 inc/Module/Install/Can.pm SHA1 9e4cc10c7f138a3f2c60d9ee1f34261f0d8c5aae inc/Module/Install/Contributors.pm SHA1 7328966e4fda0c8451a6d3850704da0b84ac1540 inc/Module/Install/Fetch.pm SHA1 66d3d335a03492583a3be121a7d888f63f08412c inc/Module/Install/Include.pm SHA1 b62ca5e2d58fa66766ccf4d64574f9e1a2250b34 inc/Module/Install/Makefile.pm SHA1 1aa925be410bb3bfcd84a16985921f66073cc1d2 inc/Module/Install/Metadata.pm SHA1 3b9281ddf7dd6d6f5de0a9642c69333023193c80 inc/Module/Install/Package.pm SHA1 4d793c044726e06fe35d8d129b76da2803377f92 inc/Module/Install/Scripts.pm SHA1 41f76ff6b39368a65da86377e43b34bacc2fb0eb inc/Module/Install/TrustMetaYml.pm SHA1 e4196994fa75e98bdfa2be0bdeeffef66de88171 inc/Module/Install/Win32.pm SHA1 c3a6d0d5b84feb3280622e9599e86247d58b0d18 inc/Module/Install/WriteAll.pm SHA1 26d58a041cd6b3d21db98b32e8fd1841aae21204 inc/Module/Package.pm SHA1 5a2f84a7a82eee0d1e5ed3628fe503a608efc10f inc/Module/Package/Dist/RDF.pm SHA1 feb933cefe2e3762e8322bd6071a2499f3440da1 inc/YAML/Tiny.pm SHA1 d295107c44e49ee10359e7a2791d93eb72715486 lib/HTML/HTML5/Parser.pm SHA1 4fac823d3cefa413def8a7fbea608e1b3779665c lib/HTML/HTML5/Parser/Charset/DecodeHandle.pm SHA1 745d72d906b636da9b16ac7dce35a72bede52833 lib/HTML/HTML5/Parser/Charset/Info.pm SHA1 fcad394c577aec2c3aadcf93adc7905dbe116841 lib/HTML/HTML5/Parser/Charset/UnicodeChecker.pm SHA1 e077fe86a0c023c65399725ba3edf43677fb743d lib/HTML/HTML5/Parser/Charset/UniversalCharDet.pm SHA1 e4a13ff73a03511c5f7c3457ee3b695b816994e0 lib/HTML/HTML5/Parser/Charset/WebLatin1.pm SHA1 6ffb54a0a7645cb1361466ffd09a376891c9804e lib/HTML/HTML5/Parser/Charset/WebThai.pm SHA1 4fee058fdfc9945bab9770d700f069b95d50f44c lib/HTML/HTML5/Parser/Error.pm SHA1 f0c555e1985624ade5e049e658a800fc887f6008 lib/HTML/HTML5/Parser/TagSoupParser.pm SHA1 33a3e43f2a22be55e5a6a0ac940ab586b878b047 lib/HTML/HTML5/Parser/Tokenizer.pm SHA1 6dee0e807c205dec740fce8f3f7dd98a0e8ae9f0 lib/HTML/HTML5/Parser/UA.pm SHA1 732b8eee13f701f526754d3588170fc9e65d0113 meta/changes.ttl SHA1 225231ee459879a71bd63497c73a92cf51a6c36d meta/doap.ttl SHA1 45ea4b576b0d89249d196851fcd35dfae5d06abc meta/makefile.ttl SHA1 6423d7e52e69cd4aa7219f415ca619b6474b98dc meta/rt-bugs.ttl SHA1 7857172ce6dd2a878530a09b43b31756f678e9ff t/01basic.t SHA1 9718fe7b9c7974d2dc30900fe6e8573307cc61c4 t/02html5.t SHA1 da9d68d050e78d3a552de95a71ca2d52c586dd99 t/03html4.t SHA1 5a9aca99d75364492622df5da405c59e567dc05b t/04fragments.t SHA1 ae192a1df4667f3d9a8e16f415e8c6bcee3d70c4 t/05origins.t SHA1 bd427bda11247c67e56be725d8a948fa37bccc98 t/06xlxdsln.t SHA1 c0f4856b9d20dbe16d2853349912a9e03ed279a3 t/07ua.t SHA1 f9304ab4d114621f3648cf36d78482ad6e4d7224 t/08ua-lwp.t SHA1 318ea1622c69eb54ec2b88faefbbc8fbfb53f9a8 t/99html5lib.t SHA1 e6e5a36404f093cdf57d3cf360a96f34412bfd0d t/html5lib-fail/domjs-unsafe.dat SHA1 294f7e6b17472f76dd25f70baf3aeec77fcad92f t/html5lib-fail/entities02.dat SHA1 bfa9e8466f05c3f458ce98103f43b024806e318d t/html5lib-fail/plain-text-unsafe.dat SHA1 1881f98a139798f8f7ba88d61bf3cc83c947c156 t/html5lib-fail/tests1.dat SHA1 e762377e3673e9156342db8a2c39e2dc26f4cf8e t/html5lib-fail/tests11.dat SHA1 b008f25560c67805dac61d4fabe34abfbeffd4b8 t/html5lib-fail/tests14.dat SHA1 5debed1629d20da84213147dd60b47a9db1e7e89 t/html5lib-fail/tests16.dat SHA1 8913ac300703dd2da9559847346ecb1f0e9a62d2 t/html5lib-fail/tests21.dat SHA1 96374cc1c7376924dae67b457c15bb5c4c84fead t/html5lib-fail/tests22.dat SHA1 73b86bc207596a884cfecda33bf76e338fd6baed t/html5lib-fail/tests23.dat SHA1 c52a4e03cf207bfd21db35033319942a4cad8495 t/html5lib-fail/tests5.dat SHA1 5c1e3eca81a421e004868b2be067c59eb64ce0f8 t/html5lib-fail/tests6.dat SHA1 aacc3803e59c2f8dcc596f7d78309e926b1abaae t/html5lib-fail/tests9.dat SHA1 b833dfdb96281a4278d086f2fd6ee8efcf6b816c t/html5lib-fail/tests_innerHTML_1.dat SHA1 69898621c4b7805869227c5e0a582efa08bac9f1 t/html5lib-pass/adoption01.dat SHA1 6327fc294e6f8cc5e3384eabd5631374e9eec9da t/html5lib-pass/adoption02.dat SHA1 09bfabd914faef1da9447645502ffccc45ad0eed t/html5lib-pass/comments01.dat SHA1 25ae1ba311590a0b58d3c74eb5d329c25cf3c65a t/html5lib-pass/doctype01.dat SHA1 b7f94b81a062feccccc320ad0d359d3922ce2527 t/html5lib-pass/entities01.dat SHA1 dc20b67f5dddb1c510bcd0ed897bb4360ef4a5eb t/html5lib-pass/html5test-com.dat SHA1 e726c15952f1cf2b313754eaaaaddb5d89c1bc5a t/html5lib-pass/inbody01.dat SHA1 35150ba0e2a10ee3ec9e90226b1918cc925dc53a t/html5lib-pass/isindex.dat SHA1 2b88098aecd040ffa244db17501b63b1ff3116a8 t/html5lib-pass/pending-spec-changes-plain-text-unsafe.dat SHA1 d6d7e4a8fcabe248398f38a8851255032c563100 t/html5lib-pass/pending-spec-changes.dat SHA1 3d98f51cd13989a52f0056b1345052a42bc6d322 t/html5lib-pass/scriptdata01.dat SHA1 2785ff3daee2434a396e33ba3a2e0b454a2cad29 t/html5lib-pass/scripted/adoption01.dat SHA1 b09e7cc786c2c38b3c25c8f7e2b9a39ab44c7559 t/html5lib-pass/scripted/ark.dat SHA1 b67d186fd109193406c916bc3a07035db760ef2a t/html5lib-pass/scripted/webkit01.dat SHA1 9ddf73a229514e33af1d15ad8bd444d376f5d9b5 t/html5lib-pass/tables01.dat SHA1 1eb200806dfea18342685ca1e7a2420ef7a8bbb3 t/html5lib-pass/tests10.dat SHA1 a4d6e45d8672bbced4e0263c7d45e430411d3daa t/html5lib-pass/tests12.dat SHA1 aacd3c1b89558aa14aeab622e592e39c400e3407 t/html5lib-pass/tests15.dat SHA1 8ed6c24f5fef57a5235a6dd2063e92262d55d27f t/html5lib-pass/tests17.dat SHA1 267c6ed51c40b0fcdfdc8b3a7fd4bcfb2e5e577b t/html5lib-pass/tests18.dat SHA1 f425496c96b2ad8c9e3448328e299cf2c03bdb6b t/html5lib-pass/tests19.dat SHA1 e41f1c5cb3732712f0866aeb5bcc8e04ccc2892c t/html5lib-pass/tests2.dat SHA1 06e13e6e717fcc691fa4b1b51d6cf6adb6a7ee6c t/html5lib-pass/tests20.dat SHA1 bc2d3e6e8a28acee60e4239db78c5c5516f91fd7 t/html5lib-pass/tests24.dat SHA1 f2d17949f7332395f553db1556cbf1f9ffadac59 t/html5lib-pass/tests25.dat SHA1 4f94c0d875f7fb73fbccd9ec3d01f70b8b4ac246 t/html5lib-pass/tests26.dat SHA1 ba0de70c79bb7d0e75445e84b03022a6dce7ecdc t/html5lib-pass/tests3.dat SHA1 a8c1f8ebf5886b0df5002a8f69ceb2405c60c66a t/html5lib-pass/tests4.dat SHA1 1a23bbac97d6bf0c34b5d67957183ab001a5a7b0 t/html5lib-pass/tests7.dat SHA1 532f635a6b431cbcac37cb9189c1a5a58790b2c0 t/html5lib-pass/tests8.dat SHA1 d5d5e56053a73a71f0e89030191efa86a139e9c8 t/html5lib-pass/tricky01.dat SHA1 202c121b8890c4b1fc4310cb43ba85b17500b732 t/html5lib-pass/webkit01.dat SHA1 42bfb8fbb4aea358f6ab4691c8e143a7bff5357c t/html5lib-pass/webkit02.dat SHA1 c145c4023e4f8a015b0110476aedd8deccac81e2 t/lib/Test/HTTP/Server.pm SHA1 f056047c0e22094abfbafc98e54d6ef6a9332f70 t/rt-79019.t -----BEGIN PGP SIGNATURE----- Version: GnuPG v1.4.12 (GNU/Linux) iEYEARECAAYFAlHayqgACgkQzr+BKGoqfTlgNwCgqa/2SdqGIeqdTEfHf/6gwzQm vfMAnjYauVm7oZoo6L5ioirESw56pSoZ =U3P9 -----END PGP SIGNATURE----- ��������������������������������������������������������������������������������������������������������HTML-HTML5-Parser-0.301/META.ttl��������������������������������������������������������������������0000644�0001750�0001750�00000046036�12166545167�013757� 0����������������������������������������������������������������������������������������������������ustar �tai�����������������������������tai��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������@prefix cpan: <http://purl.org/NET/cpan-uri/person/> . @prefix cpant: <http://purl.org/NET/cpan-uri/terms#> . @prefix dbug: <http://ontologi.es/doap-bugs#> . @prefix dc: <http://purl.org/dc/terms/> . @prefix dcs: <http://ontologi.es/doap-changeset#> . @prefix dist: <http://purl.org/NET/cpan-uri/dist/HTML-HTML5-Parser/> . @prefix doap: <http://usefulinc.com/ns/doap#> . @prefix foaf: <http://xmlns.com/foaf/0.1/> . @prefix nfo: <http://www.semanticdesktop.org/ontologies/2007/03/22/nfo#> . @prefix owl: <http://www.w3.org/2002/07/owl#> . @prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> . @prefix status: <http://purl.org/NET/cpan-uri/rt/status/> . @prefix xsd: <http://www.w3.org/2001/XMLSchema#> . dist:project a doap:Project; doap:name "HTML-HTML5-Parser"; cpant:install_script [ nfo:fileName "bin/html2xhtml" ]; cpant:install_script [ nfo:fileName "bin/html5debug" ]; cpant:perl_version_from _:B1; cpant:readme_from _:B1; cpant:requires "HTML::HTML5::Entities 0.002"; cpant:requires "HTTP::Tiny"; cpant:requires "IO::HTML"; cpant:requires "Scalar::Util"; cpant:requires "Try::Tiny"; cpant:requires "URI::file"; cpant:requires "XML::LibXML 1.94"; cpant:requires "XML::LibXML::Devel"; cpant:test_requires "Test::More 0.61"; cpant:version_from _:B1; dbug:issue <http://purl.org/NET/cpan-uri/rt/ticket/55845>; dbug:issue <http://purl.org/NET/cpan-uri/rt/ticket/65517>; dbug:issue <http://purl.org/NET/cpan-uri/rt/ticket/75809>; dbug:issue <http://purl.org/NET/cpan-uri/rt/ticket/79019>; doap:bug-database <http://rt.cpan.org/Dist/Display.html?Queue=HTML-HTML5-Parser>; doap:category [ rdfs:label "Web"@en ]; doap:category [ rdfs:label "HTML"@en ]; doap:category [ rdfs:label "HTML5"@en ]; doap:created "2009-11-26"^^xsd:date; doap:developer dist:dev-wakaba; doap:developer <http://tobyinkster.co.uk/#i>; doap:documenter <http://tobyinkster.co.uk/#i>; doap:download-page <https://metacpan.org/release/HTML-HTML5-Parser>; doap:homepage <https://metacpan.org/release/HTML-HTML5-Parser>; doap:license <http://dev.perl.org/licenses/>; doap:maintainer <http://tobyinkster.co.uk/#i>; doap:programming-language "Perl"; doap:release dist:v_0-00_01; doap:release dist:v_0-01; doap:release dist:v_0-02; doap:release dist:v_0-03; doap:release dist:v_0-04; doap:release dist:v_0-100; doap:release dist:v_0-101; doap:release dist:v_0-102; doap:release dist:v_0-103; doap:release dist:v_0-104; doap:release dist:v_0-105; doap:release dist:v_0-106; doap:release dist:v_0-107; doap:release dist:v_0-108; doap:release dist:v_0-109; doap:release dist:v_0-110; doap:release dist:v_0-200; doap:release dist:v_0-202; doap:release dist:v_0-204; doap:release dist:v_0-206; doap:release dist:v_0-208; doap:release dist:v_0-300; doap:release dist:v_0-301; doap:repository [ a doap:SVNRepository; doap:browse <http://goddamn.co.uk/svn-web/perlmods/browse/HTML-HTML5-Parser/>; ]; doap:shortdesc "parse HTML reliably"@en; doap:tester <http://tobyinkster.co.uk/#i>. dist:v_0-00_01 a doap:Version; rdfs:label "Developer preview"@en; dc:issued "2009-12-01"^^xsd:date; doap:file-release <http://backpan.cpan.org/authors/id/T/TO/TOBYINK/HTML-HTML5-Parser-0.00_01.tar.gz>; doap:revision "0.00_01"^^xsd:string. dist:v_0-01 a doap:Version; rdfs:label "Original version"@en; dc:issued "2009-12-03"^^xsd:date; doap:file-release <http://backpan.cpan.org/authors/id/T/TO/TOBYINK/HTML-HTML5-Parser-0.01.tar.gz>; doap:revision "0.01"^^xsd:string. dist:v_0-02 a doap:Version; dc:issued "2009-12-16"^^xsd:date; dcs:changeset [ dcs:item [ rdfs:label "Replace Inline::Python encoding detection with weaker, but native Perl HTML::Encoding package."@en; ]; dcs:item [ a dcs:Addition; a dcs:Packaging; rdfs:label "Bundle the html2xhtml tool."@en; ]; dcs:versus dist:v_0-01; ]; doap:file-release <http://backpan.cpan.org/authors/id/T/TO/TOBYINK/HTML-HTML5-Parser-0.02.tar.gz>; doap:revision "0.02"^^xsd:string. dist:v_0-03 a doap:Version; dc:issued "2010-01-15"^^xsd:date; dcs:changeset [ dcs:item [ a dcs:Bugfix; rdfs:label "Module didn't use URI::file properly."@en; dcs:blame [ foaf:nick "shellac" ]; ]; dcs:item [ a dcs:Packaging; a dcs:Update; rdfs:label "Upgrade distribution to my new packaging regime (auto-generated changelogs, etc)"@en; ]; dcs:item [ a dcs:Documentation; a dcs:Update; rdfs:label "Copyright 2010."@en; ]; dcs:versus dist:v_0-02; ]; doap:file-release <http://backpan.cpan.org/authors/id/T/TO/TOBYINK/HTML-HTML5-Parser-0.03.tar.gz>; doap:revision "0.03"^^xsd:string. dist:v_0-04 a doap:Version; dc:issued "2010-04-21"^^xsd:date; dcs:changeset [ dcs:item [ a dcs:Update; rdfs:label "Catch up to revision cf2c0df8a6dfb50fee923dfb21b14c83f282ccdc (2010-02-28) upstream."@en; ]; dcs:versus dist:v_0-03; ]; doap:file-release <http://backpan.cpan.org/authors/id/T/TO/TOBYINK/HTML-HTML5-Parser-0.04.tar.gz>; doap:revision "0.04"^^xsd:string. dist:v_0-100 a doap:Version; dc:issued "2010-06-23"^^xsd:date; dcs:changeset [ dcs:item [ a dcs:Bugfix; rdfs:label "Minor bugfixes."@en ]; dcs:versus dist:v_0-04; ]; doap:file-release <http://backpan.cpan.org/authors/id/T/TO/TOBYINK/HTML-HTML5-Parser-0.100.tar.gz>; doap:revision "0.100"^^xsd:string. dist:v_0-101 a doap:Version; dc:issued "2010-06-30"^^xsd:date; dcs:changeset [ dcs:item [ a dcs:Bugfix; rdfs:label "UTF-8 fix."@en; dcs:fixes [ rdfs:label "Wide characters in DOM tree."@en; dbug:reporter cpan:gwilliams; ]; ]; dcs:versus dist:v_0-100; ]; doap:file-release <http://backpan.cpan.org/authors/id/T/TO/TOBYINK/HTML-HTML5-Parser-0.101.tar.gz>; doap:revision "0.101"^^xsd:string. dist:v_0-102 a doap:Version; dc:issued "2011-01-19"^^xsd:date; dcs:changeset [ dcs:item [ a dcs:Bugfix; rdfs:label "Fix source_line method."@en; ]; dcs:item [ a dcs:Update; rdfs:label "Catch up to revision f2c921a886ab0b3dfb8d21b82525e98a4a921ad4 (2010-10-11) upstream."@en; ]; dcs:item [ a dcs:Addition; rdfs:label "Allow <object> element to appear in <head> if document has an HTML4 doctype. This is a willful violation of the HTML5 parsing algorithm. (The <object> may have <param> elements as children, as well as any children that would normally be allowed in the <head> of the document, such as <meta>; any other content is treated as the beginning of the <body>, and thus closes <object> and <head>. That's slightly looser than the HTML 4 spec which says only <param> should be used, but stricter than the HTML 4 DTD which allows pretty much anything in there!)"@en; ]; dcs:item [ a dcs:Addition; rdfs:label "Support <figcaption> element."@en; ]; dcs:item [ a dcs:Addition; rdfs:label "Support <summary> element."@en; ]; dcs:versus dist:v_0-101; ]; doap:file-release <http://backpan.cpan.org/authors/id/T/TO/TOBYINK/HTML-HTML5-Parser-0.102.tar.gz>; doap:revision "0.102"^^xsd:string. dist:v_0-103 a doap:Version; dc:issued "2011-02-09"^^xsd:date; dcs:changeset [ dcs:item [ a dcs:Documentation; a dcs:Update; rdfs:label "Copyright 2011."@en; ]; dcs:item [ a dcs:Bugfix; rdfs:label "TagSoupParser.pm called a method that is renamed between this distribution and upstream using its upstream name."@en; ]; dcs:versus dist:v_0-101; ]; doap:file-release <http://backpan.cpan.org/authors/id/T/TO/TOBYINK/HTML-HTML5-Parser-0.103.tar.gz>; doap:revision "0.103"^^xsd:string. dist:v_0-104 a doap:Version; dc:issued "2011-09-22"^^xsd:date; dcs:changeset [ dcs:item [ a dcs:Update; rdfs:label "Catch up to revision d81fcb920a1a3c351149cd66a64bf1b8ae14a172 (2011-08-21) upstream."@en; ]; dcs:item [ a dcs:Addition; rdfs:label "Support <track> element."@en; ]; dcs:item [ a dcs:Addition; rdfs:label "Some error handling stuff."@en; ]; dcs:versus dist:v_0-103; ]; doap:file-release <http://backpan.cpan.org/authors/id/T/TO/TOBYINK/HTML-HTML5-Parser-0.104.tar.gz>; doap:revision "0.104"^^xsd:string. dist:v_0-105 a doap:Version; dc:issued "2011-10-07"^^xsd:date; dcs:changeset [ dcs:item [ rdfs:label "HTML::HTML5::Parser::Error overloads stringification."@en; ]; dcs:item [ a dcs:Packaging; rdfs:label "Module::Package::RDF."@en; ]; dcs:item [ a dcs:Addition; rdfs:label "Bundle 'html5debug' script."@en; ]; dcs:item [ rdfs:label "use HTML::HTML5::Entities"@en ]; dcs:versus dist:v_0-104; ]; doap:file-release <http://backpan.cpan.org/authors/id/T/TO/TOBYINK/HTML-HTML5-Parser-0.105.tar.gz>; doap:revision "0.105"^^xsd:string. dist:v_0-106 a doap:Version; dc:issued "2011-10-10"^^xsd:date; dcs:changeset [ dcs:item [ a dcs:Bugfix; rdfs:label "Tokenizer.pm was still trying to require NamedEntityList.pm."@en; ]; dcs:versus dist:v_0-105; ]; doap:file-release <http://backpan.cpan.org/authors/id/T/TO/TOBYINK/HTML-HTML5-Parser-0.106.tar.gz>; doap:revision "0.106"^^xsd:string. dist:v_0-107 a doap:Version; dc:issued "2011-10-20"^^xsd:date; dcs:changeset [ dcs:item [ a dcs:Bugfix; rdfs:label "parse_file wasn't accepting relative file names"@en; ]; dcs:item [ a dcs:Addition; rdfs:label "html2xhtml now reads from STDIN by default."@en; ]; dcs:item [ a dcs:Addition; rdfs:label "html2xhtml can output to a file."@en; ]; dcs:versus dist:v_0-106; ]; doap:file-release <http://backpan.cpan.org/authors/id/T/TO/TOBYINK/HTML-HTML5-Parser-0.107.tar.gz>; doap:revision "0.107"^^xsd:string. dist:v_0-108 a doap:Version; dc:issued "2012-01-18"^^xsd:date; dcs:changeset [ dcs:item [ a dcs:Bugfix; rdfs:label "Crashed when generating an error message about some missing closing tags."@en; dcs:blame cpan:audreyt; ]; dcs:item [ a dcs:Documentation; a dcs:Update; rdfs:label "Copyright 2012."@en; ]; dcs:item [ a dcs:Update; rdfs:label "Provide load_xml and load_html methods for compatibility with XML::LibXML 1.70 and newer."@en; ]; dcs:versus dist:v_0-107; ]; doap:file-release <http://backpan.cpan.org/authors/id/T/TO/TOBYINK/HTML-HTML5-Parser-0.108.tar.gz>; doap:revision "0.108"^^xsd:string. dist:v_0-109 a doap:Version; rdfs:label "Yes, they are come, but they are not past."@en; dc:issued "2012-03-15"^^xsd:date; dcs:changeset [ dcs:item [ a dcs:Bugfix; rdfs:label "Use correct case for MathML's definitionURL attribute."@en; ]; dcs:item [ a dcs:Bugfix; rdfs:label "Fix several fatal errors on infrequently used code paths in TagSoupParser.pm."@en; ]; dcs:item [ a dcs:Packaging; rdfs:label "Bundle test cases from html5lib."@en; ]; dcs:item [ a dcs:Addition; rdfs:label "Provide parse_balanced_chunk to parse HTML fragments."@en; ]; dcs:item [ a dcs:Addition; rdfs:label "Provide dtd_element method to get more information about the DTD."@en; ]; dcs:item [ rdfs:label "Make source_line method work more reliably. This requires XML::LibXML::Devel, and thus a fairly recent version of XML-LibXML."@en; ]; dcs:item [ a dcs:Addition; rdfs:label "Nodes now have an implictness flag (returned by source_line called in a list context)."@en; ]; dcs:versus dist:v_0-108; ]; doap:file-release <http://backpan.cpan.org/authors/id/T/TO/TOBYINK/HTML-HTML5-Parser-0.109.tar.gz>; doap:revision "0.109"^^xsd:string. dist:v_0-110 a doap:Version; dc:issued "2012-03-20"^^xsd:date; dcs:changeset [ dcs:item [ a dcs:Bugfix; rdfs:label "Removed a use of smart match which was breaking Perl 5.8.x compatibility."@en; dcs:fixes <http://purl.org/NET/cpan-uri/rt/ticket/75809>; ]; dcs:item [ rdfs:label "use XML::LibXML::Devel::SetLineNumber if it is available."; ]; dcs:item [ rdfs:label "Text nodes should keep line numbers and column numbers too."; ]; dcs:item [ rdfs:label "Comment nodes should now keep their line numbers and column numbers."; ]; dcs:versus dist:v_0-109; ]; doap:file-release <http://backpan.cpan.org/authors/id/T/TO/TOBYINK/HTML-HTML5-Parser-0.110.tar.gz>; doap:revision "0.110"^^xsd:string. dist:v_0-200 a doap:Version; dc:issued "2012-06-13"^^xsd:date; dcs:changeset [ dcs:item [ rdfs:label "Suppress warnings about invalid Unicode code points."; ]; dcs:item [ rdfs:label "Drop dependency on Error.pm; use Try::Tiny instead."; ]; dcs:item [ rdfs:label "Drop dependency on HTML::Encoding; use IO::HTML instead."; ]; dcs:item [ rdfs:label "Passing a couple more of the html5lib test suite files."; ]; dcs:versus dist:v_0-109; ]; doap:file-release <http://backpan.cpan.org/authors/id/T/TO/TOBYINK/HTML-HTML5-Parser-0.200.tar.gz>; doap:revision "0.200"^^xsd:string. dist:v_0-202 a doap:Version; dc:issued "2012-06-27"^^xsd:date; dcs:changeset [ dcs:item [ rdfs:label "Drop dependency on LWP::UserAgent in favour of HTTP::Tiny which is part of core since 5.14."; ]; dcs:versus dist:v_0-200; ]; doap:file-release <http://backpan.cpan.org/authors/id/T/TO/TOBYINK/HTML-HTML5-Parser-0.202.tar.gz>; doap:revision "0.202"^^xsd:string. dist:v_0-204 a doap:Version; dc:issued "2012-06-29"^^xsd:date; dcs:changeset [ dcs:item [ a dcs:Bugfix; rdfs:label "Stop using defined-or operator in HTML::HTML5::Parser::UA, as it doesn't work in Perl 5.8."; ]; dcs:versus dist:v_0-202; ]; doap:file-release <http://backpan.cpan.org/authors/id/T/TO/TOBYINK/HTML-HTML5-Parser-0.204.tar.gz>; doap:revision "0.204"^^xsd:string. dist:v_0-206 a doap:Version; dc:issued "2012-06-30"^^xsd:date; dcs:changeset [ dcs:item [ a dcs:Bugfix; rdfs:label "Remove 'use 5.010' from t/07ua.t."; ]; dcs:item [ a dcs:Bugfix; rdfs:label "Disable t/07ua.t and t/08ua-lwp.t on Windows because Test::HTTP::Server doesn't work on that platform."; rdfs:seeAlso <https://rt.cpan.org/Ticket/Display.html?id=78118>; ]; dcs:versus dist:v_0-204; ]; doap:file-release <http://backpan.cpan.org/authors/id/T/TO/TOBYINK/HTML-HTML5-Parser-0.206.tar.gz>; doap:revision "0.206"^^xsd:string. dist:v_0-208 a doap:Version; dc:issued "2012-12-06"^^xsd:date; dcs:changeset [ dcs:item [ a dcs:Bugfix; rdfs:label "Minor fix re LWP-independence."; ]; dcs:item [ a dcs:Bugfix; rdfs:label "If two <html> tags were in the same file, attributes on the second <html> element could cause crashes."@en; dcs:fixes <http://purl.org/NET/cpan-uri/rt/ticket/79019>; dcs:thanks [ foaf:name "Luben Karavelov" ]; ]; dcs:versus dist:v_0-206; ]; doap:file-release <http://backpan.cpan.org/authors/id/T/TO/TOBYINK/HTML-HTML5-Parser-0.208.tar.gz>; doap:revision "0.208"^^xsd:string. dist:v_0-300 a doap:Version; dc:issued "2013-07-06"^^xsd:date; dcs:changeset [ dcs:item [ a dcs:Bugfix; rdfs:label "Fix many major memory leaks."; dcs:blame [ foaf:name "Dorian Taylor"; foaf:nick "DORIAN"; ]; dcs:fixes <tdb:2013:https://github.com/tobyink/p5-html-html5-parser/pull/1>; ]; dcs:item [ a dcs:Change; rdfs:label "Memory leak fix necessitated some API changes; in particular some methods which were available as class methods are now object methods only."@en; ]; dcs:item [ a dcs:Packaging; rdfs:label "t/99html5lib.t now uses Moo instead of Mo."@en; ]; dcs:versus dist:v_0-208; ]; doap:file-release <http://backpan.cpan.org/authors/id/T/TO/TOBYINK/HTML-HTML5-Parser-0.300.tar.gz>; doap:revision "0.300"^^xsd:string. dist:v_0-301 a doap:Version; dc:issued "2013-07-08"^^xsd:date; dcs:changeset [ dcs:item [ a dcs:Bugfix; rdfs:label "Fix t/99html5lib.t still using Mo in one place."@en; dcs:fixes <http://purl.org/NET/cpan-uri/rt/ticket/86774>; dcs:thanks [ foaf:name "Fitz Elliott"; foaf:mbox <mailto:fitz.elliott@gmail.com>; ]; ]; dcs:versus dist:v_0-300; ]; doap:file-release <http://backpan.cpan.org/authors/id/T/TO/TOBYINK/HTML-HTML5-Parser-0.301.tar.gz>; doap:revision "0.301"^^xsd:string. dist:dev-wakaba a foaf:Person; foaf:name "Wakaba"; foaf:page <http://suika.fam.cx/>. <http://tobyinkster.co.uk/#i> a foaf:Person; foaf:name "Toby Inkster"; foaf:homepage <http://tobyinkster.co.uk/>; foaf:mbox <mailto:tobyink@cpan.org>; foaf:page <https://metacpan.org/author/TOBYINK>; owl:sameAs cpan:tobyink. cpan:audreyt foaf:nick "audreyt". <http://purl.org/NET/cpan-uri/rt/ticket/55845> a dbug:Issue; rdfs:label "Warning produced when run under perl -w"; dbug:assignee _:B2; dbug:id "55845"^^xsd:string; dbug:page <https://rt.cpan.org/Ticket/Display.html?id=55845>; dbug:reporter _:B3; dbug:status status:resolved; dc:created "2010-03-23T16:50:39"^^xsd:dateTime; dc:modified "2010-04-23T13:00:02"^^xsd:dateTime. <http://purl.org/NET/cpan-uri/rt/ticket/65517> a dbug:Issue; rdfs:label "Please don't require Module::Signature"; dbug:assignee _:B2; dbug:id "65517"^^xsd:string; dbug:page <https://rt.cpan.org/Ticket/Display.html?id=65517>; dbug:reporter _:B3; dbug:status status:resolved; dc:created "2011-02-07T07:25:30"^^xsd:dateTime; dc:modified "2012-01-18T13:14:13"^^xsd:dateTime. <http://purl.org/NET/cpan-uri/rt/ticket/75809> a dbug:Issue; rdfs:label "LibXML version check uses smart match"; dbug:assignee _:B2; dbug:id "75809"^^xsd:string; dbug:page <https://rt.cpan.org/Ticket/Display.html?id=75809>; dbug:reporter [ a foaf:Agent; foaf:mbox <mailto:mail@tobyinkster.co.uk>; ]; dbug:status status:resolved; dc:created "2012-03-16T10:30:03"^^xsd:dateTime; dc:modified "2012-03-19T09:12:28"^^xsd:dateTime. <http://purl.org/NET/cpan-uri/rt/ticket/79019> a dbug:Issue; rdfs:label "Failure mode of TagSoupParser"; dbug:assignee _:B2; dbug:id "79019"^^xsd:string; dbug:page <https://rt.cpan.org/Ticket/Display.html?id=79019>; dbug:reporter [ a foaf:Agent; foaf:mbox <mailto:karavelov@mail.bg>; ]; dbug:status status:open; dc:created "2012-08-16T15:47:33"^^xsd:dateTime; dc:modified "2012-08-18T15:56:56"^^xsd:dateTime. _:B1 nfo:fileName "lib/HTML/HTML5/Parser.pm". _:B3 a foaf:Agent; foaf:mbox <mailto:DOUGDUDE@cpan.org>. _:B2 a foaf:Agent; foaf:nick "TOBYINK". ��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������HTML-HTML5-Parser-0.301/CREDITS���������������������������������������������������������������������0000644�0001750�0001750�00000000622�12166545170�013505� 0����������������������������������������������������������������������������������������������������ustar �tai�����������������������������tai��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������Maintainer: - Toby Inkster <mailto:tobyink@cpan.org> Contributor: - shellac - audreyt (cpan:AUDREYT) <mailto:audreyt@cpan.org> - TOBYINK - Wakaba - Dorian Taylor Thanks: - Anon <mailto:DOUGDUDE@cpan.org> - Anon <mailto:karavelov@mail.bg> - Luben Karavelov - Anon <mailto:mail@tobyinkster.co.uk> - Fitz Elliott <mailto:fitz.elliott@gmail.com> - GWILLIAMS (cpan:GWILLIAMS) <mailto:gwilliams@cpan.org> ��������������������������������������������������������������������������������������������������������������HTML-HTML5-Parser-0.301/LICENSE���������������������������������������������������������������������0000644�0001750�0001750�00000043774�12166545155�013514� 0����������������������������������������������������������������������������������������������������ustar �tai�����������������������������tai��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������This software is copyright (c) 2013 by Toby Inkster <tobyink@cpan.org>, Wakaba. This is free software; you can redistribute it and/or modify it under the same terms as the Perl 5 programming language system itself. Terms of the Perl programming language system itself a) the GNU General Public License as published by the Free Software Foundation; either version 1, or (at your option) any later version, or b) the "Artistic License" --- The GNU General Public License, Version 1, February 1989 --- This software is Copyright (c) 2013 by Toby Inkster <tobyink@cpan.org>, Wakaba. This is free software, licensed under: The GNU General Public License, Version 1, February 1989 GNU GENERAL PUBLIC LICENSE Version 1, February 1989 Copyright (C) 1989 Free Software Foundation, Inc. 51 Franklin St, Suite 500, Boston, MA 02110-1335 USA Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. Preamble The license agreements of most software companies try to keep users at the mercy of those companies. By contrast, our General Public License is intended to guarantee your freedom to share and change free software--to make sure the software is free for all its users. The General Public License applies to the Free Software Foundation's software and to any other program whose authors commit to using it. You can use it for your programs, too. When we speak of free software, we are referring to freedom, not price. Specifically, the General Public License is designed to make sure that you have the freedom to give away or sell copies of free software, that you receive source code or can get it if you want it, that you can change the software or use pieces of it in new free programs; and that you know you can do these things. To protect your rights, we need to make restrictions that forbid anyone to deny you these rights or to ask you to surrender the rights. These restrictions translate to certain responsibilities for you if you distribute copies of the software, or if you modify it. For example, if you distribute copies of a such a program, whether gratis or for a fee, you must give the recipients all the rights that you have. You must make sure that they, too, receive or can get the source code. And you must tell them their rights. We protect your rights with two steps: (1) copyright the software, and (2) offer you this license which gives you legal permission to copy, distribute and/or modify the software. Also, for each author's protection and ours, we want to make certain that everyone understands that there is no warranty for this free software. If the software is modified by someone else and passed on, we want its recipients to know that what they have is not the original, so that any problems introduced by others will not reflect on the original authors' reputations. The precise terms and conditions for copying, distribution and modification follow. GNU GENERAL PUBLIC LICENSE TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 0. This License Agreement applies to any program or other work which contains a notice placed by the copyright holder saying it may be distributed under the terms of this General Public License. The "Program", below, refers to any such program or work, and a "work based on the Program" means either the Program or any work containing the Program or a portion of it, either verbatim or with modifications. Each licensee is addressed as "you". 1. You may copy and distribute verbatim copies of the Program's source code as you receive it, in any medium, provided that you conspicuously and appropriately publish on each copy an appropriate copyright notice and disclaimer of warranty; keep intact all the notices that refer to this General Public License and to the absence of any warranty; and give any other recipients of the Program a copy of this General Public License along with the Program. You may charge a fee for the physical act of transferring a copy. 2. You may modify your copy or copies of the Program or any portion of it, and copy and distribute such modifications under the terms of Paragraph 1 above, provided that you also do the following: a) cause the modified files to carry prominent notices stating that you changed the files and the date of any change; and b) cause the whole of any work that you distribute or publish, that in whole or in part contains the Program or any part thereof, either with or without modifications, to be licensed at no charge to all third parties under the terms of this General Public License (except that you may choose to grant warranty protection to some or all third parties, at your option). c) If the modified program normally reads commands interactively when run, you must cause it, when started running for such interactive use in the simplest and most usual way, to print or display an announcement including an appropriate copyright notice and a notice that there is no warranty (or else, saying that you provide a warranty) and that users may redistribute the program under these conditions, and telling the user how to view a copy of this General Public License. d) You may charge a fee for the physical act of transferring a copy, and you may at your option offer warranty protection in exchange for a fee. Mere aggregation of another independent work with the Program (or its derivative) on a volume of a storage or distribution medium does not bring the other work under the scope of these terms. 3. You may copy and distribute the Program (or a portion or derivative of it, under Paragraph 2) in object code or executable form under the terms of Paragraphs 1 and 2 above provided that you also do one of the following: a) accompany it with the complete corresponding machine-readable source code, which must be distributed under the terms of Paragraphs 1 and 2 above; or, b) accompany it with a written offer, valid for at least three years, to give any third party free (except for a nominal charge for the cost of distribution) a complete machine-readable copy of the corresponding source code, to be distributed under the terms of Paragraphs 1 and 2 above; or, c) accompany it with the information you received as to where the corresponding source code may be obtained. (This alternative is allowed only for noncommercial distribution and only if you received the program in object code or executable form alone.) Source code for a work means the preferred form of the work for making modifications to it. For an executable file, complete source code means all the source code for all modules it contains; but, as a special exception, it need not include source code for modules which are standard libraries that accompany the operating system on which the executable file runs, or for standard header files or definitions files that accompany that operating system. 4. You may not copy, modify, sublicense, distribute or transfer the Program except as expressly provided under this General Public License. Any attempt otherwise to copy, modify, sublicense, distribute or transfer the Program is void, and will automatically terminate your rights to use the Program under this License. However, parties who have received copies, or rights to use copies, from you under this General Public License will not have their licenses terminated so long as such parties remain in full compliance. 5. By copying, distributing or modifying the Program (or any work based on the Program) you indicate your acceptance of this license to do so, and all its terms and conditions. 6. Each time you redistribute the Program (or any work based on the Program), the recipient automatically receives a license from the original licensor to copy, distribute or modify the Program subject to these terms and conditions. You may not impose any further restrictions on the recipients' exercise of the rights granted herein. 7. The Free Software Foundation may publish revised and/or new versions of the General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns. Each version is given a distinguishing version number. If the Program specifies a version number of the license which applies to it and "any later version", you have the option of following the terms and conditions either of that version or of any later version published by the Free Software Foundation. If the Program does not specify a version number of the license, you may choose any version ever published by the Free Software Foundation. 8. If you wish to incorporate parts of the Program into other free programs whose distribution conditions are different, write to the author to ask for permission. For software which is copyrighted by the Free Software Foundation, write to the Free Software Foundation; we sometimes make exceptions for this. Our decision will be guided by the two goals of preserving the free status of all derivatives of our free software and of promoting the sharing and reuse of software generally. NO WARRANTY 9. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 10. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. END OF TERMS AND CONDITIONS Appendix: How to Apply These Terms to Your New Programs If you develop a new program, and you want it to be of the greatest possible use to humanity, the best way to achieve this is to make it free software which everyone can redistribute and change under these terms. To do so, attach the following notices to the program. It is safest to attach them to the start of each source file to most effectively convey the exclusion of warranty; and each file should have at least the "copyright" line and a pointer to where the full notice is found. <one line to give the program's name and a brief idea of what it does.> Copyright (C) 19yy <name of author> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 1, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301 USA Also add information on how to contact you by electronic and paper mail. If the program is interactive, make it output a short notice like this when it starts in an interactive mode: Gnomovision version 69, Copyright (C) 19xx name of author Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. This is free software, and you are welcome to redistribute it under certain conditions; type `show c' for details. The hypothetical commands `show w' and `show c' should show the appropriate parts of the General Public License. Of course, the commands you use may be called something other than `show w' and `show c'; they could even be mouse-clicks or menu items--whatever suits your program. You should also get your employer (if you work as a programmer) or your school, if any, to sign a "copyright disclaimer" for the program, if necessary. Here a sample; alter the names: Yoyodyne, Inc., hereby disclaims all copyright interest in the program `Gnomovision' (a program to direct compilers to make passes at assemblers) written by James Hacker. <signature of Ty Coon>, 1 April 1989 Ty Coon, President of Vice That's all there is to it! --- The Artistic License 1.0 --- This software is Copyright (c) 2013 by Toby Inkster <tobyink@cpan.org>, Wakaba. This is free software, licensed under: The Artistic License 1.0 The Artistic License Preamble The intent of this document is to state the conditions under which a Package may be copied, such that the Copyright Holder maintains some semblance of artistic control over the development of the package, while giving the users of the package the right to use and distribute the Package in a more-or-less customary fashion, plus the right to make reasonable modifications. Definitions: - "Package" refers to the collection of files distributed by the Copyright Holder, and derivatives of that collection of files created through textual modification. - "Standard Version" refers to such a Package if it has not been modified, or has been modified in accordance with the wishes of the Copyright Holder. - "Copyright Holder" is whoever is named in the copyright or copyrights for the package. - "You" is you, if you're thinking about copying or distributing this Package. - "Reasonable copying fee" is whatever you can justify on the basis of media cost, duplication charges, time of people involved, and so on. (You will not be required to justify it to the Copyright Holder, but only to the computing community at large as a market that must bear the fee.) - "Freely Available" means that no fee is charged for the item itself, though there may be fees involved in handling the item. It also means that recipients of the item may redistribute it under the same conditions they received it. 1. You may make and give away verbatim copies of the source form of the Standard Version of this Package without restriction, provided that you duplicate all of the original copyright notices and associated disclaimers. 2. You may apply bug fixes, portability fixes and other modifications derived from the Public Domain or from the Copyright Holder. A Package modified in such a way shall still be considered the Standard Version. 3. You may otherwise modify your copy of this Package in any way, provided that you insert a prominent notice in each changed file stating how and when you changed that file, and provided that you do at least ONE of the following: a) place your modifications in the Public Domain or otherwise make them Freely Available, such as by posting said modifications to Usenet or an equivalent medium, or placing the modifications on a major archive site such as ftp.uu.net, or by allowing the Copyright Holder to include your modifications in the Standard Version of the Package. b) use the modified Package only within your corporation or organization. c) rename any non-standard executables so the names do not conflict with standard executables, which must also be provided, and provide a separate manual page for each non-standard executable that clearly documents how it differs from the Standard Version. d) make other distribution arrangements with the Copyright Holder. 4. You may distribute the programs of this Package in object code or executable form, provided that you do at least ONE of the following: a) distribute a Standard Version of the executables and library files, together with instructions (in the manual page or equivalent) on where to get the Standard Version. b) accompany the distribution with the machine-readable source of the Package with your modifications. c) accompany any non-standard executables with their corresponding Standard Version executables, giving the non-standard executables non-standard names, and clearly documenting the differences in manual pages (or equivalent), together with instructions on where to get the Standard Version. d) make other distribution arrangements with the Copyright Holder. 5. You may charge a reasonable copying fee for any distribution of this Package. You may charge any fee you choose for support of this Package. You may not charge a fee for this Package itself. However, you may distribute this Package in aggregate with other (possibly commercial) programs as part of a larger (possibly commercial) software distribution provided that you do not advertise this Package as a product of your own. 6. The scripts and library files supplied as input to or produced as output from the programs of this Package do not automatically fall under the copyright of this Package, but belong to whomever generated them, and may be sold commercially, and may be aggregated with this Package. 7. C or perl subroutines supplied by you and linked into this Package shall not be considered part of this Package. 8. The name of the Copyright Holder may not be used to endorse or promote products derived from this software without specific prior written permission. 9. THIS PACKAGE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. The End ����HTML-HTML5-Parser-0.301/examples/�������������������������������������������������������������������0000755�0001750�0001750�00000000000�12166545247�014310� 5����������������������������������������������������������������������������������������������������ustar �tai�����������������������������tai��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������HTML-HTML5-Parser-0.301/examples/html/��������������������������������������������������������������0000755�0001750�0001750�00000000000�12166545247�015254� 5����������������������������������������������������������������������������������������������������ustar �tai�����������������������������tai��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������HTML-HTML5-Parser-0.301/examples/html/utf-16.html���������������������������������������������������0000644�0001750�0001750�00000000114�11734122437�017150� 0����������������������������������������������������������������������������������������������������ustar �tai�����������������������������tai��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������<�m�e�t�a� �c�h�a�r�s�e�t�=�"�u�t�f�-�1�6�"�>� �<�p�>� 1�0�0�<�/�p�>� �����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������HTML-HTML5-Parser-0.301/examples/html/iso-8859-15.html����������������������������������������������0000644�0001750�0001750�00000000051�11734121170�017547� 0����������������������������������������������������������������������������������������������������ustar �tai�����������������������������tai��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������<meta charset="iso-8859-15"> <p>100</p> ���������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������HTML-HTML5-Parser-0.301/examples/html/utf-8.html����������������������������������������������������0000644�0001750�0001750�00000000045�11734121200�017057� 0����������������������������������������������������������������������������������������������������ustar �tai�����������������������������tai��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������<meta charset="utf-8"> <p>€100</p> �������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������HTML-HTML5-Parser-0.301/examples/charsets.pl��������������������������������������������������������0000644�0001750�0001750�00000001211�11734122271�016440� 0����������������������������������������������������������������������������������������������������ustar �tai�����������������������������tai��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������use utf8::all; use HTML::HTML5::Parser; my $U = HTML::HTML5::Parser->load_html(location => 'examples/html/utf-8.html'); my $X = HTML::HTML5::Parser->load_html(location => 'examples/html/utf-16.html'); my $W = HTML::HTML5::Parser->load_html(location => 'examples/html/iso-8859-15.html'); print "UTF-8... ", $U->getElementsByTagName('p')->[0]->textContent, "\t", HTML::HTML5::Parser->charset($U), "\n"; print "UTF-16.. ", $X->getElementsByTagName('p')->[0]->textContent, "\t", HTML::HTML5::Parser->charset($X), "\n"; print "Western... ", $W->getElementsByTagName('p')->[0]->textContent, "\t", HTML::HTML5::Parser->charset($W), "\n"; ���������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������HTML-HTML5-Parser-0.301/Makefile.PL�����������������������������������������������������������������0000644�0001750�0001750�00000000052�11766045517�014441� 0����������������������������������������������������������������������������������������������������ustar �tai�����������������������������tai��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������use inc::Module::Package 'RDF:standard'; ��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������HTML-HTML5-Parser-0.301/meta/�����������������������������������������������������������������������0000755�0001750�0001750�00000000000�12166545247�013420� 5����������������������������������������������������������������������������������������������������ustar �tai�����������������������������tai��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������HTML-HTML5-Parser-0.301/meta/makefile.ttl�����������������������������������������������������������0000644�0001750�0001750�00000001540�11772523121�015707� 0����������������������������������������������������������������������������������������������������ustar �tai�����������������������������tai��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������# This file provides instructions for packaging. @prefix : <http://purl.org/NET/cpan-uri/terms#> . <http://purl.org/NET/cpan-uri/dist/HTML-HTML5-Parser/project> :perl_version_from _:main ; :version_from _:main ; :readme_from _:main ; :test_requires "Test::More 0.61" ; :requires "Try::Tiny" ; :requires "IO::HTML" ; :requires "HTML::HTML5::Entities 0.002" ; :requires "HTTP::Tiny" ; :requires "XML::LibXML 1.94" ; :requires "XML::LibXML::Devel" ; :requires "Scalar::Util" ; :requires "URI::file" ; :install_script _:html5debug , _:html2xhtml . _:main <http://www.semanticdesktop.org/ontologies/2007/03/22/nfo#fileName> "lib/HTML/HTML5/Parser.pm" . _:html5debug <http://www.semanticdesktop.org/ontologies/2007/03/22/nfo#fileName> "bin/html5debug" . _:html2xhtml <http://www.semanticdesktop.org/ontologies/2007/03/22/nfo#fileName> "bin/html2xhtml" . ����������������������������������������������������������������������������������������������������������������������������������������������������������������HTML-HTML5-Parser-0.301/meta/doap.ttl���������������������������������������������������������������0000644�0001750�0001750�00000003316�12143760140�015055� 0����������������������������������������������������������������������������������������������������ustar �tai�����������������������������tai��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������@prefix : <http://usefulinc.com/ns/doap#> . @prefix author: <http://purl.org/NET/cpan-uri/person/> . @prefix dbug: <http://ontologi.es/doap-bugs#> . @prefix dcs: <http://ontologi.es/doap-changeset#> . @prefix dc: <http://purl.org/dc/terms/> . @prefix foaf: <http://xmlns.com/foaf/0.1/> . @prefix my: <http://purl.org/NET/cpan-uri/dist/HTML-HTML5-Parser/> . @prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> . @prefix toby: <http://tobyinkster.co.uk/#> . @prefix xsd: <http://www.w3.org/2001/XMLSchema#> . my:project a :Project ; :name "HTML-HTML5-Parser" ; :shortdesc "parse HTML reliably"@en ; :programming-language "Perl" ; :homepage <https://metacpan.org/release/HTML-HTML5-Parser> ; :download-page <https://metacpan.org/release/HTML-HTML5-Parser> ; :bug-database <http://rt.cpan.org/Dist/Display.html?Queue=HTML-HTML5-Parser> ; :repository [ a :SVNRepository ; :browse <http://goddamn.co.uk/svn-web/perlmods/browse/HTML-HTML5-Parser/> ] ; :maintainer toby:i ; :developer toby:i , my:dev-wakaba ; :documenter toby:i ; :tester toby:i ; :created "2009-11-26"^^xsd:date ; :license <http://dev.perl.org/licenses/> ; :category [ rdfs:label "Web"@en ] , [ rdfs:label "HTML"@en ] , [ rdfs:label "HTML5"@en ] . toby:i a foaf:Person ; foaf:name "Toby Inkster" ; foaf:homepage <http://tobyinkster.co.uk/> ; foaf:page <https://metacpan.org/author/TOBYINK> ; foaf:mbox <mailto:tobyink@cpan.org> ; <http://www.w3.org/2002/07/owl#sameAs> author:tobyink . my:dev-wakaba a foaf:Person ; foaf:name "Wakaba" ; foaf:page <http://suika.fam.cx/>. ������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������HTML-HTML5-Parser-0.301/meta/rt-bugs.ttl������������������������������������������������������������0000644�0001750�0001750�00000005110�12014016467�015512� 0����������������������������������������������������������������������������������������������������ustar �tai�����������������������������tai��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������@prefix dbug: <http://ontologi.es/doap-bugs#> . @prefix dc: <http://purl.org/dc/terms/> . @prefix doap: <http://usefulinc.com/ns/doap#> . @prefix foaf: <http://xmlns.com/foaf/0.1/> . @prefix prio: <http://purl.org/NET/cpan-uri/rt/priority/> . @prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> . @prefix rt: <http://purl.org/NET/cpan-uri/rt/ticket/> . @prefix status: <http://purl.org/NET/cpan-uri/rt/status/> . @prefix xsd: <http://www.w3.org/2001/XMLSchema#> . _:r1345330486r0 a foaf:Agent ; foaf:mbox <mailto:DOUGDUDE@cpan.org> . _:r1345330486r1 a foaf:Agent ; foaf:nick "TOBYINK" . _:r1345330487r2 a foaf:Agent ; foaf:mbox <mailto:mail@tobyinkster.co.uk> . _:r1345330487r3 a foaf:Agent ; foaf:mbox <mailto:karavelov@mail.bg> . <http://purl.org/NET/cpan-uri/dist/HTML-HTML5-Parser/project> dbug:issue <http://purl.org/NET/cpan-uri/rt/ticket/55845>, <http://purl.org/NET/cpan-uri/rt/ticket/65517>, <http://purl.org/NET/cpan-uri/rt/ticket/75809>, <http://purl.org/NET/cpan-uri/rt/ticket/79019> . <http://purl.org/NET/cpan-uri/rt/ticket/55845> dbug:assignee _:r1345330486r1 ; dbug:id "55845"^^xsd:string ; dbug:page <https://rt.cpan.org/Ticket/Display.html?id=55845> ; dbug:reporter _:r1345330486r0 ; dbug:status status:resolved ; dc:created "2010-03-23T16:50:39"^^xsd:dateTime ; dc:modified "2010-04-23T13:00:02"^^xsd:dateTime ; a dbug:Issue ; rdfs:label "Warning produced when run under perl -w" . <http://purl.org/NET/cpan-uri/rt/ticket/65517> dbug:assignee _:r1345330486r1 ; dbug:id "65517"^^xsd:string ; dbug:page <https://rt.cpan.org/Ticket/Display.html?id=65517> ; dbug:reporter _:r1345330486r0 ; dbug:status status:resolved ; dc:created "2011-02-07T07:25:30"^^xsd:dateTime ; dc:modified "2012-01-18T13:14:13"^^xsd:dateTime ; a dbug:Issue ; rdfs:label "Please don't require Module::Signature" . <http://purl.org/NET/cpan-uri/rt/ticket/75809> dbug:assignee _:r1345330486r1 ; dbug:id "75809"^^xsd:string ; dbug:page <https://rt.cpan.org/Ticket/Display.html?id=75809> ; dbug:reporter _:r1345330487r2 ; dbug:status status:resolved ; dc:created "2012-03-16T10:30:03"^^xsd:dateTime ; dc:modified "2012-03-19T09:12:28"^^xsd:dateTime ; a dbug:Issue ; rdfs:label "LibXML version check uses smart match" . <http://purl.org/NET/cpan-uri/rt/ticket/79019> dbug:assignee _:r1345330486r1 ; dbug:id "79019"^^xsd:string ; dbug:page <https://rt.cpan.org/Ticket/Display.html?id=79019> ; dbug:reporter _:r1345330487r3 ; dbug:status status:open ; dc:created "2012-08-16T15:47:33"^^xsd:dateTime ; dc:modified "2012-08-18T15:56:56"^^xsd:dateTime ; a dbug:Issue ; rdfs:label "Failure mode of TagSoupParser" . ��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������HTML-HTML5-Parser-0.301/meta/changes.ttl������������������������������������������������������������0000644�0001750�0001750�00000034004�12166544656�015561� 0����������������������������������������������������������������������������������������������������ustar �tai�����������������������������tai��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������@prefix : <http://usefulinc.com/ns/doap#> . @prefix author: <http://purl.org/NET/cpan-uri/person/> . @prefix dbug: <http://ontologi.es/doap-bugs#> . @prefix dcs: <http://ontologi.es/doap-changeset#> . @prefix dc: <http://purl.org/dc/terms/> . @prefix foaf: <http://xmlns.com/foaf/0.1/> . @prefix my: <http://purl.org/NET/cpan-uri/dist/HTML-HTML5-Parser/> . @prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> . @prefix toby: <http://tobyinkster.co.uk/#> . @prefix xsd: <http://www.w3.org/2001/XMLSchema#> . my:project :release my:v_0-00_01 . my:v_0-00_01 a :Version ; dc:issued "2009-12-01"^^xsd:date ; :revision "0.00_01"^^xsd:string ; :file-release <http://backpan.cpan.org/authors/id/T/TO/TOBYINK/HTML-HTML5-Parser-0.00_01.tar.gz> ; rdfs:label "Developer preview"@en . my:project :release my:v_0-01 . my:v_0-01 a :Version ; dc:issued "2009-12-03"^^xsd:date ; :revision "0.01"^^xsd:string ; :file-release <http://backpan.cpan.org/authors/id/T/TO/TOBYINK/HTML-HTML5-Parser-0.01.tar.gz> ; rdfs:label "Original version"@en . my:project :release my:v_0-02 . my:v_0-02 a :Version ; dc:issued "2009-12-16"^^xsd:date ; :revision "0.02"^^xsd:string ; :file-release <http://backpan.cpan.org/authors/id/T/TO/TOBYINK/HTML-HTML5-Parser-0.02.tar.gz> ; dcs:changeset [ dcs:versus my:v_0-01 ; dcs:item [ rdfs:label "Replace Inline::Python encoding detection with weaker, but native Perl HTML::Encoding package."@en ] , [ rdfs:label "Bundle the html2xhtml tool."@en ; a dcs:Addition , dcs:Packaging ] ] . my:project :release my:v_0-03 . my:v_0-03 a :Version ; dc:issued "2010-01-15"^^xsd:date ; :revision "0.03"^^xsd:string ; :file-release <http://backpan.cpan.org/authors/id/T/TO/TOBYINK/HTML-HTML5-Parser-0.03.tar.gz> ; dcs:changeset [ dcs:versus my:v_0-02 ; dcs:item [ rdfs:label "Module didn't use URI::file properly."@en ; a dcs:Bugfix ; dcs:blame [ foaf:nick "shellac" ] ] , [ rdfs:label "Upgrade distribution to my new packaging regime (auto-generated changelogs, etc)"@en ; a dcs:Update , dcs:Packaging ] , [ rdfs:label "Copyright 2010."@en ; a dcs:Update , dcs:Documentation ] ] . my:project :release my:v_0-04 . my:v_0-04 a :Version ; dc:issued "2010-04-21"^^xsd:date ; :revision "0.04"^^xsd:string ; :file-release <http://backpan.cpan.org/authors/id/T/TO/TOBYINK/HTML-HTML5-Parser-0.04.tar.gz> ; dcs:changeset [ dcs:versus my:v_0-03 ; dcs:item [ rdfs:label "Catch up to revision cf2c0df8a6dfb50fee923dfb21b14c83f282ccdc (2010-02-28) upstream."@en ; a dcs:Update ] ] . my:project :release my:v_0-100 . my:v_0-100 a :Version ; dc:issued "2010-06-23"^^xsd:date ; :revision "0.100"^^xsd:string ; :file-release <http://backpan.cpan.org/authors/id/T/TO/TOBYINK/HTML-HTML5-Parser-0.100.tar.gz> ; dcs:changeset [ dcs:versus my:v_0-04 ; dcs:item [ rdfs:label "Minor bugfixes."@en ; a dcs:Bugfix ] ] . my:project :release my:v_0-101 . my:v_0-101 a :Version ; dc:issued "2010-06-30"^^xsd:date ; :revision "0.101"^^xsd:string ; :file-release <http://backpan.cpan.org/authors/id/T/TO/TOBYINK/HTML-HTML5-Parser-0.101.tar.gz> ; dcs:changeset [ dcs:versus my:v_0-100 ; dcs:item [ rdfs:label "UTF-8 fix."@en ; a dcs:Bugfix ; dcs:fixes [ rdfs:label "Wide characters in DOM tree."@en ; dbug:reporter author:gwilliams ] ] ] . my:project :release my:v_0-102 . my:v_0-102 a :Version ; dc:issued "2011-01-19"^^xsd:date ; :revision "0.102"^^xsd:string ; :file-release <http://backpan.cpan.org/authors/id/T/TO/TOBYINK/HTML-HTML5-Parser-0.102.tar.gz> ; dcs:changeset [ dcs:versus my:v_0-101 ; dcs:item [ rdfs:label "Fix source_line method."@en ; a dcs:Bugfix ] , [ rdfs:label "Catch up to revision f2c921a886ab0b3dfb8d21b82525e98a4a921ad4 (2010-10-11) upstream."@en ; a dcs:Update ] , [ rdfs:label "Allow <object> element to appear in <head> if document has an HTML4 doctype. This is a willful violation of the HTML5 parsing algorithm. (The <object> may have <param> elements as children, as well as any children that would normally be allowed in the <head> of the document, such as <meta>; any other content is treated as the beginning of the <body>, and thus closes <object> and <head>. That's slightly looser than the HTML 4 spec which says only <param> should be used, but stricter than the HTML 4 DTD which allows pretty much anything in there!)"@en ; a dcs:Addition ] , [ rdfs:label "Support <figcaption> element."@en ; a dcs:Addition ] , [ rdfs:label "Support <summary> element."@en ; a dcs:Addition ] ] . my:project :release my:v_0-103 . my:v_0-103 a :Version ; dc:issued "2011-02-09"^^xsd:date ; :revision "0.103"^^xsd:string ; :file-release <http://backpan.cpan.org/authors/id/T/TO/TOBYINK/HTML-HTML5-Parser-0.103.tar.gz> ; dcs:changeset [ dcs:versus my:v_0-101 ; dcs:item [ rdfs:label "Copyright 2011."@en ; a dcs:Update , dcs:Documentation ] , [ rdfs:label "TagSoupParser.pm called a method that is renamed between this distribution and upstream using its upstream name."@en ; a dcs:Bugfix ] ] . my:project :release my:v_0-104 . my:v_0-104 a :Version ; dc:issued "2011-09-22"^^xsd:date ; :revision "0.104"^^xsd:string ; :file-release <http://backpan.cpan.org/authors/id/T/TO/TOBYINK/HTML-HTML5-Parser-0.104.tar.gz> ; dcs:changeset [ dcs:versus my:v_0-103 ; dcs:item [ rdfs:label "Catch up to revision d81fcb920a1a3c351149cd66a64bf1b8ae14a172 (2011-08-21) upstream."@en ; a dcs:Update ] , [ rdfs:label "Support <track> element."@en ; a dcs:Addition ] , [ rdfs:label "Some error handling stuff."@en ; a dcs:Addition ] ] . my:project :release my:v_0-105 . my:v_0-105 a :Version ; dc:issued "2011-10-07"^^xsd:date ; :revision "0.105"^^xsd:string ; :file-release <http://backpan.cpan.org/authors/id/T/TO/TOBYINK/HTML-HTML5-Parser-0.105.tar.gz> ; dcs:changeset [ dcs:versus my:v_0-104 ; dcs:item [ rdfs:label "HTML::HTML5::Parser::Error overloads stringification."@en ] , [ rdfs:label "Module::Package::RDF."@en ; a dcs:Packaging ] , [ rdfs:label "Bundle 'html5debug' script."@en ; a dcs:Addition ] , [ rdfs:label "use HTML::HTML5::Entities"@en ] ] . my:project :release my:v_0-106 . my:v_0-106 a :Version ; dc:issued "2011-10-10"^^xsd:date ; :revision "0.106"^^xsd:string ; :file-release <http://backpan.cpan.org/authors/id/T/TO/TOBYINK/HTML-HTML5-Parser-0.106.tar.gz> ; dcs:changeset [ dcs:versus my:v_0-105 ; dcs:item [ rdfs:label "Tokenizer.pm was still trying to require NamedEntityList.pm."@en ; a dcs:Bugfix ] ] . my:project :release my:v_0-107 . my:v_0-107 a :Version ; dc:issued "2011-10-20"^^xsd:date ; :revision "0.107"^^xsd:string ; :file-release <http://backpan.cpan.org/authors/id/T/TO/TOBYINK/HTML-HTML5-Parser-0.107.tar.gz> ; dcs:changeset [ dcs:versus my:v_0-106 ; dcs:item [ rdfs:label "parse_file wasn't accepting relative file names"@en ; a dcs:Bugfix ] ; dcs:item [ rdfs:label "html2xhtml now reads from STDIN by default."@en ; a dcs:Addition ] ; dcs:item [ rdfs:label "html2xhtml can output to a file."@en ; a dcs:Addition ] ] . my:project :release my:v_0-108 . my:v_0-108 a :Version ; dc:issued "2012-01-18"^^xsd:date ; :revision "0.108"^^xsd:string ; :file-release <http://backpan.cpan.org/authors/id/T/TO/TOBYINK/HTML-HTML5-Parser-0.108.tar.gz> ; dcs:changeset [ dcs:versus my:v_0-107 ; dcs:item [ rdfs:label "Crashed when generating an error message about some missing closing tags."@en ; a dcs:Bugfix ; dcs:blame <http://purl.org/NET/cpan-uri/person/audreyt> ] ; dcs:item [ rdfs:label "Copyright 2012."@en ; a dcs:Update , dcs:Documentation ] ; dcs:item [ rdfs:label "Provide load_xml and load_html methods for compatibility with XML::LibXML 1.70 and newer."@en ; a dcs:Update ] ] . <http://purl.org/NET/cpan-uri/person/audreyt> foaf:nick "audreyt" . my:project :release my:v_0-109 . my:v_0-109 a :Version ; dc:issued "2012-03-15"^^xsd:date ; :revision "0.109"^^xsd:string ; rdfs:label "Yes, they are come, but they are not past."@en ; :file-release <http://backpan.cpan.org/authors/id/T/TO/TOBYINK/HTML-HTML5-Parser-0.109.tar.gz> ; dcs:changeset [ dcs:versus my:v_0-108 ; dcs:item [ rdfs:label "Use correct case for MathML's definitionURL attribute."@en ; a dcs:Bugfix ] ; dcs:item [ rdfs:label "Fix several fatal errors on infrequently used code paths in TagSoupParser.pm."@en ; a dcs:Bugfix ] ; dcs:item [ rdfs:label "Bundle test cases from html5lib."@en ; a dcs:Packaging ] ; dcs:item [ rdfs:label "Provide parse_balanced_chunk to parse HTML fragments."@en ; a dcs:Addition ] ; dcs:item [ rdfs:label "Provide dtd_element method to get more information about the DTD."@en ; a dcs:Addition ] ; dcs:item [ rdfs:label "Make source_line method work more reliably. This requires XML::LibXML::Devel, and thus a fairly recent version of XML-LibXML."@en ]; dcs:item [ rdfs:label "Nodes now have an implictness flag (returned by source_line called in a list context)."@en ; a dcs:Addition ] ] . my:project :release my:v_0-110 . my:v_0-110 a :Version ; dc:issued "2012-03-20"^^xsd:date ; :revision "0.110"^^xsd:string ; :file-release <http://backpan.cpan.org/authors/id/T/TO/TOBYINK/HTML-HTML5-Parser-0.110.tar.gz> ; dcs:changeset [ dcs:versus my:v_0-109 ; dcs:item [ a dcs:Bugfix; rdfs:label "Removed a use of smart match which was breaking Perl 5.8.x compatibility."@en ; dcs:fixes <http://purl.org/NET/cpan-uri/rt/ticket/75809> ] ; dcs:item [ rdfs:label "use XML::LibXML::Devel::SetLineNumber if it is available." ] ; dcs:item [ rdfs:label "Text nodes should keep line numbers and column numbers too." ] ; dcs:item [ rdfs:label "Comment nodes should now keep their line numbers and column numbers." ] ] . my:project :release my:v_0-200 . my:v_0-200 a :Version ; dc:issued "2012-06-13"^^xsd:date ; :revision "0.200"^^xsd:string ; :file-release <http://backpan.cpan.org/authors/id/T/TO/TOBYINK/HTML-HTML5-Parser-0.200.tar.gz> ; dcs:changeset [ dcs:versus my:v_0-109 ; dcs:item [ rdfs:label "Suppress warnings about invalid Unicode code points." ] ; dcs:item [ rdfs:label "Drop dependency on Error.pm; use Try::Tiny instead." ] ; dcs:item [ rdfs:label "Drop dependency on HTML::Encoding; use IO::HTML instead." ] ; dcs:item [ rdfs:label "Passing a couple more of the html5lib test suite files." ] ] . my:project :release my:v_0-202 . my:v_0-202 a :Version ; dc:issued "2012-06-27"^^xsd:date ; :revision "0.202"^^xsd:string ; :file-release <http://backpan.cpan.org/authors/id/T/TO/TOBYINK/HTML-HTML5-Parser-0.202.tar.gz> ; dcs:changeset [ dcs:versus my:v_0-200 ; dcs:item [ rdfs:label "Drop dependency on LWP::UserAgent in favour of HTTP::Tiny which is part of core since 5.14." ] ] . my:project :release my:v_0-204 . my:v_0-204 a :Version ; dc:issued "2012-06-29"^^xsd:date ; :revision "0.204"^^xsd:string ; :file-release <http://backpan.cpan.org/authors/id/T/TO/TOBYINK/HTML-HTML5-Parser-0.204.tar.gz> ; dcs:changeset [ dcs:versus my:v_0-202 ; dcs:item [ rdfs:label "Stop using defined-or operator in HTML::HTML5::Parser::UA, as it doesn't work in Perl 5.8."; a dcs:Bugfix ] ] . my:project :release my:v_0-206 . my:v_0-206 a :Version ; dc:issued "2012-06-30"^^xsd:date ; :revision "0.206"^^xsd:string ; :file-release <http://backpan.cpan.org/authors/id/T/TO/TOBYINK/HTML-HTML5-Parser-0.206.tar.gz> ; dcs:changeset [ dcs:versus my:v_0-204 ; dcs:item [ rdfs:label "Remove 'use 5.010' from t/07ua.t."; a dcs:Bugfix ]; dcs:item [ rdfs:label "Disable t/07ua.t and t/08ua-lwp.t on Windows because Test::HTTP::Server doesn't work on that platform."; a dcs:Bugfix; rdfs:seeAlso <https://rt.cpan.org/Ticket/Display.html?id=78118> ] ] . my:project :release my:v_0-208 . my:v_0-208 a :Version ; dc:issued "2012-12-06"^^xsd:date ; :revision "0.208"^^xsd:string ; :file-release <http://backpan.cpan.org/authors/id/T/TO/TOBYINK/HTML-HTML5-Parser-0.208.tar.gz> ; dcs:changeset [ dcs:versus my:v_0-206 ; dcs:item [ rdfs:label "Minor fix re LWP-independence."; a dcs:Bugfix ]; dcs:item [ a dcs:Bugfix; rdfs:label "If two <html> tags were in the same file, attributes on the second <html> element could cause crashes."@en; dcs:fixes <http://purl.org/NET/cpan-uri/rt/ticket/79019>; dcs:thanks [ foaf:name "Luben Karavelov" ]; ]; ] . my:project :release my:v_0-300 . my:v_0-300 a :Version ; dc:issued "2013-07-06"^^xsd:date ; :revision "0.300"^^xsd:string ; :file-release <http://backpan.cpan.org/authors/id/T/TO/TOBYINK/HTML-HTML5-Parser-0.300.tar.gz> ; dcs:changeset [ dcs:versus my:v_0-208 ; dcs:item [ a dcs:Bugfix; rdfs:label "Fix many major memory leaks."; dcs:blame [ foaf:name "Dorian Taylor"; foaf:nick "DORIAN" ]; dcs:fixes <tdb:2013:https://github.com/tobyink/p5-html-html5-parser/pull/1>; ]; dcs:item [ a dcs:Change; rdfs:label "Memory leak fix necessitated some API changes; in particular some methods which were available as class methods are now object methods only."@en; ]; dcs:item [ a dcs:Packaging; rdfs:label "t/99html5lib.t now uses Moo instead of Mo."@en; ]; ] . my:project :release my:v_0-301 . my:v_0-301 a :Version ; dc:issued "2013-07-08"^^xsd:date ; :revision "0.301"^^xsd:string ; :file-release <http://backpan.cpan.org/authors/id/T/TO/TOBYINK/HTML-HTML5-Parser-0.301.tar.gz> ; dcs:changeset [ dcs:versus my:v_0-300 ; dcs:item [ a dcs:Bugfix; rdfs:label "Fix t/99html5lib.t still using Mo in one place."@en; dcs:thanks [ foaf:name "Fitz Elliott"; foaf:mbox <mailto:fitz.elliott@gmail.com> ]; dcs:fixes <http://purl.org/NET/cpan-uri/rt/ticket/86774>; ]; ] . ����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������HTML-HTML5-Parser-0.301/META.yml��������������������������������������������������������������������0000644�0001750�0001750�00000002133�12166545203�013732� 0����������������������������������������������������������������������������������������������������ustar �tai�����������������������������tai��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������--- abstract: 'parse HTML reliably' author: - 'Toby Inkster <tobyink@cpan.org>' - Wakaba build_requires: ExtUtils::MakeMaker: 6.59 Test::More: 0.61 configure_requires: ExtUtils::MakeMaker: 6.59 distribution_type: module dynamic_config: 0 generated_by: 'Module::Install version 1.06' keywords: - HTML - HTML5 - Web license: perl meta-spec: url: http://module-build.sourceforge.net/META-spec-v1.4.html version: 1.4 module_name: HTML::HTML5::Parser name: HTML-HTML5-Parser no_index: directory: - examples - inc - t - xt requires: HTML::HTML5::Entities: 0.002 HTTP::Tiny: 0 IO::HTML: 0 Scalar::Util: 0 Try::Tiny: 0 URI::file: 0 XML::LibXML: 1.94 XML::LibXML::Devel: 0 perl: 5.8.1 resources: bugtracker: http://rt.cpan.org/Dist/Display.html?Queue=HTML-HTML5-Parser homepage: https://metacpan.org/release/HTML-HTML5-Parser license: http://dev.perl.org/licenses/ repository: http://goddamn.co.uk/svn-web/perlmods/browse/HTML-HTML5-Parser/ version: 0.301 x_contributors: - shellac - 'audreyt <audreyt@cpan.org>' - TOBYINK - Wakaba - 'Dorian Taylor' �������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������HTML-HTML5-Parser-0.301/Changes���������������������������������������������������������������������0000644�0001750�0001750�00000012105�12166545155�013762� 0����������������������������������������������������������������������������������������������������ustar �tai�����������������������������tai��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������HTML-HTML5-Parser ================= Created: 2009-11-26 Home page: <https://metacpan.org/release/HTML-HTML5-Parser> Bug tracker: <http://rt.cpan.org/Dist/Display.html?Queue=HTML-HTML5-Parser> Maintainer: Toby Inkster <mailto:tobyink@cpan.org> 0.301 2013-07-08 - (Bugfix) Fix t/99html5lib.t still using Mo in one place. Fixes RT#86774 ++"Fitz Elliott" 0.300 2013-07-06 - (Bugfix) Fix many major memory leaks. ++$DORIAN - (Packaging) t/99html5lib.t now uses Moo instead of Mo. - Memory leak fix necessitated some API changes; in particular some methods which were available as class methods are now object methods only. 0.208 2012-12-06 - (Bugfix) If two <html> tags were in the same file, attributes on the second <html> element could cause crashes. Fixes RT#79019 ++"Luben Karavelov" - (Bugfix) Minor fix re LWP-independence. 0.206 2012-06-30 - (Bugfix) Disable t/07ua.t and t/08ua-lwp.t on Windows because Test::HTTP::Server doesn't work on that platform. - (Bugfix) Remove 'use 5.010' from t/07ua.t. 0.204 2012-06-29 - (Bugfix) Stop using defined-or operator in HTML::HTML5::Parser::UA, as it doesn't work in Perl 5.8. 0.202 2012-06-27 - Drop dependency on LWP::UserAgent in favour of HTTP::Tiny which is part of core since 5.14. 0.200 2012-06-13 - Drop dependency on Error.pm; use Try::Tiny instead. - Drop dependency on HTML::Encoding; use IO::HTML instead. - Passing a couple more of the html5lib test suite files. - Suppress warnings about invalid Unicode code points. 0.110 2012-03-20 - (Bugfix) Removed a use of smart match which was breaking Perl 5.8.x compatibility. Fixes RT#75809 - Comment nodes should now keep their line numbers and column numbers. - Text nodes should keep line numbers and column numbers too. - use XML::LibXML::Devel::SetLineNumber if it is available. 0.109 2012-03-15 # Yes, they are come, but they are not past. - (Addition) Nodes now have an implictness flag (returned by source_line called in a list context). - (Addition) Provide dtd_element method to get more information about the DTD. - (Addition) Provide parse_balanced_chunk to parse HTML fragments. - (Bugfix) Fix several fatal errors on infrequently used code paths in TagSoupParser.pm. - (Bugfix) Use correct case for MathML's definitionURL attribute. - (Packaging) Bundle test cases from html5lib. - Make source_line method work more reliably. This requires XML::LibXML::Devel, and thus a fairly recent version of XML-LibXML. 0.108 2012-01-18 - (Bugfix) Crashed when generating an error message about some missing closing tags. ++$audreyt - (Documentation Update) Copyright 2012. - (Update) Provide load_xml and load_html methods for compatibility with XML::LibXML 1.70 and newer. 0.107 2011-10-20 - (Addition) html2xhtml can output to a file. - (Addition) html2xhtml now reads from STDIN by default. - (Bugfix) parse_file wasn't accepting relative file names 0.106 2011-10-10 - (Bugfix) Tokenizer.pm was still trying to require NamedEntityList.pm. 0.105 2011-10-07 - (Addition) Bundle 'html5debug' script. - (Packaging) Module::Package::RDF. - HTML::HTML5::Parser::Error overloads stringification. - use HTML::HTML5::Entities 0.104 2011-09-22 - (Addition) Some error handling stuff. - (Addition) Support <track> element. - (Update) Catch up to revision d81fcb920a1a3c351149cd66a64bf1b8ae14a172 (2011-08-21) upstream. 0.103 2011-02-09 - (Bugfix) TagSoupParser.pm called a method that is renamed between this distribution and upstream using its upstream name. - (Documentation Update) Copyright 2011. 0.102 2011-01-19 - (Addition) Allow <object> element to appear in <head> if document has an HTML4 doctype. This is a willful violation of the HTML5 parsing algorithm. (The <object> may have <param> elements as children, as well as any children that would normally be allowed in the <head> of the document, such as <meta>; any other content is treated as the beginning of the <body>, and thus closes <object> and <head>. That's slightly looser than the HTML 4 spec which says only <param> should be used, but stricter than the HTML 4 DTD which allows pretty much anything in there!) - (Addition) Support <figcaption> element. - (Addition) Support <summary> element. - (Bugfix) Fix source_line method. - (Update) Catch up to revision f2c921a886ab0b3dfb8d21b82525e98a4a921ad4 (2010-10-11) upstream. 0.101 2010-06-30 - (Bugfix) UTF-8 fix. 0.100 2010-06-23 - (Bugfix) Minor bugfixes. 0.04 2010-04-21 - (Update) Catch up to revision cf2c0df8a6dfb50fee923dfb21b14c83f282ccdc (2010-02-28) upstream. 0.03 2010-01-15 - (Bugfix) Module didn't use URI::file properly. ++$shellac - (Documentation Update) Copyright 2010. - (Packaging Update) Upgrade distribution to my new packaging regime (auto-generated changelogs, etc) 0.02 2009-12-16 - (Addition Packaging) Bundle the html2xhtml tool. - Replace Inline::Python encoding detection with weaker, but native Perl HTML::Encoding package. 0.01 2009-12-03 # Original version 0.00_01 2009-12-01 # Developer preview �����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������