HTML-Encoding-0.61/0000700000175300017630000000000011447000702012702 5ustar bjoerncpanHTML-Encoding-0.61/Changes0000700000175300017630000000257511447000561014214 0ustar bjoerncpanRevision history for Perl extension HTML::Encoding. 0.61 Fri Sep 24 00:00:00 2010 - Fixed https://rt.cpan.org/Ticket/Display.html?id=61603 0.60 Sun Jun 30 09:55:00 2008 - minor bugfixes 0.59 Sun Jun 29 23:55:00 2008 - De-listed URI as dependency, not my bug - added eg/detector.pl - added t/98podsyn.t - added t/99podcov.t - added 'dist' options in Makefile.PL - marked private subs with an underscore 0.58 Sun Jun 29 19:55:00 2008 - List URI as dependency to prevent test failure - Added LICENCE => 'perl' to Makefile.PL 0.57 Wed Apr 9 03:55:00 2008 - updated the pitfyful README file - fixed a performance problem in my_decode reported by Ville Skytta 0.56 Wed Dec 6 06:25:00 2007 - updated changes file - fixed line end issues in test 0.55 Wed Dec 5 16:25:00 2007 - added basic test suite 0.54 Wed Dec 5 14:25:00 2007 - http://www.w3.org/mid/64584787-9995-43F3-A34E-D010ED9811D6@w3.org fixed 0.53 Tue May 22 10:25:00 2007 - http://rt.cpan.org/Public/Bug/Display.html?id=16351 fixed 0.52 Tue Dec 14 09:25:00 2004 - encoding_from_http_message encoding defaulting did not work as intended, fixed. 0.51 Tue Dec 14 08:45:00 2004 - encoding_from_html_document returned a list in scalar context if the document had a XML declaration, fixed. 0.50 Tue Nov 2 03:10:15 2004 - rewrite from scratch HTML-Encoding-0.61/eg/0000700000175300017630000000000011447000702013275 5ustar bjoerncpanHTML-Encoding-0.61/eg/detector.pl0000700000175300017630000000060111032110716015440 0ustar bjoerncpan#!/usr/bin/perl -w use strict; use warnings; use HTML::Encoding 'encoding_from_http_message'; use LWP::UserAgent; if (@ARGV != 1) { printf "Usage: %s http://www.example.org/\n", $0; exit; } my $resp = LWP::UserAgent->new->get('http://www.example.org'); my $enco = encoding_from_http_message($resp); printf "%s is probably %s-encoded\n", $resp->request->uri, $enco; HTML-Encoding-0.61/lib/0000700000175300017630000000000011447000702013450 5ustar bjoerncpanHTML-Encoding-0.61/lib/HTML/0000700000175300017630000000000011447000702014214 5ustar bjoerncpanHTML-Encoding-0.61/lib/HTML/Encoding.pm0000700000175300017630000010020511447000464016306 0ustar bjoerncpanpackage HTML::Encoding; use strict; use warnings; use HTML::Parser qw(); use HTTP::Headers::Util qw(split_header_words); use Encode qw(); use base qw(Exporter); our $VERSION = '0.61'; our @EXPORT_OK = qw/ &encoding_from_meta_element &xml_declaration_from_octets &encoding_from_first_chars &encoding_from_xml_declaration &encoding_from_byte_order_mark &encoding_from_content_type &encoding_from_xml_document &encoding_from_html_document &encoding_from_http_message /; our $DEFAULT_ENCODINGS = [qw/ ISO-8859-1 UTF-16LE UTF-16BE UTF-32LE UTF-32BE UTF-8 /]; our %MAP = ( BM => "\x{FEFF}", CR => "\x{000D}", LF => "\x{000A}", SP => "\x{0020}", TB => "\x{0009}", QS => "\x{003F}", NL => "\x{0085}", LS => "\x{2028}", LT => "<", # fixme GT => ">", # fixme ); sub _my_encode { my $seq; eval { $seq = Encode::encode($_[0], $_[1], $_[2]); }; return $seq unless $@; return; } sub _my_decode { my $str; eval { $str = Encode::decode($_[0], $_[1], $_[2]); }; return $str unless $@; return; } sub _make_character_map { my $encoding = shift; my %data; foreach my $sym (keys %MAP) { my $seq = _my_encode($encoding, "$MAP{$sym}", Encode::FB_CROAK); $data{$sym} = $seq if defined $seq; } \%data; } # cache for U+XXXX octet sequences our %CHARACTER_MAP_CACHE = (); sub _get_character_map { my $encoding = shift; # read from cache return $CHARACTER_MAP_CACHE{$encoding} if exists $CHARACTER_MAP_CACHE{$encoding}; # new cache entry my $map = _make_character_map($encoding); $CHARACTER_MAP_CACHE{$encoding} = $map; # return new entry return $map; } sub encoding_from_meta_element { my $text = shift; my $enco = shift; return unless defined $text; return unless length $text; return unless defined $enco; return unless length $enco; my $pars = HTML::Parser->new ( api_version => 3, @_ ); my $meta = []; my $leng = length $text; my $size = 8192; my $data = ''; my $utf8 = ''; my $i = 0; # todo: should finish when or logically body//* $pars->report_tags(qw/meta head/); $pars->handler(start => $meta, "tagname,attr"); $pars->handler ( end => sub { $_[0]->eof if $_[1] eq "head" }, "self,tagname" ); $pars->parse(sub { return if $i > $leng; $data .= substr $text, $i, $size; $i += $size; _my_decode($enco, $data, Encode::FB_QUIET); }); my @resu; foreach (grep { $_->[0] eq "meta" } @$meta) { my %hash = %{$_->[1]}; next unless defined $hash{'content'}; next unless exists $hash{'http-equiv'}; next unless lc $hash{'http-equiv'} eq "content-type"; my $char = encoding_from_content_type($hash{'content'}); push @resu, $char if defined $char and length $char; } return unless @resu; return wantarray ? @resu : $resu[0]; } sub xml_declaration_from_octets { my $text = shift; my %o = @_; my $encodings = $o{encodings} || $DEFAULT_ENCODINGS; my %resu; return unless defined $text; return unless length $text; foreach my $e (@$encodings) { my $map = _get_character_map($e); # search for > my $end = index $text, $map->{GT}; # search for {LT} . $map->{QS}; # skip this encoding unless ... next unless $end > 0 and $str >= 0 and $end > $str; # extract tentative XML declaration my $decl = substr $text, $str, $end - $str + 1; # decode XML declaration my $deco = _my_decode($e, $decl, Encode::FB_CROAK); # skip encoding if decoding failed next unless defined $deco; $resu{$deco}++; } # No XML declarations found return unless keys %resu; # sort by number of matches, most match first my @sort = sort { $resu{$b} <=> $resu{$a} } keys %resu; # in array context return all encodings, # in scalar context return best match. return wantarray ? @sort : $sort[0]; } sub encoding_from_first_chars { my $text = shift; my %o = @_; my $encodings = $o{encodings} || $DEFAULT_ENCODINGS; my $whitespace = $o{whitespace} || [qw/CR LF TB SP/]; return unless defined $text; return unless length $text; my %resu; foreach my $e (@$encodings) { my $m = _get_character_map($e); my $i = index $text, $m->{LT}; next unless $i >= 0; my $t = substr $text, 0, $i; my @y; # construct \xXX\xXX string from octets, might make sense to # have this in the map construction process push@y,"(?:".join("",map{sprintf"\\x%02x",ord}split//,$m->{$_}).")" foreach grep defined, @$whitespace; my $x = join "|", @y; $t =~ s/^($x)+//g; $resu{$e} = $i + length $m->{LT} unless length $t; } # ... return unless keys %resu; # sort by match length, longest match first my @sort = sort { $resu{$b} <=> $resu{$a} } keys %resu; # in array context return all encodings, # in scalar context return best match. return wantarray ? @sort : $sort[0]; } sub encoding_from_xml_declaration { my $decl = shift; return unless defined $decl; return unless length $decl; # todo: move this to some better place... my $ws = qr/[\x09\x85\x20\x0d\x0a\x{2028}]*/; # skip if not an XML declaration return unless $decl =~ /^<\?xml$ws/i; # attempt to extract encoding pseudo attribute return unless $decl =~ /encoding$ws=$ws'([^']+)'/i or $decl =~ /encoding$ws=$ws"([^"]+)"/i; # no encoding pseudo-attribute return unless defined $1; my $enco = $1; # strip leading/trailing whitespace/quotes $enco =~ s/^[\s'"]+|[\s'"]+$//g; # collapse white-space $enco =~ s/\s+/ /g; # treat empty charset as if it were unspecified return unless length $enco; return $enco; } sub encoding_from_byte_order_mark { my $text = shift; my %o = @_; my $encodings = $o{encodings} || $DEFAULT_ENCODINGS; my %resu; return unless defined $text; return unless length $text; foreach my $e (@$encodings) { my $map = _get_character_map($e); my $bom = $map->{BM}; # encoding cannot encode U+FEFF next unless defined $bom; # remember match length $resu{$e} = length $bom if $text =~ /^(\Q$bom\E)/; } # does not start with BOM return unless keys %resu; # sort by match length, longest match first my @sort = sort { $resu{$b} <=> $resu{$a} } keys %resu; # in array context return all encodings, # in scalar context return best match. return wantarray ? @sort : $sort[0]; } sub encoding_from_content_type { my $text = shift; # nothing to do... return unless defined $text and length $text; # downgrade Unicode strings $text = Encode::encode_utf8($text) if Encode::is_utf8($text); # split parameters, only look at the first set my %data = @{(split_header_words($text))[0]}; # extract first charset parameter if any my $char; foreach my $param (keys %data) { $char = $data{$param} and last if 'charset' eq lc $param; } # no charset parameter return unless defined $char; # there are no special escapes so just remove \s $char =~ tr/\\//d; # strip leading/trailing whitespace/quotes $char =~ s/^[\s'"]+|[\s'"]+$//g; # collapse white-space $char =~ s/\s+/ /g; # treat empty charset as if it were unspecified return unless length $char; return $char } sub encoding_from_xml_document { my $text = shift; my %o = @_; my $encodings = $o{encodings} || $DEFAULT_ENCODINGS; my %resu; return unless defined $text; return unless length $text; my @boms = encoding_from_byte_order_mark($text, encodings => $encodings); # BOM determines encoding return wantarray ? (bom => \@boms) : $boms[0] if @boms; # no BOM my @decls = xml_declaration_from_octets($text, encodings => $encodings); foreach my $decl (@decls) { my $enco = encoding_from_xml_declaration($decl); $resu{$enco}++ if defined $enco and length $enco; } return unless keys %resu; my @sort = sort { $resu{$b} <=> $resu{$a} } keys %resu; # in array context return all encodings, # in scalar context return best match. return wantarray ? (xml => \@sort) : $sort[0]; } sub encoding_from_html_document { my $text = shift; my %o = @_; my $encodings = $o{encodings} || $DEFAULT_ENCODINGS; my $popts = $o{parser_options} || {}; my $xhtml = exists $o{xhtml} ? $o{xhtml} : 1; return unless defined $text; return unless length $text; if ($xhtml) { my @xml = wantarray ? encoding_from_xml_document($text, encodings => $encodings) : scalar encoding_from_xml_document($text, encodings => $encodings); return wantarray ? @xml : $xml[0] if @xml and defined $xml[0]; } else { my @boms = encoding_from_byte_order_mark($text, encodings => $encodings); # BOM determines encoding return wantarray ? (bom => \@boms) : $boms[0] if @boms; } # no BOM my @resu; # sanity check to exclude e.g. UTF-32 my @first = encoding_from_first_chars($text, encodings => $encodings); # fall back to provided encoding list @first = @$encodings unless @first; foreach my $try (@first) { push @resu, encoding_from_meta_element($text, $try, %$popts); } return unless @resu; return wantarray ? (meta => \@resu) : $resu[0]; } sub encoding_from_http_message { my $mess = shift; my %o = @_; my $encodings = $o{encodings} || $DEFAULT_ENCODINGS; my $is_html = $o{is_html} || qr{^text/html$}i; my $is_xml = $o{is_xml} || qr{^.+/(?:.+\+)?xml$}i; my $is_t_xml = $o{is_text_xml} || qr{^text/(?:.+\+)?xml$}i; my $html_d = $o{html_default} || "ISO-8859-1"; my $xml_d = $o{xml_default} || "UTF-8"; my $txml = $o{text_xml_default}; my $xhtml = exists $o{xhtml} ? $o{xhtml} : 1; my $default = exists $o{default} ? $o{default} : 1; my $type = $mess->header('Content-Type'); my $charset = encoding_from_content_type($type); if ($mess->content_type =~ $is_xml) { return wantarray ? (protocol => $charset) : $charset if defined $charset; # special case for text/xml at user option return wantarray ? (protocol_default => $txml) : $txml if defined $txml and $mess->content_type =~ $is_t_xml; if (wantarray) { my @xml = encoding_from_xml_document($mess->content, encodings => $encodings); return @xml if @xml; } else { my $xml = scalar encoding_from_xml_document($mess->content, encodings => $encodings); return $xml if defined $xml; } return wantarray ? (default => $xml_d) : $xml_d if defined $default; } if ($mess->content_type =~ $is_html) { return wantarray ? (protocol => $charset) : $charset if defined $charset; if (wantarray) { my @html = encoding_from_html_document($mess->content, encodings => $encodings, xhtml => $xhtml); return @html if @html; } else { my $html = scalar encoding_from_html_document($mess->content, encodings => $encodings, xhtml => $xhtml); return $html if defined $html; } return wantarray ? (default => $html_d) : $html_d if defined $default; } return } 1; __END__ =pod =head1 NAME HTML::Encoding - Determine the encoding of HTML/XML/XHTML documents =head1 SYNOPSIS use HTML::Encoding 'encoding_from_http_message'; use LWP::UserAgent; use Encode; my $resp = LWP::UserAgent->new->get('http://www.example.org'); my $enco = encoding_from_http_message($resp); my $utf8 = decode($enco => $resp->content); =head1 WARNING The interface and implementation are guranteed to change before this module reaches version 1.00! Please send feedback to the author of this module. =head1 DESCRIPTION HTML::Encoding helps to determine the encoding of HTML and XML/XHTML documents... =head1 DEFAULT ENCODINGS Most routines need to know some suspected character encodings which can be provided through the C option. This option always defaults to the $HTML::Encoding::DEFAULT_ENCODINGS array reference which means the following encodings are considered by default: * ISO-8859-1 * UTF-16LE * UTF-16BE * UTF-32LE * UTF-32BE * UTF-8 If you change the values or pass custom values to the routines note that L must support them in order for this module to work correctly. =head1 ENCODING SOURCES C, C, and C return in list context the encoding source and the encoding name, possible encoding sources are * protocol (Content-Type: text/html;charset=encoding) * bom (leading U+FEFF) * xml () * meta ( to extract the charset parameter from the C header value and returns its value or C (or an empty list in list context) if there is no such value. Only the first component will be examined (HTTP/1.1 only allows for one component), any backslash escapes in strings will be unescaped, all leading and trailing quote marks and white-space characters will be removed, all white-space will be collapsed to a single space, empty charset values will be ignored and no case folding is performed. Examples: +-----------------------------------------+-----------+ | encoding_from_content_type(...) | returns | +-----------------------------------------+-----------+ | "text/html" | undef | | "text/html,text/plain;charset=utf-8" | undef | | "text/html;charset=" | undef | | "text/html;charset=\"\\u\\t\\f\\-\\8\"" | 'utf-8' | | "text/html;charset=utf\\-8" | 'utf\\-8' | | "text/html;charset='utf-8'" | 'utf-8' | | "text/html;charset=\" UTF-8 \"" | 'UTF-8' | +-----------------------------------------+-----------+ If you pass a string with the UTF-8 flag turned on the string will be converted to bytes before it is passed to L. The return value will thus never have the UTF-8 flag turned on (this might change in future versions). =item encoding_from_byte_order_mark($octets [, %options]) Takes a sequence of octets and attempts to read a byte order mark at the beginning of the octet sequence. It will go through the list of $options{encodings} or the list of default encodings if no encodings are specified and match the beginning of the string against any byte order mark octet sequence found. The result can be ambiguous, for example qq(\xFF\xFE\x00\x00) could be both, a complete BOM in UTF-32LE or a UTF-16LE BOM followed by a U+0000 character. It is also possible that C<$octets> starts with something that looks like a byte order mark but actually is not. encoding_from_byte_order_mark sorts the list of possible encodings by the length of their BOM octet sequence and returns in scalar context only the encoding with the longest match, and all encodings ordered by length of their BOM octet sequence in list context. Examples: +-------------------------+------------+-----------------------+ | Input | Encodings | Result | +-------------------------+------------+-----------------------+ | "\xFF\xFE\x00\x00" | default | qw(UTF-32LE) | | "\xFF\xFE\x00\x00" | default | qw(UTF-32LE UTF-16LE) | | "\xEF\xBB\xBF" | default | qw(UTF-8) | | "Hello World!" | default | undef | | "\xDD\x73\x66\x73" | default | undef | | "\xDD\x73\x66\x73" | UTF-EBCDIC | qw(UTF-EBCDIC) | | "\x2B\x2F\x76\x38\x2D" | default | undef | | "\x2B\x2F\x76\x38\x2D" | UTF-7 | qw(UTF-7) | +-------------------------+------------+-----------------------+ Note however that for UTF-7 it is in theory possible that the U+FEFF combines with other characters in which case such detection would fail, for example consider: +--------------------------------------+-----------+-----------+ | Input | Encodings | Result | +--------------------------------------+-----------+-----------+ | "\x2B\x2F\x76\x38\x41\x39\x67\x2D" | default | undef | | "\x2B\x2F\x76\x38\x41\x39\x67\x2D" | UTF-7 | undef | +--------------------------------------+-----------+-----------+ This might change in future versions, although this is not very relevant for most applications as there should never be need to use UTF-7 in the encoding list for existing documents. If no BOM can be found it returns C in scalar context and an empty list in list context. This routine should not be used with strings with the UTF-8 flag turned on. =item encoding_from_xml_declaration($declaration) Attempts to extract the value of the encoding pseudo-attribute in an XML declaration or text declaration in the character string $declaration. If there does not appear to be such a value it returns nothing. This would typically be used with the return values of xml_declaration_from_octets. Normalizes whitespaces like encoding_from_content_type. Examples: +-------------------------------------------+---------+ | encoding_from_xml_declaration(...) | Result | +-------------------------------------------+---------+ | "" | 'utf-8' | | "" | 'utf-8' | | "" | 'utf-8' | | "" | 'utf-8' | | "" | 'a' | | "" | 'a b' | | "" | undef | | " " | undef | | "" | 'utf-8' | | "" | undef | | "" | 'a' | +-------------------------------------------+---------+ Note that encoding_from_xml_declaration() determines the encoding even if the XML declaration is not well-formed or violates other requirements of the relevant XML specification as long as it can find an encoding pseudo-attribute in the provided string. This means XML processors must apply further checks to determine whether the entity is well-formed, etc. =item xml_declaration_from_octets($octets [, %options]) Attempts to find a ">" character in the byte string $octets using the encodings in $encodings and upon success attempts to find a preceding "<" character. Returns all the strings found this way in the order of number of successful matches in list context and the best match in scalar context. Should probably be combined with the only user of this routine, encoding_from_xml_declaration... You can modify the list of suspected encodings using $options{encodings}; =item encoding_from_first_chars($octets [, %options]) Assuming that documents start with "<" optionally preceded by whitespace characters, encoding_from_first_chars attempts to determine an encoding by matching $octets against something like /^[@{$options{whitespace}}]* elements using encoding_from_meta_element. $options{whitespace} defaults to qw/CR LF SP TB/. Returns nothing if unsuccessful. Returns the matching encodings in order of the number of octets matched in list context and the best match in scalar context. Examples: +---------------+----------+---------------------+ | String | Encoding | Result | +---------------+----------+---------------------+ | '" | UTF-8 | ISO-8859-1 or UTF-8 | +---------------+----------+---------------------+ =item encoding_from_meta_element($octets, $encname [, %options]) Attempts to find elements in the document using HTML::Parser. It will attempt to decode chunks of the byte string using $encname to characters before passing the data to HTML::Parser. An optional %options hash can be provided which will be passed to the HTML::Parser constructor. It will stop processing the document if it encounters * * encoding errors * the end of the input * ... (see todo) If relevant elements, i.e. something like are found, uses encoding_from_content_type to extract the charset parameter. It returns all such encodings it could find in document order in list context or the first encoding in scalar context (it will currently look for others regardless of calling context) or nothing if that fails for some reason. Note that there are many edge cases where this does not yield in "proper" results depending on the capabilities of the HTML::Parser version and the options you pass for it, for example, ]>

...

This would likely not detect the C value if HTML::Parser does not resolve the entity. This should however only be a concern for documents specifically crafted to break the encoding detection. =item encoding_from_xml_document($octets, [, %options]) Uses encoding_from_byte_order_mark to detect the encoding using a byte order mark in the byte string and returns the return value of that routine if it succeeds. Uses xml_declaration_from_octets and encoding_from_xml_declaration and returns the encoding for which the latter routine found most matches in scalar context, and all encodings ordered by number of occurences in list context. It does not return a value of neither byte order mark not inbound declarations declare a character encoding. Examples: +----------------------------+----------+-----------+----------+ | Input | Encoding | Encodings | Result | +----------------------------+----------+-----------+----------+ | "" | UTF-16 | default | UTF-16BE | | "" | UTF-16LE | default | undef | | "" | UTF-16LE | default | utf-8 | | "" | UTF-16 | default | UTF-16BE | | "" | CP37 | default | undef | | "" | CP37 | CP37 | cp37 | +----------------------------+----------+-----------+----------+ Lacking a return value from this routine and higher-level protocol information (such as protocol encoding defaults) processors would be required to assume that the document is UTF-8 encoded. Note however that the return value depends on the set of suspected encodings you pass to it. For example, by default, EBCDIC encodings would not be considered and thus for this routine would return the undefined value. You can modify the list of suspected encodings using $options{encodings}. =item encoding_from_html_document($octets, [, %options]) Uses encoding_from_xml_document and encoding_from_meta_element to determine the encoding of HTML documents. If $options{xhtml} is set to a false value uses encoding_from_byte_order_mark and encoding_from_meta_element to determine the encoding. The xhtml option is on by default. The $options{encodings} can be used to modify the suspected encodings and $options{parser_options} can be used to modify the HTML::Parser options in encoding_from_meta_element (see the relevant documentation). Returns nothing if no declaration could be found, the winning declaration in scalar context and a list of encoding source and encoding name in list context, see ENCODING SOURCES. ... Other problems arise from differences between HTML and XHTML syntax and encoding detection rules, for example, the input could be Content-Type: text/html

...

This is a perfectly legal HTML 4.01 document and implementations might be expected to consider the document ISO-8859-2 encoded as XML rules for encoding detection do not apply to HTML documents. This module attempts to avoid making decisions which rules apply for a specific document and would thus by default return 'utf-8' for this input. On the other hand, if the input omits the encoding declaration, Content-Type: text/html

...

It would return 'iso-8859-2'. Similar problems would arise from other differences between HTML and XHTML, for example consider Content-Type: text/html ... ... If this is processed using HTML rules, the first > will end the processing instruction and the XHTML document type declaration would be the relevant declaration for the document, if it is processed using XHTML rules, the ?> will end the processing instruction and the HTML document type declaration would be the relevant declaration. IOW, an application would need to assume a certain character encoding (family) to process enough of the document to determine whether it is XHTML or HTML and the result of this detection would depend on which processing rules are assumed in order to process it. It is thus in essence not possible to write a "perfect" detection algorithm, which is why this routine attempts to avoid making any decisions on this matter. =item encoding_from_http_message($message [, %options]) Determines the encoding of HTML / XML / XHTML documents enclosed in HTTP message. $message is an object compatible to L, e.g. a L object. %options is a hash with the following possible entries: =over 2 =item encodings array references of suspected character encodings, defaults to C<$HTML::Encoding::DEFAULT_ENCODINGS>. =item is_html Regular expression matched against the content_type of the message to determine whether to use HTML rules for the entity body, defaults to C. =item is_xml Regular expression matched against the content_type of the message to determine whether to use XML rules for the entity body, defaults to C. =item is_text_xml Regular expression matched against the content_type of the message to determine whether to use text/html rules for the message, defaults to C. This will only be checked if is_xml matches aswell. =item html_default Default encoding for documents determined (by is_html) as HTML, defaults to C. =item xml_default Default encoding for documents determined (by is_xml) as XML, defaults to C. =item text_xml_default Default encoding for documents determined (by is_text_xml) as text/xml, defaults to C in which case the default is ignored. This should be set to C if desired as this module is by default inconsistent with RFC 3023 which requires that for text/xml documents without a charset parameter in the HTTP header C is assumed. This requirement is inconsistent with RFC 2616 (HTTP/1.1) which requires to assume C, has been widely ignored and is thus disabled by default. =item xhtml Whether the routine should look for an encoding declaration in the XML declaration of the document (if any), defaults to C<1>. =item default Whether the relevant default value should be returned when no other information can be determined, defaults to C<1>. =back This is furhter possibly inconsistent with XML MIME types that differ in other ways from application/xml, for example if the MIME Type does not allow for a charset parameter in which case applications might be expected to ignore the charset parameter if erroneously provided. =back =head1 EBCDIC SUPPORT By default, this module does not support EBCDIC encodings. To enable support for EBCDIC encodings you can either change the $HTML::Encodings::DEFAULT_ENCODINGS array reference or pass the encodings to the routines you use using the encodings option, for example my @try = qw/UTF-8 UTF-16LE cp500 posix-bc .../; my $enc = encoding_from_xml_document($doc, encodings => \@try); Note that there are some subtle differences between various EBCDIC encodings, for example C is mapped to 0x5A in C and to 0x4F in C; these differences might affect processing in yet undetermined ways. =head1 TODO * bundle with test suite * optimize some routines to give up once successful * avoid transcoding for HTML::Parser if e.g. ISO-8859-1 * consider adding a "HTML5" modus of operation? =head1 SEE ALSO * http://www.w3.org/TR/REC-xml/#charencoding * http://www.w3.org/TR/REC-xml/#sec-guessing * http://www.w3.org/TR/xml11/#charencoding * http://www.w3.org/TR/xml11/#sec-guessing * http://www.w3.org/TR/html4/charset.html#h-5.2.2 * http://www.w3.org/TR/xhtml1/#C_9 * http://www.ietf.org/rfc/rfc2616.txt * http://www.ietf.org/rfc/rfc2854.txt * http://www.ietf.org/rfc/rfc3023.txt * perlunicode * Encode * HTML::Parser =head1 AUTHOR / COPYRIGHT / LICENSE Copyright (c) 2004-2008 Bjoern Hoehrmann . This module is licensed under the same terms as Perl itself. =cut HTML-Encoding-0.61/Makefile.PL0000600000175300017630000000144511032002136014653 0ustar bjoerncpanuse 5.008; use ExtUtils::MakeMaker; WriteMakefile( NAME => 'HTML::Encoding', VERSION_FROM => 'lib/HTML/Encoding.pm', # finds $VERSION PREREQ_PM => { Encode => 0, # HTML::Parser => 0, # todo: figure out proper version HTTP::Headers::Util => 0, # HTTP::Response => 0, # }, ($] >= 5.005 ? ## Add these new keywords supported since 5.005 (ABSTRACT_FROM => 'lib/HTML/Encoding.pm', # retrieve abstract from module AUTHOR => 'Bjoern Hoehrmann ') : ()), LICENSE => 'perl', 'dist' => { PREOP => 'chmod 600 Makefile.PL', TARFLAGS => '--group=cpan --owner=bjoern -cvf', }, ); HTML-Encoding-0.61/MANIFEST0000700000175300017630000000156311032111110014025 0ustar bjoerncpanChanges Makefile.PL MANIFEST README t/00basic.t t/01http.t t/98podsyn.t t/99podcov.t eg/detector.pl lib/HTML/Encoding.pm tinput/http/01 tinput/http/02 tinput/http/03 tinput/http/04 tinput/http/05 tinput/http/06 tinput/http/07 tinput/http/08 tinput/http/09 tinput/http/10 tinput/http/11 tinput/http/12 tinput/http/13 tinput/http/14 tinput/http/15 tinput/http/16 tinput/http/17 tinput/http/18 tinput/http/19 tinput/http/20 tinput/http/21 tinput/http/22 tinput/http/23 tinput/http/24 tinput/http/25 tinput/http/26 tinput/http/27 tinput/http/28 tinput/http/29 tinput/http/30 tinput/http/31 tinput/http/32 tinput/http/33 tinput/http/34 tinput/http/35 tinput/http/36 tinput/http/37 tinput/http/38 tinput/http/39 tinput/http/40 tinput/http/41 tinput/http/42 META.yml Module meta-data (added by MakeMaker) HTML-Encoding-0.61/META.yml0000700000175300017630000000126611447000702014163 0ustar bjoerncpan--- #YAML:1.0 name: HTML-Encoding version: 0.61 abstract: Determine the encoding of HTML/XML/XHTML documents author: - Bjoern Hoehrmann license: perl distribution_type: module configure_requires: ExtUtils::MakeMaker: 0 build_requires: ExtUtils::MakeMaker: 0 requires: Encode: 0 HTML::Parser: 0 HTTP::Headers::Util: 0 HTTP::Response: 0 no_index: directory: - t - inc generated_by: ExtUtils::MakeMaker version 6.55_02 meta-spec: url: http://module-build.sourceforge.net/META-spec-v1.4.html version: 1.4 HTML-Encoding-0.61/README0000700000175300017630000000124110777022120013566 0ustar bjoerncpanHTML-Encoding ============= HTML::Encoding helps to determine the encoding of HTML and XML/XHTML documents... INSTALLATION To install this module type the following: perl Makefile.PL make make test make install DEPENDENCIES This module requires these other modules and libraries: Encode HTML::Parser HTTP::Headers::Util COPYRIGHT AND LICENCE Copyright (C) 2004-2008 by Bjoern Hoehrmann This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself, either Perl version 5.8.2 or, at your option, any later version of Perl 5 you may have available. HTML-Encoding-0.61/t/0000700000175300017630000000000011447000702013145 5ustar bjoerncpanHTML-Encoding-0.61/t/00basic.t0000700000175300017630000000075410141514050014560 0ustar bjoerncpan# Before `make install' is performed this script should be runnable with # `make test'. After `make install' it should work as `perl HTML-Encoding.t' ######################### # change 'tests => 1' to 'tests => last_test_to_print'; use Test::More tests => 1; BEGIN { use_ok('HTML::Encoding') }; ######################### # Insert your test code below, the Test::More module is use()ed here so read # its man page ( perldoc Test::More ) for help writing this test script. HTML-Encoding-0.61/t/01http.t0000700000175300017630000000202410725636160014466 0ustar bjoerncpanuse Test::More tests => 43; use IO::File; use File::Spec qw(); use HTTP::Response; BEGIN { use_ok('HTML::Encoding'); }; while () { my ($name, $expect) = split ' ', $_, 2; $expect =~ s/[\r\n]+$//g; my $path = File::Spec->catfile('tinput', 'http', $name); my $data = do { local $/; IO::File->new('<' . $path)->getline }; my $message = HTTP::Response->parse($data); my $charset = HTML::Encoding::encoding_from_http_message($message); $charset = '' unless defined $charset; is($charset, $expect); } __DATA__ 01 utf-8 02 utf-8 03 utf-8 04 utf-8 05 utf-8 06 utf-8 07 utf-8 08 utf-8 09 utf-8 10 utf-8 11 utf-8 12 utf-8 13 iso-8859-1 14 iso-8859-1 15 iso-8859-1 16 ISO-8859-1 17 iso-8859-1 18 utf-8 19 utf-8 20 utf-8 21 utf-8 22 iso-8859-1 23 iso-8859-1 24 utf-8 25 utf-8 26 utf-8 27 utf-8 28 utf-8 29 utf-8 30 utf-8 31 utf-8 32 utf-8 33 CESU-8 34 CESU-8 35 {CE}SU-8 36 xCESU-8 37 PC-Multilingual-850+euro 38 ISO_8859-16:2001 39 utf-8 40 ISO-8859-1 41 42 utf-8 HTML-Encoding-0.61/t/98podsyn.t0000700000175300017630000000033611032001302015017 0ustar bjoerncpan# 99pod.t -- Minimally check POD for problems. # # $Id$ use strict; use warnings; use Test::More; eval "use Test::Pod 1.00"; plan skip_all => "Test::Pod 1.00 required for testing POD" if $@; all_pod_files_ok(); HTML-Encoding-0.61/t/99podcov.t0000700000175300017630000000043111032001274015002 0ustar bjoerncpan# 99pod.t -- Minimally check POD for code coverage. # # $Id$ use strict; use warnings; use Test::More; eval "use Test::Pod::Coverage"; plan skip_all => "Test::Pod::Coverage required for testing pod coverage" if $@; plan tests => 1; pod_coverage_ok('HTML::Encoding'); HTML-Encoding-0.61/tinput/0000700000175300017630000000000011447000702014225 5ustar bjoerncpanHTML-Encoding-0.61/tinput/http/0000700000175300017630000000000011447000702015204 5ustar bjoerncpanHTML-Encoding-0.61/tinput/http/010000700000175300017630000000060010124652330015347 0ustar bjoerncpanHTTP/1.1 200 OK Date: Thu, 23 Sep 2004 21:57:05 GMT Server: Apache/1.3.31 (Unix) PHP/4.3.8 P3P: policyref="http://www.w3.org/2001/05/P3P/p3p.xml" Cache-Control: max-age=600 Expires: Thu, 23 Sep 2004 22:07:05 GMT Last-Modified: Tue, 21 Sep 2004 18:10:14 GMT ETag: "41506e86" Accept-Ranges: bytes Content-Length: 0 Connection: close Content-Type: text/html; charset=utf-8 HTML-Encoding-0.61/tinput/http/020000700000175300017630000000007310124652350015356 0ustar bjoerncpanHTTP/1.1 200 OK Content-Type: text/html; charset=utf-8 HTML-Encoding-0.61/tinput/http/030000700000175300017630000000007610124652366015371 0ustar bjoerncpanHTTP/1.1 200 OK Content-Type: text/html ; charset = utf-8 HTML-Encoding-0.61/tinput/http/040000700000175300017630000000010610124652406015357 0ustar bjoerncpanHTTP/1.1 200 OK Content-Type: text/html ; charset = utf-8 HTML-Encoding-0.61/tinput/http/050000700000175300017630000000010610124653060015355 0ustar bjoerncpanHTTP/1.1 200 OK Content-Type: text/html;version=3.0;charset=utf-8 HTML-Encoding-0.61/tinput/http/060000700000175300017630000000007510124653102015360 0ustar bjoerncpanHTTP/1.1 200 OK Content-Type: text/html; charset='utf-8' HTML-Encoding-0.61/tinput/http/070000700000175300017630000000007510124653112015362 0ustar bjoerncpanHTTP/1.1 200 OK Content-Type: text/html; charset="utf-8" HTML-Encoding-0.61/tinput/http/080000700000175300017630000000011010124653410015352 0ustar bjoerncpanHTTP/1.1 200 OK Content-Type: text/html;version='3.0';charset=utf-8 HTML-Encoding-0.61/tinput/http/090000700000175300017630000000011210124653420015356 0ustar bjoerncpanHTTP/1.1 200 OK Content-Type: text/html;version='3.0';charset='utf-8' HTML-Encoding-0.61/tinput/http/100000700000175300017630000000011210124653432015351 0ustar bjoerncpanHTTP/1.1 200 OK Content-Type: text/html;version='3.0';charset="utf-8" HTML-Encoding-0.61/tinput/http/110000700000175300017630000000011210124653444015355 0ustar bjoerncpanHTTP/1.1 200 OK Content-Type: text/html;version="3.0";charset='utf-8' HTML-Encoding-0.61/tinput/http/120000700000175300017630000000011210124653460015354 0ustar bjoerncpanHTTP/1.1 200 OK Content-Type: text/html;version="3.0";charset="utf-8" HTML-Encoding-0.61/tinput/http/130000700000175300017630000000014010124653526015361 0ustar bjoerncpanHTTP/1.1 200 OK Content-Type: text/html;charset='iso-8859-1';version="3.0';charset='utf-8'" HTML-Encoding-0.61/tinput/http/140000700000175300017630000000014010124653542015360 0ustar bjoerncpanHTTP/1.1 200 OK Content-Type: text/html;charset="iso-8859-1";version="3.0';charset='utf-8'" HTML-Encoding-0.61/tinput/http/150000700000175300017630000000014010124653572015364 0ustar bjoerncpanHTTP/1.1 200 OK Content-Type: text/html;version="3.0';charset='utf-8'";charset='iso-8859-1' HTML-Encoding-0.61/tinput/http/160000700000175300017630000000013010124653620015356 0ustar bjoerncpanHTTP/1.1 200 OK Content-Type: text/html Content-Type: text/html;charset=iso-8859-1 HTML-Encoding-0.61/tinput/http/170000700000175300017630000000014610124653642015372 0ustar bjoerncpanHTTP/1.1 200 OK Content-Type: text/html;charset=iso-8859-1 Content-Type: text/html;charset=utf-8 HTML-Encoding-0.61/tinput/http/180000700000175300017630000000012610124653670015372 0ustar bjoerncpanHTTP/1.1 200 OK Content-Type: text/html;note="charset='iso-8859-1'";charset=utf-8 HTML-Encoding-0.61/tinput/http/190000700000175300017630000000010010124653724015363 0ustar bjoerncpanHTTP/1.1 200 OK Content-Type: text/html ;charset= utf-8 HTML-Encoding-0.61/tinput/http/200000700000175300017630000000007610124653770015370 0ustar bjoerncpanHTTP/1.1 200 OK Content-Type: text/html ; charset = utf-8 HTML-Encoding-0.61/tinput/http/210000700000175300017630000000007410124654016015361 0ustar bjoerncpanHTTP/1.1 200 OK Content-Type: text/html;charset="utf-8' HTML-Encoding-0.61/tinput/http/220000700000175300017630000000011510124654044015357 0ustar bjoerncpanHTTP/1.1 200 OK Content-Type: text/html;version="3.0";charset=iso-8859-1 HTML-Encoding-0.61/tinput/http/230000700000175300017630000000010710124654134015361 0ustar bjoerncpanHTTP/1.1 200 OK Content-Type: text/html;charset=" iso-8859-1 " HTML-Encoding-0.61/tinput/http/240000700000175300017630000000017610125062720015363 0ustar bjoerncpanHTTP/1.1 200 OK Content-Type: text/html;version="\"charset=iso-8859-1\"";charset="utf-8";version="\"charset=iso-8859-2\"" HTML-Encoding-0.61/tinput/http/250000700000175300017630000000020210125062716015357 0ustar bjoerncpanHTTP/1.1 200 OK Content-Type: text/html;version="\"charset=\"iso-8859-1\"\"";charset="utf-8";version="\"charset=iso-8859-2\"" HTML-Encoding-0.61/tinput/http/260000700000175300017630000000020210125062714015356 0ustar bjoerncpanHTTP/1.1 200 OK Content-Type: text/html;version="\"charset=iso-8859-1\"";charset="utf-8";version="\"charset=\"iso-8859-2\"\"" HTML-Encoding-0.61/tinput/http/270000700000175300017630000000020610125062712015361 0ustar bjoerncpanHTTP/1.1 200 OK Content-Type: text/html;version="\"charset=\"iso-8859-1\"\"";charset="utf-8";version="\"charset=\"iso-8859-2\"\"" HTML-Encoding-0.61/tinput/http/280000700000175300017630000000020410125062726015365 0ustar bjoerncpanHTTP/1.1 200 OK Content-Type: text/html;version="\"charset=\"iso-8859-1\"\"";charset=utf-8;version="\"charset=\"iso-8859-2\"\"" HTML-Encoding-0.61/tinput/http/290000700000175300017630000000007610125051600015362 0ustar bjoerncpanHTTP/1.1 200 OK Content-Type: text/html;charset="\utf-\8" HTML-Encoding-0.61/tinput/http/300000700000175300017630000000010110125051614015344 0ustar bjoerncpanHTTP/1.1 200 OK Content-Type: text/html;charset="\u\t\f\-\8" HTML-Encoding-0.61/tinput/http/310000700000175300017630000000007610125051646015365 0ustar bjoerncpanHTTP/1.1 200 OK Content-Type: text/html;charset=\"utf-8\" HTML-Encoding-0.61/tinput/http/320000700000175300017630000000007710125051676015372 0ustar bjoerncpanHTTP/1.1 200 OK Content-Type: text/html;charset=\u\t\f\-\8 HTML-Encoding-0.61/tinput/http/330000700000175300017630000000007610125052050015355 0ustar bjoerncpanHTTP/1.1 200 OK Content-Type: text/html;charset="\CESU-8" HTML-Encoding-0.61/tinput/http/340000700000175300017630000000007710125052060015360 0ustar bjoerncpanHTTP/1.1 200 OK Content-Type: text/html;charset="\C\ESU-8" HTML-Encoding-0.61/tinput/http/350000700000175300017630000000010010125052100015337 0ustar bjoerncpanHTTP/1.1 200 OK Content-Type: text/html;charset="\{CE}SU-8" HTML-Encoding-0.61/tinput/http/360000700000175300017630000000007710125052112015360 0ustar bjoerncpanHTTP/1.1 200 OK Content-Type: text/html;charset="\xCESU-8" HTML-Encoding-0.61/tinput/http/370000700000175300017630000000011510125052512015356 0ustar bjoerncpanHTTP/1.1 200 OK Content-Type: text/html;charset=PC-Multilingual-850+euro HTML-Encoding-0.61/tinput/http/380000700000175300017630000000010510125052550015360 0ustar bjoerncpanHTTP/1.1 200 OK Content-Type: text/html;charset=ISO_8859-16:2001 HTML-Encoding-0.61/tinput/http/390000700000175300017630000000020610125063666015375 0ustar bjoerncpanHTTP/1.1 200 OK Content-Type: text/html;version=";x=1;y=2;charset=iso-8859-1";charset=utf-8;version=";x=1;y=2;charset=iso-8859-2" HTML-Encoding-0.61/tinput/http/400000700000175300017630000000005410125124634015357 0ustar bjoerncpanHTTP/1.1 200 OK Content-Type: text/html HTML-Encoding-0.61/tinput/http/410000700000175300017630000000002310125124652015354 0ustar bjoerncpanHTTP/1.1 200 OK HTML-Encoding-0.61/tinput/http/420000700000175300017630000000007310725465300015366 0ustar bjoerncpanHTTP/1.1 200 OK Content-Type: text/html; CharSet=utf-8