libxml-um-perl-0.01.orig/0042775000175000001500000000000007344723242013035 5ustar ardolibxml-um-perl-0.01.orig/MANIFEST0100664000175000001500000000006207344722436014163 0ustar ardoChanges Makefile.PL MANIFEST README test.pl UM.pm libxml-um-perl-0.01.orig/Changes0100664000175000001500000000017407344722436014331 0ustar ardoRevision history for Perl extension XML::UM. 0.01 Sun Aug 26 11:10:26 2001 - Moved from XML-DOM to separate distribution libxml-um-perl-0.01.orig/Makefile.PL0100664000175000001500000000041707344722436015010 0ustar ardouse ExtUtils::MakeMaker; # See lib/ExtUtils/MakeMaker.pm for details of how to influence # the contents of the Makefile that is written. WriteMakefile( 'NAME' => 'XML::UM', 'VERSION_FROM' => 'UM.pm', # finds $VERSION 'PREREQ_PM' => {XML::Encoding => 0}, ); libxml-um-perl-0.01.orig/README0100664000175000001500000000142507344722436013716 0ustar ardoXML::UM version 0.01 ==================== DESCRIPTION This module provides methods to convert UTF-8 strings to any XML encoding that XML::Encoding supports. It creates mapping routines from the .xml files that can be found in the maps/ directory in the XML::Encoding distribution. Note that the XML::Encoding distribution does install the .enc files in your perl directory, but not the.xml files they were created from. That's why you have to specify $ENCDIR as in the SYNOPSIS. INSTALLATION To install this module type the following: perl Makefile.PL make make test make install COPYRIGHT AND LICENCE Copyright (c) 1999,2000 Enno Derksen All rights reserved. This program is free software; you can redistribute it and/or modify it under the same terms as Perl itself. libxml-um-perl-0.01.orig/UM.pm0100664000175000001500000002735007344722436013722 0ustar ardo# # TO DO: # # - Implement SlowMappers for expat builtin encodings (for which there # are no .enc files), e.g. UTF-16, US-ASCII, ISO-8859-1. # - Instead of parsing the .xml file with XML::Encoding, we should use XS. # If this will not be implemented for a while, we could try reading the # .enc file directly, instead of the .xml file. # I started writing XML::UM::EncParser to do this (see EOF), but got stuck. # use strict; package XML::UM::SlowMapper; use Carp; use XML::Encoding; use vars qw{ $VERSION $ENCDIR %DEFAULT_ASCII_MAPPINGS }; $VERSION = '0.01'; my $UTFCHAR = '[\\x00-\\xBF]|[\\xC0-\\xDF].|[\\xE0-\\xEF]..|[\\xF0-\\xFF]...'; # # The directory that contains the .xml files that come with XML::Encoding. # Include the terminating '\' or '/' !! # $ENCDIR = "/home1/enno/perlModules/XML-Encoding-1.01/maps/"; #$ENCDIR = "c:\\src\\perl\\xml\\XML-Encoding-1.01\\maps\\"; # # From xmlparse.h in expat distribution: # # Expat places certain restrictions on the encodings that are supported # using this mechanism. # # 1. Every ASCII character that can appear in a well-formed XML document, # other than the characters # # $@\^`{}~ # # must be represented by a single byte, and that byte must be the # same byte that represents that character in ASCII. # # [end of excerpt] #?? Which 'ASCII characters can appear in a well-formed XML document ?? # All ASCII codes 0 - 127, excl. 36,64,92,94,96,123,125,126 i.e. $@\^`{}~ %DEFAULT_ASCII_MAPPINGS = map { (chr($_), chr($_)) } (0 .. 35, 37 .. 63, 65 .. 91, 93, 95, 97 .. 122, 124, 127); sub new { my ($class, %hash) = @_; my $self = bless \%hash, $class; $self->read_encoding_file; $self; } sub dispose { my $self = shift; $self->{Factory}->dispose_mapper ($self); delete $self->{Encode}; } # Reads the XML file that contains the encoding definition. # These files come with XML::Encoding. sub read_encoding_file { #?? This should parse the .enc files (the .xml files are not installed) !! my ($self) = @_; my $encoding = $self->{Encoding}; # There is no .enc (or .xml) file for US-ASCII, but the mapping is simple # so here it goes... if ($encoding eq 'US-ASCII') { $self->{EncMapName} = 'US-ASCII'; $self->{Map} = \%DEFAULT_ASCII_MAPPINGS; # I hope this is right return; } my $file = $self->find_encoding_file ($encoding); my %uni = %DEFAULT_ASCII_MAPPINGS; my $prefix = ""; my $DIR = "file:$ENCDIR"; my $enc = new XML::Encoding (Handlers => { Init => sub { my $base = shift->base ($DIR); } }, PushPrefixFcn => sub { $prefix .= chr (shift); undef; }, PopPrefixFcn => sub { chop $prefix; undef; }, RangeSetFcn => sub { my ($byte, $uni, $len) = @_; for (my $i = $uni; $len--; $uni++) { $uni{XML::UM::unicode_to_utf8($uni)} = $prefix . chr ($byte++); } undef; }); $self->{EncMapName} = $enc->parsefile ($file); #print "Parsed Encoding " . $self->{Encoding} . " MapName=" . $self->{EncMapName} . "\n"; $self->{Map} = \%uni; } sub find_encoding_file { my ($self, $enc) = @_; return "$ENCDIR\L$enc\E.xml"; # .xml filename is lower case } # Returns a closure (method) that converts a UTF-8 encoded string to an # encoded byte sequence. sub get_encode { my ($self, %hash) = @_; my $MAP = $self->{Map}; my $ENCODE_UNMAPPED = $hash{EncodeUnmapped} || \&XML::UM::encode_unmapped_dec; my $code = "sub {\n my \$str = shift;\n \$str =~ s/"; $code .= "($UTFCHAR)/\n"; $code .= "defined \$MAP->{\$1} ? \$MAP->{\$1} : "; $code .= "\&\$ENCODE_UNMAPPED(\$1) /egs;\n"; $code .= "\$str }\n"; # print $code; my $func = eval $code; croak "could not eval generated code=[$code]: $@" if $@; $func; } # # Optimized version for when the encoding is UTF-8. # (In that case no conversion takes place.) # package XML::UM::SlowMapper::UTF8; use vars qw{ @ISA }; @ISA = qw{ XML::UM::SlowMapper }; sub read_encoding_file { # ignore it } sub get_encode { \&dont_convert; } sub dont_convert # static { shift # return argument unchanged } package XML::UM::SlowMapperFactory; sub new { my ($class, %hash) = @_; bless \%hash, $class; } sub get_encode { my ($self, %options) = @_; my $encoding = $options{Encoding}; my $mapper = $self->get_mapper ($encoding); return $mapper->get_encode (%options); } sub get_mapper { my ($self, $encoding) = @_; $self->{Mapper}->{$encoding} ||= ($encoding eq "UTF-8" ? new XML::UM::SlowMapper::UTF8 (Encoding => $encoding, Factory => $self) : new XML::UM::SlowMapper (Encoding => $encoding, Factory => $self)); } # # Prepare for garbage collection (remove circular refs) # sub dispose_encoding { my ($self, $encoding) = @_; my $mapper = $self->{Mapper}->{$encoding}; return unless defined $mapper; delete $mapper->{Factory}; delete $self->{Mapper}->{$encoding}; } package XML::UM; use Carp; use vars qw{ $FACTORY %XML_MAPPING_CRITERIA }; $FACTORY = XML::UM::SlowMapperFactory->new; sub get_encode # static { $FACTORY->get_encode (@_); } sub dispose_encoding # static { $FACTORY->dispose_encoding (@_); } # Convert UTF-8 byte sequence to Unicode index; then to '&#xNN;' string sub encode_unmapped_hex # static { my $n = utf8_to_unicode (shift); sprintf ("&#x%X;", $n); } sub encode_unmapped_dec # static { my $n = utf8_to_unicode (shift); "&#$n;" } # Converts a UTF-8 byte sequence that represents one character, # to its Unicode index. sub utf8_to_unicode # static { my $str = shift; my $len = length ($str); if ($len == 1) { return ord ($str); } if ($len == 2) { my @n = unpack "C2", $str; return (($n[0] & 0x3f) << 6) + ($n[1] & 0x3f); } elsif ($len == 3) { my @n = unpack "C3", $str; return (($n[0] & 0x1f) << 12) + (($n[1] & 0x3f) << 6) + ($n[2] & 0x3f); } elsif ($len == 4) { my @n = unpack "C4", $str; return (($n[0] & 0x0f) << 18) + (($n[1] & 0x3f) << 12) + (($n[2] & 0x3f) << 6) + ($n[3] & 0x3f); } else { croak "bad UTF8 sequence [$str] hex=" . hb($str); } } # Converts a Unicode character index to the byte sequence # that represents that character in UTF-8. sub unicode_to_utf8 # static { my $n = shift; if ($n < 0x80) { return chr ($n); } elsif ($n < 0x800) { return pack ("CC", (($n >> 6) | 0xc0), (($n & 0x3f) | 0x80)); } elsif ($n < 0x10000) { return pack ("CCC", (($n >> 12) | 0xe0), ((($n >> 6) & 0x3f) | 0x80), (($n & 0x3f) | 0x80)); } elsif ($n < 0x110000) { return pack ("CCCC", (($n >> 18) | 0xf0), ((($n >> 12) & 0x3f) | 0x80), ((($n >> 6) & 0x3f) | 0x80), (($n & 0x3f) | 0x80)); } croak "number [$n] is too large for Unicode in \&unicode_to_utf8"; } #?? The following package is unfinished. #?? It should parse the .enc file and create an array that maps #?? Unicode-index to encoded-str. I got stuck... # package XML::UM::EncParser; # # sub new # { # my ($class, %hash) = @_; # my $self = bless \%hash, $class; # $self; # } # # sub parse # { # my ($self, $filename) = @_; # open (FILE, $filename) || die "can't open .enc file $filename"; # binmode (FILE); # # my $buf; # read (FILE, $buf, 4 + 40 + 2 + 2 + 1024); # # my ($magic, $name, $pfsize, $bmsize, @map) = unpack ("NA40nnN256", $buf); # printf "magic=%04x name=$name pfsize=$pfsize bmsize=$bmsize\n", $magic; # # if ($magic != 0xFEEBFACE) # { # close FILE; # die sprintf ("bad magic number [0x%08X] in $filename, expected 0xFEEBFACE", $magic); # } # # for (my $i = 0; $i < 256; $i++) # { # printf "[%d]=%d ", $i, $map[$i]; # print "\n" if ($i % 8 == 7); # } # # for (my $i = 0; $i < $pfsize; $i++) # { # print "----- PrefixMap $i ----\n"; # read (FILE, $buf, 2 + 2 + 32 + 32); # my ($min, $len, $bmap_start, @ispfx) = unpack ("CCnC64", $buf); # my (@ischar) = splice @ispfx, 32, 32, (); # #?? could use b256 instead of C32 for bitvector a la vec() # # print "ispfx=@ispfx\n"; # print "ischar=@ischar\n"; # $len = 256 if $len == 0; # # print " min=$min len=$len bmap_start=$bmap_start\n"; # } # # close FILE; # } 1; # package return code __END__ =head1 NAME XML::UM - Convert UTF-8 strings to any encoding supported by XML::Encoding =head1 SYNOPSIS use XML::UM; # Set directory with .xml files that comes with XML::Encoding distribution # Always include the trailing slash! $XML::UM::ENCDIR = '/home1/enno/perlModules/XML-Encoding-1.01/maps/'; # Create the encoding routine my $encode = XML::UM::get_encode ( Encoding => 'ISO-8859-2', EncodeUnmapped => \&XML::UM::encode_unmapped_dec); # Convert a string from UTF-8 to the specified Encoding my $encoded_str = $encode->($utf8_str); # Remove circular references for garbage collection XML::UM::dispose_encoding ('ISO-8859-2'); =head1 DESCRIPTION This module provides methods to convert UTF-8 strings to any XML encoding that L supports. It creates mapping routines from the .xml files that can be found in the maps/ directory in the L distribution. Note that the XML::Encoding distribution does install the .enc files in your perl directory, but not the.xml files they were created from. That's why you have to specify $ENCDIR as in the SYNOPSIS. This implementation uses the XML::Encoding class to parse the .xml file and creates a hash that maps UTF-8 characters (each consisting of up to 4 bytes) to their equivalent byte sequence in the specified encoding. Note that large mappings may consume a lot of memory! Future implementations may parse the .enc files directly, or do the conversions entirely in XS (i.e. C code.) =head1 get_encode (Encoding => STRING, EncodeUnmapped => SUB) The central entry point to this module is the XML::UM::get_encode() method. It forwards the call to the global $XML::UM::FACTORY, which is defined as an instance of XML::UM::SlowMapperFactory by default. Override this variable to plug in your own mapper factory. The XML::UM::SlowMapperFactory creates an instance of XML::UM::SlowMapper (and caches it for subsequent use) that reads in the .xml encoding file and creates a hash that maps UTF-8 characters to encoded characters. The get_encode() method of XML::UM::SlowMapper is called, finally, which generates an anonimous subroutine that uses the hash to convert multi-character UTF-8 blocks to the proper encoding. =head1 dispose_encoding ($encoding_name) Call this to free the memory used by the SlowMapper for a specific encoding. Note that in order to free the big conversion hash, the user should no longer have references to the subroutines generated by get_encode(). The parameters to the get_encode() method (defined as name/value pairs) are: =over 4 =item * Encoding The name of the desired encoding, e.g. 'ISO-8859-2' =item * EncodeUnmapped (Default: \&XML::UM::encode_unmapped_dec) Defines how Unicode characters not found in the mapping file (of the specified encoding) are printed. By default, they are converted to decimal entity references, like '{' Use \&XML::UM::encode_unmapped_hex for hexadecimal constants, like '«' =back =head1 CAVEATS I'm not exactly sure about which Unicode characters in the range (0 .. 127) should be mapped to themselves. See comments in XML/UM.pm near %DEFAULT_ASCII_MAPPINGS. The encodings that expat supports by default are currently not supported, (e.g. UTF-16, ISO-8859-1), because there are no .enc files available for these encodings. This module needs some more work. If you have the time, please help! =head1 AUTHOR Original Author is Enno Derksen. Send bug reports, hints, tips, suggestions to T.J Mather at >. =cut libxml-um-perl-0.01.orig/test.pl0100664000175000001500000000074707344722436014360 0ustar ardo# Before `make install' is performed this script should be runnable with # `make test'. After `make install' it should work as `perl test.pl' ######################### # change 'tests => 1' to 'tests => last_test_to_print'; use Test; BEGIN { plan tests => 1 }; use XML::UM; ok(1); # If we made it this far, we're ok. ######################### # Insert your test code below, the Test module is use()ed here so read # its man page ( perldoc Test ) for help writing this test script.