Unicode-MapUTF8-1.11/0000775000076400007640000000000010322472527015720 5ustar snowharesnowhare00000000000000Unicode-MapUTF8-1.11/lib/0000775000076400007640000000000010322472527016466 5ustar snowharesnowhare00000000000000Unicode-MapUTF8-1.11/lib/Unicode/0000775000076400007640000000000010322472527020054 5ustar snowharesnowhare00000000000000Unicode-MapUTF8-1.11/lib/Unicode/MapUTF8.pm0000664000076400007640000004622010322472527021602 0ustar snowharesnowhare00000000000000package Unicode::MapUTF8; use strict; use Carp qw(confess croak carp); use Unicode::String; use Unicode::Map; use Unicode::Map8; use Jcode; use vars qw ($VERSION @EXPORT @EXPORT_OK @EXPORT_TAGS @ISA); use subs qw (utf8_supported_charset to_utf8 from_utf8 utf8_charset_alias _init_charsets); require Exporter; BEGIN { @ISA = qw(Exporter); @EXPORT = qw (); @EXPORT_OK = qw (utf8_supported_charset to_utf8 from_utf8 utf8_charset_alias); @EXPORT_TAGS = qw (); $VERSION = "1.11"; } ############################ # File level package globals (class variables) my $_Supported_Charsets; my $_Charset_Names; my $_Charset_Aliases; _init_charsets; ############## sub utf8_charset_alias { if ($#_ == -1) { my $aliases = {}; %$aliases = %$_Charset_Aliases; return $aliases; } my $parms; my @parms_list = @_; if (($#parms_list == 0) && (ref ($parms_list[0]) eq 'HASH')) { _set_utf8_charset_alias($parms_list[0]); return; } elsif (($#parms_list > 0) && (($#parms_list % 2) == 1)) { _set_utf8_charset_alias({ @parms_list }); return; } elsif ($#parms_list == 0) { my $lc_charset = lc($parms_list[0]); my $result = $_Charset_Aliases->{$lc_charset}; return $result; } croak( '[' . localtime(time) . '] ' . __PACKAGE__ . "::utf8_charset_alias() - invalid parameters passed\n"); } ###################################################################### # Sets (or clears ;-) ) a runtime character set alias. sub _set_utf8_charset_alias { my ($parms) = @_; my @alias_names = keys %$parms; foreach my $alias (@alias_names) { my $lc_alias = lc ($alias); my $charset = $parms->{$alias}; if (! defined $charset) { if (exists ($_Charset_Aliases->{$lc_alias})) { delete $_Charset_Aliases->{$lc_alias}; } next; } my $lc_charset = lc ($charset); if (! exists ($_Charset_Names->{$lc_charset})) { croak( '[' . localtime(time) . '] ' . __PACKAGE__ . "::utf8_charset_alias() - attempted to set alias '$alias' to point to unknown charset encoding of '$charset'\n"); } if (exists ($_Charset_Names->{$lc_alias})) { carp('[' . localtime(time) . '] [warning] ' . __PACKAGE__ . "::utf8_charset_alias() - Aliased base defined charset name '$alias' to '$charset'."); } $_Charset_Aliases->{$lc_alias} = $lc_charset; } } #### sub utf8_supported_charset { if ($#_ == -1 && wantarray) { my %all_charsets = (%$_Supported_Charsets, %$_Charset_Aliases); my @charsets = sort keys %all_charsets; return @charsets; } my $charset = shift; if (not defined $charset) { croak( '[' . localtime(time) . '] ' . __PACKAGE__ . "::utf8_supported_charset() - no character set specified\n"); } my $lc_charset = lc($charset); return 1 if (exists ($_Charset_Names->{$lc_charset})); return 1 if (exists ($_Charset_Aliases->{$lc_charset})); return 0; } #### sub to_utf8 { my @parm_list = @_; my $parms = {}; if (($#parm_list > 0) && (($#parm_list % 2) == 1)) { $parms = { @parm_list }; } elsif ($#parm_list == 0) { $parms = $parm_list[0]; if (! ref($parms)) { croak( '[' . localtime(time) . '] ' . __PACKAGE__ . "::to_utf8() - invalid parameters passed\n"); } } else { croak( '[' . localtime(time) . '] ' . __PACKAGE__ . "::to_utf8() - bad parameters passed\n"); } if (! (exists $parms->{-string})) { croak( '[' . localtime(time) . '] ' . __PACKAGE__ . "::to_utf8() - missing '-string' parameter\n"); } my $string = $parms->{-string}; my $charset = $parms->{-charset}; if (! defined ($charset)) { croak( '[' . localtime(time) . '] ' . __PACKAGE__ . "::to_utf8() - missing '-charset' parameter value\n"); } my $lc_charset = lc ($charset); my $alias_charset = $_Charset_Aliases->{$lc_charset}; my $true_charset = defined($alias_charset) ? $_Charset_Names->{$alias_charset} : $_Charset_Names->{$lc_charset}; if (! defined $true_charset) { croak( '[' . localtime(time) . '] ' . __PACKAGE__ . "::to_utf8() - character set '$charset' is not supported\n"); } $string = '' if (! defined ($string)); my $converter = $_Supported_Charsets->{$true_charset}; if ($converter eq 'map8') { return _unicode_map8_to_utf8 ($string,$true_charset); } if ($converter eq 'unicode-map'){ return _unicode_map_to_utf8 ($string,$true_charset); } elsif ($converter eq 'string') { return _unicode_string_to_utf8 ($string,$true_charset); } elsif ($converter eq 'jcode') { return _jcode_to_utf8 ($string,$true_charset); } else { croak( '[' . localtime(time) . '] ' . __PACKAGE__ . "::to_utf8() - charset '$charset' is not supported\n"); } } #### sub from_utf8 { my @parm_list = @_; my $parms; if (($#parm_list > 0) && (($#parm_list % 2) == 1)) { $parms = { @parm_list }; } elsif ($#parm_list == 0) { $parms = $parm_list[0]; if (! ref($parms)) { croak( '[' . localtime(time) . '] ' . __PACKAGE__ . "::from_utf8() - invalid parameters passed\n"); } } else { croak( '[' . localtime(time) . '] ' . __PACKAGE__ . "::from_utf8() - bad parameters passed\n"); } if (! (exists $parms->{-string})) { ; croak( '[' . localtime(time) . '] ' . __PACKAGE__ . "::from_utf8() - missing '-string' parameter\n"); } my $string = $parms->{-string}; my $charset = $parms->{-charset}; if (! defined ($charset)) { croak( '[' . localtime(time) . '] ' . __PACKAGE__ . "::from_utf8() - missing '-charset' parameter value\n"); } my $lc_charset = lc ($charset); my $alias_charset = $_Charset_Aliases->{$lc_charset}; my $true_charset = defined($alias_charset) ? $_Charset_Names->{$alias_charset} : $_Charset_Names->{$lc_charset}; if (! defined $true_charset) { croak( '[' . localtime(time) . '] ' . __PACKAGE__ . "::from_utf8() - character set '$charset' is not supported\n"); } $string = '' if (! defined ($string)); my $converter = $_Supported_Charsets->{$true_charset}; my $result; if ($converter eq 'map8') { $result = _unicode_map8_from_utf8 ($string,$true_charset); } elsif ($converter eq 'unicode-map') { $result = _unicode_map_from_utf8 ($string,$true_charset); } elsif ($converter eq 'string') { $result = _unicode_string_from_utf8 ($string,$true_charset); } elsif ($converter eq 'jcode') { $result = _jcode_from_utf8 ($string,$true_charset); } else { croak( '[' . localtime(time) . '] ' . __PACKAGE__ . "::from_utf8() - charset '$charset' is not supported\n"); } return $result; } ###################################################################### # # _unicode_map_from_utf8($string,$target_charset); # # Returns the string converted from UTF8 to the specified target multibyte charset. # sub _unicode_map_from_utf8 { my ($string,$target_charset) = @_; if (! defined $target_charset) { croak( '[' . localtime(time) . '] ' . __PACKAGE__ . '::_unicode_map_from_utf8() - (line ' . __LINE__ . ") No target character set specified\n"); } my $ucs2 = from_utf8 ({ -string => $string, -charset => 'ucs2' }); my $target = Unicode::Map->new($target_charset); if (! defined $target) { confess( '[' . localtime(time) . '] ' . __PACKAGE__ . '::_unicode_map_from_utf8() - (line ' . __LINE__ . ") failed to instantate Unicode::Map object for charset '$target_charset': $!\n"); } my $result = $target->from_unicode($ucs2); return $result; } ###################################################################### # # _unicode_map_to_utf8($string,$source_charset); # # Returns the string converted the specified target multibyte charset to UTF8. # sub _unicode_map_to_utf8 { my ($string,$source_charset) = @_; if (! defined $source_charset) { croak( '[' . localtime(time) . '] ' . __PACKAGE__ . '::_unicode_map_to_utf8() - (line ' . __LINE__ . ") No source character set specified\n"); } my $source = Unicode::Map->new($source_charset); if (! defined $source) { confess('[' . localtime(time) . '] ' . __PACKAGE__ . "::_unicode_map_to_utf8() - (line " . __LINE__ . ") failed to instantate a Unicode::Map object: $!\n"); } my $ucs2 = $source->to_unicode($string); my $result = to_utf8({ -string => $ucs2, -charset => 'ucs2' }); return $result; } ###################################################################### # # _unicode_map8_from_utf8($string,$target_charset); # # Returns the string converted from UTF8 to the specified target 8bit charset. # sub _unicode_map8_from_utf8 { my ($string,$target_charset) = @_; if (! defined $target_charset) { croak( '[' . localtime(time) . '] ' . __PACKAGE__ . '::_unicode_map8_from_utf8() - (line ' . __LINE__ . ") No target character set specified\n"); } my $u = Unicode::String::utf8($string); if (! defined $u) { confess( '[' . localtime(time) . '] ' . __PACKAGE__ . "::_unicode_map8_from_utf8() - (line " . __LINE__ . ") failed to instantate Unicode::String::utf8 object: $!\n"); } my $ordering = $u->ord; $u->byteswap if (defined($ordering) && ($ordering == 0xFFFE)); my $ucs2_string = $u->ucs2; my $target = Unicode::Map8->new($target_charset); if (! defined $target) { confess( '[' . localtime(time) . '] ' . __PACKAGE__ . "::_unicode_map8_from_utf8() - (line " . __LINE__ . ") ailed to instantate Unicode::Map8 object for character set '$target_charset': $!\n"); } my $result = $target->to8($ucs2_string); return $result; } ###################################################################### # # _unicode_map8_to_utf8($string,$source_charset); # # Returns the string converted the specified target 8bit charset to UTF8. # # sub _unicode_map8_to_utf8 { my ($string,$source_charset) = @_; my $source = Unicode::Map8->new($source_charset); if (! defined $source) { confess('[' . localtime(time) . '] ' . __PACKAGE__ . "::_unicode_map8_to_utf8() - (line " . __LINE__ . ") failed to instantate a Unicode::Map8 object for character set '$source_charset': $!\n"); } my $ucs2_string = $source->tou($string); if (! defined $ucs2_string) { confess('[' . localtime(time) . '] ' . __PACKAGE__ . "::_unicode_map8_to_utf8() - (line " . __LINE__ . ") failed to instantate a Unicode::String::utf16 object: $!\n"); } my $utf8_string = $ucs2_string->utf8; return $utf8_string; } ###################################################################### # # _unicode_string_from_utf8($string,$target_charset); # # Returns the string converted from UTF8 to the specified unicode encoding. # sub _unicode_string_from_utf8 { my ($string,$target_charset) = @_; $target_charset = lc ($target_charset); my $final; if ($target_charset eq 'utf8') { $final = $string; } elsif ($target_charset eq 'ucs2') { my $u = Unicode::String::utf8($string); my $ordering = $u->ord; $u->byteswap if (defined($ordering) && ($ordering == 0xFFFE)); $final = $u->ucs2; } elsif ($target_charset eq 'ucs4') { my $u = Unicode::String::utf8($string); my $ordering = $u->ord; $u->byteswap if (defined($ordering) && ($ordering == 0xFFFE)); $final = $u->ucs4; } elsif ($target_charset eq 'utf16') { my $u = Unicode::String::utf8($string); my $ordering = $u->ord; $u->byteswap if (defined($ordering) && ($ordering == 0xFFFE)); $final = $u->utf16; } elsif ($target_charset eq 'utf7') { my $u = Unicode::String::utf8($string); my $ordering = $u->ord; $u->byteswap if (defined($ordering) && ($ordering == 0xFFFE)); $final = $u->utf7; } else { croak( '[' . localtime(time) . '] ' . __PACKAGE__ . "::_unicode_string_from_utf8() - charset '$target_charset' is not supported\n"); } return $final; } ###################################################################### # # _unicode_string_to_utf8($string,$source_charset); # # Returns the string converted the specified unicode encoding to UTF8. # sub _unicode_string_to_utf8 { my ($string,$source_charset) = @_; $source_charset = lc ($source_charset); my $final; if ($source_charset eq 'utf8') { $final = $string; } elsif ($source_charset eq 'ucs2') { my $u = Unicode::String::utf16($string); if (! defined $u) { confess('[' . localtime(time) . '] ' . __PACKAGE__ . "::_unicode_string_to_utf8() - (line " . __LINE__ . ") failed to instantate a Unicode::String::utf16 object: $!\n"); } my $ordering = $u->ord; $u->byteswap if (defined($ordering) && ($ordering == 0xFFFE)); $final = $u->utf8; } elsif ($source_charset eq 'ucs4') { my $u = Unicode::String::ucs4($string); if (! defined $u) { confess('[' . localtime(time) . '] ' . __PACKAGE__ . "::_unicode_string_to_utf8() - (line " . __LINE__ . ") failed to instantate a Unicode::String::ucs4 object: $!\n"); } my $ordering = $u->ord; $u->byteswap if (defined($ordering) && ($ordering == 0xFFFE)); $final = $u->utf8; } elsif ($source_charset eq 'utf16') { my $u = Unicode::String::utf16($string); if (! defined $u) { confess('[' . localtime(time) . '] ' . __PACKAGE__ . "::_unicode_string_to_utf8() - (line " . __LINE__ . ") failed to instantate a Unicode::String::utf16 object: $!\n"); } my $ordering = $u->ord; $u->byteswap if (defined($ordering) && ($ordering == 0xFFFE)); $final = $u->utf8; } elsif ($source_charset eq 'utf7') { my $u = Unicode::String::utf7($string); if (! defined $u) { confess('[' . localtime(time) . '] ' . __PACKAGE__ . "::_unicode_string_to_utf8() - (line " . __LINE__ . ") failed to instantate a Unicode::String::utf7 object: $!\n"); } my $ordering = $u->ord; $u->byteswap if (defined($ordering) && ($ordering == 0xFFFE)); $final = $u->utf8; } else { croak( '[' . localtime(time) . '] ' . __PACKAGE__ . ":: _unicode_string_to_utf8() - charset '$source_charset' is not supported\n"); } return $final; } ###################################################################### # # _jcode_from_utf8($string,$target_charset); # # Returns the string converted from UTF8 to the specified Jcode encoding. # sub _jcode_from_utf8 { my ($string,$target_charset) = @_; my $j = Jcode->new($string,'utf8'); $target_charset = lc ($target_charset); my $final; if ($target_charset =~ m/^iso[-_]2022[-_]jp$/) { $final = $j->iso_2022_jp; } elsif ($target_charset eq 'sjis') { $final = $j->sjis; } elsif ($target_charset eq 'euc-jp') { $final = $j->euc; } elsif ($target_charset eq 'jis') { $final = $j->jis; } else { croak( '[' . localtime(time) . '] ' . __PACKAGE__ . "::_jcode_from_utf8() - charset '$target_charset' is not supported\n"); } return $final; } ###################################################################### # # _jcode_to_utf8($string,$source_charset); # # Returns the string converted from the specified Jcode encoding to UTF8. # sub _jcode_to_utf8 { my ($string,$source_charset) = @_; $source_charset = lc ($source_charset); my $final; if ($source_charset =~ m/^iso[-_]2022[-_]jp$/) { my $j = Jcode->new($string,'jis')->h2z; $final = $j->utf8; } elsif ($source_charset =~m/^(s[-_]?jis|shift[-_]?jis)$/) { my $j = Jcode->new($string,'sjis'); $final = $j->utf8; } elsif ($source_charset eq 'euc-jp') { my $j = Jcode->new($string,'euc'); $final = $j->utf8; } elsif ($source_charset eq 'jis') { my $j = Jcode->new($string,'jis'); $final = $j->utf8; } else { croak( '[' . localtime(time) . '] ' . __PACKAGE__ . "::_jcode_to_utf8() - charset '$source_charset' is not supported\n"); } return $final; } ####################################################################### # # Character set handlers maps # sub _init_charsets { $_Charset_Aliases = {}; $_Supported_Charsets = { 'utf8' => 'string', 'ucs2' => 'string', 'ucs4' => 'string', 'utf7' => 'string', 'utf16' => 'string', 'sjis' => 'jcode', 's-jis' => 'jcode', 's_jis' => 'jcode', 'shiftjis' => 'jcode', 'shift-jis' => 'jcode', 'shift_jis' => 'jcode', 'iso-2022-jp' => 'jcode', 'iso_2022_jp' => 'jcode', 'jis' => 'jcode', 'euc-jp' => 'jcode', }; $_Charset_Names = { map { lc ($_) => $_ } keys %$_Supported_Charsets }; # All the Unicode::Map8 charsets { my @map_ids = &_list_unicode_map8_charsets; foreach my $id (@map_ids) { my $lc_id = lc ($id); next if (exists ($_Charset_Names->{$lc_id})); $_Supported_Charsets->{$id} = 'map8'; $_Charset_Names->{$lc_id} = $id; } } $_Charset_Names = { map { lc ($_) => $_ } keys %$_Supported_Charsets }; # Add any charsets not already listed from Unicode::Map { my $unicode_map = Unicode::Map->new; my @map_ids = $unicode_map->ids; foreach my $id (@map_ids) { my $lc_id = lc ($id); next if (exists ($_Charset_Names->{$lc_id})); $_Supported_Charsets->{$id} = 'unicode-map'; $_Charset_Names->{$lc_id} = $id; } } } ###################################################################### # # Code taken and modified from the 'usr/bin/umap' code distributed # with Unicode::Map8. It wouldn't be necessary if Unicode::Map8 # had a direct method for this.... # sub _list_unicode_map8_charsets { my %set = ( ucs4 => {}, ucs2 => {utf16 => 1}, utf7 => {}, utf8 => {}, ); if (opendir(DIR, $Unicode::Map8::MAPS_DIR)) { my @files = grep(!/^\.\.?$/,readdir(DIR)); foreach my $f (@files) { next unless -f "$Unicode::Map8::MAPS_DIR/$f"; $f =~ s/\.(?:bin|txt)$//; my $supported = $set{$f} = {} if Unicode::Map8->new($f); } } my $avoid_warning = keys %Unicode::Map8::ALIASES; while ( my($alias, $charset) = each %Unicode::Map8::ALIASES) { if (exists $set{$charset}) { $set{$charset}{$alias} = 1; } } my %merged_set = (); foreach my $encoding (keys %set) { $merged_set{$encoding} = 1; my $set_item = $set{$encoding}; while (my ($key,$value) = each (%$set_item)) { $merged_set{$key} = $value; } } my @final_charsets = sort keys %merged_set; return @final_charsets; } ###################################################################### 1; Unicode-MapUTF8-1.11/lib/Unicode/MapUTF8.pod0000664000076400007640000002053310322472527021747 0ustar snowharesnowhare00000000000000 =head1 NAME Unicode::MapUTF8 - Conversions to and from arbitrary character sets and UTF8 =head1 SYNOPSIS use Unicode::MapUTF8 qw(to_utf8 from_utf8 utf8_supported_charset); # Convert a string in 'ISO-8859-1' to 'UTF8' my $output = to_utf8({ -string => 'An example', -charset => 'ISO-8859-1' }); # Convert a string in 'UTF8' encoding to encoding 'ISO-8859-1' my $other = from_utf8({ -string => 'Other text', -charset => 'ISO-8859-1' }); # List available character set encodings my @character_sets = utf8_supported_charset; # Add a character set alias utf8_charset_alias({ 'ms-japanese' => 'sjis' }); # Convert between two arbitrary (but largely compatible) charset encodings # (SJIS to EUC-JP) my $utf8_string = to_utf8({ -string =>$sjis_string, -charset => 'sjis'}); my $euc_jp_string = from_utf8({ -string => $utf8_string, -charset => 'euc-jp' }) # Verify that a specific character set is supported if (utf8_supported_charset('ISO-8859-1') { # Yes } =head1 DESCRIPTION Provides an adapter layer between core routines for converting to and from UTF8 and other encodings. In essence, a way to give multiple existing Unicode modules a single common interface so you don't have to know the underlaying implementations to do simple UTF8 to-from other character set encoding conversions. As such, it wraps the Unicode::String, Unicode::Map8, Unicode::Map and Jcode modules in a standardized and simple API. This also provides general character set conversion operation based on UTF8 - it is possible to convert between any two compatible and supported character sets via a simple two step chaining of conversions. As with most things Perlish - if you give it a few big chunks of text to chew on instead of lots of small ones it will handle many more characters per second. By design, it can be easily extended to encompass any new charset encoding conversion modules that arrive on the scene. This module is intended to provide good Unicode support to versions of Perl prior to 5.8. If you are using Perl 5.8.0 or later, you probably want to be using the Encode module instead. This module B work with Perl 5.8, but Encode is the preferred method in that environment. =head1 CHANGES 1.11 2005.10.10 Documentation changes. Addition of Build.PL support. Added various build tests, LICENSE, Artistic_License.txt, GPL_License.txt. Split documentation into seperate .pod file. Added Japanese translation of POD. 1.10 2005.05.22 - Fixed bug in conversion of ISO-2022-JP to UTF-8. Problem and fix found by Masahiro HONMA . Similar bugs in conversions of shift_jis and euc-jp to UTF-8 fixed as well. 1.09 2001.08.22 - Fixed multiple typo occurances of 'uft' where 'utf' was meant in code. Problem affected utf16 and utf7 encodings. Problem found by devon smith 1.08 2000.11.06 - Added 'utf8_charset_alias' function to allow for runtime setting of character set aliases. Added several alternate names for 'sjis' (shiftjis, shift-jis, shift_jis, s-jis, and s_jis). Corrected 'croak' messages for 'from_utf8' functions to appropriate function name. Tightened up initialization encapsulation Corrected fatal problem in jcode from unicode internals. Problem and fix found by Brian Wisti . 1.07 2000.11.01 - Added 'croak' to use Carp declaration to fix error messages. Problem and fix found by Brian Wisti . 1.06 2000.10.30 - Fix to handle change in stringification of overloaded objects between Perl 5.005 and 5.6. Problem noticed by Brian Wisti . 1.05 2000.10.23 - Error in conversions from UTF8 to multibyte encodings corrected 1.04 2000.10.23 - Additional diagnostic messages added for internal error conditions 1.03 2000.10.22 - Bug fix for load time autodetction of Unicode::Map8 encodings 1.02 2000.10.22 - Added load time autodetection of Unicode::Map8 supported character set encodings. Fixed internal calling error for some character sets with 'from_utf8'. Thanks goes to Ilia Lobsanov for reporting this problem. 1.01 2000.10.02 - Fixed handling of empty strings and added more identification for error messages. 1.00 2000.09.29 - Pre-release version =head1 FUNCTIONS =over 4 =item utf8_charset_alias({ $alias => $charset }); Used for runtime assignment of character set aliases. Called with no parameters, returns a hash of defined aliases and the character sets they map to. Example: my $aliases = utf8_charset_alias; my @alias_names = keys %$aliases; If called with ONE parameter, returns the name of the 'real' charset if the alias is defined. Returns undef if it is not found in the aliases. Example: if (! utf8_charset_alias('VISCII')) { # No alias for this } If called with a list of 'alias' => 'charset' pairs, defines those aliases for use. Example: utf8_charset_alias({ 'japanese' => 'sjis', 'japan' => 'sjis' }); Note: It will croak if a passed pair does not map to a character set defined in the predefined set of character encoding. It is NOT allowed to alias something to another alias. Multiple character set aliases can be set with a single call. To clear an alias, pass a character set mapping of undef. Example: utf8_charset_alias({ 'japanese' => undef }); While an alias is set, the 'utf8_supported_charset' function will return the alias as if it were a predefined charset. Overriding a base defined character encoding with an alias will generate a warning message to STDERR. =back =over 4 =item utf8_supported_charset($charset_name); Returns true if the named charset is supported (including user defined aliases). Returns false if it is not. Example: if (! utf8_supported_charset('VISCII')) { # No support yet } If called in a list context with no parameters, it will return a list of all supported character set names (including user defined aliases). Example: my @charsets = utf8_supported_charset; =back =over 4 =item to_utf8({ -string => $string, -charset => $source_charset }); Returns the string converted to UTF8 from the specified source charset. =back =over 4 =item from_utf8({ -string => $string, -charset => $target_charset}); Returns the string converted from UTF8 to the specified target charset. =back =head1 VERSION 1.11 2005.10.10 =head1 TODO Regression tests for Jcode, 2-byte encodings and encoding aliases =head1 SEE ALSO L L L L L =head1 COPYRIGHT Copyright 2000-2005, Benjamin Franz. All rights reserved. =head1 AUTHOR Benjamin Franz =head1 LICENSE This program is free software; you can redistribute it and/or modify it under the same terms and conditions as Perl itself. This means that you can, at your option, redistribute it and/or modify it under either the terms the GNU Public License (GPL) version 1 or later, or under the Perl Artistic License. See http://dev.perl.org/licenses/ =head1 DISCLAIMER THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. Use of this software in any way or in any form, source or binary, is not allowed in any country which prohibits disclaimers of any implied warranties of merchantability or fitness for a particular purpose or any disclaimers of a similar nature. IN NO EVENT SHALL I BE LIABLE TO ANY PARTY FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION (INCLUDING, BUT NOT LIMITED TO, LOST PROFITS) EVEN IF I HAVE BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE =cut Unicode-MapUTF8-1.11/t/0000775000076400007640000000000010322472527016163 5ustar snowharesnowhare00000000000000Unicode-MapUTF8-1.11/t/97_distribution.t0000664000076400007640000000062010322472527021404 0ustar snowharesnowhare00000000000000use strict; use lib ('./blib','../blib', './lib', '../lib'); eval { require Test::More; }; if ($@) { $|++; print "1..0 # Skipped: Test::More required for testing distribution\n"; exit; } eval { require Test::Distribution; }; if ($@) { Test::More::plan( skip_all => 'Test::Distribution not installed' ); } Test::Distribution->import('only' => [qw(prereq sig description)]); Unicode-MapUTF8-1.11/t/98_pod_coverage.t0000664000076400007640000000110310322472527021320 0ustar snowharesnowhare00000000000000use strict; use lib ('./blib','../blib', './lib', '../lib'); eval { require Test::More; }; if ($@) { $|++; print "1..0 # Skipped: Test::More required for testing POD coverage\n"; exit; } eval { require Test::Pod::Coverage; }; if ($@ or (not defined $Test::Pod::Coverage::VERSION) or ($Test::Pod::Coverage::VERSION < 1.06)) { Test::More::plan (skip_all => "Test::Pod::Coverage 1.06 required for testing POD coverage"); exit; } Test::More::plan (tests => 1); Test::Pod::Coverage::pod_coverage_ok( 'Unicode::MapUTF8', { also_private => ['DEBUG'] }); Unicode-MapUTF8-1.11/t/01_unicode_maputf8.t0000664000076400007640000002146510322472527021752 0ustar snowharesnowhare00000000000000#!/usr/bin/perl -w use strict; use lib ('./blib','../blib','../lib','./lib'); #use bytes; use Unicode::MapUTF8 qw(utf8_supported_charset to_utf8 from_utf8 utf8_charset_alias); # General info for writing test modules: # # When running as 'make test' the default # working directory is the one _above_ the # 't/' directory. my @do_tests=(1..5); my $test_subs = { 1 => { -code => \&test1, -desc => ' eight-bit ' }, 2 => { -code => \&test2, -desc => ' unicode ' }, 3 => { -code => \&test3, -desc => ' multi-byte ' }, 4 => { -code => \&test4, -desc => ' jcode ' }, 5 => { -code => \&test5, -desc => ' charset aliases ' }, # 6 => { -code => \&big5_with_embedded_ascii, -desc => ' big5 embedded ascii ' }, }; my @charsets = utf8_supported_charset; print $do_tests[0],'..',$do_tests[$#do_tests],"\n"; print STDERR "\n"; my $n_failures = 0; foreach my $test (@do_tests) { my $sub = $test_subs->{$test}->{-code}; my $desc = $test_subs->{$test}->{-desc}; my $failure = ''; eval { $failure = &$sub; }; if ($@) { $failure = $@; } if ($failure ne '') { chomp $failure; print "not ok $test\n"; print STDERR " $desc - $failure\n"; $n_failures++; } else { print "ok $test\n"; print STDERR " $desc - ok\n"; } } print "END\n"; exit; ######################################## # Eight bit conversions # ######################################## sub test1 { my $charset = 'ISO-8859-1'; my $source_string = 'Hello World'; my $utf8_string = 'Hello World'; my $result = test_general({ -charset => $charset, -source => $source_string, -utf8 => $utf8_string, }); return $result if ($result ne ''); $source_string = ''; $utf8_string = ''; $result = test_general({ -charset => $charset, -source => $source_string, -utf8 => $utf8_string, }); return $result if ($result ne ''); return ''; } ######################################## # Unicode conversions # ######################################## sub test2 { my $charset = 'UCS2'; my $source_string = "\x00H\x00e\x00l\x00l\x00o\x00 \x00W\x00o\x00r\x00l\x00d"; my $utf8_string = 'Hello World'; my $result = test_general({ -charset => $charset, -source => $source_string, -utf8 => $utf8_string, }); return $result if ($result ne ''); $source_string = ''; $utf8_string = ''; $result = test_general({ -charset => $charset, -source => $source_string, -utf8 => $utf8_string, }); return $result if ($result ne ''); return ''; } ######################################## # Multibyte conversions # ######################################## sub test3 { return ''; } ######################################## # Japanese (Jcode) conversions # ######################################## sub test4 { my $charset = 'euc-jp'; my $source_string = "Hello World"; my $utf8_string = 'Hello World'; my $result = test_general({ -charset => $charset, -source => $source_string, -utf8 => $utf8_string, }); return $result if ($result ne ''); $source_string = ''; $utf8_string = ''; $result = test_general({ -charset => $charset, -source => $source_string, -utf8 => $utf8_string, }); return $result if ($result ne ''); return ''; } ######################################## # Charset aliases # ######################################## sub test5 { my $charset='ISO-8859-1'; my $alias ='latin-1_sort_of'; eval { utf8_charset_alias({ $alias => $charset }); }; if ($@) { return "$@" } eval { my $aliased = utf8_charset_alias($alias); if ((! defined $aliased) || (lc($charset) ne lc($aliased))) { die("Alias crosscheck for '$alias' -> '$charset' returned a *different* charset of '$aliased'"); } }; if ($@) { return "Failed to alias character set '$charset' to '$alias': $@" } $charset = $alias; my $source_string = 'Hello World'; my $utf8_string = 'Hello World'; my $result = test_general({ -charset => $charset, -source => $source_string, -utf8 => $utf8_string, }); return $result if ($result ne ''); $source_string = ''; $utf8_string = ''; $result = test_general({ -charset => $charset, -source => $source_string, -utf8 => $utf8_string, }); return $result if ($result ne ''); eval { utf8_charset_alias({ $alias => undef }); }; if ($@) { return "$@" } $source_string = 'Hello World'; $utf8_string = 'Hello World'; eval { my $result = test_general({ -charset => $charset, -source => $source_string, -utf8 => $utf8_string, }); }; if (! defined $@) { return "Failed to catch use of non-aliased charset"; } return ''; } ######################################## # Test Big5 with embedded ASCII # ######################################## sub big5_with_embedded_ascii { my $charset = 'big5'; my @errors = (); { my $source_string = "\xa5\x40\xa5\x41\x30"; my $utf8_string = to_utf8({ -charset => "ucs2", -string => "\x4e\x16\x4e\x15\x00\x30"}); my $result = test_general({ -charset => $charset, -source => $source_string, -utf8 => $utf8_string, }); push(@errors,$result) if ($result ne ''); } { my $source_string = "\xa5\x40\xa5\x41\x30\xa5\x30\x41\xa5\x40"; my $utf8_string = to_utf8({ -charset => "ucs2", -string => "\x4e\x16\x4e\x15\x00\x30\x00\x41\x4e\x16"}); my $result = test_general({ -charset => $charset, -source => $source_string, -utf8 => $utf8_string, }); push(@errors,$result) if ($result ne ''); } if (0 < @errors) { return join('',@errors); } return ''; } ######################################## # Generalized test framework # ######################################## sub test_general { my ($parms) = shift; my $source_charset = $parms->{-charset}; my $source_string = $parms->{-source}; my $utf8_string = $parms->{-utf8}; eval { my $result_string = to_utf8({ -string => $source_string, -charset => $source_charset }); if ($utf8_string ne $result_string) { die ('(line ' . __LINE__ . ") conversion from '$source_charset' to UTF8 resulted in unexpected output.\nExpected '" . hexout($utf8_string) . "' but got '" . hexout($result_string) . "'\n"); } }; if ($@) { return "Failed to convert UTF8 text to $source_charset:\n$@" } eval { my $result_string = from_utf8({ '-string' => $utf8_string, '-charset' => $source_charset, }); if ($source_string ne $result_string) { die ("conversion from UTF8 to '$source_charset' resulted in unexpected output.\nExpected '" . hexout($source_string) . "' but got '" . hexout($result_string) . "'\n"); } }; if ($@) { return "Failed to convert '$source_charset' text to UTF8: $@" } eval { my $result_string = from_utf8({ -string => $source_string, -charset => $source_charset, }); if ($source_string ne to_utf8({ -string => $result_string, -charset => $source_charset })) { die ("input and output strings differed"); } }; if ($@) { return "Round trip conversion of '$source_charset' to UTF8 failed: $@" } return ''; } sub hexout { my ($string) = @_; $string =~ s/([\x00-\xff])/unpack("H",$1).unpack("h",$1)/egos; return $string; } Unicode-MapUTF8-1.11/t/99_pod.t0000664000076400007640000000057410322472527017461 0ustar snowharesnowhare00000000000000eval { require Test::More; }; if ($@) { $|++; print "1..0 # Skipped: Test::More required for testing POD. Skipping.\n"; exit; } eval { require Test::Pod; }; if ($@ or (not defined $Test::Pod::VERSION) or ($Test::Pod::VERSION < 1.00)) { Test::More::plan (skip_all => "Test::Pod 1.00 required for testing POD"); exit; } Test::Pod::all_pod_files_ok(); Unicode-MapUTF8-1.11/META.yml0000664000076400007640000000062010322472527017167 0ustar snowharesnowhare00000000000000--- name: Unicode-MapUTF8 version: 1.11 author: - Benjamin Franz abstract: Conversions to and from arbitrary character sets and UTF8 license: perl requires: Carp: 0 File::Copy: 0 Jcode: 0 Unicode::Map: 0 Unicode::Map8: 0 Unicode::String: 0 provides: Unicode::MapUTF8: file: lib/Unicode/MapUTF8.pm version: 1.11 generated_by: Module::Build version 0.261 Unicode-MapUTF8-1.11/Changes0000664000076400007640000000521010322472527017211 0ustar snowharesnowhare00000000000000Unicode::MapUTF8 - Conversions to and from arbitrary character sets and UTF8 1.11 2005.10.10 Documentation changes. Addition of Build.PL support. Added various build tests, LICENSE, Artistic_License.txt, GPL_License.txt. Split documentation into seperate .pod file. Added Japanese translation of POD. 1.10 2005.05.22 - Fixed bug in conversion of ISO-2022-JP to UTF-8. Problem and fix found by Masahiro HONMA . Similar bugs in conversions of shift_jis and euc-jp to UTF-8 corrected as well. 1.09 2001.08.22 - Fixed multiple typo occurances of 'uft' where 'utf' was meant in code. Problem affected utf16 and utf7 encodings. Problem found by devon smith 1.08 2000.11.06 Added 'utf8_charset_alias' function to allow for runtime setting of character set aliases. Added several alternate names for 'sjis' (shiftjis, shift-jis, shift_jis, s-jis, and s_jis). Corrected 'croak' messages for 'from_utf8' functions to appropriate function name. Corrected fatal problem in jcode-unicode internals. Problem and fix found by Brian Wisti . 1.07 2000.11.01 Added 'croak' to use Carp declaration to fix error messages. Problem and fix found by . 1.06 2000.10.30 Fix to handle change in stringification of overloaded objects between Perl 5.005 and 5.6. Problem noticed by Brian Wisti . 1.05 2000.10.23 Error in conversions from UTF8 to multibyte encodings corrected 1.04 2000.10.23 Additional diagnostic error messages added for internal errors 1.03 2000.10.22 Bug fix for load time Unicode::Map encoding detection 1.02 2000.10.22 Bug fix to 'from_utf8' method and load time detection of Unicode::Map8 supported character set encodings 1.01 2000.10.02 Initial public release Provides an adapter layer between core routines for converting to and from UTF8 and other encodings. In essence, a way to give multiple existing Unicode modules a single common interface so you don't have to know the underlaying implementations to do simple UTF8 to-from other character set string conversions. As such, it wraps the Unicode::String, Unicode::Map8, Unicode::Map and Jcode modules in a standardized and simple API. To install: perl Makefile.PL make make test make install Unicode-MapUTF8-1.11/MANIFEST0000664000076400007640000000037110322472527017052 0ustar snowharesnowhare00000000000000Build.PL Changes MANIFEST Makefile.PL README lib/Unicode/MapUTF8.pm lib/Unicode/MapUTF8.pod pod/MapUTF8.ja_JP.utf8.pod pod/MapUTF8.ja_JP.eucjp.pod pod/MapUTF8.en.pod t/01_unicode_maputf8.t t/97_distribution.t t/98_pod_coverage.t t/99_pod.t META.yml Unicode-MapUTF8-1.11/Build.PL0000664000076400007640000000231710322472527017217 0ustar snowharesnowhare00000000000000use Module::Build; use File::Copy qw(copy); my $lang = defined($ENV{'LANG'}) ? $ENV{'LANG'} : 'en'; my $target_pod = File::Spec->catfile('lib','Unicode','MapUTF8.pod'); if ($lang =~ m/^(ja|ja_JP|ja_JP.utf-8|ja_JP.utf8|ja.utf8|ja.utf-8)$/i) { $source_pod = File::Spec->catfile('pod','MapUTF8.ja_JP.utf8.pod'); copy ($source_pod, $target_pod); } elsif ($lang =~ m/^(ja_JP.eucjp|ja_JP.euc|ja_euc|ja_eucjp)$/i) { $source_pod = File::Spec->catfile('pod','MapUTF8.ja_JP.eucjp.pod'); copy ($source_pod, $target_pod); } else { $source_pod = File::Spec->catfile('pod','MapUTF8.en.pod'); copy ($source_pod, $target_pod); } my $build = Module::Build->new ( module_name => 'Unicode::MapUTF8', dist_author => 'Benjamin Franz ', dist_abstract => 'Conversions to and from arbitrary character sets and UTF8', license => 'perl', requires => { 'Carp' => 0, 'File::Copy' => 0, 'Unicode::Map' => 0, 'Unicode::String' => 0, 'Unicode::Map8' => 0, 'Jcode' => 0, }, build_requires => { }, )->create_build_script; Unicode-MapUTF8-1.11/Makefile.PL0000664000076400007640000000340510322472527017674 0ustar snowharesnowhare00000000000000use ExtUtils::MakeMaker; use File::Spec; use File::Copy qw (copy); # See lib/ExtUtils/MakeMaker.pm for details of how to influence # the contents of the Makefile that is written. my $lang = defined($ENV{'LANG'}) ? $ENV{'LANG'} : 'en'; my $target_pod = File::Spec->catfile('lib','Unicode','MapUTF8.pod'); if ($lang =~ m/^(ja|ja_JP|ja_JP.utf-8|ja_JP.utf8|ja.utf8|ja.utf-8)$/i) { $source_pod = File::Spec->catfile('pod','MapUTF8.ja_JP.utf8.pod'); copy ($source_pod, $target_pod); } elsif ($lang =~ m/^(ja_JP.eucjp|ja_JP.euc|ja_euc|ja_eucjp)$/i) { $source_pod = File::Spec->catfile('pod','MapUTF8.ja_JP.eucjp.pod'); copy ($source_pod, $target_pod); } else { $source_pod = File::Spec->catfile('pod','MapUTF8.en.pod'); copy ($source_pod, $target_pod); } WriteMakefile( 'NAME' => 'Unicode::MapUTF8', 'VERSION_FROM' => 'lib/Unicode/MapUTF8.pm', 'linkext' => { LINKTYPE=>'' }, # no link needed 'dist' => {'COMPRESS'=>'gzip -9f', 'SUFFIX' => 'gz', 'ZIP'=>'/usr/bin/zip','ZIPFLAGS'=>'-rl'}, 'PREREQ_PM' => { 'Carp' => 0, 'File::Copy' => 0, 'Unicode::Map' => 0, 'Unicode::String' => 0, 'Unicode::Map8' => 0, 'Jcode' => 0, }, ($] >= 5.005 ? ## Add these new keywords supported since 5.005 (ABSTRACT_FROM => 'lib/Unicode/MapUTF8.pod', # retrieve abstract from module AUTHOR => 'Benjamin Franz ') : ()), ($] >= 5.800 ? ## Add these new keywords supported since 5.8 (NO_META => 1) : ()), PL_FILES => {}, ); Unicode-MapUTF8-1.11/pod/0000775000076400007640000000000010322472527016502 5ustar snowharesnowhare00000000000000Unicode-MapUTF8-1.11/pod/MapUTF8.ja_JP.utf8.pod0000664000076400007640000002266210322472527022251 0ustar snowharesnowhare00000000000000 =head1 名前 Unicode::MapUTF8 - 任意の文字セットから/へのUTF8の変換 =head1 概要 use Unicode::MapUTF8 qw(to_utf8 from_utf8 utf8_supported_charset); # 'ISO-8859-1'の文字列を 'UTF8'に変換する my $output = to_utf8({ -string => 'An example', -charset => 'ISO-8859-1' }); # 'UTF8'エンコーディングの文字列をエンコーディング 'ISO-8859-1'へ変換する my $other = from_utf8({ -string => 'Other text', -charset => 'ISO-8859-1' }); # 利用できる文字セットエンコーディングのリスト my @character_sets = utf8_supported_charset; # 文字セット別名の追加 utf8_charset_alias({ 'ms-japanese' => 'sjis' }); # 2つの任意の(しかし大きく互換性のある)文字セットエンコーディングでの変換 # (SJIS to EUC-JP) my $utf8_string = to_utf8({ -string =>$sjis_string, -charset => 'sjis'}); my $euc_jp_string = from_utf8({ -string => $utf8_string, -charset => 'euc-jp' }) # 特定の文字セットがサポートされているかを確認 if (utf8_supported_charset('ISO-8859-1') { # Yes } =head1 説明 UTF8とその他のエンコーディングから/への変換のための中核となるルーチンの間のアダプタ層を提供します。本質的には、複数存在する Unicodeモジュールへの1つの共通のインターフェース、下敷きになっている実装を知ることなく、簡単にUTF8から/へ他の文字セットエンコーディ ング変換を簡単におこないます。そのため、これはUnicode::String、Unicode::Map8、Unicode::Map、Jcodeモ ジュールを標準化された簡単なAPIの中に包みます。 またこれはUTF-8をベースに一般的な文字セット変換も提供します−これは2段階の変換つなげることにより、2つの互換性があり、サポートされている文字セットで可能です。 ほとんどのことがPerl的なので−かみ砕く対象として数多くの小さいなものの代わりに2、3の大きな固まりを与えると、1秒間により多くの文字を扱います。 設計では、登場するいかなる新しい文字セット・エンコーディング変換モジュールも取り込むように簡単に拡張することができます。 =head1 変更点 (原文のまま) 1.11 2005.10.10 Documentation changes. Addition of Build.PL support. Added various build tests, LICENSE, Artistic_License.txt, GPL_License.txt. Split documentation into seperate .pod file. Added Japanese translation of POD. 1.10 2005.05.22 - Fixed bug in conversion of ISO-2022-JP to UTF-8. Problem and fix found by Masahiro HONMA . Similar bugs in conversions of shift_jis and euc-jp to UTF-8 fixed as well. 1.09 2001.08.22 - Fixed multiple typo occurances of 'uft' where 'utf' was meant in code. Problem affected utf16 and utf7 encodings. Problem found by devon smith 1.08 2000.11.06 - Added 'utf8_charset_alias' function to allow for runtime setting of character set aliases. Added several alternate names for 'sjis' (shiftjis, shift-jis, shift_jis, s-jis, and s_jis). Corrected 'croak' messages for 'from_utf8' functions to appropriate function name. Tightened up initialization encapsulation Corrected fatal problem in jcode from unicode internals. Problem and fix found by Brian Wisti . 1.07 2000.11.01 - Added 'croak' to use Carp declaration to fix error messages. Problem and fix found by Brian Wisti . 1.06 2000.10.30 - Fix to handle change in stringification of overloaded objects between Perl 5.005 and 5.6. Problem noticed by Brian Wisti . 1.05 2000.10.23 - Error in conversions from UTF8 to multibyte encodings corrected 1.04 2000.10.23 - Additional diagnostic messages added for internal error conditions 1.03 2000.10.22 - Bug fix for load time autodetction of Unicode::Map8 encodings 1.02 2000.10.22 - Added load time autodetection of Unicode::Map8 supported character set encodings. Fixed internal calling error for some character sets with 'from_utf8'. Thanks goes to Ilia Lobsanov for reporting this problem. 1.01 2000.10.02 - Fixed handling of empty strings and added more identification for error messages. 1.00 2000.09.29 - Pre-release version =head1 関数 =over =item utf8_charset_alias({ $alias => $charset }); 文字セット別名の実行時の代入に使われます。 引数なしで呼ばれると、定義されている別名と、それにマップされる文字セットのハッシュを返します。 例:: my $aliases = utf8_charset_alias; my @alias_names = keys %$aliases; 1つのパラメータ付きで呼ばれると、もしその別名が定義されていれば、'本当の'文字セットの名前を返します。それが別名に見つからなければundefを返します。 例: if (! utf8_charset_alias('VISCII')) { # No alias for this } もし'alias' => 'charset'の組のリストで呼ばれれば、それらの別名が使えるように定義します。 例: utf8_charset_alias({ 'japanese' => 'sjis', 'japan' => 'sjis' }); 注意:渡された組が、予め定義されている文字セット・エンコーディングの集合に定義されている文字セットにマップされなければ、croakします。これは他の別名への別名を許してはいません。 複数の文字セットを1回の呼出しで設定することができます。 別名をクリーンするためには、undefの文字セットマッピングを渡します。 例: utf8_charset_alias({ 'japanese' => undef }); 別名が設定されている間、もし予め定義されている文字セットであれば、'utf8_supported_charset' 関数は別名を返します。 基本の定義された文字エンコーディングを別名でオーバーライドすると、標準エラー(STDERR)への警告メッセージを出します。 =back =over =item utf8_supported_charset($charset_name); (ユーザが定義した別名も含めて)名づけられた文字セットがサポートされていればtrueを返します。

そうでなければfalseを返します。 例: if (! utf8_supported_charset('VISCII')) { # まだサポートされていません } パラメータなしで、リスト・コンテキストで呼ばれると、(ユーザが定義した別名も含めて)サポートされているすべての文字セット名のリストを返します。 例: my @charsets = utf8_supported_charset; =back =over =item to_utf8({ -string => $string, -charset => $source_charset }); 指定された元の文字セット(source charset)からUTF8に変換された文字列を返します。 =back =over =item from_utf8({ -string => $string, -charset => $target_charset}); UTF8から指定されたターゲットの文字セット(target charset)に変換された文字列を返します。 =back =head1 バージョン 1.11 - 2005.10.10 =head1 やるべきこと Jcode、2バイト・エンコーディングそしてエンコーディング別名のための逆行テスト =head1 参考資料 L L L L =head1 著作権 Copyright 2000-2005, Benjamin Franz. All rights reserved. =head1 作者 Benjamin Franz =head1 クレジット 川合孝典 "Kawai,Takanori" - 邦訳 =head1 ライセンス Perl と同じライセンス( Artistic License と GPL のデュアルライセンス) (原文のまま) This program is free software; you can redistribute it and/or modify it under the same terms and conditions as Perl itself. This means that you can, at your option, redistribute it and/or modify it under either the terms the GNU Public License (GPL) version 1 or later, or under the Perl Artistic License. See http://dev.perl.org/licenses/ =head1 DISCLAIMER (原文のまま) THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. Use of this software in any way or in any form, source or binary, is not allowed in any country which prohibits disclaimers of any implied warranties of merchantability or fitness for a particular purpose or any disclaimers of a similar nature. IN NO EVENT SHALL I BE LIABLE TO ANY PARTY FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION (INCLUDING, BUT NOT LIMITED TO, LOST PROFITS) EVEN IF I HAVE BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE =cut Unicode-MapUTF8-1.11/pod/MapUTF8.en.pod0000664000076400007640000002053310322472527020776 0ustar snowharesnowhare00000000000000 =head1 NAME Unicode::MapUTF8 - Conversions to and from arbitrary character sets and UTF8 =head1 SYNOPSIS use Unicode::MapUTF8 qw(to_utf8 from_utf8 utf8_supported_charset); # Convert a string in 'ISO-8859-1' to 'UTF8' my $output = to_utf8({ -string => 'An example', -charset => 'ISO-8859-1' }); # Convert a string in 'UTF8' encoding to encoding 'ISO-8859-1' my $other = from_utf8({ -string => 'Other text', -charset => 'ISO-8859-1' }); # List available character set encodings my @character_sets = utf8_supported_charset; # Add a character set alias utf8_charset_alias({ 'ms-japanese' => 'sjis' }); # Convert between two arbitrary (but largely compatible) charset encodings # (SJIS to EUC-JP) my $utf8_string = to_utf8({ -string =>$sjis_string, -charset => 'sjis'}); my $euc_jp_string = from_utf8({ -string => $utf8_string, -charset => 'euc-jp' }) # Verify that a specific character set is supported if (utf8_supported_charset('ISO-8859-1') { # Yes } =head1 DESCRIPTION Provides an adapter layer between core routines for converting to and from UTF8 and other encodings. In essence, a way to give multiple existing Unicode modules a single common interface so you don't have to know the underlaying implementations to do simple UTF8 to-from other character set encoding conversions. As such, it wraps the Unicode::String, Unicode::Map8, Unicode::Map and Jcode modules in a standardized and simple API. This also provides general character set conversion operation based on UTF8 - it is possible to convert between any two compatible and supported character sets via a simple two step chaining of conversions. As with most things Perlish - if you give it a few big chunks of text to chew on instead of lots of small ones it will handle many more characters per second. By design, it can be easily extended to encompass any new charset encoding conversion modules that arrive on the scene. This module is intended to provide good Unicode support to versions of Perl prior to 5.8. If you are using Perl 5.8.0 or later, you probably want to be using the Encode module instead. This module B work with Perl 5.8, but Encode is the preferred method in that environment. =head1 CHANGES 1.11 2005.10.10 Documentation changes. Addition of Build.PL support. Added various build tests, LICENSE, Artistic_License.txt, GPL_License.txt. Split documentation into seperate .pod file. Added Japanese translation of POD. 1.10 2005.05.22 - Fixed bug in conversion of ISO-2022-JP to UTF-8. Problem and fix found by Masahiro HONMA . Similar bugs in conversions of shift_jis and euc-jp to UTF-8 fixed as well. 1.09 2001.08.22 - Fixed multiple typo occurances of 'uft' where 'utf' was meant in code. Problem affected utf16 and utf7 encodings. Problem found by devon smith 1.08 2000.11.06 - Added 'utf8_charset_alias' function to allow for runtime setting of character set aliases. Added several alternate names for 'sjis' (shiftjis, shift-jis, shift_jis, s-jis, and s_jis). Corrected 'croak' messages for 'from_utf8' functions to appropriate function name. Tightened up initialization encapsulation Corrected fatal problem in jcode from unicode internals. Problem and fix found by Brian Wisti . 1.07 2000.11.01 - Added 'croak' to use Carp declaration to fix error messages. Problem and fix found by Brian Wisti . 1.06 2000.10.30 - Fix to handle change in stringification of overloaded objects between Perl 5.005 and 5.6. Problem noticed by Brian Wisti . 1.05 2000.10.23 - Error in conversions from UTF8 to multibyte encodings corrected 1.04 2000.10.23 - Additional diagnostic messages added for internal error conditions 1.03 2000.10.22 - Bug fix for load time autodetction of Unicode::Map8 encodings 1.02 2000.10.22 - Added load time autodetection of Unicode::Map8 supported character set encodings. Fixed internal calling error for some character sets with 'from_utf8'. Thanks goes to Ilia Lobsanov for reporting this problem. 1.01 2000.10.02 - Fixed handling of empty strings and added more identification for error messages. 1.00 2000.09.29 - Pre-release version =head1 FUNCTIONS =over 4 =item utf8_charset_alias({ $alias => $charset }); Used for runtime assignment of character set aliases. Called with no parameters, returns a hash of defined aliases and the character sets they map to. Example: my $aliases = utf8_charset_alias; my @alias_names = keys %$aliases; If called with ONE parameter, returns the name of the 'real' charset if the alias is defined. Returns undef if it is not found in the aliases. Example: if (! utf8_charset_alias('VISCII')) { # No alias for this } If called with a list of 'alias' => 'charset' pairs, defines those aliases for use. Example: utf8_charset_alias({ 'japanese' => 'sjis', 'japan' => 'sjis' }); Note: It will croak if a passed pair does not map to a character set defined in the predefined set of character encoding. It is NOT allowed to alias something to another alias. Multiple character set aliases can be set with a single call. To clear an alias, pass a character set mapping of undef. Example: utf8_charset_alias({ 'japanese' => undef }); While an alias is set, the 'utf8_supported_charset' function will return the alias as if it were a predefined charset. Overriding a base defined character encoding with an alias will generate a warning message to STDERR. =back =over 4 =item utf8_supported_charset($charset_name); Returns true if the named charset is supported (including user defined aliases). Returns false if it is not. Example: if (! utf8_supported_charset('VISCII')) { # No support yet } If called in a list context with no parameters, it will return a list of all supported character set names (including user defined aliases). Example: my @charsets = utf8_supported_charset; =back =over 4 =item to_utf8({ -string => $string, -charset => $source_charset }); Returns the string converted to UTF8 from the specified source charset. =back =over 4 =item from_utf8({ -string => $string, -charset => $target_charset}); Returns the string converted from UTF8 to the specified target charset. =back =head1 VERSION 1.11 2005.10.10 =head1 TODO Regression tests for Jcode, 2-byte encodings and encoding aliases =head1 SEE ALSO L L L L L =head1 COPYRIGHT Copyright 2000-2005, Benjamin Franz. All rights reserved. =head1 AUTHOR Benjamin Franz =head1 LICENSE This program is free software; you can redistribute it and/or modify it under the same terms and conditions as Perl itself. This means that you can, at your option, redistribute it and/or modify it under either the terms the GNU Public License (GPL) version 1 or later, or under the Perl Artistic License. See http://dev.perl.org/licenses/ =head1 DISCLAIMER THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. Use of this software in any way or in any form, source or binary, is not allowed in any country which prohibits disclaimers of any implied warranties of merchantability or fitness for a particular purpose or any disclaimers of a similar nature. IN NO EVENT SHALL I BE LIABLE TO ANY PARTY FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION (INCLUDING, BUT NOT LIMITED TO, LOST PROFITS) EVEN IF I HAVE BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE =cut Unicode-MapUTF8-1.11/pod/MapUTF8.ja_JP.eucjp.pod0000664000076400007640000002027210322472527022464 0ustar snowharesnowhare00000000000000 =head1 ̾ Unicode::MapUTF8 - ǤդʸåȤ顿ؤUTF8Ѵ =head1 use Unicode::MapUTF8 qw(to_utf8 from_utf8 utf8_supported_charset); # 'ISO-8859-1'ʸ 'UTF8'Ѵ my $output = to_utf8({ -string => 'An example', -charset => 'ISO-8859-1' }); # 'UTF8'󥳡ǥ󥰤ʸ򥨥󥳡ǥ 'ISO-8859-1'Ѵ my $other = from_utf8({ -string => 'Other text', -charset => 'ISO-8859-1' }); # ѤǤʸåȥ󥳡ǥ󥰤Υꥹ my @character_sets = utf8_supported_charset; # ʸå̾ɲ utf8_charset_alias({ 'ms-japanese' => 'sjis' }); # ĤǤդΡʤ礭ߴΤʸåȥ󥳡ǥ󥰤ǤѴ # (SJIS to EUC-JP) my $utf8_string = to_utf8({ -string =>$sjis_string, -charset => 'sjis'}); my $euc_jp_string = from_utf8({ -string => $utf8_string, -charset => 'euc-jp' }) # ʸåȤݡȤƤ뤫ǧ if (utf8_supported_charset('ISO-8859-1') { # Yes } =head1 UTF8Ȥ¾Υ󥳡ǥ󥰤顿ؤѴΤˤȤʤ롼δ֤Υץؤ󶡤ޤܼŪˤϡʣ¸ߤ Unicode⥸塼ؤΣĤζ̤Υ󥿡եߤˤʤäƤΤ뤳ȤʤñUTF8顿¾ʸåȥ󥳡ǥ ѴñˤʤޤΤᡢUnicode::StringUnicode::Map8Unicode::MapJcode 塼ɸಽ줿ñAPIߤޤ ޤUTF-8١˰ŪʸåѴ󶡤ޤݤϣʳѴĤʤ뤳ȤˤꡢĤθߴꡢݡȤƤʸåȤDzǽǤ ۤȤɤΤȤPerlŪʤΤǡݤߺդоݤȤƿ¿ξʤΤˣ礭ʸǤޤͿȡ1ô֤ˤ¿ʸ򰷤ޤ ߷פǤϡо줹뤤ʤ뿷ʸåȡ󥳡ǥѴ⥸塼褦˴ñ˳ĥ뤳ȤǤޤ =head1 ѹ ʸʸΤޤޡ 1.11 2005.10.10 Documentation changes. Addition of Build.PL support. Added various build tests, LICENSE, Artistic_License.txt, GPL_License.txt. Split documentation into seperate .pod file. Added Japanese translation of POD. 1.10 2005.05.22 - Fixed bug in conversion of ISO-2022-JP to UTF-8. Problem and fix found by Masahiro HONMA . Similar bugs in conversions of shift_jis and euc-jp to UTF-8 fixed as well. 1.09 2001.08.22 - Fixed multiple typo occurances of 'uft' where 'utf' was meant in code. Problem affected utf16 and utf7 encodings. Problem found by devon smith 1.08 2000.11.06 - Added 'utf8_charset_alias' function to allow for runtime setting of character set aliases. Added several alternate names for 'sjis' (shiftjis, shift-jis, shift_jis, s-jis, and s_jis). Corrected 'croak' messages for 'from_utf8' functions to appropriate function name. Tightened up initialization encapsulation Corrected fatal problem in jcode from unicode internals. Problem and fix found by Brian Wisti . 1.07 2000.11.01 - Added 'croak' to use Carp declaration to fix error messages. Problem and fix found by Brian Wisti . 1.06 2000.10.30 - Fix to handle change in stringification of overloaded objects between Perl 5.005 and 5.6. Problem noticed by Brian Wisti . 1.05 2000.10.23 - Error in conversions from UTF8 to multibyte encodings corrected 1.04 2000.10.23 - Additional diagnostic messages added for internal error conditions 1.03 2000.10.22 - Bug fix for load time autodetction of Unicode::Map8 encodings 1.02 2000.10.22 - Added load time autodetection of Unicode::Map8 supported character set encodings. Fixed internal calling error for some character sets with 'from_utf8'. Thanks goes to Ilia Lobsanov for reporting this problem. 1.01 2000.10.02 - Fixed handling of empty strings and added more identification for error messages. 1.00 2000.09.29 - Pre-release version =head1 ؿ =over =item utf8_charset_alias({ $alias => $charset }); ʸå̾μ¹Ի˻Ȥޤ ʤǸƤФȡƤ̾ȡ˥ޥåפʸåȤΥϥå֤ޤ 㡧: my $aliases = utf8_charset_alias; my @alias_names = keys %$aliases; ĤΥѥ᡼դǸƤФȡ⤷̾ƤС''ʸåȤ֤̾ޤ줬̾˸Ĥʤundef֤ޤ 㡧 if (! utf8_charset_alias('VISCII')) { # No alias for this } ⤷'alias' => 'charset'ȤΥꥹȤǸƤФС̾Ȥ褦ޤ 㡧 utf8_charset_alias({ 'japanese' => 'sjis', 'japan' => 'sjis' }); աϤ줿ȤͽƤʸåȡ󥳡ǥ󥰤νƤʸåȤ˥ޥåפʤСcroakޤ¾̾ؤ̾ƤϤޤ ʣʸåȤ򣱲θƽФꤹ뤳ȤǤޤ ̾򥯥꡼󤹤뤿ˤϡundefʸåȥޥåԥ󥰤Ϥޤ 㡧 utf8_charset_alias({ 'japanese' => undef }); ̾ꤵƤ֡⤷ͽƤʸåȤǤС'utf8_supported_charset' ؿ֤̾ޤ ܤ줿ʸ󥳡ǥ󥰤̾ǥС饤ɤȡɸ२顼STDERR)ؤηٹåФޤ =back =over =item utf8_supported_charset($charset_name); ʥ桼̾ޤơ̾Ť줿ʸåȤݡȤƤtrue֤ޤ

Ǥʤfalse֤ޤ 㡧 if (! utf8_supported_charset('VISCII')) { # ޤݡȤƤޤ } ѥ᡼ʤǡꥹȡƥȤǸƤФȡʥ桼̾ޤơ˥ݡȤƤ뤹٤Ƥʸå̾ΥꥹȤ֤ޤ 㡧 my @charsets = utf8_supported_charset; =back =over =item to_utf8({ -string => $string, -charset => $source_charset }); ꤵ줿ʸå(source charset)UTF8Ѵ줿ʸ֤ޤ =back =over =item from_utf8({ -string => $string, -charset => $target_charset}); UTF8ꤵ줿åȤʸå(target charset)Ѵ줿ʸ֤ޤ =back =head1 С 1.11 - 2005.10.10 =head1 ٤ Jcode2Хȡ󥳡ǥ󥰤ƥ󥳡ǥ̾Τεչԥƥ =head1 ͻ L L L L =head1  Copyright 2000-2005, Benjamin Franz. All rights reserved. =head1 Benjamin Franz =head1 쥸å 繧ŵ "Kawai,Takanori" - ˮ =head1 饤 Perl Ʊ饤󥹡 Artistic License GPL Υǥ奢饤󥹡 ʸʸΤޤޡ This program is free software; you can redistribute it and/or modify it under the same terms and conditions as Perl itself. This means that you can, at your option, redistribute it and/or modify it under either the terms the GNU Public License (GPL) version 1 or later, or under the Perl Artistic License. See http://dev.perl.org/licenses/ =head1 DISCLAIMER ʸʸΤޤޡ THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. Use of this software in any way or in any form, source or binary, is not allowed in any country which prohibits disclaimers of any implied warranties of merchantability or fitness for a particular purpose or any disclaimers of a similar nature. IN NO EVENT SHALL I BE LIABLE TO ANY PARTY FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION (INCLUDING, BUT NOT LIMITED TO, LOST PROFITS) EVEN IF I HAVE BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE =cut Unicode-MapUTF8-1.11/README0000664000076400007640000000156610322472527016610 0ustar snowharesnowhare00000000000000Unicode::MapUTF8 - Conversions to and from arbitrary character sets and UTF8 Provides an adapter layer between core routines for converting to and from UTF8 and other encodings. In essence, a way to give multiple existing Unicode modules a single common interface so you don't have to know the underlaying implementations to do simple UTF8 to-from other character set string conversions. As such, it wraps the Unicode::String, Unicode::Map8, Unicode::Map and Jcode modules in a standardized and simple API. Mainly intended for use with Perl 5.6 and 5.0 since starting with Perl 5.8 the Encode modules are the preferred way of handling character set encodings. To install: perl Makefile.PL make make test make install Alternatively, if you have Module::Build installed, perl Build.PL ./Build ./Build test ./Build install See 'perldoc Unicode::MapUTF8' for the documentation.