Unicode-Japanese-0.47/ 0040755 0001761 0000144 00000000000 11311614444 013261 5 ustar hio users Unicode-Japanese-0.47/t/ 0040755 0001761 0000144 00000000000 11311614444 013524 5 ustar hio users Unicode-Japanese-0.47/t/hirakata.t 0100644 0001761 0000144 00000002315 11046013305 015465 0 ustar hio users
## convert hiragana <-> katakana
#
# hiragana : (A) U+3042, (I) U+3044, (U) U+3046
# e3.81.82 e3.81.84 e3.81.86
#
# katakana : (A) U+30a2, (I) U+30a4, (U) U+30a6
# e3.82.a2 e3.82.a4 e3.82.a6
#
use strict;
use Test;
use Carp;
#no I18N::Japanese;
use Unicode::Japanese qw(no_I18N_Japanese);
use lib 't';
require 'esc.pl';
Unicode::Japanese->new();
print "[$Unicode::Japanese::xs_loaderror]\n";
BEGIN { plan tests => 2*2 }
my $string;
$]>=5.008 and eval('use bytes'), $@ && die $@;
my $kata_AIU = "\xe3\x82\xa2\xe3\x82\xa4\xe3\x82\xa6";
my $hira_AIU = "\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86";
# hiragana(A I U) -> katakana(A I U)
# (xs)
$string = Unicode::Japanese->new($hira_AIU);
$string->hira2kata();
ok(escfull($string->utf8()), escfull($kata_AIU));
# (pp)
$string = Unicode::Japanese::PurePerl->new($hira_AIU);
$string->hira2kata();
ok(escfull($string->utf8()), escfull($kata_AIU));
# katakana(A I U) -> hiragana(A I U)
# (xs)
$string = Unicode::Japanese->new($kata_AIU);
$string->kata2hira();
ok(escfull($string->utf8()), escfull($hira_AIU));
# (pp)
$string = Unicode::Japanese::PurePerl->new($kata_AIU);
$string->kata2hira();
ok(escfull($string->utf8()), escfull($hira_AIU));
Unicode-Japanese-0.47/t/jis.t 0100644 0001761 0000144 00000007401 11046013305 014467 0 ustar hio users ## ----------------------------------------------------------------------------
# t/jis.t
# -----------------------------------------------------------------------------
# $Id: jis.t 4635 2006-06-14 07:13:04Z hio $
# -----------------------------------------------------------------------------
use strict;
use Test;
BEGIN { plan tests => 20, };
# -----------------------------------------------------------------------------
# load module
use Unicode::Japanese;
use lib 't';
require 'esc.pl';
my $xs = Unicode::Japanese->new();
my $pp = Unicode::Japanese::PurePerl->new();
sub jisToUtf8_xs($){ tt($xs->set($_[0],'jis')->utf8()); }
sub jisToUtf8_pp($){ tt($pp->set($_[0],'jis')->utf8()); }
sub jisToSjis_xs($){ tt($xs->set($_[0],'jis')->sjis()); }
sub jisToSjis_pp($){ tt($pp->set($_[0],'jis')->sjis()); }
sub jisToJis_xs($){ tt($xs->set($_[0],'jis')->jis()); }
sub jisToJis_pp($){ tt($pp->set($_[0],'jis')->jis()); }
sub tt($){ escfull($_[0]) }
sub bin($){ escfull(pack("H*",join('',split(' ',$_[0])))); }
{
# ASCII : \e(B
#
my $test = "\e(B123ABC\e(B123";
my $correct = tt("123ABC123");
ok(jisToUtf8_xs($test),$correct,"escape to ASCII (xs)");
ok(jisToUtf8_pp($test),$correct,"escape to ASCII (pp)");
}
{
# jis.roman : \e(J
#
my $test = "\e(J123ABC\e(B123";
my $correct = tt("123ABC123");
ok(jisToUtf8_xs($test),$correct,"escape to jis.roman (xs)");
ok(jisToUtf8_pp($test),$correct,"escape to jis.roman (pp)");
}
{
# jis.kana : \e(I
#
my $test = "\e(I123ABC\e(B123";
my $correct_utf8 = bin("ef bd b1 ef bd b2 ef bd b3 ef be 81 ef be 82 ef be 83 31 32 33");
my $correct_sjis = bin("b1 b2 b3 c1 c2 c3 31 32 33");
ok(jisToSjis_xs($test),$correct_sjis,"escape to jis.kana (xs/sjis)");
ok(jisToSjis_pp($test),$correct_sjis,"escape to jis.kana (pp/sjis)");
ok(jisToUtf8_xs($test),$correct_utf8,"escape to jis.kana (xs/utf8)");
ok(jisToUtf8_pp($test),$correct_utf8,"escape to jis.kana (pp/utf8)");
}
{
# jis.kana(so/si)
#
my $test = "\x0e123ABC\x0f123";
my $correct = bin("ef bd b1 ef bd b2 ef bd b3 ef be 81 ef be 82 ef be 83 31 32 33");
#skip("so/si not supported yet",jisToUtf8_xs($test),$correct,"escape to jis.roman (xs)");
#skip("so/si not supported yet",jisToUtf8_pp($test),$correct,"escape to jis.roman (pp)");
}
{
# jis-c-6226-1979(old-JIS) : \e$@
# jis-x-0208-1983(new-JIS) : \e$B
# jis-x-0208-1990 : \e&@\e$B
my $test_old_jis = "\e\$\@!!\e(B";
my $test_new_jis = "\e\$B!!\e(B";
my $test_jis1990 = "\e&\@\e\$B!!\e(B";
my $correct = tt("\x81\x40");
ok(jisToSjis_xs($test_old_jis),$correct,"old-jis to sjis (xs)");
ok(jisToSjis_pp($test_old_jis),$correct,"old-jis to sjis (pp)");
ok(jisToSjis_xs($test_new_jis),$correct,"new-jis to sjis (xs)");
ok(jisToSjis_pp($test_new_jis),$correct,"new-jis to sjis (pp)");
ok(jisToSjis_xs($test_jis1990),$correct,"jis1990 to sjis (xs)");
ok(jisToSjis_pp($test_jis1990),$correct,"jis1990 to sjis (pp)");
}
{
# jis-x-0212-1990: \e$(D
#skip("jis-x-0212 not ready");
#skip("jis-x-0212 not ready");
my $test = "\e\$(D!!\e(B";
my $correct = tt("\x81\xac");
ok(jisToSjis_xs($test),$correct,"jis0212 to sjis (xs)");
ok(jisToSjis_pp($test),$correct,"jis0212 to sjis (pp)");
}
{
# resume to ascii on newline. : \e(B
# JIS X 0208-1983 \e$B
my $test1 = "\e\$B!!\n!!!";
my $correct1_sjis = tt("\x81\x40\n!!!");
my $correct1_jis = tt("\e\$B!!\e(B\n!!!");
ok(jisToSjis_xs($test1),$correct1_sjis,"resume to ASCII (xs)");
ok(jisToSjis_pp($test1),$correct1_sjis,"resume to ASCII (pp)");
ok(jisToJis_xs($test1), $correct1_jis, "resume to ASCII (xs)");
ok(jisToJis_pp($test1), $correct1_jis, "resume to ASCII (pp)");
}
# -----------------------------------------------------------------------------
# End Of File.
# -----------------------------------------------------------------------------
Unicode-Japanese-0.47/t/pod.t 0100644 0001761 0000144 00000000340 11046013305 014457 0 ustar hio users #! perl -w
use Test::More;
eval "use Test::Pod 1.14;";
plan skip_all => "Test::Pod 1.14 required for testing POD" if $@;
eval "require Encode;";
plan skip_all => "Encode required for testing POD" if $@;
all_pod_files_ok();
Unicode-Japanese-0.47/t/z2h.t 0100644 0001761 0000144 00000004740 11046013305 014410 0 ustar hio users
use strict;
use Test;
BEGIN { plan tests => 8*2; }
use Unicode::Japanese;
use lib 't';
require 'esc.pl';
#Unicode::Japanese->new();
#$Unicode::Japanese::xs_loaderror and print STDERR "$Unicode::Japanese::xs_loaderror\n";
# -----------------------------------------------------------------------------
# h2z convert
#
my $string = Unicode::Japanese->new();
my $ppstring = Unicode::Japanese::PurePerl->new();
my ($set,$expected);
# h2z num
$set = "0129";
$expected = "\xef\xbc\x90\xef\xbc\x91\xef\xbc\x92\xef\xbc\x99";
ok($string->set($set)->h2z()->utf8(),$expected);
ok($ppstring->set($set)->h2z()->utf8(),$expected);
# h2z alpha
$set = "abzABZ";
$expected = "\xef\xbd\x81\xef\xbd\x82\xef\xbd\x9a\xef\xbc\xa1\xef\xbc\xa2\xef\xbc\xba";
ok($string->set($set)->h2z()->utf8(),$expected);
ok($ppstring->set($set)->h2z()->utf8(),$expected);
# h2z symbol
$set = "!#^*(-+~{]>?_";
$expected = "\xef\xbc\x81\xef\xbc\x83\xef\xbc\xbe\xef\xbc\x8a\xef\xbc\x88\xef\xbc\x8d\xef\xbc\x8b\xef\xbd\x9e\xef\xbd\x9b\xef\xbc\xbd\xef\xbc\x9e\xef\xbc\x9f\xef\xbc\xbf";
ok($string->set($set)->h2z()->utf8(),$expected);
ok($ppstring->set($set)->h2z()->utf8(),$expected);
# h2z kana / KUTEN KATA-SMALL-O HIRA-SMALL-O KANA-VU
$set = "\xef\xbd\xa1\xef\xbd\xab\xe3\x81\x89\xef\xbd\xb3\xef\xbe\x9e";
$expected = "\xe3\x80\x82\xe3\x82\xa9\xe3\x81\x89\xe3\x83\xb4";
ok($string->set($set)->h2z()->utf8(),$expected);
ok($ppstring->set($set)->h2z()->utf8(),$expected);
# -----------------------------------------------------------------------------
# z2h convert
#
# z2h num
$set = "\xef\xbc\x90\xef\xbc\x91\xef\xbc\x92\xef\xbc\x99";
$expected = "0129";
ok($string->set($set)->z2h()->utf8(),$expected);
ok($ppstring->set($set)->z2h()->utf8(),$expected);
# z2h alpha
$set = "\xef\xbd\x81\xef\xbd\x82\xef\xbd\x9a\xef\xbc\xa1\xef\xbc\xa2\xef\xbc\xba";
$expected = "abzABZ";
ok($string->set($set)->z2h()->utf8(),$expected);
ok($ppstring->set($set)->z2h()->utf8(),$expected);
# z2h symbol
$set = "\xef\xbc\x81\xef\xbc\x83\xef\xbc\xbe\xef\xbc\x8a\xef\xbc\x88\xef\xbc\x8d\xef\xbc\x8b\xef\xbd\x9e\xef\xbd\x9b\xef\xbc\xbd\xef\xbc\x9e\xef\xbc\x9f";
$expected = "!#^*(-+~{]>?";
ok($string->set($set)->z2h()->utf8(),$expected);
ok($ppstring->set($set)->z2h()->utf8(),$expected);
# z2h kana, HIRAGANA LETTER SMALL O is kept.
$set = "\xe3\x80\x82\xe3\x82\xa9\xe3\x81\x89\xe3\x83\xb4";
$expected = "\xef\xbd\xa1\xef\xbd\xab\xe3\x81\x89\xef\xbd\xb3\xef\xbe\x9e";
ok($string->set($set)->z2h()->utf8(),$expected);
ok($ppstring->set($set)->z2h()->utf8(),$expected);
Unicode-Japanese-0.47/t/0.loadxs.t 0100644 0001761 0000144 00000001744 11046013305 015336 0 ustar hio users ## ----------------------------------------------------------------------------
# t/0.loadxs.t
# -----------------------------------------------------------------------------
# $Id: 0.loadxs.t 5236 2008-01-16 09:47:26Z hio $
# -----------------------------------------------------------------------------
use strict;
use Test;
BEGIN { plan tests => 3 }
# -----------------------------------------------------------------------------
# load module
require Unicode::Japanese;
ok(1,1,'require');
import Unicode::Japanese;
ok(1,1,'import');
# -----------------------------------------------------------------------------
# check XS was loaded.
# xs is loaded in first invocation of `new'.
Unicode::Japanese->new();
# to avoid used-only-once warning, read twice.
my $err = ($Unicode::Japanese::xs_loaderror,$Unicode::Japanese::xs_loaderror)[0];
if( !-e 't/pureperl.flag' )
{
print "# load xs\n";
ok($err,'');
}else
{
print "# pure perl\n";
ok($err =~ /Can't locate loadable object/);
}
Unicode-Japanese-0.47/t/wavedash.t 0100644 0001761 0000144 00000002466 11046013305 015512 0 ustar hio users ## ----------------------------------------------------------------------------
# t/wavedash.t
# -----------------------------------------------------------------------------
# Mastering programed by YAMASHINA Hio
#
# Copyright YMIRLINK,Inc.
# -----------------------------------------------------------------------------
# $Id: wavedash.t 4528 2004-11-04 06:08:06Z hio $
# -----------------------------------------------------------------------------
use strict;
use Test::More tests => 4;
use Unicode::Japanese;
# -----------------------------------------------------------------------------
# U+301C(WAVE DASH) => CP912:8160
#
is(Unicode::Japanese->new("\xe3\x80\x9c")->sjis(),
"\x81\x60",
"U+301C => CP912:8160");
is(Unicode::Japanese::PurePerl->new("\xe3\x80\x9c")->sjis(),
"\x81\x60",
"U+301C => CP912:8160");
# -----------------------------------------------------------------------------
# U+301C(WAVE DASH) =(z2h)=> ASCII:7E (TILDE)
#
is(Unicode::Japanese->new("\xe3\x80\x9c")->z2h()->sjis(),
"~",
"U+301C => CP912:8160");
is(Unicode::Japanese::PurePerl->new("\xe3\x80\x9c")->z2h()->sjis(),
"~",
"U+301C => CP912:8160");
# -----------------------------------------------------------------------------
# End Of File.
# -----------------------------------------------------------------------------
Unicode-Japanese-0.47/t/v045_imodexs.t 0100644 0001761 0000144 00000001475 11046013305 016135 0 ustar hio users ## ----------------------------------------------------------------------------
# t/v045_imodexs.t
# -----------------------------------------------------------------------------
# $Id: 0.loadxs.t 5236 2008-01-16 09:47:26Z hio $
# -----------------------------------------------------------------------------
use strict;
use Test::More;
use Unicode::Japanese;
# xs is loaded in first invocation of `new'.
my $xs = Unicode::Japanese->new();
# to avoid used-only-once warning, read twice.
my $err = ($Unicode::Japanese::xs_loaderror,$Unicode::Japanese::xs_loaderror)[0];
if( $err =~ /Can't locate loadable object/ )
{
plan skip_all => 'no xs module';
}
plan tests => 1;
# imode, EXT-1.
$xs->set("\xf9\xb1", 'sjis-imode1');
my $u8 = $xs->utf8;
is(unpack("H*", $u8), unpack("H*", "?"), "imode-ext1 with imode1 will be '?'");
Unicode-Japanese-0.47/t/outrange.t 0100644 0001761 0000144 00000001530 11046013305 015523 0 ustar hio users
use Test;
use Unicode::Japanese;
BEGIN { plan tests => 6 }
## check from utf8 convert
## U+2665 BLACK HEART SUIT (in Miscellaneous Symbols) into some charsets.
my $string;
# sjis
$string = new Unicode::Japanese "\xe2\x99\xa5";
ok($string->sjis, "♥", "U+2665 (9829) => sjis");
# euc
$string = new Unicode::Japanese "\xe2\x99\xa5";
ok($string->euc, "♥", "U+2665 (9829) => eucjp");
# jis(iso-2022-jp)
$string = new Unicode::Japanese "\xe2\x99\xa5";
ok($string->jis, "♥", "U+2665 (9829) => jis");
# imode
$string = new Unicode::Japanese "\xe2\x99\xa5";
ok($string->sjis_imode, "?", "U+2665 (9829) => imode");
# dot-i
$string = new Unicode::Japanese "\xe2\x99\xa5";
ok($string->sjis_doti, "?", "U+2665 (9829) => doti");
# j-sky
$string = new Unicode::Japanese "\xe2\x99\xa5";
ok($string->sjis_jsky, "?", "U+2665 (9829) => jsky");
Unicode-Japanese-0.47/t/emoji.t 0100644 0001761 0000144 00000034157 11046013305 015015 0 ustar hio users ## ----------------------------------------------------------------------------
# t/emoji.t
# -----------------------------------------------------------------------------
# $Id: emoji.t 5221 2008-01-16 06:56:15Z hio $
# -----------------------------------------------------------------------------
use strict;
use Test::More tests => 24 + 25 *22 +6*4 + 17*2;
# -----------------------------------------------------------------------------
# load module
use Unicode::Japanese qw(no_I18N_Japanese);
use lib 't';
require 'esc.pl';
use vars qw($STR $PPSTR);
$STR = Unicode::Japanese->new();
$PPSTR = Unicode::Japanese::PurePerl->new();
if( !-e 't/pureperl.flag' && $Unicode::Japanese::xs_loaderror )
{
print STDERR "xs load error : [$Unicode::Japanese::xs_loaderror]\n";
}
{
# emoji(SUNSHINE) in sjis-imode, sjis-vodafone, sjis-icon-au
#
my $imode = "\xf8\x9f";
my $jsky = "\e\$" . "Gj" . "\x0f";
my $au = '
';
# imode.
is( $STR->set($imode, 'sjis-imode')->ucs4,
"\x00\x0f".$imode,
'imode => ucs4 (xs)',
);
is( $STR->set($imode, 'sjis-imode')->sjis_imode,
$imode,
'imode => imode (xs)',
);
is( $PPSTR->set($imode, 'sjis-imode')->ucs4,
"\x00\x0f".$imode,
'imode => ucs4 (pp)',
);
is( $PPSTR->set($imode, 'sjis-imode')->sjis_imode,
$imode,
'imode => imode (pp)',
);
# jsky.
is( $STR->set($jsky, 'sjis-jsky')->ucs4,
"\x00\x0f"."\xFD"."j", # G=>\xFD.
'jsky => ucs4 (pp)',
);
is( $STR->set($jsky, 'sjis-jsky')->sjis_jsky,
$jsky,
'jsky => jsky (pp)',
);
is( $PPSTR->set($jsky, 'sjis-jsky')->ucs4,
"\x00\x0f"."\xFD"."j", # G=>\xFD.
'jsky => ucs4 (pp)',
);
is( $PPSTR->set($jsky, 'sjis-jsky')->sjis_jsky,
$jsky,
'jsky => jsky (pp)',
);
# au.
is( $STR->set($au, 'sjis-icon-au')->ucs4,
"\x00\x0f"."\xE0".chr(44),
'au => ucs4 (pp)',
);
is( $STR->set($au, 'sjis-icon-au')->sjis_icon_au,
$au,
'au => au (pp)',
);
is( $PPSTR->set($au, 'sjis-icon-au')->ucs4,
"\x00\x0f"."\xE0".chr(44),
'au => ucs4 (pp)',
);
is( $PPSTR->set($au, 'sjis-icon-au')->sjis_icon_au,
$au,
'au => au (pp)',
);
# imode <=> jsky
#
is($STR->set($imode, 'sjis-imode')->sjis_jsky, $jsky, 'imode => jsky (xs)');
is($STR->set($jsky, 'sjis-jsky')->sjis_imode, $imode, 'jsky => imode (xs)');
is($PPSTR->set($imode, 'sjis-imode')->sjis_jsky, $jsky, 'imode => jsky (pp)');
is($PPSTR->set($jsky, 'sjis-jsky')->sjis_imode, $imode, 'jsky => imode (pp)');
# jsky <=> au
#
is($STR->set($jsky, 'sjis-jsky')->sjis_icon_au, $au, 'jsky => au (xs)');
is($STR->set($au, 'sjis-icon-au')->sjis_jsky, $jsky, 'au => jsky (xs)');
is($PPSTR->set($jsky, 'sjis-jsky')->sjis_icon_au, $au, 'jsky => au (pp)');
is($PPSTR->set($au, 'sjis-icon-au')->sjis_jsky, $jsky, 'au => jsky (pp)');
# au <=> imode
#
is($STR->set($au, 'sjis-icon-au')->sjis_imode, $imode, 'au => imode (xs)');
is($STR->set($imode, 'sjis-imode')->sjis_icon_au, $au, 'imode => au (xs)');
is($PPSTR->set($au, 'sjis-icon-au')->sjis_imode, $imode, 'au => imode (pp)');
is($PPSTR->set($imode, 'sjis-imode')->sjis_icon_au, $au, 'imode => au (pp)');
}
# -----------------------------------------------------------------------------
# test(type, ucs4, sjis
# imode1, imode2, doti, jsky1, jsky2 );
# type: imode1/imode2/doti/jsky1/doti2
# ucs4: 0x0fxxxx
#
# 14 tests at one test() call.
# 7 tests, ucs4,sjis,imode1,imode2,doti,jsky1, and jsky2 are
# by XS and PurePerl.
#
# (ja:) 一度の test() 呼び出しで, 22のテスト
# (ja:) (ucs4,sjis,imode1,imode2,doti,jsky1,jsky2,au1,au2,au1-icon,au2-icon
# の11種類を XS と PurePerl で)
#
# jsky-escape
sub je
{
"\e\$".join('',@_)."\x0f";
}
# au-escape
sub ae
{
"\e\$B" . join('', @_) . "\e\(B";
}
# au-icon
sub ai
{
'
';
}
# -----------------------------------------------------------------------------
# sunrise (jsky2 only, jsky1 compat)
#
# jsky2-sunrise: jsky1 compat.
$STR->set("\x00\x0f\xfc\xe9",'ucs4');
is(escfull($STR->ucs4()),escfull("\0\x0f\xfc\xe9"));
is(escfull($STR->sjis_jsky2()),escfull(je("\x50\x69")));
is(escfull($STR->sjis_jsky1()),escfull(je("\x47\x6d")));
$PPSTR->set("\x00\x0f\xfc\xe9",'ucs4');
is(escfull($PPSTR->ucs4()),escfull("\0\x0f\xfc\xe9"));
is(escfull($PPSTR->sjis_jsky2()),escfull(je("\x50\x69")));
is(escfull($PPSTR->sjis_jsky1()),escfull(je("\x47\x6d")));
# jsky1-sunrise: jsky2 kept.
$STR->set("\x00\x0f\xfd\x6d",'ucs4');
is(escfull($STR->ucs4()),escfull("\0\x0f\xfd\x6d"));
is(escfull($STR->sjis_jsky2()),escfull(je("\x47\x6d")));
is(escfull($STR->sjis_jsky1()),escfull(je("\x47\x6d")));
$PPSTR->set("\x00\x0f\xfd\x6d",'ucs4');
is(escfull($PPSTR->ucs4()),escfull("\0\x0f\xfd\x6d"));
is(escfull($PPSTR->sjis_jsky2()),escfull(je("\x47\x6d")));
is(escfull($PPSTR->sjis_jsky1()),escfull(je("\x47\x6d")));
# -----------------------------------------------------------------------------
# dollar bag (imode2 only)
# imode2.$袋 => imode1.袋
#
# imode2-dollar bag: imode1 compat.
$STR->set("\x00\x0f\xf9\xba",'ucs4');
is(escfull($STR->ucs4()),escfull("\0\x0f\xf9\xba"));
is(escfull($STR->sjis_imode2()),escfull("\xf9\xba"));
is(escfull($STR->sjis_imode1()),escfull("\xf9\x51"));
$PPSTR->set("\x00\x0f\xf9\xba",'ucs4');
is(escfull($PPSTR->ucs4()),escfull("\0\x0f\xf9\xba"));
is(escfull($PPSTR->sjis_imode2()),escfull("\xf9\xba"));
is(escfull($PPSTR->sjis_imode1()),escfull("\xf9\x51"));
# imode1-dollar bag: imode2 kept.
$STR->set("\x00\x0f\xf9\x51",'ucs4');
is(escfull($STR->ucs4()),escfull("\0\x0f\xf9\x51"));
is(escfull($STR->sjis_imode2()),escfull("\xf9\x51"));
is(escfull($STR->sjis_imode1()),escfull("\xf9\x51"));
$PPSTR->set("\x00\x0f\xf9\x51",'ucs4');
is(escfull($PPSTR->ucs4()),escfull("\0\x0f\xf9\x51"));
is(escfull($PPSTR->sjis_imode2()),escfull("\xf9\x51"));
is(escfull($PPSTR->sjis_imode1()),escfull("\xf9\x51"));
# -----------------------------------------------------------------------------
# the sun
# 晴れ,F89F,,F0E5,476A,,002C
#
test( 'sjis-imode1', 0x0FF89F, '?',
"\xF8\x9F", "\xF8\x9F", "\xF0\xE5", je("\x47\x6a"), je("\x47\x6a"),
ae("\x75\x41"), ae("\x75\x41"), ai(44), ai(44));
test( 'sjis-imode2', 0x0FF89F, '?',
"\xF8\x9F", "\xF8\x9F", "\xF0\xE5", je("\x47\x6a"), je("\x47\x6a"),
ae("\x75\x41"), ae("\x75\x41"), ai(44), ai(44));
test( 'sjis-doti', 0x0FF0E5, '?',
"\xF8\x9F", "\xF8\x9F", "\xF0\xE5", je("\x47\x6a"), je("\x47\x6a"),
ae("\x75\x41"), ae("\x75\x41"), ai(44), ai(44));
test( 'sjis-jsky1', 0x0FFD6A, '?',
"\xF8\x9F", "\xF8\x9F", "\xF0\xE5", je("\x47\x6a"), je("\x47\x6a"),
ae("\x75\x41"), ae("\x75\x41"), ai(44), ai(44));
test( 'sjis-jsky2', 0x0FFD6A, '?',
"\xF8\x9F", "\xF8\x9F", "\xF0\xE5", je("\x47\x6a"), je("\x47\x6a"),
ae("\x75\x41"), ae("\x75\x41"), ai(44), ai(44));
test( 'jis-au1', 0x0FE02C, '?',
"\xF8\x9F", "\xF8\x9F", "\xF0\xE5", je("\x47\x6a"), je("\x47\x6a"),
ae("\x75\x41"), ae("\x75\x41"), ai(44), ai(44));
# -----------------------------------------------------------------------------
# rainy (umbrella/rain cloud)
# 雨(傘),F8A1,,F1BA,476B,,005F
# 雨(雨雲),=F8A1,,F0E7,=476B,,=005F
#
test( 'sjis-imode1', 0x0FF8A1, '?',
"\xF8\xA1", "\xF8\xA1", "\xF1\xBA", je("\x47\x6b"), je("\x47\x6b"),
ae("\x75\x45"), ae("\x75\x45"), ai(95), ai(95));
test( 'sjis-imode2', 0x0FF8A1, '?',
"\xF8\xA1", "\xF8\xA1", "\xF1\xBA", je("\x47\x6b"), je("\x47\x6b"),
ae("\x75\x45"), ae("\x75\x45"), ai(95), ai(95));
test( 'sjis-doti', 0x0FF1BA, '?',
"\xF8\xA1", "\xF8\xA1", "\xF1\xBA", je("\x47\x6b"), je("\x47\x6b"),
ae("\x75\x45"), ae("\x75\x45"), ai(95), ai(95));
test( 'sjis-jsky1', 0x0FFD6B, '?',
"\xF8\xA1", "\xF8\xA1", "\xF1\xBA", je("\x47\x6b"), je("\x47\x6b"),
ae("\x75\x45"), ae("\x75\x45"), ai(95), ai(95));
test( 'sjis-jsky2', 0x0FFD6B, '?',
"\xF8\xA1", "\xF8\xA1", "\xF1\xBA", je("\x47\x6b"), je("\x47\x6b"),
ae("\x75\x45"), ae("\x75\x45"), ai(95), ai(95));
test( 'jis-au1', 0x0FE05F, '?',
"\xF8\xA1", "\xF8\xA1", "\xF1\xBA", je("\x47\x6b"), je("\x47\x6b"),
ae("\x75\x45"), ae("\x75\x45"), ai(95), ai(95));
#
test( 'sjis-doti', 0x0FF0E7, '?',
"\xF8\xA1", "\xF8\xA1", "\xF0\xE7", je("\x47\x6b"), je("\x47\x6b"),
ae("\x75\x45"), ae("\x75\x45"), ai(95), ai(95));
# -----------------------------------------------------------------------------
# digit 0, (normal, framed+bgcolored, framed)
# 0,=F990,,F040,=4645,,=0145
# [0](色地),=F990,,F2B2,4645,,0145
# [0](白地),F990,,F2B5,=4645,,=0145
#
test( 'sjis-doti', 0x0FF040, '?',
"\xf9\x90", "\xf9\x90", "\xf0\x40", je("\x46\x45"), je("\x46\x45"),
ae("\x78\x4b"), ae("\x78\x4b"), ai(325), ai(325) );
#
test( 'sjis-doti', 0x0FF2B2, '?',
"\xf9\x90", "\xf9\x90", "\xf2\xb2", je("\x46\x45"), je("\x46\x45"),
ae("\x78\x4b"), ae("\x78\x4b"), ai(325), ai(325) );
test( 'sjis-jsky1', 0x0FFC45, '?',
"\xf9\x90", "\xf9\x90", "\xf2\xb2", je("\x46\x45"), je("\x46\x45"),
ae("\x78\x4b"), ae("\x78\x4b"), ai(325), ai(325) );
test( 'sjis-jsky2', 0x0FFC45, '?',
"\xf9\x90", "\xf9\x90", "\xf2\xb2", je("\x46\x45"), je("\x46\x45"),
ae("\x78\x4b"), ae("\x78\x4b"), ai(325), ai(325) );
test( 'jis-au1', 0x0FE145, '?',
"\xf9\x90", "\xf9\x90", "\xf2\xb2", je("\x46\x45"), je("\x46\x45"),
ae("\x78\x4b"), ae("\x78\x4b"), ai(325), ai(325) );
#
test( 'sjis-imode1', 0x0FF990, '?',
"\xf9\x90", "\xf9\x90", "\xf2\xb5", je("\x46\x45"), je("\x46\x45"),
ae("\x78\x4b"), ae("\x78\x4b"), ai(325), ai(325) );
test( 'sjis-imode2', 0x0FF990, '?',
"\xf9\x90", "\xf9\x90", "\xf2\xb5", je("\x46\x45"), je("\x46\x45"),
ae("\x78\x4b"), ae("\x78\x4b"), ai(325), ai(325) );
test( 'sjis-doti', 0x0FF2B5, '?',
"\xf9\x90", "\xf9\x90", "\xf2\xb5", je("\x46\x45"), je("\x46\x45"),
ae("\x78\x4b"), ae("\x78\x4b"), ai(325), ai(325) );
# -----------------------------------------------------------------------------
# bell
# ベル,,F9B8,,,4F45,0030
#
test( 'sjis-imode2', 0x0FF9B8, '?',
'?', "\xf9\xb8", '?', '?', je("\x4f\x45"),
ae("\x76\x6d"), ae("\x76\x6d"), ai(48), ai(48));
test( 'sjis-jsky2', 0x0FFBC5, '?',
'?', "\xf9\xb8", '?', '?', je("\x4f\x45"),
ae("\x76\x6d"), ae("\x76\x6d"), ai(48), ai(48));
test( 'jis-au1', 0x0FE030, '?',
'?', "\xf9\xb8", '?', '?', je("\x4f\x45"),
ae("\x76\x6d"), ae("\x76\x6d"), ai(48), ai(48));
# -----------------------------------------------------------------------------
# カップ,F8D1,,F0B4,4765,,005D,
#
test( 'jis-au2', 0x0FE05D, '?',
"\xf8\xd1", "\xf8\xd1", "\xf0\xb4", je("\x47\x65"), je("\x47\x65"),
ae("\x78\x36"), ae("\x78\x36"), ai(93), ai(93));
# -----------------------------------------------------------------------------
# ☆ WHITE STAR
# U+2606, SJIS:8199
#
{
my $xs = Unicode::Japanese->new();
my $pp = Unicode::Japanese::PurePerl->new();
#print STDERR "# white star (sjis)\n";
my $s = "\x81\x99";
my $j = Unicode::Japanese->new($s,'sjis')->jis();
my $u = "\x26\x06";
foreach my $code (qw(sjis sjis-imode1 sjis-imode2 sjis-doti sjis-jsky1 sjis-jsky2 sjis-au1 sjis-au2 sjis-icon-au1 sjis-icon-au2))
{
is(escfull($xs->set($s,$code)->ucs2),escfull($u),"WHITE STAR: $code:ucs2");
is(escfull($xs->set($u,"ucs2")->conv($code)),escfull($s),"WHITE STAR: ucs2:$code");
}
#print STDERR "# white star (jis)\n";
foreach my $code (qw(jis jis-jsky1 jis-jsky2 jis-au1 jis-au2 jis-icon-au1 jis-icon-au2))
{
is(escfull($xs->set($j,$code)->ucs2),escfull($u),"WHITE STAR: $code:ucs2");
is(escfull($xs->set($u,"ucs2")->conv($code)),escfull($j),"WHITE STAR: ucs2:$code");
}
}
# -----------------------------------------------------------------------------
# test method.
sub test
{
my ($code,$ucs4,$sjis) = splice(@_,0,3);
my ($imode1,$imode2,$doti,$jsky1,$jsky2,$au1,$au2,$au1i,$au2i) = splice(@_,0,9);
$ucs4 = pack('N',$ucs4);
if( $code !~ /^(sjis-imode[12]|sjis-doti|sjis-jsky[12]|jis-au[12]|sjis-au[12]i)$/ )
{
die "code invalid [$code]";
}
my $shortcode = $code;
$shortcode =~ s/^s?jis\-//;
$shortcode =~ s/^icon\-(.*)/$1i/;
my $src = eval "\$$shortcode";
$@ and die $@;
my $str = Unicode::Japanese->new($src,$code);
my $pp = Unicode::Japanese::PurePerl->new($src,$code);
if( $code =~ /jsky/ && $src =~ /^\e\$(.*)\x0f$/ )
{
$src = "$code#je(".uc(unpack('H*',$1)).')';
}else
{
$src = "$code#".uc(unpack('H*',$src));
}
my ($pkg,$file,$line) = caller();
my $caller = "$file at $line";
foreach($ucs4,$sjis,$imode1,$imode2,$doti,$jsky1,$jsky2,$au1,$au2,$au1i,$au2i)
{
$_ = escfull($_);
}
# input value => ucs4
is(escfull($str->ucs4()),$ucs4,"$src=>ucs4 (xs), $caller");
is(escfull($pp ->ucs4()),$ucs4,"$src=>ucs4 (pp), $caller");
# ucs4 => others
is(escfull($str->sjis()), $sjis, "$src=>ucs4=>sjis (xs), $caller" );
is(escfull($pp ->sjis()), $sjis, "$src=>ucs4=>sjis (pp), $caller" );
is(escfull($str->sjis_imode1()),$imode1,"$src=>ucs4=>imode1 (xs), $caller");
is(escfull($pp ->sjis_imode1()),$imode1,"$src=>ucs4=>imode1 (pp), $caller");
is(escfull($str->sjis_imode2()),$imode2,"$src=>ucs4=>imode2 (xs), $caller");
is(escfull($pp ->sjis_imode2()),$imode2,"$src=>ucs4=>imode2 (pp), $caller");
is(escfull($str->sjis_doti()), $doti, "$src=>ucs4=>doti (xs), $caller" );
is(escfull($pp ->sjis_doti()), $doti, "$src=>ucs4=>doti (pp), $caller" );
is(escfull($str->sjis_jsky1()), $jsky1, "$src=>ucs4=>jsky1 (xs), $caller" );
is(escfull($pp ->sjis_jsky1()), $jsky1, "$src=>ucs4=>jsky1 (pp), $caller" );
is(escfull($str->sjis_jsky2()), $jsky2, "$src=>ucs4=>jsky2 (xs), $caller" );
is(escfull($pp ->sjis_jsky2()), $jsky2, "$src=>ucs4=>jsky2 (pp), $caller" );
is(escfull($str->jis_au1()), $au1, "$src=>ucs4=>au1 (xs), $caller" );
is(escfull($pp ->jis_au1()), $au1, "$src=>ucs4=>au1 (pp), $caller" );
is(escfull($str->jis_au2()), $au2, "$src=>ucs4=>au2 (xs), $caller" );
is(escfull($pp ->jis_au2()), $au2, "$src=>ucs4=>au2 (pp), $caller" );
is(escfull($str->sjis_icon_au1()), $au1i, "$src=>ucs4=>au1i (xs), $caller" );
is(escfull($pp ->sjis_icon_au1()), $au1i, "$src=>ucs4=>au1i (pp), $caller" );
is(escfull($str->sjis_icon_au2()), $au2i, "$src=>ucs4=>au2i (xs), $caller" );
is(escfull($pp ->sjis_icon_au2()), $au2i, "$src=>ucs4=>au2i (pp), $caller" );
}
Unicode-Japanese-0.47/t/verify_sjis_ucs2.pl 0100644 0001761 0000144 00000010001 11046013305 017330 0 ustar hio users #!/usr/bin/perl -w
#
# t/verify_sjis_ucs2.pl
#
# sjis=>ucs2とucs2=>sjisの全文字テスト
# XS側だけのてすと….
#
# $ sh runtest.sh t/verify_sjis_ucs2.pl
#
# all sjis(0x0000-0xFFFF) => ucs2
# all ucs2(0x0000-0xFFFF) => sjis
#
use strict;
#BEGIN{$Unicode::Japanese::PurePerl = 1;}
use Unicode::Japanese;
use IO::File;
print "loading Uni::Jp\n";
Unicode::Japanese->new('');
my $msg = $Unicode::Japanese::xs_loaderror;
print "xs-load-message : [".(defined($msg)?$msg:'')."]".(!defined($msg)?' (undef)':$msg eq ''?' (empty)':'')."\n";
my $tablefh = new IO::File 'jcode/CP932.TXT'
or die "cannot open 'jcode/CP932.TXT'";
print "reading 'jcode/CP932.TXT'...\n";
my(%s2u,%u2s);
while(<$tablefh>)
{
next if(m/^#/);
next if(m/^$/);
chomp;
m/^0x([0-9a-fA-F]+)\s+(?:0x([0-9a-fA-F]+))?/ or die $_;
next if(!defined($2));
$s2u{hex($1)} = hex($2);
# CP932 Unicode
}
%u2s = reverse(%s2u);
$| = 1;
# --------------------------------------------------------------------
# 不一致時に出力する用
sub dumpstr($$)
{
my($hdr,$str)=@_;
my $line = $hdr.sprintf(" : [len:%d]",length($str));
for( my $i=0; $iucs2...\n";
test_sjis_ucs2();
sub upack
{
pack('n',shift);
}
sub sjis_ucs2
{
my $code = shift;
my $str = $code<=0xFF?pack("C",$code):pack('n',$code);
exists($s2u{$code}) ? upack($s2u{$code}) :
$code<=0xFF ? "\0?" :
$str =~ /^[\x81-\x9F\xE0-\xFC][\x40-\x7E\x80-\xFC]$/ ? "\0?" :
(
(exists($s2u{$code>>8}) ? upack($s2u{$code>>8}) : "\0?").
(exists($s2u{$code&255}) ? upack($s2u{$code&255}) : "\0?")
)
}
sub test_sjis_ucs2
{
printf "[0x%#04x]",0;
for( my $i=0x0; $i<=0xffff; ++$i )
{
if( ($i&0xFF)==0 && $i)
{
if( ($i&0x3FFF)==0 )
{
printf "\n[%#06x]",$i;
}else
{
print '.';
}
}
my $src = pack($i<=0xff?'c':'n',$i);
my $code = $i;
my $xs = Unicode::Japanese->new($src,'sjis')->ucs2();
my $test = sjis_ucs2($code);
if( $xs ne $test )
{
print STDERR "\n";
print STDERR "<utf8>>\n";
print STDERR "i : $i\n";
dumpstr('sjis',$src);
dumpstr('xs ',$xs);
dumpstr('test',$test);
exit;
}
}
print "\n";
}
# --------------------------------------------------------------------
# tests ucs2 to sjis
print "Testing ucs2=>sjis...\n";
test_ucs2_sjis();
sub spack
{
my $code = shift;
$code <= 0xFF ? pack('C',$code) : pack('n',$code);
}
sub ucs2_sjis
{
my $code = shift;
exists($u2s{$code}) ? spack($u2s{$code}) :
$code<=0x7F ? chr($code) :
''.$code.';';
}
sub test_ucs2_sjis
{
printf "[0x%#04x]",0;
for( my $i=0x0; $i<=0xffff; ++$i )
{
if( ($i&0xFF)==0 && $i)
{
if( ($i&0x3FFF)==0 )
{
printf "\n[%#06x]",$i;
}else
{
print '.';
}
}
my $code = $i;
my $ucs2 = pack('n',$code);
my $xs = Unicode::Japanese->new($ucs2,'ucs2')->sjis();
my $test = ucs2_sjis($code);
if( $xs ne $test )
{
print STDERR "\n";
print STDERR "<sjis>>\n";
printf STDERR "i : 0x%04x\n",$i;
dumpstr('ucs2',$ucs2);
dumpstr('xs ',$xs);
dumpstr('test',$test);
exit;
}
}
print "\n";
}
# --------------------------------------------------------------------
# done
print "done\n";
# --------------------------------------------------------------------
# PurePerl code, copy from String.pl
#
use vars qw(@U2T);
sub _ucs2_utf8 {
my $this = shift;
my $str = shift;
if(!defined($str))
{
return '';
}
my $result = '';
for my $uc (unpack("n*", $str))
{
$result .= $U2T[$uc] ? $U2T[$uc] :
($U2T[$uc] = ($uc < 0x80) ? chr($uc) :
($uc < 0x800) ? chr(0xC0 | ($uc >> 6)) . chr(0x80 | ($uc & 0x3F)) :
chr(0xE0 | ($uc >> 12)) . chr(0x80 | (($uc >> 6) & 0x3F)) .
chr(0x80 | ($uc & 0x3F)));
}
$result;
}
Unicode-Japanese-0.47/t/getcode.t 0100644 0001761 0000144 00000003560 11046013305 015316 0 ustar hio users ## ----------------------------------------------------------------------------
# t/getcode.t
# -----------------------------------------------------------------------------
# $Id: getcode.t 4675 2007-08-30 09:20:04Z hio $
# -----------------------------------------------------------------------------
use strict;
use Test::More tests => 20*2;
# -----------------------------------------------------------------------------
# load module
use Unicode::Japanese qw(no_I18N_Japanese);
# wake lazy-loader
Unicode::Japanese->new();
print "xs status : [$Unicode::Japanese::xs_loaderror]\n";
my $code;
test("\x00\x00\xfe\xff",'utf32');
test("\xff\xfe\x00\x00",'utf32');
test("\xfe\xff",'utf16');
test("\xff\xfe",'utf16');
test("\x00\x00\x61\x1b",'utf32-be');
test("\x1b\x61\x00\x00",'utf32-le');
test("love", 'ascii');
test("\x1b\x24\x42\x30\x26\x1b\x28\x42",'jis');
test("\e\$Bx4u0u1vE\x7a\x78\x7b\x50\x7a\x70\e(B",'jis-au');
test("\x88\xa4\e\$EE\x0f",'sjis-jsky');
test("\xf6\x63",'sjis-au');
test("\x1b\x24\x42\x30\x26\x1b\x28\x42\e\$EE\x0f",'jis-jsky');
test("\xb0\xa6",'euc');
test("\x88\xa4",'sjis');
test("\x88\xa4\xf8\xdf", 'sjis-imode');
test("\x88\xa4\xf1\xb5",'sjis-doti');
test("\xe6\x84\x9b",'utf8');
test("\xcd\x10\x89\x01",'unknown');
test("\xf3\x40",'sjis-au');
test("\x81\xf3\x40\x41",'sjis');
# -----------------------------------------------------------------------------
# test($str,$charset)
# test if $str is Charset $charset.
# test both xs and purperl.
#
sub test
{
my $src = shift;
my $icode = shift;
my ($pkg,$file,$line) = caller();
my $caller = "$file at $line";
my $code = Unicode::Japanese->getcode($src);
is($code, $icode, "$icode(xs)") or diag('src:'.unpack('H*',$src)." (xs) $caller");
$code = Unicode::Japanese::PurePerl->getcode($src);
is($code, $icode, "$icode(pp)") or diag('src:'.unpack('H*',$src)." (pp) $caller");
}
Unicode-Japanese-0.47/t/ucs2_utf8.pl 0100644 0001761 0000144 00000005004 11046013305 015671 0 ustar hio users #!/usr/bin/perl
#
# ucs2 <=> utf8 全文字チェック
# ucs2(0x0000..0xFFFF) => utf8
# utf8(0x000000..0xFFFFFF) => ucs2
#
use strict;
use Unicode::Japanese;
$| = 1;
# ucs2 => utf8
print "ucs2 => utf8\n";
print "[0x000000]";
for( my $i=0; $i<=0xFFFF; ++$i )
{
if( ($i&0xFF)==0 && $i )
{
if( ($i&0x3FFF)==0 )
{
printf "\n[%#08x]",$i;
}else
{
print ".";
}
}
my $src = pack('n',$i);
my $str = Unicode::Japanese->new($src,'ucs2');
my $xs = $str->utf8();
my $orig = _ucs2_utf8($str,$src);
if( $xs ne $orig )
{
$src = unpack('H*',$src);
$xs = unpack('H*',$xs);
$orig = unpack('H*',$orig);
print "\n";
die "not match, src:[$src], xs:[$xs] != orig:[$orig]";
}
}
print "\n";
# utf8 => ucs2
print "utf8 => ucs2\n";
print "[0x000000]";
for( my $i=0; $i<=0xFFFFFF; ++$i )
{
if( ($i&0xFF)==0 && $i )
{
if( ($i&0x3FFF)==0 )
{
printf "\n[%#08x]",$i;
}else
{
print ".";
}
}
my $src = pack('N',$i);
$src =~ s/^\0+//;
my $str = Unicode::Japanese->new($src,'utf8');
my $xs = $str->ucs2();
my $orig = _utf8_ucs2($str,$src);
if( $xs ne $orig )
{
$src = unpack('H*',$src);
$xs = unpack('H*',$xs);
$orig = unpack('H*',$orig);
print "\n";
die "not match, src:[$src], xs:[$xs] != orig:[$orig]";
}
}
print "\n";
# ----------------------------------------------------------------------
my @U2T;
my %T2U;
sub _ucs2_utf8 {
my $this = shift;
my $str = shift;
if(!defined($str))
{
return '';
}
my $result = '';
for my $uc (unpack("n*", $str))
{
$result .= $U2T[$uc] ? $U2T[$uc] :
($U2T[$uc] = ($uc < 0x80) ? chr($uc) :
($uc < 0x800) ? chr(0xC0 | ($uc >> 6)) . chr(0x80 | ($uc & 0x3F)) :
chr(0xE0 | ($uc >> 12)) . chr(0x80 | (($uc >> 6) & 0x3F)) .
chr(0x80 | ($uc & 0x3F)));
}
$result;
}
sub _utf8_ucs2 {
my $this = shift;
my $str = shift;
if(!defined($str))
{
return '';
}
my $c1;
my $c2;
my $c3;
$str =~ s/([\x00-\x7f]|[\xc0-\xdf][\x80-\xbf]|[\xe0-\xef][\x80-\xbf]{2}|[\xf0-\xf7][\x80-\xbf]{3}|[\xf8-\xfb][\x80-\xbf]{4}|[\xfc-\xfd][\x80-\xbf]{5}|(.))/
defined($2)?"\0$2":
$T2U{$1}
or ($T2U{$1}
= ((length($1) == 1) ? pack("n", unpack("C", $1)) :
(length($1) == 2) ? (($c1,$c2) = unpack("C2", $1),
pack("n", (($c1 & 0x1F)<<6)|($c2 & 0x3F))) :
(length($1) == 3) ? (($c1,$c2,$c3) = unpack("C3", $1),
pack("n", (($c1 & 0x0F)<<12)|(($c2 & 0x3F)<<6)|($c3 & 0x3F))) : "\0?"))
/eg;
$str;
}
Unicode-Japanese-0.47/t/utf8flag.t 0100644 0001761 0000144 00000001403 11046013305 015416 0 ustar hio users ## ----------------------------------------------------------------------------
# t/utf8flag.t
# -----------------------------------------------------------------------------
# $Id: utf8flag.t 4504 2002-11-05 07:44:57Z hio $
# -----------------------------------------------------------------------------
use strict;
use Test;
BEGIN { plan tests => 1; }
use Unicode::Japanese;
my $string;
if( $]<5.008 )
{
skip("your perl(v$]) maybe not support utf-8.",0,1);
}else
{
my $CODE=<<'CODE';
# ---------------------------------------------------------------------------
# check utf-8 flag
# h2z num
$string = Unicode::Japanese->new("0129");
$string->h2z();
ok( $string->getu(), "\x{ff10}\x{ff11}\x{ff12}\x{ff19}");
CODE
eval $CODE;
$@ and die $@;
}
Unicode-Japanese-0.47/t/base.t 0100644 0001761 0000144 00000002227 11046013305 014615 0 ustar hio users ## ----------------------------------------------------------------------------
# t/base.t
# -----------------------------------------------------------------------------
# $Id: base.t 4499 2002-10-31 07:48:02Z hio $
# -----------------------------------------------------------------------------
use strict;
use Test;
BEGIN { plan tests => 8 }
# -----------------------------------------------------------------------------
# load module
require Unicode::Japanese;
ok(1);
import Unicode::Japanese;
ok(1);
# -----------------------------------------------------------------------------
# check new and set/get
my $string;
$string = new Unicode::Japanese;
ok($string);
$string = new Unicode::Japanese 'abcde';
ok($string->get, 'abcde');
$string = new Unicode::Japanese;
$string->set('abcde');
ok($string->get, 'abcde');
# -----------------------------------------------------------------------------
# check new and set/get *PurePerl*
$string = new Unicode::Japanese::PurePerl;
ok($string);
$string = new Unicode::Japanese::PurePerl 'abcde';
ok($string->get, 'abcde');
$string = new Unicode::Japanese::PurePerl;
$string->set('abcde');
ok($string->get, 'abcde');
Unicode-Japanese-0.47/t/allchar_ucs4.t 0100644 0001761 0000144 00000010645 11046013305 016252 0 ustar hio users #!/usr/bin/perl
#
# ucs2 <=> utf8 全文字チェック
# ucs2(0x0000..0xFFFF) => utf8
# utf8(0x000000..0xFFFFFF) => ucs2
#
use Test::More;
BEGIN
{
if( !$ENV{ALLCHAR_TEST} )
{
plan skip_all => "no ALLCHAR_TEST";
exit;
}
plan 'no_plan'; #tests => 0x0010_FFFF * 4;
}
use strict;
use Unicode::Japanese;
use lib '.', 't';
require 'esc.pl';
test1();
test2();
test3();
test4();
test5();
sub test1
{
my $xs = Unicode::Japanese->new();
my $pp = Unicode::Japanese::PurePerl->new();
for my $i (0..0x7f)
{
my $hex = sprintf('%02x', $i);
my $ucs4 = pack("N", $i);
my $utf8 = pack("C*", $i);
$xs->set($utf8, 'utf8');
$pp->set($utf8, 'utf8');
is(escfull($xs->ucs4), escfull($ucs4), "[1/$hex] utf8->ucs4 (xs)");
is(escfull($pp->ucs4), escfull($ucs4), "[1/$hex] utf8->ucs4 (pp)");
$xs->set($ucs4, 'ucs4');
$pp->set($ucs4, 'ucs4');
is(escfull($xs->utf8), escfull($utf8), "[1/$hex] ucs4->utf8 (xs)");
is(escfull($pp->utf8), escfull($utf8), "[1/$hex] ucs4->utf8 (pp)");
}
}
sub test2
{
my $xs = Unicode::Japanese->new();
my $pp = Unicode::Japanese::PurePerl->new();
my $min = 0x80;
my $max = 0x800-1;
my $wholetest = $ENV{ALLCHAR_TEST}>=2;
my $_max = $wholetest ? $max : $min+1;
for my $_i ($min..$_max)
{
my $i = $wholetest ? $_i : ($_i==$min ? $min : $max);
my $hex = sprintf('%02x', $i);
my $ucs4 = pack("N", $i);
my $utf8 = pack("C*", 0xc0+($i>>6), map{(($i>>$_)&0x3f)^0x80} (0));
$xs->set($utf8, 'utf8');
$pp->set($utf8, 'utf8');
#diag(escfull($ucs4).', '.escfull($utf8));
is(escfull($xs->ucs4), escfull($ucs4), "[2/$hex] utf8->ucs4 (xs)");
is(escfull($pp->ucs4), escfull($ucs4), "[2/$hex] utf8->ucs4 (pp)");
$xs->set($ucs4, 'ucs4');
$pp->set($ucs4, 'ucs4');
is(escfull($xs->utf8), escfull($utf8), "[2/$hex] ucs4->utf8 (xs)");
is(escfull($pp->utf8), escfull($utf8), "[2/$hex] ucs4->utf8 (pp)");
}
}
sub test3
{
my $xs = Unicode::Japanese->new();
my $pp = Unicode::Japanese::PurePerl->new();
my $min = 0x800;
my $max = 0x1_0000-1;
my $wholetest = $ENV{ALLCHAR_TEST}>=2;
my $_max = $wholetest ? $max : $min+1;
for my $_i ($min..$_max)
{
my $i = $wholetest ? $_i : ($_i==$min ? $min : $max);
my $hex = sprintf('%02x', $i);
my $ucs4 = pack("N", $i);
my $utf8 = pack("C*", 0xe0+($i>>12), map{(($i>>$_)&0x3f)^0x80} (6, 0));
$xs->set($utf8, 'utf8');
$pp->set($utf8, 'utf8');
#diag(escfull($ucs4).', '.escfull($utf8));
is(escfull($xs->ucs4), escfull($ucs4), "[3/$hex] utf8->ucs4 (xs)");
is(escfull($pp->ucs4), escfull($ucs4), "[3/$hex] utf8->ucs4 (pp)");
$xs->set($ucs4, 'ucs4');
$pp->set($ucs4, 'ucs4');
is(escfull($xs->utf8), escfull($utf8), "[3/$hex] ucs4->utf8 (xs)");
is(escfull($pp->utf8), escfull($utf8), "[3/$hex] ucs4->utf8 (pp)");
}
}
sub test4
{
my $xs = Unicode::Japanese->new();
my $pp = Unicode::Japanese::PurePerl->new();
my $min = 0x1_0000;
my $max = 0x11_0000-1;
my $wholetest = $ENV{ALLCHAR_TEST}>=2;
my $_max = $wholetest ? $max : $min+1;
for my $_i ($min..$_max)
{
my $i = $wholetest ? $_i : ($_i==$min ? $min : $max);
my $hex = sprintf('%02x', $i);
my $ucs4 = pack("N", $i);
my $utf8 = pack("C*", 0xf0+($i>>18), map{(($i>>$_)&0x3f)^0x80} (12, 6, 0));
$xs->set($utf8, 'utf8');
$pp->set($utf8, 'utf8');
#diag(escfull($ucs4).', '.escfull($utf8));
is(escfull($xs->ucs4), escfull($ucs4), "[4/$hex] utf8->ucs4 (xs)");
is(escfull($pp->ucs4), escfull($ucs4), "[4/$hex] utf8->ucs4 (pp)");
$xs->set($ucs4, 'ucs4');
$pp->set($ucs4, 'ucs4');
is(escfull($xs->utf8), escfull($utf8), "[4/$hex] ucs4->utf8 (xs)");
is(escfull($pp->utf8), escfull($utf8), "[4/$hex] ucs4->utf8 (pp)");
}
}
sub test5
{
my $xs = Unicode::Japanese->new();
my $pp = Unicode::Japanese::PurePerl->new();
for my $i (0x11_0000)
{
my $hex = sprintf('%02x', $i);
my $ucs4 = pack("N", $i);
my $utf8 = pack("C*", 0xf0+($i>>18), map{(($i>>$_)&0x3f)^0x80} (12, 6, 0));
$xs->set($utf8, 'utf8');
$pp->set($utf8, 'utf8');
#diag(escfull($ucs4).', '.escfull($utf8));
is(escfull($xs->ucs4), escfull("\0\0\0?"), "[5/$hex] utf8->ucs4='?' (xs)");
is(escfull($pp->ucs4), escfull("\0\0\0?"), "[5/$hex] utf8->ucs4='?' (pp)");
$xs->set($ucs4, 'ucs4');
$pp->set($ucs4, 'ucs4');
is(escfull($xs->utf8), escfull('?'), "[5/$hex] ucs4->utf8='?' (xs)");
is(escfull($pp->utf8), escfull('?'), "[5/$hex] ucs4->utf8='?' (pp)");
}
}
Unicode-Japanese-0.47/t/random.pl 0100644 0001761 0000144 00000002427 11046013305 015335 0 ustar hio users #!/usr/bin/perl
#
# t/random.pl
#
# ランダムに作成した文字列(バイナリ列)をいろいろ変換.
#
# $ sh runtest.sh t/random.pl > random.out
#
# 異常終了時には
# $ sh runtest.sh t/random_redo.pl
# でリトライできます.
#
use strict;
use Unicode::Japanese;
my $maxlen = 512;
our @charcodes = (
'jis', 'sjis', 'euc',
'sjis-imode', 'sjis-doti', 'sjis-jsky',
);
our $count = 0;
my $file = 'random.dat';
open(FILE,">$file") or die "cannot open [$file]";
select((select(FILE),$|=1)[0]);
$| = 1;
print "[0x000000]";
for(;; ++$count)
{
if( ($count&0xFF)==0 && $count )
{
if( ($count&0x3FFF)==0 )
{
printf "\n[%#08x]",$count;
}else
{
print ".";
}
}
my $len = int(rand($maxlen-4))+4;
my $src = '';
for( my $i=0; $i<$len; ++$i )
{
$src .= pack('C',int(rand(0x256)));
}
seek FILE,0,0;
print FILE pack('N',$count);
print FILE pack('N',length($src));
print FILE $src;
truncate FILE,length($src)+8;
# ------------------------------------
# utf8 => jis/eucjp/etc.
#
my $str = Unicode::Japanese->new($src,'utf8');
foreach my $ocode ( @charcodes )
{
$str->conv($ocode);
}
# ------------------------------------
# jis/eucjp/etc. => utf8
foreach my $icode ( @charcodes )
{
Unicode::Japanese->new($src,$icode);
}
}
Unicode-Japanese-0.47/t/esc.pl 0100644 0001761 0000144 00000002251 11046013305 014622 0 ustar hio users ## ----------------------------------------------------------------------------
# esc.pl
# -----------------------------------------------------------------------------
# require 'esc.pl'
# (used from t/*.t)
# (ja:) (t/*.t сゃc)
# -----------------------------------------------------------------------------
# escapes coltroll characters.
# esc() effects only 0x00-0x7F.
# escfull() effect all chats includes utf-8 char which will be in \x{hh} format.
# (ja:) 九勝絖鴻宴若.
# (ja:) 0x80篁ラ罧 esc ,
# (ja:) 鴻宴若 escfull .
# -----------------------------------------------------------------------------
sub esc
{
my $str = shift;
$str =~ s/\\/\\\\/g;
$str =~ s/\n/\\n/g;
$str =~ s/\e/\\e/g;
$str =~ s/\r/\\r/g;
$str =~ s/\0/\\0/g;
$str =~ s/([\x00-\x1f\x7f])/sprintf('\x%02x',ord($1))/ge;
$str;
}
sub escfull
{
my $str = shift;
$str =~ s/\\/\\\\/g;
$str =~ s/\n/\\n/g;
$str =~ s/\e/\\e/g;
$str =~ s/\r/\\r/g;
$str =~ s/\0/\\0/g;
$str =~ s/([\x00-\x1f\x7f-\xff])/sprintf('\x%02x',ord($1))/ge;
$str =~ s/([^\x00-\xff])/sprintf('\x{%02x}',ord($1))/ge;
$str;
}
1;
Unicode-Japanese-0.47/t/allchar_eucjp_sjis.t 0100644 0001761 0000144 00000004370 11046013305 017530 0 ustar hio users #!/usr/bin/perl -w
use Test::More;
BEGIN
{
if( !$ENV{ALLCHAR_TEST} )
{
plan skip_all => "no ALLCHAR_TEST";
exit;
}
plan
tests =>
0x100 # 1byte
+ (0xfe-0xa1+1)**2 # EUCJP_0212
+ (0xfe-0xa1+1)**2 # EUCJP_C
+ (0xdf-0xa1+1); # EUCJP_KANA
}
use strict;
use Unicode::Japanese;
my %RE =
(
ASCII => '[\x00-\x7f]',
EUC_0212 => '\x8f[\xa1-\xfe][\xa1-\xfe]',
EUC_C => '[\xa1-\xfe][\xa1-\xfe]',
EUC_KANA => '\x8e[\xa1-\xdf]',
);
my $RE = join('|',values(%RE));
Unicode::Japanese->new();
for( my $i=0; $i<0x100; ++$i )
{
my $ch = pack('C',$i);
my $valid = $ch;
my $res = Unicode::Japanese->_e2s($ch);
is($res,$valid,sprintf("ascii:0x%02x",$i) );
if( $valid ne $res )
{
out('1byte-char',$ch,$res,$valid);
}
}
# EUCJP_0212
for( my $c1 = 0xa1; $c1<=0xfe; ++$c1 )
{
for( my $c2 = 0xa1; $c2<=0xfe; ++$c2 )
{
my $ch = "\x8f".pack("CC",$c1,$c2);
my $valid = "\x81\xac"; # udnef-sjis
my $res = Unicode::Japanese->_e2s($ch);
is($res,$valid,sprintf("eucjp_0212:0x%02x%02x",$c1,$c2));
if( $res ne $valid )
{
out('EUCJP_0212',$ch,$res,$valid);
}
}
}
# EUCJP_C
for( my $c1 = 0xa1; $c1<=0xfe; ++$c1 )
{
for( my $c2 = 0xa1; $c2<=0xfe; ++$c2 )
{
my $ch = pack("CC",$c1,$c2);
my $valid = conv($ch);
my $res = Unicode::Japanese->_e2s($ch);
is($res,$valid,sprintf("eucjp_0212:0x%02x%02x",$c1,$c2));
if( $res ne $valid )
{
out('EUCJP_C',$ch,$res,$valid);
}
}
}
# EUCJP_KANA
for( my $c1 = 0xa1; $c1<=0xdf; ++$c1 )
{
my $ch = "\x8e".pack("C",$c1);
my $valid = pack("C",$c1);
my $res = Unicode::Japanese->_e2s($ch);
is($res,$valid,sprintf("eucjp_kana:0x%02x",$c1));
if( $res ne $valid )
{
out('EUCJP_KANA',$ch,$res,$valid);
}
}
sub conv
{
my $ch = shift;
#`echo -n '$ch'|nkf -E -s`;
use Jcode;
#Jcode::euc_sjis($ch);
Jcode->new($ch,"euc")->sjis;
}
sub out
{
my $where = shift;
my $ch = shift;
my $res = shift;
my $valid = shift;
if(0)
{
print STDERR "[$where]\n";
print STDERR "char :", (map{sprintf(" %02x",$_)} unpack('C*',$ch)),"\n";
print STDERR "res :", (map{sprintf(" %02x",$_)} unpack('C*',$res)),"\n";
print STDERR "valid:", (map{sprintf(" %02x",$_)} unpack('C*',$valid)),"\n";
}
#exit;
}
Unicode-Japanese-0.47/t/v045_getcode_doti.t 0100644 0001761 0000144 00000001624 11046013305 017112 0 ustar hio users ## ----------------------------------------------------------------------------
# t/v045_getcode.t
# -----------------------------------------------------------------------------
# $Id: 0.loadxs.t 5236 2008-01-16 09:47:26Z hio $
# -----------------------------------------------------------------------------
use strict;
use Test::More;
use Unicode::Japanese;
# xs is loaded in first invocation of `new'.
my $xs = Unicode::Japanese->new();
my $pp = Unicode::Japanese::PurePerl->new();
# to avoid used-only-once warning, read twice.
my $err = ($Unicode::Japanese::xs_loaderror,$Unicode::Japanese::xs_loaderror)[0];
if( $err =~ /Can't locate loadable object/ )
{
plan skip_all => 'no xs module';
}
plan tests => 2;
# f340 is available on both au and doti.
# But f040 is available on only doti.
my $str = "\xf3\x40\xf0\x40";
is($xs->getcode($str), "sjis-doti", "xs");
is($pp->getcode($str), "sjis-doti", "pp");
Unicode-Japanese-0.47/t/illegal.t 0100644 0001761 0000144 00000016303 11046013305 015314 0 ustar hio users ## ----------------------------------------------------------------------------
# t/illegal.t
# -----------------------------------------------------------------------------
# Mastering programed by YAMASHINA Hio
#
# Copyright YMIRLINK,Inc.
# -----------------------------------------------------------------------------
# $Id: illegal.t 4631 2006-04-14 05:18:55Z pho $
# -----------------------------------------------------------------------------
use strict;
use Test::More tests => 72;
use Unicode::Japanese;
my $Z1 = "\0"; # U+0000 in 1 byte.
my $Z2 = "\xc0\x80"; # U+0000 in 2 bytes.
my $Z3 = "\xe0\x80\x80"; # U+0000 in 3 bytes.
my $Z4 = "\xf0\x80\x80\x80"; # U+0000 in 4 bytes.
my $Z5 = "\xf8\x80\x80\x80\x80"; # U+0000 in 5 bytes.
my $Z6 = "\xfc\x80\x80\x80\x80\x80"; # U+0000 in 6 bytes.
sub u{ unpack("H*",$_[0]) }
# -----------------------------------------------------------------------------
# internal data
#
{
my $d = "internal data / \\x00";
my $U = Unicode::Japanese->new();
my $PPU = Unicode::Japanese::PurePerl->new();
is(u($U->set($Z1)->{str}), u("\x00"), "$d (1 byte)");
is(u($U->set($Z2)->{str}), u("?"), "$d (2 bytes)");
is(u($U->set($Z3)->{str}), u("?"), "$d (3 bytes)");
is(u($U->set($Z4)->{str}), u("?"), "$d (4 bytes)");
is(u($U->set($Z5)->{str}), u("?"), "$d (5 bytes)");
is(u($U->set($Z6)->{str}), u("?"), "$d (6 bytes)");
is(u($PPU->set($Z1)->{str}), u("\x00"), "$d (1 byte) [PP]");
is(u($PPU->set($Z2)->{str}), u("?"), "$d (2 bytes) [PP]");
is(u($PPU->set($Z3)->{str}), u("?"), "$d (3 bytes) [PP]");
is(u($PPU->set($Z4)->{str}), u("?"), "$d (4 bytes) [PP]");
is(u($PPU->set($Z5)->{str}), u("?"), "$d (5 bytes) [PP]");
is(u($PPU->set($Z6)->{str}), u("?"), "$d (6 bytes) [PP]");
}
# -----------------------------------------------------------------------------
# sjis
#
{
my $d = "sjis / \\x00";
my $U = Unicode::Japanese->new();
my $PPU = Unicode::Japanese::PurePerl->new();
$U->{str}=$Z1; is(u($U->sjis()), u("\x00"), "$d (1 byte)");
$U->{str}=$Z2; is(u($U->sjis()), u("?"), "$d (2 bytes)");
$U->{str}=$Z3; is(u($U->sjis()), u("?"), "$d (3 bytes)");
$U->{str}=$Z4; is(u($U->sjis()), u("?"), "$d (4 bytes)");
$U->{str}=$Z5; is(u($U->sjis()), u("?"), "$d (5 bytes)");
$U->{str}=$Z6; is(u($U->sjis()), u("?"), "$d (6 bytes)");
$PPU->{str}=$Z1; is(u($PPU->sjis()), u("\x00"), "$d (1 byte) [PP]");
$PPU->{str}=$Z2; is(u($PPU->sjis()), u("?"), "$d (2 bytes) [PP]");
$PPU->{str}=$Z3; is(u($PPU->sjis()), u("?"), "$d (3 bytes) [PP]");
$PPU->{str}=$Z4; is(u($PPU->sjis()), u("?"), "$d (4 bytes) [PP]");
$PPU->{str}=$Z5; is(u($PPU->sjis()), u("?"), "$d (5 bytes) [PP]");
$PPU->{str}=$Z6; is(u($PPU->sjis()), u("?"), "$d (6 bytes) [PP]");
}
# -----------------------------------------------------------------------------
# utf8
#
{
my $d = "utf8 / \\x00";
my $U = Unicode::Japanese->new();
my $PPU = Unicode::Japanese::PurePerl->new();
$U->{str}=$Z1; is(u($U->utf8()), u("\x00"), "$d (1 byte)");
$U->{str}=$Z2; is(u($U->utf8()), u("?"), "$d (2 bytes)");
$U->{str}=$Z3; is(u($U->utf8()), u("?"), "$d (3 bytes)");
$U->{str}=$Z4; is(u($U->utf8()), u("?"), "$d (4 bytes)");
$U->{str}=$Z5; is(u($U->utf8()), u("?"), "$d (5 bytes)");
$U->{str}=$Z6; is(u($U->utf8()), u("?"), "$d (6 bytes)");
$PPU->{str}=$Z1; is(u($PPU->utf8()), u("\x00"), "$d (1 byte) [PP]");
$PPU->{str}=$Z2; is(u($PPU->utf8()), u("?"), "$d (2 bytes) [PP]");
$PPU->{str}=$Z3; is(u($PPU->utf8()), u("?"), "$d (3 bytes) [PP]");
$PPU->{str}=$Z4; is(u($PPU->utf8()), u("?"), "$d (4 bytes) [PP]");
$PPU->{str}=$Z5; is(u($PPU->utf8()), u("?"), "$d (5 bytes) [PP]");
$PPU->{str}=$Z6; is(u($PPU->utf8()), u("?"), "$d (6 bytes) [PP]");
}
# -----------------------------------------------------------------------------
# ucs2
#
{
my $d = "ucs2 / \\x00";
my $U = Unicode::Japanese->new();
my $PPU = Unicode::Japanese::PurePerl->new();
$U->{str}=$Z1; is(u($U->ucs2()), u("\x00\x00"), "$d (1 byte)");
$U->{str}=$Z2; is(u($U->ucs2()), u("\x00?"), "$d (2 bytes)");
$U->{str}=$Z3; is(u($U->ucs2()), u("\x00?"), "$d (3 bytes)");
$U->{str}=$Z4; is(u($U->ucs2()), u("\x00?"), "$d (4 bytes)");
$U->{str}=$Z5; is(u($U->ucs2()), u("\x00?"), "$d (5 bytes)");
$U->{str}=$Z6; is(u($U->ucs2()), u("\x00?"), "$d (6 bytes)");
$PPU->{str}=$Z1; is(u($PPU->ucs2()), u("\x00\x00"), "$d (1 byte) [PP]");
$PPU->{str}=$Z2; is(u($PPU->ucs2()), u("\x00?"), "$d (2 bytes) [PP]");
$PPU->{str}=$Z3; is(u($PPU->ucs2()), u("\x00?"), "$d (3 bytes) [PP]");
$PPU->{str}=$Z4; is(u($PPU->ucs2()), u("\x00?"), "$d (4 bytes) [PP]");
$PPU->{str}=$Z5; is(u($PPU->ucs2()), u("\x00?"), "$d (5 bytes) [PP]");
$PPU->{str}=$Z6; is(u($PPU->ucs2()), u("\x00?"), "$d (6 bytes) [PP]");
}
# -----------------------------------------------------------------------------
# ucs4
#
{
my $d = "ucs4 / \\x00";
my $U = Unicode::Japanese->new();
my $PPU = Unicode::Japanese::PurePerl->new();
$U->{str}=$Z1; is(u($U->ucs4()), u("\x00\x00\x00\x00"), "$d (1 byte)");
$U->{str}=$Z2; is(u($U->ucs4()), u("\x00\x00\x00?"), "$d (2 bytes)");
$U->{str}=$Z3; is(u($U->ucs4()), u("\x00\x00\x00?"), "$d (3 bytes)");
$U->{str}=$Z4; is(u($U->ucs4()), u("\x00\x00\x00?"), "$d (4 bytes)");
$U->{str}=$Z5; is(u($U->ucs4()), u("\x00\x00\x00?"), "$d (5 bytes)");
$U->{str}=$Z6; is(u($U->ucs4()), u("\x00\x00\x00?"), "$d (6 bytes)");
$PPU->{str}=$Z1; is(u($PPU->ucs4()), u("\x00\x00\x00\x00"), "$d (1 byte) [PP]");
$PPU->{str}=$Z2; is(u($PPU->ucs4()), u("\x00\x00\x00?"), "$d (2 bytes) [PP]");
$PPU->{str}=$Z3; is(u($PPU->ucs4()), u("\x00\x00\x00?"), "$d (3 bytes) [PP]");
$PPU->{str}=$Z4; is(u($PPU->ucs4()), u("\x00\x00\x00?"), "$d (4 bytes) [PP]");
$PPU->{str}=$Z5; is(u($PPU->ucs4()), u("\x00\x00\x00?"), "$d (5 bytes) [PP]");
$PPU->{str}=$Z6; is(u($PPU->ucs4()), u("\x00\x00\x00?"), "$d (6 bytes) [PP]");
}
# -----------------------------------------------------------------------------
# utf16
#
{
my $d = "utf16 / \\x00";
my $U = Unicode::Japanese->new();
my $PPU = Unicode::Japanese::PurePerl->new();
$U->{str}=$Z1; is(u($U->utf16()), u("\x00\x00"), "$d (1 byte)");
$U->{str}=$Z2; is(u($U->utf16()), u("\x00?"), "$d (2 bytes)");
$U->{str}=$Z3; is(u($U->utf16()), u("\x00?"), "$d (3 bytes)");
$U->{str}=$Z4; is(u($U->utf16()), u("\x00?"), "$d (4 bytes)");
$U->{str}=$Z5; is(u($U->utf16()), u("\x00?"), "$d (5 bytes)");
$U->{str}=$Z6; is(u($U->utf16()), u("\x00?"), "$d (6 bytes)");
$PPU->{str}=$Z1; is(u($PPU->utf16()), u("\x00\x00"), "$d (1 byte) [PP]");
$PPU->{str}=$Z2; is(u($PPU->utf16()), u("\x00?"), "$d (2 bytes) [PP]");
$PPU->{str}=$Z3; is(u($PPU->utf16()), u("\x00?"), "$d (3 bytes) [PP]");
$PPU->{str}=$Z4; is(u($PPU->utf16()), u("\x00?"), "$d (4 bytes) [PP]");
$PPU->{str}=$Z5; is(u($PPU->utf16()), u("\x00?"), "$d (5 bytes) [PP]");
$PPU->{str}=$Z6; is(u($PPU->utf16()), u("\x00?"), "$d (6 bytes) [PP]");
}
# -----------------------------------------------------------------------------
# End Of File.
# -----------------------------------------------------------------------------
Unicode-Japanese-0.47/t/v038_guess_imode2.t 0100644 0001761 0000144 00000003074 11046013305 017051 0 ustar hio users #! /usr/bin/perl -w
## ----------------------------------------------------------------------------
# t/v038_guess_imode2.t
# -----------------------------------------------------------------------------
# Mastering programmed by YAMASHINA Hio
#
# Copyright 2006 YAMASHINA Hio
# -----------------------------------------------------------------------------
# $Id: v038_guess_imode2.t 4670 2007-01-16 01:01:08Z hio $
# -----------------------------------------------------------------------------
use strict;
use strict;
use Test::More tests => 1+76*2;
use Unicode::Japanese;
&check();
&test_guess_imode2();
# -----------------------------------------------------------------------------
# check.
#
sub check
{
#diag("Unicode::Japanese [$Unicode::Japanese::VERSION]");
Unicode::Japanese->new();
my $xs_loaderror = $Unicode::Japanese::xs_loaderror;
defined($xs_loaderror) or $xs_loaderror = '{undef}';
is($xs_loaderror, '', "load success");
}
# -----------------------------------------------------------------------------
# test_guess_imode2.
#
sub test_guess_imode2
{
my $xs = Unicode::Japanese->new();
my $pp = Unicode::Japanese::PurePerl->new();
foreach my $i (1..76)
{
my $data = "\x82\xb3 \xf9".pack("C",0xb0+$i);
is($xs->getcode($data), 'sjis-imode', "[guess_imode2] imode-pictgram extend $i (xs)");
is($pp->getcode($data), 'sjis-imode', "[guess_imode2] imode-pictgram extend $i (pp)");
}
}
# -----------------------------------------------------------------------------
# End of File.
# -----------------------------------------------------------------------------
Unicode-Japanese-0.47/t/v046_earlycall.t 0100644 0001761 0000144 00000001075 11055722707 016447 0 ustar hio users ## ----------------------------------------------------------------------------
# t/v046_earlycall.t
# -----------------------------------------------------------------------------
# $Id$
# -----------------------------------------------------------------------------
use strict;
use Test::More;
use Unicode::Japanese;
plan tests => 3;
is($Unicode::Japanese::xs_loaderror, undef, "xsubs is not loaded yet");
eval{ Unicode::Japanese->getcode(""); };
my $err = $@;
is($err, '', "getcode success");
is($Unicode::Japanese::xs_loaderror, '', "xsubs is loaded successfully");
Unicode-Japanese-0.47/t/emoji-imode-utf8.t 0100644 0001761 0000144 00000010216 11046013305 016762 0 ustar hio users
use strict;
#use warnings;
use Unicode::Japanese;
use Test::More tests => (176 + 76) * 4;
&test;
sub test
{
my $xs = Unicode::Japanese->new();
my @data = ;
my $conv;
foreach $conv ('imode1', 'imode2')
{
#diag "test $conv\n";
foreach(@data)
{
chomp;
#/^\w+$/ and print("$_\n");
/^\w+$/ and next;
$_ or exit;
my ($sjis_hex, $ucs2_hex) = split(' ', $_);
# set utf8-imode.
my $ucs2_imode = pack("H*", $ucs2_hex);
my $u8_imode = $xs->set($ucs2_imode, "ucs2")->utf8;
my $u8_from_utf8 = $xs->set($u8_imode, "utf8-$conv")->utf8;
my $u8hex_from_utf8 = uc unpack("H*", $u8_from_utf8);
# set sjis-imode.
my $sjis = pack("H*", $sjis_hex);
my $u8_from_sjis = $xs->set($sjis, "sjis-$conv")->utf8;
my $u8hex_from_sjis = uc unpack("H*", $u8_from_sjis);
#print "$sjis_hex => $u8hex_from_sjis\n";
#print "$ucs2_hex => $u8hex_from_utf8 (($u8_from_utf8))\n";
is($u8hex_from_utf8, $u8hex_from_sjis, "set utf8-$conv S+$sjis_hex/U+$ucs2_hex - ($u8hex_from_sjis)");
#
my $u8_imode_hex = uc unpack("H*", $xs->utf8 ne '?' ? $u8_imode : '?');
my $meth = "utf8_$conv";
my $out = $xs->$meth();
my $out_hex = uc unpack("H*", $out);
#print "$sjis_hex => $u8hex_from_sjis\n";
#print "$ucs2_hex => $u8hex_from_utf8 (($u8_from_utf8))\n";
is($out_hex, $u8_imode_hex, "get utf8-$conv S+$sjis_hex/U+$ucs2_hex - ($out_hex)") or die "TEST";
}
}
}
# http://www.nttdocomo.co.jp/service/imode/make/content/pictograph/basic/index.html
# http://www.nttdocomo.co.jp/service/imode/make/content/pictograph/extention/index.html
__DATA__
BASIC
F89F E63E
F8A0 E63F
F8A1 E640
F8A2 E641
F8A3 E642
F8A4 E643
F8A5 E644
F8A6 E645
F8A7 E646
F8A8 E647
F8A9 E648
F8AA E649
F8AB E64A
F8AC E64B
F8AD E64C
F8AE E64D
F8AF E64E
F8B0 E64F
F8B1 E650
F8B2 E651
F8B3 E652
F8B4 E653
F8B5 E654
F8B6 E655
F8B7 E656
F8B8 E657
F8B9 E658
F8BA E659
F8BB E65A
F8BC E65B
F8BD E65C
F8BE E65D
F8BF E65E
F8C0 E65F
F8C1 E660
F8C2 E661
F8C3 E662
F8C4 E663
F8C5 E664
F8C6 E665
F8C7 E666
F8C8 E667
F8C9 E668
F8CA E669
F8CB E66A
F8CC E66B
F8CD E66C
F8CE E66D
F8CF E66E
F8D0 E66F
F8D1 E670
F8D2 E671
F8D3 E672
F8D4 E673
F8D5 E674
F8D6 E675
F8D7 E676
F8D8 E677
F8D9 E678
F8DA E679
F8DB E67A
F8DC E67B
F8DD E67C
F8DE E67D
F8DF E67E
F8E0 E67F
F8E1 E680
F8E2 E681
F8E3 E682
F8E4 E683
F8E5 E684
F8E6 E685
F8E7 E686
F8E8 E687
F8E9 E688
F8EA E689
F8EB E68A
F8EC E68B
F8ED E68C
F8EE E68D
F8EF E68E
F8F0 E68F
F8F1 E690
F8F2 E691
F8F3 E692
F8F4 E693
F8F5 E694
F8F6 E695
F8F7 E696
F8F8 E697
F8F9 E698
F8FA E699
F8FB E69A
F8FC E69B
F940 E69C
F941 E69D
F942 E69E
F943 E69F
F944 E6A0
F945 E6A1
F946 E6A2
F947 E6A3
F948 E6A4
F949 E6A5
F972 E6CE
F973 E6CF
F974 E6D0
F975 E6D1
F976 E6D2
F977 E6D3
F978 E6D4
F979 E6D5
F97A E6D6
F97B E6D7
F97C E6D8
F97D E6D9
F97E E6DA
F980 E6DB
F981 E6DC
F982 E6DD
F983 E6DE
F984 E6DF
F985 E6E0
F986 E6E1
F987 E6E2
F988 E6E3
F989 E6E4
F98A E6E5
F98B E6E6
F98C E6E7
F98D E6E8
F98E E6E9
F98F E6EA
F990 E6EB
F9B0 E70B
F991 E6EC
F992 E6ED
F993 E6EE
F994 E6EF
F995 E6F0
F996 E6F1
F997 E6F2
F998 E6F3
F999 E6F4
F99A E6F5
F99B E6F6
F99C E6F7
F99D E6F8
F99E E6F9
F99F E6FA
F9A0 E6FB
F9A1 E6FC
F9A2 E6FD
F9A3 E6FE
F9A4 E6FF
F9A5 E700
F9A6 E701
F9A7 E702
F9A8 E703
F9A9 E704
F9AA E705
F9AB E706
F9AC E707
F9AD E708
F9AE E709
F9AF E70A
F950 E6AC
F951 E6AD
F952 E6AE
F955 E6B1
F956 E6B2
F957 E6B3
F95B E6B7
F95C E6B8
F95D E6B9
F95E E6BA
EXTERNSION
F9B1 E70C
F9B2 E70D
F9B3 E70E
F9B4 E70F
F9B5 E710
F9B6 E711
F9B7 E712
F9B8 E713
F9B9 E714
F9BA E715
F9BB E716
F9BC E717
F9BD E718
F9BE E719
F9BF E71A
F9C0 E71B
F9C1 E71C
F9C2 E71D
F9C3 E71E
F9C4 E71F
F9C5 E720
F9C6 E721
F9C7 E722
F9C8 E723
F9C9 E724
F9CA E725
F9CB E726
F9CC E727
F9CD E728
F9CE E729
F9CF E72A
F9D0 E72B
F9D1 E72C
F9D2 E72D
F9D3 E72E
F9D4 E72F
F9D5 E730
F9D6 E731
F9D7 E732
F9D8 E733
F9D9 E734
F9DA E735
F9DB E736
F9DC E737
F9DD E738
F9DE E739
F9DF E73A
F9E0 E73B
F9E1 E73C
F9E2 E73D
F9E3 E73E
F9E4 E73F
F9E5 E740
F9E6 E741
F9E7 E742
F9E8 E743
F9E9 E744
F9EA E745
F9EB E746
F9EC E747
F9ED E748
F9EE E749
F9EF E74A
F9F0 E74B
F9F1 E74C
F9F2 E74D
F9F3 E74E
F9F4 E74F
F9F5 E750
F9F6 E751
F9F7 E752
F9F8 E753
F9F9 E754
F9FA E755
F9FB E756
F9FC E757
Unicode-Japanese-0.47/t/v041_sjisau.t 0100755 0001761 0000144 00000002771 11046013305 015762 0 ustar hio users #! /usr/bin/perl -w
## ----------------------------------------------------------------------------
# t/v041_sjisau.t
# -----------------------------------------------------------------------------
# Mastering programmed by SANO Taku (SAWATARI Mikage)
#
# Copyright 2007 SANO Taku (SAWATARI Mikage)
# -----------------------------------------------------------------------------
# $Id: v041_sjisau.t 4683 2007-09-03 07:29:10Z mikage $
# -----------------------------------------------------------------------------
use strict;
use strict;
use Test::More tests => 1+2;
use Unicode::Japanese;
&check();
&test_sjis_au();
# -----------------------------------------------------------------------------
# check.
#
sub check
{
#diag("Unicode::Japanese [$Unicode::Japanese::VERSION]");
Unicode::Japanese->new();
my $xs_loaderror = $Unicode::Japanese::xs_loaderror;
defined($xs_loaderror) or $xs_loaderror = '{undef}';
is($xs_loaderror, '', "load success");
}
# -----------------------------------------------------------------------------
# test_sjis_au.
#
sub test_sjis_au
{
my $xs = Unicode::Japanese->new();
my $pp = Unicode::Japanese::PurePerl->new();
is($xs->set($xs->set("羃若")->sjis_au, "sjis-au")->get, "羃若", "[sjis_au] check (xs)");
is($pp->set($pp->set("羃若")->sjis_au, "sjis-au")->get, "羃若", "[sjis_au] check (pp)");
}
# -----------------------------------------------------------------------------
# End of File.
# -----------------------------------------------------------------------------
Unicode-Japanese-0.47/t/illlet.t 0100644 0001761 0000144 00000002166 11046013305 015172 0 ustar hio users
use Test;
use Unicode::Japanese;
use lib 't';
require 'esc.pl';
BEGIN { plan tests => 6 }
## convert an illustrated letter between different types
## (ja:井腮腟究絖紊)
my $string;
# dot-i/j-sky to imode
$string = new Unicode::Japanese "\xf3\xbf\x81\x88\xf3\xbf\x8e\x8e";
ok(escfull($string->sjis_imode), escfull("\xf9\x8e\x82\xd2"));
$string = new Unicode::Japanese "\xf3\xbf\xb0\xb2\xf3\xbf\xb1\x84";
ok(escfull($string->sjis_imode), escfull("\xf9\x82\xf9\x8f"));
# imode/j-sky to dot-i
$string = new Unicode::Japanese "\xf3\xbf\xa2\xa8";
ok(escfull($string->sjis_doti), escfull("\xf0\x76"));
# 0ffc32.0ffc44 (jsky1.4632(NEW).jsky1.4644(篁医鐚峨]))
# f4a8.f055
$string = new Unicode::Japanese "\xf3\xbf\xb0\xb2\xf3\xbf\xb1\x84";
ok(escfull($string->sjis_doti), escfull("\xf4\xa8\xf0\x55"));
# imode(0ff8a8) to j-sky
$string = new Unicode::Japanese "\xf3\xbf\xa2\xa8";
ok(escfull($string->sjis_jsky), escfull("\x1b\x24\x46\x60\x0f"));
# U+0FF048 U+0FF38E
$string = new Unicode::Japanese "\xf3\xbf\x81\x88\xf3\xbf\x8e\x8e";
ok(escfull($string->sjis_jsky), escfull("\x1b\x24\x46\x43\x0f\x82\xd2"));
Unicode-Japanese-0.47/t/random_redo.pl 0100644 0001761 0000144 00000001435 11046013305 016344 0 ustar hio users #!/usr/bin/perl
use strict;
use Unicode::Japanese;
our @charcodes = (
'jis', 'sjis', 'euc',
'sjis-imode', 'sjis-doti', 'sjis-jsky',
);
my $file = 'random.dat';
open(FILE,"<$file") or die "cannot open [$file]";
my $dat;
read FILE,$dat,8;
my ($count,$len) = unpack('NN',$dat);
printf "[%#08x] len:%d\n",$count,$len;
read FILE,$dat,$len;
{
my $src = $dat;
# ------------------------------------
# utf8 => jis/eucjp/etc.
#
my $str = Unicode::Japanese->new($src,'utf8');
foreach my $ocode ( @charcodes )
{
print "utf8=>$ocode...\n";
$str->conv($ocode);
}
# ------------------------------------
# jis/eucjp/etc. => utf8
foreach my $icode ( @charcodes )
{
print "$icode=>utf8...\n";
Unicode::Japanese->new($src,$icode);
}
}
print "done\n";
Unicode-Japanese-0.47/t/00-load.t 0100644 0001761 0000144 00000000257 11046013305 015040 0 ustar hio users use Test::More tests => 1;
BEGIN {
use_ok( 'Unicode::Japanese' );
}
diag( "Testing Unicode::Japanese $Unicode::Japanese::VERSION, Perl $], /usr/local/ymir/perl/bin/perl" );
Unicode-Japanese-0.47/t/emoji-jsky-chopped.t 0100644 0001761 0000144 00000002216 11046013305 017402 0 ustar hio users
use strict;
#use warnings;
use Test::More tests => 14;
# -----------------------------------------------------------------------------
# load module
#
use Unicode::Japanese;
sub xs { _conv('Unicode::Japanese', @_); }
sub pp { _conv('Unicode::Japanese::PurePerl', @_); }
sub _conv
{
my $pkg = shift;
my $str = shift;
my $icode = shift or die "no icode";
my $out = $pkg->new($str, $icode)->utf8;
esc($out);
}
sub esc
{
my $out = shift;
$out =~ s/\\/\\\\/g;
$out =~ s/\e/\\e/g;
$out =~ s/\$/\\\$/g;
$out =~ s/([^ -~])/"\\x".unpack("H*",$1)/ge;
$out;
}
# -----------------------------------------------------------------------------
# run tests.
#
&test;
sub test
{
foreach my $icode (
'sjis-jsky',
'sjis-jsky1',
'sjis-jsky2',
'jis-jsky',
'jis-jsky1',
'jis-jsky2',
)
{
is(xs("\e\$G\x21", $icode), esc("\xf3\xbf\xb4\xa1"), "(xs) $icode"),
is(pp("\e\$G\x21", $icode), esc("\xf3\xbf\xb4\xa1"), "(pp) $icode"),
}
my $xs = Unicode::Japanese->new();
my $pp = Unicode::Japanese::PurePerl->new();
is($xs->getcode("\e\$G\x21"), "sjis-jsky", "(xs) getcode");
is($pp->getcode("\e\$G\x21"), "sjis-jsky", "(pp) getcode");
}
Unicode-Japanese-0.47/t/util.t 0100644 0001761 0000144 00000001136 11046013305 014656 0 ustar hio users
use Test;
use Unicode::Japanese;
BEGIN { plan tests => 4 }
## Util method
my $string;
# strlen (KATAKANA-AIU)
$string = new Unicode::Japanese "\xe3\x82\xa2\xe3\x82\xa4\xe3\x82\xa6";
ok($string->strlen, 6);
# strcut (KATAKANA-AIU)
$string = new Unicode::Japanese "\xe3\x82\xa2\xe3\x82\xa4\xe3\x82\xa6";
ok($string->strcut(5)->[0], "\xe3\x82\xa2\xe3\x82\xa4");
# join_csv
$string = new Unicode::Japanese;
$string->join_csv([1, 2, 'abc', '"123"']);
ok($string->get, '1,2,abc,"""123"""' . "\n");
# split_csv
$string = new Unicode::Japanese '1,2,abc,"""123"""';
ok($string->split_csv->[3], '"123"');
Unicode-Japanese-0.47/t/utf16.t 0100644 0001761 0000144 00000003631 11046013305 014650 0 ustar hio users #!/usr/bin/perl
#
# utf16 check.
# * surrogate pairs.
#
use strict;
use Test;
BEGIN { plan tests => 10, };
# -----------------------------------------------------------------------------
# load module
#
use Unicode::Japanese;
my $xs = Unicode::Japanese->new();
my $pp = Unicode::Japanese::PurePerl->new();
sub utf16ToUtf8_xs($){ tt($xs->set($_[0],'utf16')->utf8()); }
sub utf16ToUtf8_pp($){ tt($pp->set($_[0],'utf16')->utf8()); }
sub utf16ToUcs4_xs($){ tt($xs->set($_[0],'utf16')->ucs4()); }
sub utf16ToUcs4_pp($){ tt($pp->set($_[0],'utf16')->ucs4()); }
sub tt($){ join(' ',map{unpack("H*",$_)}split(//,$_[0])); }
sub bin($){ $_[0]; }
# -----------------------------------------------------------------------------
# run.
#
$| = 1;
{
# surrogate pair.(first one, U+01.0000)
#
my $test = "\xD8\x00\xDC\x00";
my $correct_ucs4 = tt("\x00\x01\x00\x00");
my $correct_utf8 = tt("\xf0\x90\x80\x80");
ok(utf16ToUtf8_xs($test),$correct_utf8,"surrogate pair (xs/utf8)");
ok(utf16ToUtf8_pp($test),$correct_utf8,"surrogate pair (pp/utf8)");
ok(utf16ToUcs4_xs($test),$correct_ucs4,"surrogate pair (xs/ucs4)");
ok(utf16ToUcs4_pp($test),$correct_ucs4,"surrogate pair (pp/ucs4)");
}
{
# surrogate pair.(sample)
# Western Musical Symbols, (U+01D100..)
# U+0x01D11E, MUSICAL SYMBOL G CLEF (活)
#
my $test = "\xD8\x3C\xDD\x1E";
my $correct_ucs4 = tt("\x00\x01\xF1\x1E");
my $correct_utf8 = tt("\xF0\x9F\x84\x9E");
ok(utf16ToUtf8_xs($test),$correct_utf8,"surrogate pair (xs)");
ok(utf16ToUtf8_pp($test),$correct_utf8,"surrogate pair (pp)");
ok(utf16ToUcs4_xs($test),$correct_ucs4,"surrogate pair (xs)");
ok(utf16ToUcs4_pp($test),$correct_ucs4,"surrogate pair (pp)");
}
{
# surrogate pair.(last one, U+10.FFFF)
#
my $test = "\xDB\xFF\xDF\xFF";
my $correct = tt("\x00\x10\xFF\xFF");
ok(utf16ToUcs4_xs($test),$correct,"surrogate pair (xs)");
ok(utf16ToUcs4_pp($test),$correct,"surrogate pair (pp)");
}
Unicode-Japanese-0.47/t/fromutf8.t 0100644 0001761 0000144 00000001511 11046013305 015450 0 ustar hio users
use Test;
use Unicode::Japanese;
BEGIN { plan tests => 7 }
## check from utf8 convert
my $string;
# sjis
$string = new Unicode::Japanese "\xe6\x84\x9b";
ok($string->sjis, "\x88\xa4");
# euc
$string = new Unicode::Japanese "\xe6\x84\x9b";
ok($string->euc, "\xb0\xa6");
# jis(iso-2022-jp)
$string = new Unicode::Japanese "\xe6\x84\x9b";
ok($string->jis, "\x1b\x24\x42\x30\x26\x1b\x28\x42");
# imode
$string = new Unicode::Japanese "\xf3\xbf\xa2\xa8";
ok($string->sjis_imode, "\xf8\xa8");
# dot-i
$string = new Unicode::Japanese "\xf3\xbf\x81\x88\xf3\xbf\x8e\x8e";
ok($string->sjis_doti, "\xf0\x48\xf3\x8e");
# j-sky
$string = new Unicode::Japanese "\xf3\xbf\xb0\xb2";
ok($string->sjis_jsky, "\e\$F2\x0f");
# j-sky(packed)
$string = new Unicode::Japanese "\xf3\xbf\xb0\xb2\xf3\xbf\xb1\x84";
ok($string->sjis_jsky, "\e\$F2D\x0f");
Unicode-Japanese-0.47/t/emoji-jsky-utf8.t 0100644 0001761 0000144 00000005725 11046013305 016656 0 ustar hio users ## ----------------------------------------------------------------------------
# t/emoji-jsky-utf8.t
# -----------------------------------------------------------------------------
# $Id: emoji-jsky-utf8.t 5220 2008-01-16 06:55:51Z hio $
# -----------------------------------------------------------------------------
use strict;
#use warnings;
use Test::More tests => 4 * 6 * 2 + 2 * 6 * 2 + 4;
# -----------------------------------------------------------------------------
# load module
#
use Unicode::Japanese;
&test_set_get;
sub xs { _conv('Unicode::Japanese', @_); }
sub pp { _conv('Unicode::Japanese::PurePerl', @_); }
sub _conv
{
my $pkg = shift;
my $str = shift;
my $icode = shift || 'utf8-jsky';
$pkg->new($str, $icode)->utf8;
#unpack("H*",$pkg->new($str, $icode)->utf8);
}
sub xsj { _conv('Unicode::Japanese', shift, "sjis-jsky"); }
sub ppj { _conv('Unicode::Japanese::PurePerl', shift, "sjis-jsky"); }
sub utf8 { shift }
sub test_set_get
{
foreach my $spec (
# G=>U+E001-U+E05a (ee8081-ee819a)
["\xee\x80\x80", undef, "e000: out of range"],
["\xee\x80\x81", "G!", "e001: "],
["\xee\x81\x9a", "Gz", "e05a: "],
["\xee\x81\x9b", undef, "e05b: out of range"],
# E=>U+E101-U+E15a (ee8481-ee859a)
["\xee\x84\x80", undef, "e100: out of range"],
["\xee\x84\x81", "E!", "e101: "],
["\xee\x85\x9a", "Ez", "e15a: "],
["\xee\x85\x9b", undef, "e15b: out of range"],
# F=>U+E201-U+E25a (ee8881-ee899a)
["\xee\x88\x80", undef, "e200: out of range"],
["\xee\x88\x81", "F!", "e201: "],
["\xee\x89\x9a", "Fz", "e25a: "],
["\xee\x89\x9b", undef, "e25b: out of range"],
# O=>U+E301-U+E34D (ee8c81-ee8d8d)
["\xee\x8c\x80", undef, "e300: out of range"],
["\xee\x8c\x81", "O!", "e301: "],
["\xee\x8d\x8d", "Om", "e34d: "],
["\xee\x8d\x8e", undef, "e34e: out of range"],
# P=>U+E401-U+E44C (ee9081-ee918c)
["\xee\x90\x80", undef, "e400: out of range"],
["\xee\x90\x81", "P!", "e401: "],
["\xee\x91\x8c", "Pl", "e44c: "],
["\xee\x91\x8d", undef, "e44d: out of range"],
# Q=>U+E501-U+E537 (ee9481-ee94b7)
["\xee\x94\x80", undef, "e500: out of range"],
["\xee\x94\x81", "Q!", "e501: "],
["\xee\x94\xb7", "QW", "e537: "],
["\xee\x94\xb8", undef, "e538: out of range"],
["\xee\x94\xb9", undef, "e539: out of range"],
["\xee\x94\xba", undef, "e53a: out of range"],
)
{
my ($u8, $out_src, $note) = @$spec;
my $out = $out_src ? xsj("\e\$$out_src\x0f") : utf8($u8);
is(xs($u8), $out, "(xs/set) $note");
is(pp($u8), $out, "(pp/set) $note");
if( $out_src )
{
is(Unicode::Japanese->new("\e\$$out_src\x0f","sjis-jsky")->utf8_jsky, $u8, "(xs/get) $note");
is(Unicode::Japanese::PurePerl->new("\e\$$out_src\x0f","sjis-jsky")->utf8_jsky, $u8, "(xs/get) $note");
}
}
}
# -----------------------------------------------------------------------------
# End of File.
# -----------------------------------------------------------------------------
Unicode-Japanese-0.47/t/allchar_sjis_eucjp.t 0100644 0001761 0000144 00000003140 11046013305 017522 0 ustar hio users #!/usr/bin/perl -w
use Test::More;
BEGIN
{
if( !$ENV{ALLCHAR_TEST} )
{
plan skip_all => "no ALLCHAR_TEST";
exit;
}
plan
tests =>
0x100 # 1byte, SJIS_KANA
+ (0x9f-0x81+1+0xef-0xe0+1)*(0x7e-0x40+1+0xfc-0x80+1); # SJIS_C
}
use strict;
use Unicode::Japanese;
my %RE =
(
ASCII => '[\x00-\x7f]',
SJIS_C => '[\x81-\x9f\xe0-\xef][\x40-\x7e\x80-\xfc]',
SJIS_KANA => '[\xa1-\xdf]',
);
my $RE = join('|',values(%RE));
Unicode::Japanese->new();
for( my $i=0; $i<0x100; ++$i )
{
my $ch = pack('C',$i);
my $kana = $ch=~/^($RE{SJIS_KANA})$/;
my $valid = $kana ? "\x8e".$ch : $ch ;
my $res = Unicode::Japanese->_s2e($ch);
is($res,$valid);
if( $valid ne $res )
{
my $where = $kana ? 'SJIS_KANA' : '1byte-char';
out($where,$ch,$res,$valid);
}
}
# SJIS_C
for( my $c1 = 0x81; $c1<=0xef; ++$c1 )
{
$c1 = 0xe0 if( $c1==0xa0 );
for( my $c2 = 0x40; $c2<=0xfc; ++$c2 )
{
$c2 = 0x80 if( $c2==0x7f );
my $ch = pack("CC",$c1,$c2);
my $valid = conv($ch);
my $res = Unicode::Japanese->_s2e($ch);
is($res,$valid);
if( $res ne $valid )
{
out('SJIS_C',$ch,$res,$valid);
}
}
}
sub conv
{
my $ch = shift;
#`echo -n '$ch'|nkf -S -e`;
use Jcode;
Jcode::sjis_euc($ch);
}
sub out
{
my $where = shift;
my $ch = shift;
my $res = shift;
my $valid = shift;
print STDERR "[$where]\n";
print STDERR "char :", (map{sprintf(" %02x",$_)} unpack('C*',$ch)),"\n";
print STDERR "res :", (map{sprintf(" %02x",$_)} unpack('C*',$res)),"\n";
print STDERR "valid:", (map{sprintf(" %02x",$_)} unpack('C*',$valid)),"\n";
exit;
}
Unicode-Japanese-0.47/t/toutf8.t 0100644 0001761 0000144 00000002577 11046013305 015144 0 ustar hio users
use Test;
use strict;
use Unicode::Japanese;
#print STDERR $Unicode::Japanese::PurePerl?"PurePerl mode\n":"XS mode\n";
BEGIN { plan tests => 9 }
## check to utf8 convert
$]>=5.008 and eval 'use bytes', $@ && die $@;
my $string;
use lib 't';
require 'esc.pl';
# sjis
$string = new Unicode::Japanese "\x88\xa4", 'sjis';
ok($string->utf8(), "\xe6\x84\x9b");
# euc
$string = new Unicode::Japanese "\xb0\xa6", 'euc';
ok($string->utf8(), "\xe6\x84\x9b");
# jis(iso-2022-jp)
$string = new Unicode::Japanese "\x1b\x24\x42\x30\x26\x1b\x28\x42", 'jis';
ok($string->utf8(), "\xe6\x84\x9b");
# imode
$string = new Unicode::Japanese "\xf8\xa8", 'sjis-imode';
ok($string->utf8(), "\xf3\xbf\xa2\xa8", 'sjis-imode');
# dot-i
$string = new Unicode::Japanese "\xf0\x48\xf3\x8e", 'sjis-doti';
ok($string->utf8(), "\xf3\xbf\x81\x88\xf3\xbf\x8e\x8e");
# j-sky (4632 ==> 0ffc32)
$string = new Unicode::Japanese::PurePerl "\e\$F2\x0f", 'sjis-jsky';
ok(escfull($string->utf8()), escfull("\xf3\xbf\xb0\xb2"));
$string = new Unicode::Japanese "\e\$F2\x0f", 'sjis-jsky';
ok(escfull($string->utf8()), escfull("\xf3\xbf\xb0\xb2"));
# j-sky(packed) (4632 4644 ==> 0ffc32 0ffc44)
$string = new Unicode::Japanese::PurePerl "\e\$F2D\x0f", 'sjis-jsky';
ok($string->utf8(), "\xf3\xbf\xb0\xb2\xf3\xbf\xb1\x84");
$string = new Unicode::Japanese "\e\$F2D\x0f", 'sjis-jsky';
ok($string->utf8(), "\xf3\xbf\xb0\xb2\xf3\xbf\xb1\x84");
Unicode-Japanese-0.47/t/pod-coverage.t 0100644 0001761 0000144 00000000345 11046013305 016255 0 ustar hio users #! perl -w
use Test::More;
eval "use Test::Pod::Coverage 1.04";
plan skip_all => "Test::Pod::Coverage 1.04 required for testing POD coverage" if $@;
my $trustme = { trustme => ['^(?:load_xs)$'] };
all_pod_coverage_ok($trustme);
Unicode-Japanese-0.47/t/sjis_extras.t 0100644 0001761 0000144 00000000713 11046013305 016237 0 ustar hio users #! /usr/bin/perl -w
use strict;
#use warnings;
use Test::More tests => 8;
use Unicode::Japanese;
&test_extras;
sub test_extras
{
my $re = '^[^?&].?$';
$re = ($]>=5.005 && eval "qr/$re/") || $re;
foreach my $code (qw(00a2 00a3 00a5 00ac 2016 203e 2212 301c))
{
my $sjis = Unicode::Japanese->new(pack("H*",$code),"ucs2")->sjis;
if( ref($re) )
{
like($sjis, $re, "U+$code => sjis");
}else
{
ok($sjis =~ /$re/, "U+$code => sjis");
}
}
}
Unicode-Japanese-0.47/t/allchar_utf8.pl 0100644 0001761 0000144 00000002071 11046013305 016424 0 ustar hio users #!/usr/bin/perl
use strict;
use Unicode::Japanese;
# とりあえず utf8=>* 変換してみるだけ.
# あってるかはみてないにゃ.
# 3文字しかしてなかったり…
# utf8は6文字まで?
#
our @charcodes = (
'jis', 'sjis', 'euc',
'sjis-imode', 'sjis-doti', 'sjis-jsky',
);
# in: utf8 0x00 - 0xFF.FF.FF
$| = 1;
for( my $i=0; $i<=0xFFFFFF; ++$i )
{
if( ($i&0xFF)==0 )
{
if( ($i&0x3FFF)==0 )
{
print "\n" if( $i );
printf "[%#08x]",$i;
}else
{
print ".";
}
}
my $src = pack('N',$i);
$src =~ s/^\0+//;
# ------------------------------------
# utf8 => jis/eucjp/etc.
#
my $str = Unicode::Japanese->new($src,'utf8');
foreach my $ocode ( @charcodes )
{
$str->conv($ocode);
}
# ------------------------------------
# jis/eucjp/etc. => utf8
foreach my $icode ( @charcodes )
{
Unicode::Japanese->new($src,$icode);
}
}
print "\n";
check_mem();
sub check_mem
{
open FILE,"/proc/$$/status" or die "cannot open [/proc/$$/status]";
while()
{
m/^Vm\w+:\s*(\d+)/m and print;
}
close FILE;
}
Unicode-Japanese-0.47/t/v031_getcode_jiskana.t 0100644 0001761 0000144 00000000610 11046013305 017560 0 ustar hio users #! /usr/bin/perl -w
# $Id: v031_getcode_jiskana.t 4613 2005-09-28 13:17:10Z hio $
use strict;
use Test::More tests => 2;
use Unicode::Japanese;
# JIS, HANKAKU-KATAKANA, "TE SU TO"
my $txt = "\e(IC=D\e(B";
Unicode::Japanese->new(); # load dyncode.
is( Unicode::Japanese->getcode($txt), "jis", "getcode(xs): jis");
is( Unicode::Japanese::PurePerl->getcode($txt), "jis", "getcode(pp): jis");
Unicode-Japanese-0.47/bin/ 0040755 0001761 0000144 00000000000 11311614444 014031 5 ustar hio users Unicode-Japanese-0.47/bin/ujguess 0100755 0001761 0000144 00000007157 11046013305 015445 0 ustar hio users #!/usr/local/ymir/perl/bin/perl -w
## ----------------------------------------------------------------------------
# ujguess
# -----------------------------------------------------------------------------
# Mastering programmed by YAMASHINA Hio
#
# Copyright 2005 YAMASHINA Hio
# -----------------------------------------------------------------------------
# $Id: ujguess 4697 2007-09-14 06:17:00Z pho $
# -----------------------------------------------------------------------------
package Unicode::Japanese::UJGuess;
use strict;
use Unicode::Japanese;
our $VERSION = '0.02';
if( !caller )
{
__PACKAGE__->do_work(@ARGV);
}
# -----------------------------------------------------------------------------
# main.
#
sub do_work
{
my $pkg = shift;
my $string;
my @files;
my $no_filename;
while(@_)
{
my $key = shift;
if( $key !~ /^-/ )
{
push(@files,$key);
next;
}elsif( $key eq '--' )
{
push(@files,@_);
last;
}
if( $key eq '--no-filename' )
{
$no_filename = 1;
next;
}elsif( $key eq '--show-filename' )
{
$no_filename = 0;
next;
}elsif( $key eq '-s' )
{
my $value = shift;
push(@files,[$key,$value]);
next;
}elsif( $key =~ /^(-h|--help)$/ )
{
print_usage();
return 1;
}elsif( $key =~ /^(-V|--version)$/ )
{
print_version();
return 1;
}else
{
die "unkown argument [$key]";
}
}
!defined($no_filename) and $no_filename = @files<=1;
Unicode::Japanese->new(); # load stub.
local($/) = undef;
if( !@files )
{
my $text = ;
$no_filename or print "-:";
print Unicode::Japanese->getcode($text)."\n";
}
foreach my $file (@files)
{
my $filename;
my $text;
if( ref($file) )
{
$filename = join(' ',@$file);
$text = $file->[1];
}elsif( $file eq '-' )
{
$filename = '-';
$text = ;
}else
{
$filename = $file;
open(FILE,$file) or die "could not open file [$file] : $!";
$text = ;
close(FILE);
}
$no_filename or print "$filename:";
print Unicode::Japanese->getcode($text)."\n";
}
1;
}
# -----------------------------------------------------------------------------
# print_usage();
#
sub print_usage
{
print "usage: ujguess [options] [files...]\n";
print "options:\n";
print " --no-filename print only the name of character set\n";
print " --show-filename print both names of the file and character set\n";
print " -h|--help print this message\n";
print " -V|--version print the version of ujguess\n";
}
# -----------------------------------------------------------------------------
# print_version();
#
sub print_version
{
print "ujguess $VERSION\n";
print "Unicode::Janaese $Unicode::Japanese::VERSION\n";
}
__END__
=head1 NAME
ujguess -- Guess encoding of given files
=head1 SYNOPSIS
ujguess [files..]
=head1 VERSION
ujguess 0.02
=head1 DESCRIPTION
B guesses encoding of given files.
=over 4
=item --no-filename
Don't prepend file name to each results.
This is the default behavior if there is just one file to be processed.
=item --show-filename
Prepend file name to each results.
This is the default behavior if there are two or more files to be processed.
=item -h,--help
Print a short help message.
=item -V,--version
Print the version of B.
=back
=head1 SEE ALSO
L,
L
=cut
# -----------------------------------------------------------------------------
# End of File.
# -----------------------------------------------------------------------------
Unicode-Japanese-0.47/bin/ujconv 0100755 0001761 0000144 00000011157 11046013305 015257 0 ustar hio users #!/usr/local/ymir/perl/bin/perl -w
## ----------------------------------------------------------------------------
# ujconv
# -----------------------------------------------------------------------------
# Mastering programmed by YAMASHINA Hio
#
# Copyright 2005 YAMASHINA Hio
# -----------------------------------------------------------------------------
# $Id: ujconv 4697 2007-09-14 06:17:00Z pho $
# -----------------------------------------------------------------------------
package Unicode::Japanese::UJConv;
use strict;
use Unicode::Japanese;
our $VERSION = '0.02';
if( !caller )
{
__PACKAGE__->do_work(@ARGV);
}
# -----------------------------------------------------------------------------
# main.
#
sub do_work
{
my $pkg = shift;
my $from = 'auto';
my $to = 'auto';
my $string;
my @files;
while(@_)
{
my $key = shift;
if( $key !~ /^-/ )
{
push(@files,$key);
next;
}elsif( $key eq '--' )
{
push(@files,@_);
last;
}
if( $key eq '-f' )
{
$from = shift;
next;
}elsif( $key eq '-t' )
{
$to = shift;
next;
}elsif( $key eq '-s' )
{
my $value = shift;
push(@files,[$key,$value]);
next;
}elsif( $key =~ /^(-h|--help)$/ )
{
print_usage();
return 1;
}elsif( $key =~ /^(-V|--version)$/ )
{
print_version();
return 1;
}elsif( $key =~ /^(-l|--list)$/ )
{
print_list();
return 1;
}else
{
die "unkown argument [$key]";
}
}
if( $to eq 'auto' )
{
my $lang = $ENV{LANG};
if( $lang && $lang=~/\.(.*)/ )
{
my $code = $1;
if( $code=~/^(ujis|jis|iso-2022-jp)$/i )
{
$to = 'jis';
}elsif( $code=~/^(ujis|eucJP)$/i )
{
$to = 'euc';
}elsif( $code=~/^(sjis|shift_?jis)$/i )
{
$to = 'sjis';
}elsif( $code=~/^(utf-?8)$/i )
{
$to = 'utf8';
}
}
if( $to eq 'auto' )
{
$to = $^O eq 'MSWin32' ? 'sjis' : 'euc';
}
}
local($/) = undef;
if( !@files )
{
my $text = ;
print Unicode::Japanese->new($text,$from)->conv($to);
}
foreach my $file (@files)
{
my $text;
if( ref($file) )
{
$text = $file->[1];
}elsif( $file eq '-' )
{
$text = ;
}else
{
open(FILE,$file) or die "could not open file [$file] : $!";
$text = ;
close(FILE);
}
print Unicode::Japanese->new($text,$from)->conv($to);
}
1;
}
# -----------------------------------------------------------------------------
# print_usage();
#
sub print_usage
{
print "usage: ujconv [-f from_encode] [-t to_encode] [-s string] [files...]\n";
print "see \`perldoc ujconv' for details.\n";
}
# -----------------------------------------------------------------------------
# print_version();
#
sub print_version
{
print "ujconv $VERSION\n";
print "Unicode::Janaese $Unicode::Japanese::VERSION\n";
}
# -----------------------------------------------------------------------------
# print_list();
#
sub print_list
{
foreach my $enc (qw(
utf8
ucs2
ucs4
utf16
jis
euc
euc-jp
sjis
cp932
sjis-imode
sjis-doti
sjis-jsky
jis-jsky
jis-au
sjis-icon-au
euc-icon-au
jis-icon-au
utf8-icon-au
))
{
print "$enc\n";
}
}
__END__
=head1 NAME
ujconv -- reinvented iconv(1) using Unicode::Japanese
=head1 SYNOPSIS
ujconv [-f from_encoding] [-t to_encoding] [-s string] [files...]
ujconv -l
ujconv -h
ujconv -V
=head1 VERSION
ujconv 0.02
=head1 DESCRIPTION
B is an iconv-like tool which is written in perl using
Unicode::Japanese.
B reads text from STDIN or files, convert them, and print them to
STDOUT.
Available options are as follows. Each options can be in short form (-f) or long
form (--from):
=over 4
=item -f,--from I
Convert characters from I. Unlike B this option can be
omitted. In that case, the encoding of the input is guessed by B.
=item -t,--to I
Convert characters to I.
=item -s,--string I
Input from the argument string instead of file or STDIN.
=item -l,--list
List all available encodings, one name per each lines.
=item -h,--help
Print a short help message.
=item -V,--version
Print the version of B.
=back
=head1 SEE ALSO
L,
L,
L,
L
=cut
# -----------------------------------------------------------------------------
# End of File.
# -----------------------------------------------------------------------------
Unicode-Japanese-0.47/inc/ 0040755 0001761 0000144 00000000000 11311614444 014032 5 ustar hio users Unicode-Japanese-0.47/inc/ExtUtils/ 0040755 0001761 0000144 00000000000 11311614444 015613 5 ustar hio users Unicode-Japanese-0.47/inc/ExtUtils/MY_Metafile.pm 0100644 0001761 0000144 00000031676 11046013305 020310 0 ustar hio users ## ----------------------------------------------------------------------------
# ExtUtils::MY_Metafile
# -----------------------------------------------------------------------------
# Mastering programmed by YAMASHINA Hio
#
# Copyright 2006-2008 YAMASHINA Hio
# -----------------------------------------------------------------------------
# $Id: MY_Metafile.pm 5362 2008-01-30 05:09:42Z hio $
# -----------------------------------------------------------------------------
package ExtUtils::MY_Metafile;
use strict;
#use warnings; # warnings pragma was first released with perl 5.006.
use ExtUtils::MakeMaker;
use vars qw($VERSION @EXPORT);
$VERSION = '0.09';
@EXPORT = qw(my_metafile);
use vars qw(%META_PARAMS); # DISTNAME(pkgname)=>HASHREF.
use vars qw($DEFAULT_META_SPEC_VERSION);
$DEFAULT_META_SPEC_VERSION = '1.3';
1;
# -----------------------------------------------------------------------------
# for: use inc::ExtUtils::MY_Metafile;
#
sub inc::ExtUtils::MY_Metafile::import
{
my $pkg = 'ExtUtils::MY_Metafile';
push(@inc::ExtUtils::MY_Metafile::ISA, $pkg);
goto &import;
}
# -----------------------------------------------------------------------------
# import.
#
sub import
{
my $pkg = shift;
my @syms = (!@_ || grep{/^:all$/}@_) ? @EXPORT : @_;
my $callerpkg = caller;
foreach my $name (@syms)
{
my $sub = $pkg->can($name);
$sub or next;
no strict 'refs';
*{$callerpkg.'::'.$name} = $sub;
}
if( !grep{ /^:no_setup$/ } @_ )
{
# override.
*MM::metafile_target = \&_mm_metafile;
}
}
# -----------------------------------------------------------------------------
# _diag_version();
#
sub _diag_version
{
my $mmver = $ExtUtils::MakeMaker::VERSION;
my $mmvernum = $mmver;
if( $mmvernum =~ /^(\d+)\.(\d+)_(\d+)\z/ )
{
$mmvernum = "$1.$2$3";
$mmver .= "=$mmvernum";
}
if( $mmvernum >= 6.30 )
{
print STDERR "# ExtUtils::MY_Metafile for MM 6.30 or later ($mmver).\n";
}else
{
print STDERR "# ExtUtils::MY_Metafile for MM 6.25 or earlier ($mmver).\n";
}
}
# -----------------------------------------------------------------------------
# my_metafile($distname => $param);
# my_metafile($param);
#
sub my_metafile
{
my $distname = @_>=2 && shift;
my $param = shift;
UNIVERSAL::isa($distname,'HASH') and $distname = $distname->{DISTNAME};
$distname ||= '';
$distname =~ s/::/-/g;
$META_PARAMS{$distname} and warn "# overwrite previous meta config $distname.\n";
$META_PARAMS{$distname} = $param;
}
# -----------------------------------------------------------------------------
# _mm_metafile($MM)
# altanative of MM::metafile_target.
# takes $MM object and returns makefile text.
#
sub _mm_metafile
{
my $this = shift;
if( $this->{NO_META} )
{
return
"metafile:\n" .
"\t\$(NOECHO) \$(NOOP)\n";
}
# generate META.yml text.
#
my $meta = _gen_meta_yml($this);
my @write_meta = (
'$(NOECHO) $(ECHO) Generating META.yml',
$this->echo($meta, 'META_new.yml'),
);
# format as makefile text.
#
my ($make_target, $metaout_file);
my $mmvernum = $ExtUtils::MakeMaker::VERSION;
if( $mmvernum =~ /^(\d+)\.(\d+)_(\d+)\z/ )
{
$mmvernum = "$1.$2$3";
}
if( $mmvernum >= 6.30 )
{
$make_target = "# for MM 6.30 or later.\n";
$make_target .= "metafile : create_distdir\n";
$metaout_file = '$(DISTVNAME)/META.yml';
}else
{
$make_target = "# for MM 6.25 or earlier.\n";
$make_target .= "metafile :\n";
$metaout_file = 'META.yml',
}
my $rename_meta = "-\$(NOECHO) \$(MV) META_new.yml $metaout_file";
my $make_body = join('', map{"\t$_\n"} @write_meta, $rename_meta);
"$make_target$make_body";
}
# -----------------------------------------------------------------------------
# _gen_meta_yml($MM);
# generate META.yml text.
#
sub _gen_meta_yml
{
# from MakeMaker-6.30.
my $this = shift;
my $param = shift;
my $check_meta_spec = 1;
if( !$param )
{
$param = $META_PARAMS{$this->{DISTNAME}} || $META_PARAMS{''};
if( !$param )
{
$param = {};
$check_meta_spec = 0;
}
}
if( $META_PARAMS{':all'} )
{
# special key.
$param = { %{$META_PARAMS{':all'}}, %$param };
}
# meta_spec and meta_spec_version.
my $meta_spec = $param->{meta_spec} || $param->{'meta-spec'};
if($param->{meta_spec} && $param->{'meta-spec'} )
{
warn "both meta_spec and meta-spec exist.\n";
}
$meta_spec &&= {%$meta_spec}; # sharrow-copy.
$meta_spec ||= {};
if( exists($param->{meta_spec_version}) && exists($meta_spec->{version}) )
{
warn "both meta_spec_vesrion and meta_spec.version exist.\n";
}
$meta_spec->{version} ||= $param->{meta_spec_version} || $DEFAULT_META_SPEC_VERSION;
$meta_spec->{url} ||= "http://module-build.sourceforge.net/META-spec-v$meta_spec->{version}.html";
# requires:, build_requires:
my $requires_to_yaml = sub{
my $key = shift;
my $hash = shift;
my $yaml = '';
my ($maxkeylen) = sort{$b<=>$a} map{length($_)} keys %$hash;
my ($maxvallen) = sort{$b<=>$a} map{length($_)} values %$hash;
foreach my $name ( sort { lc $a cmp lc $b } keys %$hash )
{
my $ver = $hash->{$name};
$yaml .= sprintf " %-*s %*s\n", $maxkeylen+1, "$name:", $maxvallen, $ver;
}
chomp $yaml;
$yaml ? "$key:\n$yaml" : '';
};
my $requires = $requires_to_yaml->(requires => $param->{requires} || $this->{PREREQ_PM});
my $build_requires = $requires_to_yaml->(build_requires => $param->{build_requires});
# no_index:
my $no_index = $param->{no_index};
if( !$no_index || !$no_index->{directory} )
{
my @dirs = grep{-d $_} (qw(
inc t
ex eg example examples sample samples demo demos
));
$no_index = @dirs && +{ directory => \@dirs };
}
$no_index = $no_index ? _yaml_out({no_index=>$no_index}) : '';
chomp $no_index;
if( $param->{no_index} && !$ENV{NO_NO_INDEX_CHECK} )
{
my $warned;
foreach my $key (keys %{$param->{no_index}})
{
# dir is in spec-v1.2, directory is from spec-v1.3.
if( $key eq 'dir' && $meta_spec->{version}>=1.3 )
{
$warned ||= print STDERR "\n";
warn "$key should be `directory' in META-spec-v1.3 and later.\n";
next;
}
$key =~ /^(file|dir|directory|package|namespace)$/ and next;
$warned ||= print STDERR "\n";
warn "$key is invalid field for no_index.\n";
}
$warned and print STDERR "\n";
}
# abstract is from file.
my $abstract = '';
if( $this->{ABSTRACT} )
{
$abstract = _yaml_out({abstract => $this->{ABSTRACT}});
}elsif( $this->{ABSTRACT_FROM} && open(my$fh, "< $this->{ABSTRACT_FROM}") )
{
while(<$fh>)
{
/^=head1 NAME$/ or next;
(my $pkg = $this->{DISTNAME}) =~ s/-/::/g;
while(<$fh>)
{
/^=/ and last;
/^(\Q$pkg\E\s+-+\s+)(.*)/ or next;
$abstract = $2;
last;
}
last;
}
$abstract = $abstract ? _yaml_out({abstract=>$abstract}) : '';
}
chomp $abstract;
# build yaml object as hash.
my $yaml = {}; # key=>"value as yaml-text".
# first, set from arguments for WriteMakefile().
$yaml->{name} = $this->{DISTNAME};
$yaml->{version} = $this->{VERSION};
$yaml->{version_from} = $this->{VERSION_FROM};
$yaml->{installdirs} = $this->{INSTALLDIRS};
$yaml->{author} = $this->{AUTHOR};
$yaml->{license} = $this->{LICENSE};
foreach my $key (keys %$yaml)
{
if( $yaml->{$key} )
{
my $pad = ' 'x(12-length($key));
$yaml->{$key} = sprintf('%s:%s %s', $key, $pad, $yaml->{$key});
}
}
$yaml->{abstract} = $abstract;
$yaml->{no_index} = $no_index;
$yaml->{requires} = $requires;
$yaml->{build_requires} = $build_requires;
$yaml->{distribution_type} = 'distribution_type: module';
$yaml->{generated_by} = "generated_by: ExtUtils::MY_Metafile version $VERSION, EUMM-$ExtUtils::MakeMaker::VERSION.";
$yaml->{'meta-spec'} = "meta-spec:\n";
$yaml->{'meta-spec'} .= " version: ".delete($meta_spec->{version})."\n";
$yaml->{'meta-spec'} .= " url: ".delete($meta_spec->{url})."\n";
# next, set from arguments for my_metafile().
my $extras = {};
foreach my $key (sort keys %$param)
{
grep{$key eq $_} qw(no_index requires build_requires meta_spec meta-spec meta_spec_version) and next;
my $line = _yaml_out->({$key=>$param->{$key}});
if( exists($yaml->{$key}) )
{
chomp $line;
$yaml->{$key} = $line;
}else
{
$extras->{$key} = $line;
}
}
$yaml->{extras} = join('', map{$extras->{$_}} sort keys %$extras);
# then, check required keys by yaml-spec.
my @required_keys = qw(meta-spec name version abstract author license generated_by);
foreach my $key (@required_keys)
{
$check_meta_spec or next;
my $ok = $yaml->{$key} && $yaml->{$key}=~/\w/;
$ok ||= $extras->{$key} and next;
warn "$key is required for meta-spec v1.2 ($this->{DISTNAME}).\n";
}
if( exists($param->{license}) && exists($this->{LICENSE}) && $param->{license} ne $this->{LICENSE} )
{
warn "WriteMakefile.LICENSE ($this->{LICENSE}) is different from my_metafile.license ($param->{license}).";
}
$yaml->{license} ||= 'license: unknown';
foreach my $key (keys %$yaml)
{
$key eq 'extras' and next;
$yaml->{$key} ||= "#$key:";
}
$yaml->{extras} &&= "\n# extras.\n$yaml->{extras}";
foreach my $key (qw(abstract license))
{
my $pad = ' 'x(12-length($key));
$yaml->{$key} =~ s/^$key: +(.+)\z/$key:$pad $1/;
}
# packing into singple text.
my $meta = <{name}
$yaml->{version}
$yaml->{version_from}
$yaml->{installdirs}
$yaml->{author}
$yaml->{abstract}
$yaml->{license}
$yaml->{requires}
$yaml->{build_requires}
$yaml->{no_index}
$yaml->{extras}
$yaml->{distribution_type}
$yaml->{generated_by}
$yaml->{'meta-spec'}
YAML
#print "$meta";
$meta;
}
# -----------------------------------------------------------------------------
# generate simple yaml.
#
sub _yaml_out
{
my $obj = shift;
my $depth = shift || 0;
my $out = '';
if( !defined($obj) )
{
$out = " "x$depth."~\n";
}elsif( !ref($obj) )
{
$out = " "x$depth.$obj."\n";
}elsif( ref($obj)eq'ARRAY' )
{
my @e = map{_yaml_out->($_, $depth+1)} @$obj;
@e = map{ " "x$depth."- ".substr($_, ($depth+1)*2)} @e;
$out = join('', @e);
$out ||= " "x$depth."[]";
}elsif( ref($obj)eq'HASH' )
{
foreach my $k (sort keys %$obj)
{
$out .= " "x$depth."$k:";
$out .= ref($obj->{$k}) ? "\n"._yaml_out($obj->{$k}, $depth+1) : " $obj->{$k}\n";
}
$out ||= " "x$depth."{}";
}else
{
die "not supported: $obj";
}
$out;
}
# -----------------------------------------------------------------------------
# End of Code.
# -----------------------------------------------------------------------------
__END__
=encoding utf8
=for stopwords
YAMASHINA
Hio
ACKNOWLEDGEMENTS
AnnoCPAN
CPAN
EUMM
META.yml
RT
=head1 NAME
ExtUtils::MY_Metafile - META.yml customize with ExtUtil::MakeMaker
=head1 VERSION
Version 0.09
=head1 SYNOPSIS
put ExtUtils/MY_Metafile.pm into inc/ExtUtils/MY_Metafile.pm:
$ mkdir -p inc/ExtUtils
$ cp `perldoc -l ExtUtils::MY_Metafile` inc/ExtUtils/
and write in your Makefile.PL:
use ExtUtils::MakeMaker;
use inc::ExtUtils::MY_Metafile;
my_metafile {
no_index => {
directory => [ qw(inc example t), ],
},
license => 'perl',
};
WriteMakefile(
DISTNAME => 'Your::Module',
...
);
=head1 EXPORT
This module exports one function.
=head1 FUNCTIONS
=head2 my_metafile $modname => \%meta_param;
Takes two arguments.
First one is package name to be generated, and you can omit this
argument. Second is hashref which contains META.yml contents.
my_metafile {
no_index => {
directory => [ qw(inc example t), ],
},
license => 'perl',
};
Some parameters are checked automatically.
=over
=item no_index
If you not specify C parameter for C and
there are any directory of F, they are set as it.
=item requires
C directive is set from C parameter
of EUMM. If you want to use C, you can write it.
=back
=head1 AUTHOR
YAMASHINA Hio, C<< >>
=head1 BUGS
Please report any bugs or feature requests to
C, or through the web interface at
L.
I will be notified, and then you'll automatically be notified of progress on
your bug as I make changes.
=head1 SUPPORT
You can find documentation for this module with the perldoc command.
perldoc ExtUtils::MY_Metafile
You can also look for information at:
=over 4
=item * AnnoCPAN: Annotated CPAN documentation
L
=item * CPAN Ratings
L
=item * RT: CPAN's request tracker
L
=item * Search CPAN
L
=back
=head1 ACKNOWLEDGEMENTS
=head1 COPYRIGHT & LICENSE
Copyright 2006-2008 YAMASHINA Hio, all rights reserved.
This program is free software; you can redistribute it and/or modify it
under the same terms as Perl itself.
=cut
# -----------------------------------------------------------------------------
# End of File.
# -----------------------------------------------------------------------------
Unicode-Japanese-0.47/lib/ 0040755 0001761 0000144 00000000000 11311614444 014027 5 ustar hio users Unicode-Japanese-0.47/lib/Unicode/ 0040755 0001761 0000144 00000000000 11311614444 015415 5 ustar hio users Unicode-Japanese-0.47/lib/Unicode/Japanese/ 0040755 0001761 0000144 00000000000 11311614444 017143 5 ustar hio users Unicode-Japanese-0.47/lib/Unicode/Japanese/JA.pod 0100644 0001761 0000144 00000053353 11046013305 020141 0 ustar hio users =encoding utf-8
=head1 NAME
Unicode::Japanese::JA - ユ茯絖潟若紊
=head1 网荀
use Unicode::Japanese;
use Unicode::Japanese qw(unijp);
# convert utf8 -> sjis
print Unicode::Japanese->new($str)->sjis;
print unijp($str)->sjis; # same as above.
# convert sjis -> utf8
print Unicode::Japanese->new($str,'sjis')->get;
# convert sjis (imode_EMOJI) -> utf8
print Unicode::Japanese->new($str,'sjis-imode')->get;
# convert zenkaku (utf8) -> hankaku (utf8)
print Unicode::Japanese->new($str)->z2h->get;
=head1 茯
Unicode::Japanese 鐚ユ茯絖潟若娯紊茵≪吾ャ若с鐚
=head2 罘
=over 2
=item *
Unicode::Japanese ゃ潟鴻帥潟鴻鐚UTF-8 ф絖篆障鐚
=item *
XS 篏睡/筝篏睡宴泣若障鐚
XS 若潟鴻綽荀翫鐚
No-XS 荵純篏睡翫篏睡筝
(Japanese.pm 潟若у篏障)鐚
=item *
茹茹紊鐚帥蚊紊泣若障鐚
=item *
阪遣肢 (DoCoMo i-mode鐚KDDI AU, Softbank Mobile, ASTEL dot-i) 腟究絖
Unicode 腱潟違эDB 膈у宴с障鐚
=item *
違阪遣肢怨紕эゃ<若吾腟究絖娯紊純с鐚
=item *
SJIS 鐚 MS-CP932 帥 Unicode 潟違茵障鐚
=item *
Unicode -> SJIS鐚EUC-JP/JIS鐚 潟井鐚SJIS ц;憗с絖
dddd; 綵√紊障鐚 Unicode腱腟究絖
'?'障. 障, 阪遣肢怨紊, 鴻絲上絖'?'障.
=item *
Perl-5.8.0 篁ラ, utf8 違荐絎茵障.
utf-8 `ゃ' 緇 utf8() <純,
utf-8 `絖' 緇 getu() <純篏帥障.
get() <純憝鴻с utf-8 `ゃ' 菴障
(絨ョ紊眼醇с障).
sjis(), jis(), utf8(), etc.. <純сゃ菴障.
new, set, getcode <純ュ, utf8-flaged/bytes 障.
=back
=head1 篏綽荀
=over 4
=item *
perl 5.10.x, 5.8.x, etc. (5.004 篁ラ).
=item *
(OK)
C 潟潟ゃ.
≪吾ャ若 XS Pure Perl 筝≧鴻絲上障.
C 潟潟ゃ, Unicode::Japanese
Pure Perl ≪吾ャ若ゃ潟鴻若障.
=item *
(OK)
鴻 Test.pm Test::More.
=back
絎茵綽≪吾ャ若障.
=head1 <純
=over 4
=item $s = Unicode::Japanese->new($str [, $icode [, $encode]])
違 Unicode::Japanese ゃ潟鴻帥潟鴻絎障鐚
<若帥絎鐚L <純羝<障鐚
=item $s = unijp($str [, $icode [, $encode]])
Unicode::Janaese->new(...) 臂.
=item $s->set($str [, $icode [, $encode]])
X
=over 2
=item $str: 絖
=item $icode: 絖潟若絎鐚ュ鐚ユ 'utf8'
=item $encode: ゃ膃垸劫鐚ュ鐚
=back
ゃ潟鴻帥潟鴻絖祉障鐚
絖潟若絎ャ UTF-8 荀障鐚
純絖潟若:
auto
utf8 ucs2 ucs4
utf16-be utf16-le utf16
utf32-be utf32-le utf32
sjis cp932 euc euc-jp jis
sjis-imode sjis-imode1 sjis-imode2
utf8-imode utf8-imode1 utf8-imode2
sjis-doti sjis-doti1
sjis-jsky sjis-jsky1 sjis-jsky2
jis-jsky jis-jsky1 jis-jsky2
utf8-jsky utf8-jsky1 utf8-jsky2
sjis-au sjis-au1 sjis-au2
jis-au jis-au1 jis-au2
sjis-icon-au sjis-icon-au1 sjis-icon-au2
euc-icon-au euc-icon-au1 euc-icon-au2
jis-icon-au jis-icon-au1 jis-icon-au2
utf8-icon-au utf8-icon-au1 utf8-icon-au2
ascii binary
(L<|/泣若潟潟若c潟>
.)
絖潟若ゅャ翫鐚'auto' 絎障鐚
'auto' 絖潟若ゅャ鐚getcode() <純
茵障鐚
ゃ膃垸劫鐚'base64' 炊絎純с鐚
base64 絎翫鐚base64 潟若
Unicode::Japanese 鴻絖障鐚
羝<絖紊眼障丈主罨蚊翫鐚絖潟若
'binary' 絎障鐚
sjis-imode鐚sjis-doti鐚翫鐚絖筝 dddd;
腟究絖紊障鐚
絖潟若c翫鐚
ゅャ腆阪с障鐚
sjis, utf8 筝≧鴻茹iс絖翫鐚sjis鐚
sjis-au鐚sjis-doti 筝≧鴻茹iс絖翫鐚sjis-au鐚
菴障鐚
=item $str = $s->get
=over 2
=item $str: 絖(UTF-8)
=back
絖 UTF-8 潟若у冴障鐚
憜 `ゃ' 菴障, 絨ョ紊眼醇с障.
ゃ綽荀 utf8() <純,
絖綽荀 getu() <純篏帥鴻鴻<障.
=item $str = $s->getu
=over 2
=item $str: 絖(UTF-8)
=back
絖 UTF-8 潟若у冴障鐚
Perl-5.8.0 篁ラ, utf-8 違ゃ utf-8 絖
菴障.
=item $code = $s->getcode($str)
=over 2
=item $str: 絖
=item $code: 絖潟若茵絖
=back
羝<絖(I<$str>)絖潟若ゅャ障鐚
∽違с, 箴紊, ゃ潟鴻帥潟鴻篆
絖潟若ゅャс羈鐚
絖潟若ゅユ鐚篁ヤ≪眼冴ゅ茵障鐚
(PurePerl)
=over 4
=item 1
UTF-32 BOM 逸utf32 ゅ障鐚
=item 2
UTF-16 BOM 逸utf16 ゅ障鐚
=item 3
UTF-32BE 罩c綵√鐚utf32-be ゅ障鐚
=item 4
UTF-32LE 罩c綵√鐚utf32-le ゅ障鐚
=item 5
ESC 絖 障 8 腴c絖障逸ascii ゅ
ESC ゃ ASCII 九勝絖 (0x00-0x1F 0x7F) ascii 膀峨荀
=item 6
JIS鴻宴若激若宴潟鴻障逸jis ゅ障鐚
=item 7
J-PHONE 腟究絖障逸sjis-jsky ゅャ障鐚
=item 8
EUC-JP 潟若罩c綵√鐚euc ゅ障鐚
=item 9
SJIS 潟若罩c綵√鐚sjis ゅ障鐚
=item 10
SJIS 潟若 au 腟究絖罩c綵√鐚sjis-au ゅ障鐚
=item 11
SJIS i-mode 腟究絖罩c綵√鐚sjis-imode ゅャ障鐚
=item 12
SJIS dot-i 腟究絖罩c綵√鐚sjis-doti ゅャ障鐚
=item 13
UTF-8 罩c綵√鐚utf8 ゅ障鐚
=item 14
綵障翫鐚unknown ゅ障鐚
=back
(XS)
=over 4
=item 1
UTF-32 BOM 逸utf32 ゅ障鐚
=item 2
UTF-16 BOM 逸utf16 ゅ障鐚
=item 3
篁ヤ潟若ゃ, 罩c絖潟若с倶欠Щ茯帥鴻障.
ascii / euc / sjis / jis / utf8 / utf32-be / utf32-le / sjis-jsky /
sjis-imode / sjis-au / sjis-doti
=item 4
緇障фcc筝, 篁ヤэゃ, ゅ障.
utf32-be / utf32-le / ascii / jis / euc / sjis / sjis-jsky / sjis-imode /
sjis-au / sjis-doti / utf8
=item 5
綵障翫鐚unknown ゅ障鐚
=back
篁ヤ≪眼冴鐚篁ヤ鴻羈鐚
=over 2
=item *
UTF-8 絖с鐚SJIS潟若荀醇с障鐚
=item *
UCS2 ゅャс障鐚
=item *
UTF-16 BOM 翫粋茯茘障鐚
=item *
阪遣腟究絖鐚ゃх贋・腟究絖翫粋茘с障鐚
dddd; 綵√ц菴違翫鐚阪遣腟究絖ゅャ茵障鐚
=back
XSPurePerlс, ゅャ≪眼冴, 違腟醇с障.
鴻, 鴻宴若絖сsjis翫, PurePerlсsjis茯茘障
XSс茯茘障. sjis-jsky阪ャсс. 障,
篏茯よ茘蚊, euc-jp, 罕鴻宴若絖篁
c障.
=item $code = $s->getcodelist($str)
=over 2
=item $str: 絖
=item $code: 絖潟若茵絖
=back
羝<絖(I<$str>)絖潟若ゅャ障鐚
getcode , 鴻純絖潟若
筝荀с菴障.
=item $str = $s->conv($ocode, $encode)
=over 2
=item $ocode: 阪潟若 (篁ヤ絎)
utf8 ucs2 ucs4 utf16
sjis cp932 euc euc-jp jis
sjis-imode sjis-imode1 sjis-imode2
utf8-imode utf8-imode1 utf8-imode2
sjis-doti sjis-doti1
sjis-jsky sjis-jsky1 sjis-jsky2
jis-jsky jis-jsky1 jis-jsky2
utf8-jsky utf8-jsky1 utf8-jsky2
sjis-au sjis-au1 sjis-au2
jis-au jis-au1 jis-au2
sjis-icon-au sjis-icon-au1 sjis-icon-au2
euc-icon-au euc-icon-au1 euc-icon-au2
jis-icon-au jis-icon-au1 jis-icon-au2
utf8-icon-au utf8-icon-au1 utf8-icon-au2
binary
(L<|/泣若潟潟若c潟>
.)
阪遣絖潟若¥絨障医ゃ鐚医紊с祉
紊с腟究絖祉鐚井腮腟究絖祉鐚茵障鐚
医鐚c医紊с絖潟若筝с鐚
=item $encode: ゃ膃垸劫鐚ュ鐚
=item $str: 絖
=back
絖絎絖潟若紊冴障鐚
絖潟潟若鐚'base64' 炊絎純с鐚
base64 絎翫鐚base64 潟潟若
絖菴障鐚
perl-5.8.0 篁ラ, 阪 utf-8 違ゃ障.
=item $s->tag2bin
絖筝障 dddd; 綵√絖鐚茵絖篏臀障鐚
=item $s->z2h
茹茹紊障鐚
=item $s->h2z
茹茹紊障鐚
=item $s->hira2kata
蚊帥紊障鐚
=item $s->kata2hira
帥蚊紊障鐚
=item $str = $s->jis
$str: JIS 潟潟若c潟医就綣ゃ
絖 JIS鐚ISO-2022-JP鐚 潟若у冴障鐚
=item $str = $s->euc
$str: euc-jp 潟潟若c潟医就綣ゃ
絖 EUC-JP 潟若у冴障鐚
=item $str = $s->utf8
$str: utf-8 潟潟若c潟医就綣ゃ
絖 UTF-8 潟若у冴障鐚
perl-5.8.0 篁ラ, ゃ菴障.
=item $str = $s->ucs2
$str: ucs2 潟潟若c潟医就綣ゃ
絖 UCS2 潟若у冴障鐚
=item $str = $s->ucs4
$str: ucs4 潟潟若c潟医就綣ゃ
絖 UCS4 潟若у冴障鐚
=item $str = $s->utf16
$str: ucs-16 潟潟若c潟医就綣ゃ
絖 UTF-16 潟若у冴障鐚
BOM篁障鐚
潟c≪喝就綣ц障鐚
=item $str = $s->sjis
$str: sjis 潟潟若c潟医就綣ゃ
絖 SJIS鐚MS-CP932鐚 潟若у冴障鐚
=item $str = $s->sjis_imode
$str: sjis/imode腟究絖 潟潟若c潟医就綣ゃ
絖 i-mode 腴 SJIS 潟若у冴障鐚
違imode腟究絖ュс.
=item $str = $s->sjis_imode1
$str: sjis/imode 腟究絖 潟潟若c潟医就綣ゃ
絖 i-mode 腴 SJIS 潟若у冴障鐚
堺腟究絖障.
=item $str = $s->sjis_imode2
$str: sjis/imode 腟究絖 潟潟若c潟医就綣ゃ
絖 i-mode 腴 SJIS 潟若у冴障鐚
堺腟究絖, ≦宍腟究絖帥障.
=item $str = $s->sjis_doti
$str: sjis/dot-i 腟究絖 潟潟若c潟医就綣ゃ
絖 dot-i 腴 SJIS 潟若у冴障鐚
=item $str = $s->sjis_jsky
$str: sjis/j-sky 腟究絖 潟潟若c潟医就綣ゃ
絖 j-sky 腴 SJIS 潟若у冴障鐚
違j-sky腟究絖(VERSION 0.15 с, jsky2)ュс.
=item $str = $s->sjis_jsky1
$str: sjis/j-sky 腟究絖 潟潟若c潟医就綣ゃ
絖 j-sky 腴 SJIS 潟若у冴障鐚
Page 1鐔3 帥腟究絖帥障.
=item $str = $s->sjis_jsky
$str: sjis/j-sky 腟究絖 潟潟若c潟医就綣ゃ
絖 j-sky 腴 SJIS 潟若у冴障鐚
Page 1鐔6 腟究絖帥障.
=item $str = $s->sjis_icon_au
$str: sjis/AU icon帥 潟潟若c潟医就綣ゃ
絖 AU 腴 SJIS 潟若у冴障鐚
=item $str_arrayref = $s->strcut($len)
=over 2
=item $len: 蚊絖(茹後)
=item $str_arrayref: 絖
=back
I<$len>ф絎絖(茹)篁ヤ絖蚊障鐚
荀膣, utf-8 違cutf-8絖с.
=item $len = $s->strlen
$len: 絖茵腓阪
UTF-8 絖絲障 length() 篏帥茹絖鐚絖激 3 c障障鐚
<純篏睡鐚緇ャ SJIS 鐚茹絖鐚絖激 2 菴障鐚
=item $s->join_csv(@values);
@values: 若翠
CSV 絖紊鐚ゃ潟鴻帥潟鴻主障鐚
絖緇壕("\n")菴遵障鐚
=item @values = $s->split_csv;
@values: 若翠
ゃ潟鴻帥潟鴻主絖 CSV 荀鐚蚊障鐚
絖緇壕("\n")ゃ蚊障鐚
ュ binary с utf-8 絖菴障.
binary cゃ菴障.
=back
=head1 泣若潟潟若c潟
+---------------+----+-----+-------+
|encoding | in | out | guess |
+---------------+----+-----+-------+
|auto : OK : -- | ----- |
+---------------+----+-----+-------+
|utf8 : OK : OK | OK |
|ucs2 : OK : OK | ----- |
|ucs4 : OK : OK | ----- |
|utf16-be : OK : -- | ----- |
|utf16-le : OK : -- | ----- |
|utf16 : OK : OK | OK(#) |
|utf32-be : OK : -- | OK |
|utf32-le : OK : -- | OK |
|utf32 : OK : -- | OK(#) |
+---------------+----+-----+-------+
|sjis : OK : OK | OK |
|cp932 : OK : OK | ----- |
|euc : OK : OK | OK |
|euc-jp : OK : OK | ----- |
|jis : OK : OK | OK |
+---------------+----+-----+-------+
|sjis-imode : OK : OK | OK |
|sjis-imode1 : OK : OK | ----- |
|sjis-imode2 : OK : OK | ----- |
|utf8-imode : OK : OK | ----- |
|utf8-imode1 : OK : OK | ----- |
|utf8-imode2 : OK : OK | ----- |
+---------------+----+-----+-------+
|sjis-doti : OK : OK | OK |
|sjis-doti1 : OK : OK | ----- |
+---------------+----+-----+-------+
|sjis-jsky : OK : OK | OK |
|sjis-jsky1 : OK : OK | ----- |
|sjis-jsky2 : OK : OK | ----- |
|jis-jsky : OK : OK | ----- |
|jis-jsky1 : OK : OK | ----- |
|jis-jsky2 : OK : OK | ----- |
|utf8-jsky : OK : OK | ----- |
|utf8-jsky1 : OK : OK | ----- |
|utf8-jsky2 : OK : OK | ----- |
+---------------+----+-----+-------+
|sjis-au : OK : OK | OK |
|sjis-au1 : OK : OK | ----- |
|sjis-au2 : OK : OK | ----- |
|jis-au : OK : OK | ----- |
|jis-au1 : OK : OK | ----- |
|jis-au2 : OK : OK | ----- |
|sjis-icon-au : OK : OK | ----- |
|sjis-icon-au1 : OK : OK | ----- |
|sjis-icon-au2 : OK : OK | ----- |
|euc-icon-au : OK : OK | ----- |
|euc-icon-au1 : OK : OK | ----- |
|euc-icon-au2 : OK : OK | ----- |
|jis-icon-au : OK : OK | ----- |
|jis-icon-au1 : OK : OK | ----- |
|jis-icon-au2 : OK : OK | ----- |
|utf8-icon-au : OK : OK | ----- |
|utf8-icon-au1 : OK : OK | ----- |
|utf8-icon-au2 : OK : OK | ----- |
+---------------+----+-----+-------+
|ascii : OK : -- | OK |
|binary : OK : OK | ----- |
+---------------+----+-----+-------+
(#): guessed when it has bom.
=head2 茯茘篏
1. utf32 (#)
2. utf16 (#)
3. utf32-be
4. utf32-le
5. ascii
6. jis
7. sjis-jsky (pp)
8. euc
9. sjis
10. sjis-jsky (xs)
11. sjis-au
12. sjis-imode
13. sjis-doti
14. utf8
15. unknown
=head1 DESCRIPTION OF UNICODE MAPPING
Unicode 潟違篁ヤ茵障鐚
=over 2
=item Shift_JIS
MS-CP932 Unicode 吾潟違茵障鐚
潟違若篁ヤURL篏睡障鐚
L<< ftp://ftp.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP932.TXT >>
Unicode SJIS 吾潟違翫鐚茵憗с絖鐚
絖 dddd; 綵√紊障鐚
鐚阪遣腟究絖?紊障鐚
障鐚阪遣 SJIS 後鐚茵憗с絖?紊障鐚
=item EUC-JP/ISO-2022-JP
筝綺SJIS潟若紊鐚Unicode 吾潟違障鐚
鐚SJIS ц;憗с絖障翫鐚
絖罩c潟違с障鐚
=item DoCoMo i-mode
F800 - F9FF ∞亀絖絖鐚U+0FF800 - U+0FF9FF
潟違障鐚
=item ASTEL dot-i
F000 - F4FF ∞亀絖絖鐚U+0FF000 - U+0FF4FF
潟違障鐚
=item J-PHONE J-SKY
J-SKY 腟究絖鐚鴻宴若激若宴潟 "\e\$" 緇鐚腟究絖1ゃ鐚
1や札筝腟究絖2ゃ鐚"\x0f"鐚膓障鐚
1ゃ腟究絖膓翫鐚2ゃ帥g吾
х軒с障鐚
1ゃ2ゃ≪1絖荀鐚4500 - 47FF 鐚
U+0FFB00 - U+0FFDFF 潟違障鐚
Unicode::Japanese с鐚Unicode J-SKY 腟究絖潟違鐚
1ゃ筝с腟究絖g翫鐚х軒茵障鐚
=item AU
腟究絖絖鐚U+0FF500 - U+0FF6FF 潟違障鐚
=back
=head1 PurePerl mode
use Unicode::Japanese qw(PurePerl);
use 綣違 C<'PurePerl'> 筝,
XS篏帥腓榊絎hс障.
=head1
違荀 C 絎
怨. ャ
L.
web ゃ潟帥若с若鴻с障障.
腱ャ, 腱紊眼茵у怨違我
篌障.
=over 2
=item *
EUC-JP鐚JIS 潟若鐚SJIS 紊 UTF-8 後鐚
SJIS ц;憗с絖罩c鎀с障鐚
=item *
XS篏睡翫鐚EUC-JP鐚SJIS(腟究絖)潟若絖筝
\e 障鐚EUC-JP鐚SJIS 潟若ゅ紊掩鐚
罩cゅャ紊茵堺ャ障鐚
=item *
Japanese.pm <ゃ緇ゃ鐚FTP ASCII ≪若
荵∫<ゃ紕障鐚
=back
=head1 泣若
≪吾ャ若ャ<潟 perldoc 潟潟ц堺ャ障.
perldoc Unicode::Japanese
障, 篁ヤ贋с荀堺ャ障:
=over 4
=item * AnnoCPAN: Annotated CPAN documentation
L
=item * CPAN Ratings
L
=item * RT: CPAN's request tracker
L
=item * Search CPAN
L
=back
=head1 CREDITS
Thanks very much to:
NAKAYAMA Nao
SUGIURA Tatsuki & Debian JP Project
=head1 篏罔潟ゃ祉潟
Copyright 2001-2008
SANO Taku (SAWATARI Mikage) and YAMASHINA Hio,
all rights reserved.
違若純с≪с Perl
ゃ祉潟鴻 у絽喝眼茵堺ャ障.
Unicode-Japanese-0.47/lib/Unicode/Japanese.mlpod 0100644 0001761 0000144 00000113056 11046013305 020175 0 ustar hio users
=encoding utf-8
=head1 NAME
Unicode::Japanese - Convert encoding of japanese text
J<< ja; Unicode::Japanese::JA - ユ茯絖潟若紊 >>
=head1 SYNOPSIS
use Unicode::Japanese;
use Unicode::Japanese qw(unijp);
# convert utf8 -> sjis
print Unicode::Japanese->new($str)->sjis;
print unijp($str)->sjis; # same as above.
# convert sjis -> utf8
print Unicode::Japanese->new($str,'sjis')->get;
# convert sjis (imode_EMOJI) -> utf8
print Unicode::Japanese->new($str,'sjis-imode')->get;
# convert zenkaku (utf8) -> hankaku (utf8)
print Unicode::Japanese->new($str)->z2h->get;
=head1 DESCRIPTION
The Unicode::Japanese module converts encoding of japanese text from one
encoding to another.
J<< ja;
Unicode::Japanese 鐚ユ茯絖潟若娯紊茵≪吾ャ若с鐚
>>
=head2 FEATURES
=over 2
=item *
An instance of Unicode::Japanese internally holds a string in UTF-8.
J<< ja;
Unicode::Japanese ゃ潟鴻帥潟鴻鐚UTF-8 ф絖篆障鐚
>>
=item *
This module is implemented in two ways: XS and pure perl. If efficiency is
important for you, you should build and install the XS module. If you don't want
to, or if you can't build the XS module, you may use the pure perl module
instead. In that case, only you have to do is to copy Japanese.pm into somewhere
in @INC.
J<< ja;
XS 篏睡/筝篏睡宴泣若障鐚
XS 若潟鴻綽荀翫鐚
No-XS 荵純篏睡翫篏睡筝
(Japanese.pm 潟若у篏障)鐚
>>
=item *
This module can convert characters from zenkaku (full-width) form to hankaku
(half-width) form, and vice versa. Conversion between hiragana (one of two sets
of japanese phonetical alphabet) and katakana (another set of japanese
phonetical alphabet) is also supported.
J<< ja;
茹茹紊鐚帥蚊紊泣若障鐚
>>
=item *
This module has mapping tables for emoji (graphic characters) defined by various
japanese mobile phones; DoCoMo i-mode, ASTEL dot-i and J-PHONE J-Sky. Those
letters are mapped on Unicode Private Use Area so unicode strings it outputs are
still valid even if they contain emoji, and you can safely pass them to other
softwares that can handle Unicode.
J<< ja;
阪遣肢 (DoCoMo i-mode鐚KDDI AU, Softbank Mobile, ASTEL dot-i) 腟究絖
Unicode 腱潟違эDB 膈у宴с障鐚
>>
=item *
This module can map some emoji from one set to another. Different mobile phones
define different sets of emoji, so mapping each other is not always
possible. But since some emoji exist in two or more sets with similar
appearance, this module considers those emoji to be the same.
J<< ja;
違阪遣肢怨紕эゃ<若吾腟究絖娯紊純с鐚
>>
=item *
This module uses the mapping table for MS-CP932 instead of the standard
Shift_JIS. The Shift_JIS encoding used by MS-Windows (MS-SJIS/MS-CP932) slightly
differs from the standard.
J<< ja;
SJIS 鐚 MS-CP932 帥 Unicode 潟違茵障鐚
>>
=item *
When the module converts strings from Unicode to Shift_JIS, EUC-JP or
ISO-2022-JP, unicode letters which can't be represented in those encodings will
be encoded in "dddd;" form (decimal character reference). Note, however, that
letters in Unicode Private Use Area will be replaced with '?' mark ('QUESTION
MARK'; U+003F) instead of being encoded. In addition, encoding to character sets
for mobile phones makes every unrepresentable letters being '?' mark.
J<< ja;
Unicode -> SJIS鐚EUC-JP/JIS鐚 潟井鐚SJIS ц;憗с絖
dddd; 綵√紊障鐚 Unicode腱腟究絖
'?'障. 障, 阪遣肢怨紊, 鴻絲上絖'?'障.
>>
=item *
On perl-5.8.0 or later, this module handles the UTF-8 flag: the method utf8()
returns UTF-8 I string, and the method getu() returns UTF-8 I
string.
J<< ja;
Perl-5.8.0 篁ラ, utf8 違荐絎茵障.
utf-8 `ゃ' 緇 utf8() <純,
utf-8 `絖' 緇 getu() <純篏帥障.
>>
Currently the method get() returns UTF-8 I string but this behavior may be
changed in the future.
J<< ja;
get() <純憝鴻с utf-8 `ゃ' 菴障
(絨ョ紊眼醇с障).
>>
Methods like sjis(), jis(), utf8(), and such like return I string. new(),
set(), getcode() methods just ignore the UTF-8 flag of strings they take.
J<< ja;
sjis(), jis(), utf8(), etc.. <純сゃ菴障.
new, set, getcode <純ュ, utf8-flaged/bytes 障.
>>
=back
=head1 REQUIREMENT
J<< ja; 篏綽荀 >>
=over 4
=item *
perl 5.10.x, 5.8.x, etc. (5.004 and later)
J<< ja;
perl 5.10.x, 5.8.x, etc. (5.004 篁ラ).
>>
=item *
(optional)
C Compiler.
This module supports both XS and Pure Perl.
If you have no C Compilers,
Unicode::Japanese will be installed as Pure Perl module.
J<< ja;
(OK)
C 潟潟ゃ.
≪吾ャ若 XS Pure Perl 筝≧鴻絲上障.
C 潟潟ゃ, Unicode::Japanese
Pure Perl ≪吾ャ若ゃ潟鴻若障.
>>
=item *
(optional)
Test.pm and Test::More for testing.
J<< ja;
(OK)
鴻 Test.pm Test::More.
>>
=back
No other modules are required at run time.
J<< ja;
絎茵綽≪吾ャ若障.
>>
=head1 METHODS
=over 4
=item $s = Unicode::Japanese->new($str [, $icode [, $encode]])
Create a new instance of Unicode::Japanese.
J<< ja;
違 Unicode::Japanese ゃ潟鴻帥潟鴻絎障鐚
>>
Any given parameters will be internally passed to the method L().
J<< ja;
<若帥絎鐚L <純羝<障鐚
>>
=item $s = unijp($str [, $icode [, $encode]])
Same as Unicode::Jananese->new(...).
J<< ja;
Unicode::Janaese->new(...) 臂.
>>
=item $s->set($str [, $icode [, $encode]])
X
=over 2
=item $str: string
J<< ja; $str: 絖 >>
=item $icode: optional character encoding (default: 'utf8')
J<< ja; $icode: 絖潟若絎鐚ュ鐚ユ 'utf8' >>
=item $encode: optional binary encoding (default: no binary encodings are assumed)
J<< ja; $encode: ゃ膃垸劫鐚ュ鐚 >>
=back
Store a string into the instance.
J<< ja;
ゃ潟鴻帥潟鴻絖祉障鐚
絖潟若絎ャ UTF-8 荀障鐚
>>
Possible character encodings are:
J<< ja;
純絖潟若:
>>
auto
utf8 ucs2 ucs4
utf16-be utf16-le utf16
utf32-be utf32-le utf32
sjis cp932 euc euc-jp jis
sjis-imode sjis-imode1 sjis-imode2
utf8-imode utf8-imode1 utf8-imode2
sjis-doti sjis-doti1
sjis-jsky sjis-jsky1 sjis-jsky2
jis-jsky jis-jsky1 jis-jsky2
utf8-jsky utf8-jsky1 utf8-jsky2
sjis-au sjis-au1 sjis-au2
jis-au jis-au1 jis-au2
sjis-icon-au sjis-icon-au1 sjis-icon-au2
euc-icon-au euc-icon-au1 euc-icon-au2
jis-icon-au jis-icon-au1 jis-icon-au2
utf8-icon-au utf8-icon-au1 utf8-icon-au2
ascii binary
(see also L"SUPPORTED ENCODINGS">.)
J<< ja;
(L"SUPPORTED ENCODINGS J<< ja; 泣若潟潟若c潟 >> ">
.)
>>
If you want the Unicode::Japanese detect the character encoding of string, you
must explicitly specify 'auto' as the second argument. In that case, the given
string will be passed to the method getcode() to guess the encoding.
J<< ja;
絖潟若ゅャ翫鐚'auto' 絎障鐚
'auto' 絖潟若ゅャ鐚getcode() <純
茵障鐚
>>
For binary encodings, only 'base64' is currently supported. If you specify
'base64' as the third argument, the given string will be decoded using Base64
decoder.
J<< ja;
ゃ膃垸劫鐚'base64' 炊絎純с鐚
base64 絎翫鐚base64 潟若
Unicode::Japanese 鴻絖障鐚
>>
Specify 'binary' as the second argument if you want your string to be stored
without modification.
J<< ja;
羝<絖紊眼障丈主罨蚊翫鐚絖潟若
'binary' 絎障鐚
>>
When you specify 'sjis-imode' or 'sjis-doti' as the character encoding, any
occurences of 'dddd;' (decimal character reference) in the string will be
interpreted and decoded as code point of emoji, just like emoji implanted into
the string in binary form.
J<< ja;
sjis-imode鐚sjis-doti鐚翫鐚絖筝 dddd;
腟究絖紊障鐚
>>
Since encoded forms of strings in various encodings are not clearly distinctive
to each other, it is not always certainly possible to detect what encoding is
used for a given string.
J<< ja;
絖潟若c翫鐚
ゅャ腆阪с障鐚
>>
When a given string is possibly interpreted as both Shift_JIS and UTF-8 string,
this module considers such a string to be encoded in Shift_JIS. And if the
encoding is not distinguishable between 'sjis-au' and 'sjis-doti', this module
considers it 'sjis-au'.
J<< ja;
sjis, utf8 筝≧鴻茹iс絖翫鐚sjis鐚
sjis-au鐚sjis-doti 筝≧鴻茹iс絖翫鐚sjis-au鐚
菴障鐚
>>
=item $str = $s->get
=over 2
=item $str: string (UTF-8)
J<< ja; $str: 絖(UTF-8) >>
=back
Get the internal string in UTF-8.
J<< ja;
絖 UTF-8 潟若у冴障鐚
>>
This method currently returns a byte string (whose UTF-8 flag is turned off),
but this behavior may be changed in the future.
J<< ja;
憜 `ゃ' 菴障, 絨ョ紊眼醇с障.
>>
If you absolutely want a byte string, you should use the method utf8()
instead. And if you want a character string (whose UTF-8 flag is turned on), you
have to use the method getu().
J<< ja;
ゃ綽荀 utf8() <純,
絖綽荀 getu() <純篏帥鴻鴻<障.
>>
=item $str = $s->getu
=over 2
=item $str: string (UTF-8)
J<< ja; $str: 絖(UTF-8) >>
=back
Get the internal string in UTF-8.
J<< ja;
絖 UTF-8 潟若у冴障鐚
>>
On perl-5.8.0 or later, this method returns a character string with its UTF-8
flag turned on.
J<< ja;
Perl-5.8.0 篁ラ, utf-8 違ゃ utf-8 絖
菴障.
>>
=item $code = $s->getcode($str)
=over 2
=item $str: string
J<< ja; $str: 絖 >>
=item $code: name of character encoding
J<< ja; $code: 絖潟若茵絖 >>
=back
Detect the character encoding of given string.
J<< ja;
羝<絖(I<$str>)絖潟若ゅャ障鐚
>>
Note that this method, exceptionaly, doesn't deal with the internal string of an
instance.
J<< ja;
∽違с, 箴紊, ゃ潟鴻帥潟鴻篆
絖潟若ゅャс羈鐚
>>
To guess the encoding, the following algorithm is used:
J<< ja;
絖潟若ゅユ鐚篁ヤ≪眼冴ゅ茵障鐚
>>
(For pure perl implementation)
J<< ja;
(PurePerl)
>>
=over 4
=item 1
If the string has an UTF-32 BOM, its encoding is 'utf32'.
J<< ja;
UTF-32 BOM 逸utf32 ゅ障鐚
>>
=item 2
If it has an UTF-16 BOM, its encoding is 'utf16'.
J<< ja;
UTF-16 BOM 逸utf16 ゅ障鐚
>>
=item 3
If it is valid for UTF-32BE, its encoding is 'utf32-be'.
J<< ja;
UTF-32BE 罩c綵√鐚utf32-be ゅ障鐚
>>
=item 4
If it is valid for UTF-32LE, its encoding is 'utf32-le'.
J<< ja;
UTF-32LE 罩c綵√鐚utf32-le ゅ障鐚
>>
=item 5
If it contains no ESC characters or bytes whose eighth bit is on, its encoding
is 'ascii'. Every ASCII control characters (0x00-0x1F and 0x7F) except ESC
(0x1B) are considered to be in the range of 'ascii'.
J<< ja;
ESC 絖 障 8 腴c絖障逸ascii ゅ
ESC ゃ ASCII 九勝絖 (0x00-0x1F 0x7F) ascii 膀峨荀
>>
=item 6
If it contains escape sequences of ISO-2022-JP, its encoding is 'jis'.
J<< ja;
JIS鴻宴若激若宴潟鴻障逸jis ゅ障鐚
>>
=item 7
If it contains any emoji defined for J-PHONE, its encoding is 'sjis-jsky'.
J<< ja;
J-PHONE 腟究絖障逸sjis-jsky ゅャ障鐚
>>
=item 8
If it is valid for EUC-JP, its encoding is 'euc'.
J<< ja;
EUC-JP 潟若罩c綵√鐚euc ゅ障鐚
>>
=item 9
If it is valid for Shift_JIS, its encoding is 'sjis'.
J<< ja;
SJIS 潟若罩c綵√鐚sjis ゅ障鐚
>>
=item 10
If it contains any emoji defined for au, and everything else is valid for
Shift_JIS, its encoding is 'sjis-au'.
J<< ja;
SJIS 潟若 au 腟究絖罩c綵√鐚sjis-au ゅ障鐚
>>
=item 11
If it contains any emoji defined for i-mode, and everything else is valid for
Shift_JIS, its encoding is 'sjis-imode'.
J<< ja;
SJIS i-mode 腟究絖罩c綵√鐚sjis-imode ゅャ障鐚
>>
=item 12
If it contains any emoji defined for dot-i, and everything else is valid for
Shift_JIS, its encoding is 'sjis-doti'.
J<< ja;
SJIS dot-i 腟究絖罩c綵√鐚sjis-doti ゅャ障鐚
>>
=item 13
If it is valid for UTF-8, its encoding is 'utf8'.
J<< ja;
UTF-8 罩c綵√鐚utf8 ゅ障鐚
>>
=item 14
If no conditions above are fulfilled, its encoding is 'unknown'.
J<< ja;
綵障翫鐚unknown ゅ障鐚
>>
=back
(For XS implementation)
J<< ja;
(XS)
>>
=over 4
=item 1
If the string has an UTF-32 BOM, its encoding is 'utf32'.
J<< ja;
UTF-32 BOM 逸utf32 ゅ障鐚
>>
=item 2
If it has an UTF-16 BOM, its encoding is 'utf16'.
J<< ja;
UTF-16 BOM 逸utf16 ゅ障鐚
>>
=item 3
Find all possible encodings that might have been applied to the string from the
following:
J<< ja;
篁ヤ潟若ゃ, 罩c絖潟若с倶欠Щ茯帥鴻障.
>>
ascii / euc / sjis / jis / utf8 / utf32-be / utf32-le / sjis-jsky /
sjis-imode / sjis-au / sjis-doti
=item 4
If any encodings have been found possible, this module picks out one encoding
having the highest priority among them. The priority order is as follows:
J<< ja;
緇障фcc筝, 篁ヤэゃ, ゅ障.
>>
utf32-be / utf32-le / ascii / jis / euc / sjis / sjis-jsky / sjis-imode /
sjis-au / sjis-doti / utf8
=item 5
If no conditions above are fulfilled, its encoding is 'unknown'.
J<< ja;
綵障翫鐚unknown ゅ障鐚
>>
=back
Pay attention to the following pitfalls in the above algorithm:
J<< ja;
篁ヤ≪眼冴鐚篁ヤ鴻羈鐚
>>
=over 2
=item *
UTF-8 strings might be accidentally considered to be encoded in Shift_JIS.
J<< ja;
UTF-8 絖с鐚SJIS潟若荀醇с障鐚
>>
=item *
UCS-2 strings (sequence of raw UCS-2 letters in big-endian; each letters has
always 2 bytes) can't be detected because they look like nothing but sequences
of random bytes whose length is an even number.
J<< ja;
UCS2 ゅャс障鐚
>>
=item *
UTF-16 strings must have BOM to be detected.
J<< ja;
UTF-16 BOM 翫粋茯茘障鐚
>>
=item *
Emoji are only be recognized if they are implanted into the string in binary
form. If they are described in 'dddd;' form, they aren't considered to be
emoji.
J<< ja;
阪遣腟究絖鐚ゃх贋・腟究絖翫粋茘с障鐚
dddd; 綵√ц菴違翫鐚阪遣腟究絖ゅャ茵障鐚
>>
=back
Since the XS and pure perl implementations use different algorithms to guess
encoding, they may guess differently for the same string. Especially, the pure
perl implementation finds Shift_JIS strings containing ESC character (0x1B) to
be actually encoded in Shift_JIS but XS implementation doesn't. This is because
such strings can hardly be distinguished from 'sjis-jsky'. In addition, EUC-JP
strings containing ESC character are also rejected for the same reason.
J<< ja;
XSPurePerlс, ゅャ≪眼冴, 違腟醇с障.
鴻, 鴻宴若絖сsjis翫, PurePerlсsjis茯茘障
XSс茯茘障. sjis-jsky阪ャсс. 障,
篏茯よ茘蚊, euc-jp, 罕鴻宴若絖篁
c障.
>>
=item $code = $s->getcodelist($str)
=over 2
=item $str: string
J<< ja; $str: 絖 >>
=item $code: name of character encodings
J<< ja; $code: 絖潟若茵絖 >>
=back
Detect the character encoding of given string.
J<< ja;
羝<絖(I<$str>)絖潟若ゅャ障鐚
>>
Unlike the method getcode(), getcodelist() returns a list of possible encodings.
J<< ja;
getcode , 鴻純絖潟若
筝荀с菴障.
>>
=item $str = $s->conv($ocode, $encode)
=over 2
=item $ocode: character encoding (possible encodings are:)
J<< ja; $ocode: 阪潟若 (篁ヤ絎) >>
utf8 ucs2 ucs4 utf16
sjis cp932 euc euc-jp jis
sjis-imode sjis-imode1 sjis-imode2
utf8-imode utf8-imode1 utf8-imode2
sjis-doti sjis-doti1
sjis-jsky sjis-jsky1 sjis-jsky2
jis-jsky jis-jsky1 jis-jsky2
utf8-jsky utf8-jsky1 utf8-jsky2
sjis-au sjis-au1 sjis-au2
jis-au jis-au1 jis-au2
sjis-icon-au sjis-icon-au1 sjis-icon-au2
euc-icon-au euc-icon-au1 euc-icon-au2
jis-icon-au jis-icon-au1 jis-icon-au2
utf8-icon-au utf8-icon-au1 utf8-icon-au2
binary
(see also L"SUPPORTED ENCODINGS">.)
J<< ja;
(L"SUPPORTED ENCODINGS J<< ja; 泣若潟潟若c潟 >> ">
.)
>>
Some encodings for mobile phones have a trailing digit like 'sjis-au2'. Those
digits represent the version number of encodings. Such encodings have a variant
with no trailing digits, like 'sjis-au', which is the same as the latest version
among its variants.
J<< ja;
阪遣絖潟若¥絨障医ゃ鐚医紊с祉
紊с腟究絖祉鐚井腮腟究絖祉鐚茵障鐚
医鐚c医紊с絖潟若筝с鐚
>>
=item $encode: optional binary encoding
J<< ja; $encode: ゃ膃垸劫鐚ュ鐚 >>
=item $str: string
J<< ja; $str: 絖 >>
=back
Get the internal string of instance with encoding it using a given character
encoding method.
J<< ja;
絖絎絖潟若紊冴障鐚
>>
If you want the resulting string to be encoded in Base64, specify 'base64' as
the second argument.
J<< ja;
絖潟潟若鐚'base64' 炊絎純с鐚
base64 絎翫鐚base64 潟潟若
絖菴障鐚
>>
On perl-5.8.0 or later, the UTF-8 flag of resulting string is turned off even if
you specify 'utf8' to the first argument.
J<< ja;
perl-5.8.0 篁ラ, 阪 utf-8 違ゃ障.
>>
=item $s->tag2bin
Interpret decimal character references (dddd;) in the instance, and replaces
them with single characters they represent.
J<< ja;
絖筝障 dddd; 綵√絖鐚茵絖篏臀障鐚
>>
=item $s->z2h
Replace zenkaku (full-width) letters in the instance with hankaku (half-width)
letters.
J<< ja;
茹茹紊障鐚
>>
=item $s->h2z
Replace hankaku (half-width) letters in the instance with zenkaku (full-width)
letters.
J<< ja;
茹茹紊障鐚
>>
=item $s->hira2kata
Replace any hiragana in the instance with katakana.
J<< ja;
蚊帥紊障鐚
>>
=item $s->kata2hira
Replace any katakana in the instance with hiragana.
J<< ja;
帥蚊紊障鐚
>>
=item $str = $s->jis
$str: byte string in ISO-2022-JP
J<< ja;
$str: JIS 潟潟若c潟医就綣ゃ
>>
Get the internal string of instance with encoding it in ISO-2022-JP.
J<< ja;
絖 JIS鐚ISO-2022-JP鐚 潟若у冴障鐚
>>
=item $str = $s->euc
$str: byte string in EUC-JP
J<< ja;
$str: euc-jp 潟潟若c潟医就綣ゃ
>>
Get the internal string of instance with encoding it in EUC-JP.
J<< ja;
絖 EUC-JP 潟若у冴障鐚
>>
=item $str = $s->utf8
$str: byte string in UTF-8
J<< ja;
$str: utf-8 潟潟若c潟医就綣ゃ
>>
Get the internal UTF-8 string of instance.
J<< ja;
絖 UTF-8 潟若у冴障鐚
>>
On perl-5.8.0 or later, the UTF-8 flag of resulting string is turned off.
J<< ja;
perl-5.8.0 篁ラ, ゃ菴障.
>>
=item $str = $s->ucs2
$str: byte string in UCS-2
J<< ja;
$str: ucs2 潟潟若c潟医就綣ゃ
>>
Get the internal string of instance as a sequence of raw UCS-2 letters in
big-endian. Note that this is different from UTF-16BE as raw UCS-2 sequence has
no concept of surrogate pair.
J<< ja;
絖 UCS2 潟若у冴障鐚
>>
=item $str = $s->ucs4
$str: byte string in UCS-4
J<< ja;
$str: ucs4 潟潟若c潟医就綣ゃ
>>
Get the internal string of instance as a sequence of raw UCS-4 letters in
big-endian. This is practically the same as UTF-32BE.
J<< ja;
絖 UCS4 潟若у冴障鐚
>>
=item $str = $s->utf16
$str: byte string in UTF-16
J<< ja;
$str: ucs-16 潟潟若c潟医就綣ゃ
>>
Get the insternal string of instance with encoding it in UTF-16 in big-endian
with no BOM prepended.
J<< ja;
絖 UTF-16 潟若у冴障鐚
BOM篁障鐚
潟c≪喝就綣ц障鐚
>>
=item $str = $s->sjis
$str: byte string in Shift_JIS
J<< ja;
$str: sjis 潟潟若c潟医就綣ゃ
>>
Get the internal string of instance with encoding it in Shift_JIS (MS-SJIS /
MS-CP932).
J<< ja;
絖 SJIS鐚MS-CP932鐚 潟若у冴障鐚
>>
=item $str = $s->sjis_imode
$str: byte string in 'sjis-imode'
J<< ja;
$str: sjis/imode腟究絖 潟潟若c潟医就綣ゃ
>>
Get the internal string of instance with encoding it in 'sjis-imode'.
J<< ja;
絖 i-mode 腴 SJIS 潟若у冴障鐚
違imode腟究絖ュс.
>>
=item $str = $s->sjis_imode1
$str: byte string in 'sjis-imode1'
J<< ja;
$str: sjis/imode 腟究絖 潟潟若c潟医就綣ゃ
>>
Get the internal string of instance with encoding it in 'sjis-imode1'.
J<< ja;
絖 i-mode 腴 SJIS 潟若у冴障鐚
堺腟究絖障.
>>
=item $str = $s->sjis_imode2
$str: byte string in 'sjis-imode2'
J<< ja;
$str: sjis/imode 腟究絖 潟潟若c潟医就綣ゃ
>>
Get the internal string of instance with encoding it in 'sjis-imode2'.
J<< ja;
絖 i-mode 腴 SJIS 潟若у冴障鐚
堺腟究絖, ≦宍腟究絖帥障.
>>
=item $str = $s->sjis_doti
$str: byte string in 'sjis-doti'
J<< ja;
$str: sjis/dot-i 腟究絖 潟潟若c潟医就綣ゃ
>>
Get the internal string of instance with encoding it in 'sjis-doti'.
J<< ja;
絖 dot-i 腴 SJIS 潟若у冴障鐚
>>
=item $str = $s->sjis_jsky
$str: byte string in 'sjis-jsky'
J<< ja;
$str: sjis/j-sky 腟究絖 潟潟若c潟医就綣ゃ
>>
Get the internal string of instance with encoding it in 'sjis-jsky'.
J<< ja;
絖 j-sky 腴 SJIS 潟若у冴障鐚
違j-sky腟究絖(VERSION 0.15 с, jsky2)ュс.
>>
=item $str = $s->sjis_jsky1
$str: byte string in 'sjis-jsky1'
J<< ja;
$str: sjis/j-sky 腟究絖 潟潟若c潟医就綣ゃ
>>
Get the internal string of instance with encoding it in 'sjis-jsky1'.
J<< ja;
絖 j-sky 腴 SJIS 潟若у冴障鐚
Page 1鐔3 帥腟究絖帥障.
>>
=item $str = $s->sjis_jsky
$str: byte string in 'sjis-jsky'
J<< ja;
$str: sjis/j-sky 腟究絖 潟潟若c潟医就綣ゃ
>>
Get the internal string of instance with encoding it in 'sjis-jsky'.
J<< ja;
絖 j-sky 腴 SJIS 潟若у冴障鐚
Page 1鐔6 腟究絖帥障.
>>
=item $str = $s->sjis_icon_au
$str: byte string in 'sjis-icon-au'
J<< ja;
$str: sjis/AU icon帥 潟潟若c潟医就綣ゃ
>>
Get the internal string of instance with encoding it in 'sjis-icon-au'.
J<< ja;
絖 AU 腴 SJIS 潟若у冴障鐚
>>
=item $str_arrayref = $s->strcut($len)
=over 2
=item $len: maximum length of each chunks (in number of
full-width characters)
J<< ja; $len: 蚊絖(茹後) >>
=item $str_arrayref: reference to array of strings
J<< ja; $str_arrayref: 絖 >>
=back
Split the internal string of instance into chunks of a given length.
J<< ja;
I<$len>ф絎絖(茹)篁ヤ絖蚊障鐚
>>
On perl-5.8.0 or later, UTF-8 flags of each chunks are turned on.
J<< ja;
荀膣, utf-8 違cutf-8絖с.
>>
=item $len = $s->strlen
$len: character width of the internal string
J<< ja;
$len: 絖茵腓阪
>>
Calculate the character width of the internal string. Half-width characters have
width of one unit, and full-width characters have width of two units.
J<< ja;
UTF-8 絖絲障 length() 篏帥茹絖鐚絖激 3 c障障鐚
<純篏睡鐚緇ャ SJIS 鐚茹絖鐚絖激 2 菴障鐚
>>
=item $s->join_csv(@values);
@values: array of strings
J<< ja;
@values: 若翠
>>
Build a line of CSV from the arguments, and store it into the instance. The
resulting line has a trailing line break ("\n").
J<< ja;
CSV 絖紊鐚ゃ潟鴻帥潟鴻主障鐚
絖緇壕("\n")菴遵障鐚
>>
=item @values = $s->split_csv;
@values: array of strings
J<< ja;
@values: 若翠
>>
Parse a line of CSV in the instance and return each columns. The line will be
chomp()ed before getting parsed.
J<< ja;
ゃ潟鴻帥潟鴻主絖 CSV 荀鐚蚊障鐚
絖緇壕("\n")ゃ蚊障鐚
>>
If the internal string was decoded from 'binary' encoding (see methods new() and
set()), the UTF-8 flags of the resulting array of strings are turned
off. Otherwise the flags are turned on.
J<< ja;
ュ binary с utf-8 絖菴障.
binary cゃ菴障.
>>
=back
=head1 SUPPORTED ENCODINGS
J<< ja; 泣若潟潟若c潟 >>
+---------------+----+-----+-------+
|encoding | in | out | guess |
+---------------+----+-----+-------+
|auto : OK : -- | ----- |
+---------------+----+-----+-------+
|utf8 : OK : OK | OK |
|ucs2 : OK : OK | ----- |
|ucs4 : OK : OK | ----- |
|utf16-be : OK : -- | ----- |
|utf16-le : OK : -- | ----- |
|utf16 : OK : OK | OK(#) |
|utf32-be : OK : -- | OK |
|utf32-le : OK : -- | OK |
|utf32 : OK : -- | OK(#) |
+---------------+----+-----+-------+
|sjis : OK : OK | OK |
|cp932 : OK : OK | ----- |
|euc : OK : OK | OK |
|euc-jp : OK : OK | ----- |
|jis : OK : OK | OK |
+---------------+----+-----+-------+
|sjis-imode : OK : OK | OK |
|sjis-imode1 : OK : OK | ----- |
|sjis-imode2 : OK : OK | ----- |
|utf8-imode : OK : OK | ----- |
|utf8-imode1 : OK : OK | ----- |
|utf8-imode2 : OK : OK | ----- |
+---------------+----+-----+-------+
|sjis-doti : OK : OK | OK |
|sjis-doti1 : OK : OK | ----- |
+---------------+----+-----+-------+
|sjis-jsky : OK : OK | OK |
|sjis-jsky1 : OK : OK | ----- |
|sjis-jsky2 : OK : OK | ----- |
|jis-jsky : OK : OK | ----- |
|jis-jsky1 : OK : OK | ----- |
|jis-jsky2 : OK : OK | ----- |
|utf8-jsky : OK : OK | ----- |
|utf8-jsky1 : OK : OK | ----- |
|utf8-jsky2 : OK : OK | ----- |
+---------------+----+-----+-------+
|sjis-au : OK : OK | OK |
|sjis-au1 : OK : OK | ----- |
|sjis-au2 : OK : OK | ----- |
|jis-au : OK : OK | ----- |
|jis-au1 : OK : OK | ----- |
|jis-au2 : OK : OK | ----- |
|sjis-icon-au : OK : OK | ----- |
|sjis-icon-au1 : OK : OK | ----- |
|sjis-icon-au2 : OK : OK | ----- |
|euc-icon-au : OK : OK | ----- |
|euc-icon-au1 : OK : OK | ----- |
|euc-icon-au2 : OK : OK | ----- |
|jis-icon-au : OK : OK | ----- |
|jis-icon-au1 : OK : OK | ----- |
|jis-icon-au2 : OK : OK | ----- |
|utf8-icon-au : OK : OK | ----- |
|utf8-icon-au1 : OK : OK | ----- |
|utf8-icon-au2 : OK : OK | ----- |
+---------------+----+-----+-------+
|ascii : OK : -- | OK |
|binary : OK : OK | ----- |
+---------------+----+-----+-------+
(#): guessed when it has bom.
=head2 GUESSING ORDER
J<< ja; 茯茘篏 >>
1. utf32 (#)
2. utf16 (#)
3. utf32-be
4. utf32-le
5. ascii
6. jis
7. sjis-jsky (pp)
8. euc
9. sjis
10. sjis-jsky (xs)
11. sjis-au
12. sjis-imode
13. sjis-doti
14. utf8
15. unknown
=head1 DESCRIPTION OF UNICODE MAPPING
Transcoding between Unicode encodings and other ones is performed as below:
J<< ja;
Unicode 潟違篁ヤ茵障鐚
>>
=over 2
=item Shift_JIS
This module uses the mapping table of MS-CP932.
J<< ja;
MS-CP932 Unicode 吾潟違茵障鐚
潟違若篁ヤURL篏睡障鐚
>>
L<< ftp://ftp.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP932.TXT >>
When the module tries to convert Unicode string to Shift_JIS, it represents most
letters which isn't available in Shift_JIS as decimal character reference
('dddd;'). There is one exception to this: every graphic characters for mobile
phones are replaced with '?' mark.
J<< ja;
Unicode SJIS 吾潟違翫鐚茵憗с絖鐚
絖 dddd; 綵√紊障鐚
鐚阪遣腟究絖?紊障鐚
>>
For variants of Shift_JIS defined for mobile phones, every unrepresentable
characters are replaced with '?' mark unlike the plain Shift_JIS.
J<< ja;
障鐚阪遣 SJIS 後鐚茵憗с絖?紊障鐚
>>
=item EUC-JP/ISO-2022-JP
This module doesn't directly convert Unicode string from/to EUC-JP or
ISO-2022-JP: it once converts from/to Shift_JIS and then do the rest
translation. So characters which aren't available in the Shift_JIS can not be
properly translated.
J<< ja;
筝綺SJIS潟若紊鐚Unicode 吾潟違障鐚
鐚SJIS ц;憗с絖障翫鐚
絖罩c潟違с障鐚
>>
=item DoCoMo i-mode
This module maps emoji in the range of F800 - F9FF to U+0FF800 - U+0FF9FF.
J<< ja;
F800 - F9FF ∞亀絖絖鐚U+0FF800 - U+0FF9FF
潟違障鐚
>>
=item ASTEL dot-i
This module maps emoji in the range of F000 - F4FF to U+0FF000 - U+0FF4FF.
J<< ja;
F000 - F4FF ∞亀絖絖鐚U+0FF000 - U+0FF4FF
潟違障鐚
>>
=item J-PHONE J-SKY
The encoding method defined by J-SKY is as follows: first an escape sequence
"\e\$" comes to indicate the beginning of emoji, then the first byte of an emoji
comes next, then the second bytes of at least one emoji comes next, then "\x0f"
comes last to indicate the end of emoji. If a string contains a series of emoji
whose first bytes are identical, such sequence can be compressed by cascading
second bytes of them to the single first byte.
J<< ja;
J-SKY 腟究絖鐚鴻宴若激若宴潟 "\e\$" 緇鐚腟究絖1ゃ鐚
1や札筝腟究絖2ゃ鐚"\x0f"鐚膓障鐚
1ゃ腟究絖膓翫鐚2ゃ帥g吾
х軒с障鐚
>>
This module considers a pair of those first and second bytes to be one letter,
and map them from 4500 - 47FF to U+0FFB00 - U+0FFDFF.
J<< ja;
1ゃ2ゃ≪1絖荀鐚4500 - 47FF 鐚
U+0FFB00 - U+0FFDFF 潟違障鐚
>>
When the module encodes J-SKY emoji, it performs the compression automatically.
J<< ja;
Unicode::Japanese с鐚Unicode J-SKY 腟究絖潟違鐚
1ゃ筝с腟究絖g翫鐚х軒茵障鐚
>>
=item AU
This module maps AU emoji to U+0FF500 - U+0FF6FF.
J<< ja;
腟究絖絖鐚U+0FF500 - U+0FF6FF 潟違障鐚
>>
=back
=head1 PurePerl mode
use Unicode::Japanese qw(PurePerl);
If you want to explicitly take the pure perl implementation, pass
C<'PurePerl'> to the argument of the C().
=item $s = unijp($str [, $icode [, $encode]])
Same as Unicode::Jananese->new(...).
=item $s->set($str [, $icode [, $encode]])
X
=over 2
=item $str: string
=item $icode: optional character encoding (default: 'utf8')
=item $encode: optional binary encoding (default: no binary encodings are assumed)
=back
Store a string into the instance.
Possible character encodings are:
auto
utf8 ucs2 ucs4
utf16-be utf16-le utf16
utf32-be utf32-le utf32
sjis cp932 euc euc-jp jis
sjis-imode sjis-imode1 sjis-imode2
utf8-imode utf8-imode1 utf8-imode2
sjis-doti sjis-doti1
sjis-jsky sjis-jsky1 sjis-jsky2
jis-jsky jis-jsky1 jis-jsky2
utf8-jsky utf8-jsky1 utf8-jsky2
sjis-au sjis-au1 sjis-au2
jis-au jis-au1 jis-au2
sjis-icon-au sjis-icon-au1 sjis-icon-au2
euc-icon-au euc-icon-au1 euc-icon-au2
jis-icon-au jis-icon-au1 jis-icon-au2
utf8-icon-au utf8-icon-au1 utf8-icon-au2
ascii binary
(see also L.)
If you want the Unicode::Japanese detect the character encoding of string, you
must explicitly specify 'auto' as the second argument. In that case, the given
string will be passed to the method getcode() to guess the encoding.
For binary encodings, only 'base64' is currently supported. If you specify
'base64' as the third argument, the given string will be decoded using Base64
decoder.
Specify 'binary' as the second argument if you want your string to be stored
without modification.
When you specify 'sjis-imode' or 'sjis-doti' as the character encoding, any
occurences of 'dddd;' (decimal character reference) in the string will be
interpreted and decoded as code point of emoji, just like emoji implanted into
the string in binary form.
Since encoded forms of strings in various encodings are not clearly distinctive
to each other, it is not always certainly possible to detect what encoding is
used for a given string.
When a given string is possibly interpreted as both Shift_JIS and UTF-8 string,
this module considers such a string to be encoded in Shift_JIS. And if the
encoding is not distinguishable between 'sjis-au' and 'sjis-doti', this module
considers it 'sjis-au'.
=item $str = $s->get
=over 2
=item $str: string (UTF-8)
=back
Get the internal string in UTF-8.
This method currently returns a byte string (whose UTF-8 flag is turned off),
but this behavior may be changed in the future.
If you absolutely want a byte string, you should use the method utf8()
instead. And if you want a character string (whose UTF-8 flag is turned on), you
have to use the method getu().
=item $str = $s->getu
=over 2
=item $str: string (UTF-8)
=back
Get the internal string in UTF-8.
On perl-5.8.0 or later, this method returns a character string with its UTF-8
flag turned on.
=item $code = $s->getcode($str)
=over 2
=item $str: string
=item $code: name of character encoding
=back
Detect the character encoding of given string.
Note that this method, exceptionaly, doesn't deal with the internal string of an
instance.
To guess the encoding, the following algorithm is used:
(For pure perl implementation)
=over 4
=item 1
If the string has an UTF-32 BOM, its encoding is 'utf32'.
=item 2
If it has an UTF-16 BOM, its encoding is 'utf16'.
=item 3
If it is valid for UTF-32BE, its encoding is 'utf32-be'.
=item 4
If it is valid for UTF-32LE, its encoding is 'utf32-le'.
=item 5
If it contains no ESC characters or bytes whose eighth bit is on, its encoding
is 'ascii'. Every ASCII control characters (0x00-0x1F and 0x7F) except ESC
(0x1B) are considered to be in the range of 'ascii'.
=item 6
If it contains escape sequences of ISO-2022-JP, its encoding is 'jis'.
=item 7
If it contains any emoji defined for J-PHONE, its encoding is 'sjis-jsky'.
=item 8
If it is valid for EUC-JP, its encoding is 'euc'.
=item 9
If it is valid for Shift_JIS, its encoding is 'sjis'.
=item 10
If it contains any emoji defined for au, and everything else is valid for
Shift_JIS, its encoding is 'sjis-au'.
=item 11
If it contains any emoji defined for i-mode, and everything else is valid for
Shift_JIS, its encoding is 'sjis-imode'.
=item 12
If it contains any emoji defined for dot-i, and everything else is valid for
Shift_JIS, its encoding is 'sjis-doti'.
=item 13
If it is valid for UTF-8, its encoding is 'utf8'.
=item 14
If no conditions above are fulfilled, its encoding is 'unknown'.
=back
(For XS implementation)
=over 4
=item 1
If the string has an UTF-32 BOM, its encoding is 'utf32'.
=item 2
If it has an UTF-16 BOM, its encoding is 'utf16'.
=item 3
Find all possible encodings that might have been applied to the string from the
following:
ascii / euc / sjis / jis / utf8 / utf32-be / utf32-le / sjis-jsky /
sjis-imode / sjis-au / sjis-doti
=item 4
If any encodings have been found possible, this module picks out one encoding
having the highest priority among them. The priority order is as follows:
utf32-be / utf32-le / ascii / jis / euc / sjis / sjis-jsky / sjis-imode /
sjis-au / sjis-doti / utf8
=item 5
If no conditions above are fulfilled, its encoding is 'unknown'.
=back
Pay attention to the following pitfalls in the above algorithm:
=over 2
=item *
UTF-8 strings might be accidentally considered to be encoded in Shift_JIS.
=item *
UCS-2 strings (sequence of raw UCS-2 letters in big-endian; each letters has
always 2 bytes) can't be detected because they look like nothing but sequences
of random bytes whose length is an even number.
=item *
UTF-16 strings must have BOM to be detected.
=item *
Emoji are only be recognized if they are implanted into the string in binary
form. If they are described in 'dddd;' form, they aren't considered to be
emoji.
=back
Since the XS and pure perl implementations use different algorithms to guess
encoding, they may guess differently for the same string. Especially, the pure
perl implementation finds Shift_JIS strings containing ESC character (0x1B) to
be actually encoded in Shift_JIS but XS implementation doesn't. This is because
such strings can hardly be distinguished from 'sjis-jsky'. In addition, EUC-JP
strings containing ESC character are also rejected for the same reason.
=item $code = $s->getcodelist($str)
=over 2
=item $str: string
=item $code: name of character encodings
=back
Detect the character encoding of given string.
Unlike the method getcode(), getcodelist() returns a list of possible encodings.
=item $str = $s->conv($ocode, $encode)
=over 2
=item $ocode: character encoding (possible encodings are:)
utf8 ucs2 ucs4 utf16
sjis cp932 euc euc-jp jis
sjis-imode sjis-imode1 sjis-imode2
utf8-imode utf8-imode1 utf8-imode2
sjis-doti sjis-doti1
sjis-jsky sjis-jsky1 sjis-jsky2
jis-jsky jis-jsky1 jis-jsky2
utf8-jsky utf8-jsky1 utf8-jsky2
sjis-au sjis-au1 sjis-au2
jis-au jis-au1 jis-au2
sjis-icon-au sjis-icon-au1 sjis-icon-au2
euc-icon-au euc-icon-au1 euc-icon-au2
jis-icon-au jis-icon-au1 jis-icon-au2
utf8-icon-au utf8-icon-au1 utf8-icon-au2
binary
(see also L.)
Some encodings for mobile phones have a trailing digit like 'sjis-au2'. Those
digits represent the version number of encodings. Such encodings have a variant
with no trailing digits, like 'sjis-au', which is the same as the latest version
among its variants.
=item $encode: optional binary encoding
=item $str: string
=back
Get the internal string of instance with encoding it using a given character
encoding method.
If you want the resulting string to be encoded in Base64, specify 'base64' as
the second argument.
On perl-5.8.0 or later, the UTF-8 flag of resulting string is turned off even if
you specify 'utf8' to the first argument.
=item $s->tag2bin
Interpret decimal character references (dddd;) in the instance, and replaces
them with single characters they represent.
=item $s->z2h
Replace zenkaku (full-width) letters in the instance with hankaku (half-width)
letters.
=item $s->h2z
Replace hankaku (half-width) letters in the instance with zenkaku (full-width)
letters.
=item $s->hira2kata
Replace any hiragana in the instance with katakana.
=item $s->kata2hira
Replace any katakana in the instance with hiragana.
=item $str = $s->jis
$str: byte string in ISO-2022-JP
Get the internal string of instance with encoding it in ISO-2022-JP.
=item $str = $s->euc
$str: byte string in EUC-JP
Get the internal string of instance with encoding it in EUC-JP.
=item $str = $s->utf8
$str: byte string in UTF-8
Get the internal UTF-8 string of instance.
On perl-5.8.0 or later, the UTF-8 flag of resulting string is turned off.
=item $str = $s->ucs2
$str: byte string in UCS-2
Get the internal string of instance as a sequence of raw UCS-2 letters in
big-endian. Note that this is different from UTF-16BE as raw UCS-2 sequence has
no concept of surrogate pair.
=item $str = $s->ucs4
$str: byte string in UCS-4
Get the internal string of instance as a sequence of raw UCS-4 letters in
big-endian. This is practically the same as UTF-32BE.
=item $str = $s->utf16
$str: byte string in UTF-16
Get the insternal string of instance with encoding it in UTF-16 in big-endian
with no BOM prepended.
=item $str = $s->sjis
$str: byte string in Shift_JIS
Get the internal string of instance with encoding it in Shift_JIS (MS-SJIS /
MS-CP932).
=item $str = $s->sjis_imode
$str: byte string in 'sjis-imode'
Get the internal string of instance with encoding it in 'sjis-imode'.
=item $str = $s->sjis_imode1
$str: byte string in 'sjis-imode1'
Get the internal string of instance with encoding it in 'sjis-imode1'.
=item $str = $s->sjis_imode2
$str: byte string in 'sjis-imode2'
Get the internal string of instance with encoding it in 'sjis-imode2'.
=item $str = $s->sjis_doti
$str: byte string in 'sjis-doti'
Get the internal string of instance with encoding it in 'sjis-doti'.
=item $str = $s->sjis_jsky
$str: byte string in 'sjis-jsky'
Get the internal string of instance with encoding it in 'sjis-jsky'.
=item $str = $s->sjis_jsky1
$str: byte string in 'sjis-jsky1'
Get the internal string of instance with encoding it in 'sjis-jsky1'.
=item $str = $s->sjis_jsky
$str: byte string in 'sjis-jsky'
Get the internal string of instance with encoding it in 'sjis-jsky'.
=item $str = $s->sjis_icon_au
$str: byte string in 'sjis-icon-au'
Get the internal string of instance with encoding it in 'sjis-icon-au'.
=item $str_arrayref = $s->strcut($len)
=over 2
=item $len: maximum length of each chunks (in number of
full-width characters)
=item $str_arrayref: reference to array of strings
=back
Split the internal string of instance into chunks of a given length.
On perl-5.8.0 or later, UTF-8 flags of each chunks are turned on.
=item $len = $s->strlen
$len: character width of the internal string
Calculate the character width of the internal string. Half-width characters have
width of one unit, and full-width characters have width of two units.
=item $s->join_csv(@values);
@values: array of strings
Build a line of CSV from the arguments, and store it into the instance. The
resulting line has a trailing line break ("\n").
=item @values = $s->split_csv;
@values: array of strings
Parse a line of CSV in the instance and return each columns. The line will be
chomp()ed before getting parsed.
If the internal string was decoded from 'binary' encoding (see methods new() and
set()), the UTF-8 flags of the resulting array of strings are turned
off. Otherwise the flags are turned on.
=back
=head1 SUPPORTED ENCODINGS
+---------------+----+-----+-------+
|encoding | in | out | guess |
+---------------+----+-----+-------+
|auto : OK : -- | ----- |
+---------------+----+-----+-------+
|utf8 : OK : OK | OK |
|ucs2 : OK : OK | ----- |
|ucs4 : OK : OK | ----- |
|utf16-be : OK : -- | ----- |
|utf16-le : OK : -- | ----- |
|utf16 : OK : OK | OK(#) |
|utf32-be : OK : -- | OK |
|utf32-le : OK : -- | OK |
|utf32 : OK : -- | OK(#) |
+---------------+----+-----+-------+
|sjis : OK : OK | OK |
|cp932 : OK : OK | ----- |
|euc : OK : OK | OK |
|euc-jp : OK : OK | ----- |
|jis : OK : OK | OK |
+---------------+----+-----+-------+
|sjis-imode : OK : OK | OK |
|sjis-imode1 : OK : OK | ----- |
|sjis-imode2 : OK : OK | ----- |
|utf8-imode : OK : OK | ----- |
|utf8-imode1 : OK : OK | ----- |
|utf8-imode2 : OK : OK | ----- |
+---------------+----+-----+-------+
|sjis-doti : OK : OK | OK |
|sjis-doti1 : OK : OK | ----- |
+---------------+----+-----+-------+
|sjis-jsky : OK : OK | OK |
|sjis-jsky1 : OK : OK | ----- |
|sjis-jsky2 : OK : OK | ----- |
|jis-jsky : OK : OK | ----- |
|jis-jsky1 : OK : OK | ----- |
|jis-jsky2 : OK : OK | ----- |
|utf8-jsky : OK : OK | ----- |
|utf8-jsky1 : OK : OK | ----- |
|utf8-jsky2 : OK : OK | ----- |
+---------------+----+-----+-------+
|sjis-au : OK : OK | OK |
|sjis-au1 : OK : OK | ----- |
|sjis-au2 : OK : OK | ----- |
|jis-au : OK : OK | ----- |
|jis-au1 : OK : OK | ----- |
|jis-au2 : OK : OK | ----- |
|sjis-icon-au : OK : OK | ----- |
|sjis-icon-au1 : OK : OK | ----- |
|sjis-icon-au2 : OK : OK | ----- |
|euc-icon-au : OK : OK | ----- |
|euc-icon-au1 : OK : OK | ----- |
|euc-icon-au2 : OK : OK | ----- |
|jis-icon-au : OK : OK | ----- |
|jis-icon-au1 : OK : OK | ----- |
|jis-icon-au2 : OK : OK | ----- |
|utf8-icon-au : OK : OK | ----- |
|utf8-icon-au1 : OK : OK | ----- |
|utf8-icon-au2 : OK : OK | ----- |
+---------------+----+-----+-------+
|ascii : OK : -- | OK |
|binary : OK : OK | ----- |
+---------------+----+-----+-------+
(#): guessed when it has bom.
=head2 GUESSING ORDER
1. utf32 (#)
2. utf16 (#)
3. utf32-be
4. utf32-le
5. ascii
6. jis
7. sjis-jsky (pp)
8. euc
9. sjis
10. sjis-jsky (xs)
11. sjis-au
12. sjis-imode
13. sjis-doti
14. utf8
15. unknown
=head1 DESCRIPTION OF UNICODE MAPPING
Transcoding between Unicode encodings and other ones is performed as below:
=over 2
=item Shift_JIS
This module uses the mapping table of MS-CP932.
L<< ftp://ftp.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP932.TXT >>
When the module tries to convert Unicode string to Shift_JIS, it represents most
letters which isn't available in Shift_JIS as decimal character reference
('dddd;'). There is one exception to this: every graphic characters for mobile
phones are replaced with '?' mark.
For variants of Shift_JIS defined for mobile phones, every unrepresentable
characters are replaced with '?' mark unlike the plain Shift_JIS.
=item EUC-JP/ISO-2022-JP
This module doesn't directly convert Unicode string from/to EUC-JP or
ISO-2022-JP: it once converts from/to Shift_JIS and then do the rest
translation. So characters which aren't available in the Shift_JIS can not be
properly translated.
=item DoCoMo i-mode
This module maps emoji in the range of F800 - F9FF to U+0FF800 - U+0FF9FF.
=item ASTEL dot-i
This module maps emoji in the range of F000 - F4FF to U+0FF000 - U+0FF4FF.
=item J-PHONE J-SKY
The encoding method defined by J-SKY is as follows: first an escape sequence
"\e\$" comes to indicate the beginning of emoji, then the first byte of an emoji
comes next, then the second bytes of at least one emoji comes next, then "\x0f"
comes last to indicate the end of emoji. If a string contains a series of emoji
whose first bytes are identical, such sequence can be compressed by cascading
second bytes of them to the single first byte.
This module considers a pair of those first and second bytes to be one letter,
and map them from 4500 - 47FF to U+0FFB00 - U+0FFDFF.
When the module encodes J-SKY emoji, it performs the compression automatically.
=item AU
This module maps AU emoji to U+0FF500 - U+0FF6FF.
=back
=head1 PurePerl mode
use Unicode::Japanese qw(PurePerl);
If you want to explicitly take the pure perl implementation, pass
C<'PurePerl'> to the argument of the C