XML-TMX-0.31/000755 000765 000024 00000000000 12645001536 012632 5ustar00ambsstaff000000 000000 XML-TMX-0.31/Changes000644 000765 000024 00000006731 12645001531 014127 0ustar00ambsstaff000000 000000 Revision history for Perl extension XML::TMX. 0.31 Mon Jan 11 19:32:33 WET 2016 - Fixed utf8 POD. 0.30 Mon Jan 11 19:07:37 WET 2016 - Added tsv2tmx script. 0.29 Sat Oct 10 20:04:11 WEST 2015 - Consistent version numbers 0.28 Thu Sep 17 14:24:20 WEST 2015 - Fixed TMX spec URL (thanks to Alexander Becker) 0.27 Mon Aug 31 18:14:12 WEST 2015 - Fixed tmx-POStagger. 0.26 Tue Apr 28 19:08:28 WEST 2015 - Fixed bug when processing CDATA elements. - Added -raw option to for_tu - Reworked TMX annotated format. - Various improvements in tmx-POStagger. 0.25 Tue Jul 30 16:06:58 WEST 2013 - tmx2tmx -cat : fix a bug related with a missing ":utf8" - tmxclean -len: remove tu if len > 50 and len-min * 2 < len-max - tmxclean -eq: remove tu if equal ( seg(l1)=seg(l2) ) - output formatting of the TMX slightly changed (tu = lines/5) 0.24 Thu Nov 29 11:51:03 WET 2012 - Require Perl v5.10 0.23 Nov 24, 2012 - sort languages when writing (good for tests) 0.22 Jun 6, 2012 - more UTF-8 pod stuff 0.21 Jun 5, 2012 - properly encode UTF-8 Scripts - tmx2tmx -cat preserves props and notes from all documents - add POD to some scripts - added tmx-tokenize - Moved some scripts to example folder Reader - save info about CDATA sections - add -raw option on for_tu2 - for_tu2 renamed as for_tu - reads header correctly (props and notes) Writer - write correctly CDATA sections - better output - handle header props and notes - more testing 0.20 Jan 31, 2012 - add 'verbose' option to for_tu family of methods 0.19 Jan 27, 2012 - fix 'ignore_markup' in Reader.pm (thanks to Achim Ruopp) 0.18 Feb 09, 2011 - tmxwc working for multiple files; - Fixed tmxuniq to work without complaining on unicode; 0.17 Dec 22, 2008 - Added tmxwc and tmxclean scripts. 0.16 Fev 5, 2007 - refactored for_tu2 function (it used $&, $` and $'). 0.15 Dec 9, 2006 - new for_tu2 function that will soon replace for_tu more efficient for big files - languages function just checks the first few 100 TUs 0.14 Jun 7, 2006 - in XML::TMX::Writer, if we know the encoding, use binmode to force it. 0.13 Jul 20, 2005 - for_tu now handles direct output for another TMX file. - removed some newlined in the output. 0.12 Jul 17, 2005 - added some tests to XML::TMX::Writer; - changed test names to be tested in order; 0.11 Jul 12, 2005 - corrected stupid bug lying around since 0.05 0.10 Sep 30, 2004 0.09 - problem with upload to CPAN; 0.09 - problem with upload to CPAN; - added pod and pod-coverage tests; - added documentation; 0.07 May 18, 2004 - option to remove sub-tags is now global and not specific for the for_tu command; 0.06 Apr 22, 2004 - corrected bug on test files - tmx2tmx is installed - tmx2tmx conversion to TRADOS TMX1.1 format - tmx2tmx basic TMX cleaner 0.05 Jan 07, 2004 - removed dependency with XML::Writer; - option to remove sub-tags when processing s; 0.04 Nov 11, 2003 - use xml:lang instead of lang if the first exists 0.03 Oct 12, 2003 - Many bugs were found. Here is a working version (we hope) 0.02 Oct 09, 2003 - Corrected warnings with perl 5.8.1 0.01 Sep 14, 2003 - Created XML::TMX::{Reader|Query} XML-TMX-0.31/examples/000755 000765 000024 00000000000 12645001535 014447 5ustar00ambsstaff000000 000000 XML-TMX-0.31/lib/000755 000765 000024 00000000000 12645001535 013377 5ustar00ambsstaff000000 000000 XML-TMX-0.31/Makefile.PL000644 000765 000024 00000002060 12644775453 014621 0ustar00ambsstaff000000 000000 use ExtUtils::MakeMaker; use 5.010; WriteMakefile( 'NAME' => 'XML::TMX', 'VERSION_FROM' => 'lib/XML/TMX.pm', 'AUTHOR' => 'Projecto Natura', 'ABSTRACT' => 'TMX tools', 'EXE_FILES' => [ qw!scripts/tmxsplit scripts/tmxuniq scripts/tmx-POStagger scripts/tmx-tokenize scripts/tmxclean scripts/tmxwc scripts/tmx2html scripts/tmxgrep scripts/tsv2tmx scripts/tmx2tmx!], 'PREREQ_PM' => { 'Test::Pod' => '1.00', 'Test::Pod::Coverage' => '0.08', 'XML::DT' => '0.57', 'Exporter' => '0', 'Pod::Usage' => 0, 'Getopt::Long' => 0, }, ); XML-TMX-0.31/MANIFEST000644 000765 000024 00000001257 12645001537 013771 0ustar00ambsstaff000000 000000 Makefile.PL MANIFEST README Changes lib/XML/TMX/Writer.pm lib/XML/TMX/FromPO.pm lib/XML/TMX/Reader.pm lib/XML/TMX.pm scripts/tmx2tmx scripts/tmx2html scripts/tmxsplit scripts/tmxuniq scripts/tmxgrep scripts/tmxwc scripts/tmxclean scripts/tmx-explode scripts/tmx-tokenize scripts/tmx-POStagger examples/xpdf-tmx examples/po2tmx t/01_main.t t/10_writer.t t/20_reader.t t/25_for_tu.t t/50_tmxcat.t t/80_frompo.t t/98_pod.t t/99_pod-coverage.t t/sample.tmx t/writer1.xml t/writer2.xml t/cat.xml META.yml Module meta-data (added by MakeMaker) t/00-aux.t scripts/tsv2tmx META.json Module JSON meta-data (added by MakeMaker) XML-TMX-0.31/META.json000644 000765 000024 00000002054 12645001536 014254 0ustar00ambsstaff000000 000000 { "abstract" : "TMX tools", "author" : [ "Projecto Natura" ], "dynamic_config" : 1, "generated_by" : "ExtUtils::MakeMaker version 7.1, CPAN::Meta::Converter version 2.150005", "license" : [ "unknown" ], "meta-spec" : { "url" : "http://search.cpan.org/perldoc?CPAN::Meta::Spec", "version" : "2" }, "name" : "XML-TMX", "no_index" : { "directory" : [ "t", "inc" ] }, "prereqs" : { "build" : { "requires" : { "ExtUtils::MakeMaker" : "0" } }, "configure" : { "requires" : { "ExtUtils::MakeMaker" : "0" } }, "runtime" : { "requires" : { "Exporter" : "0", "Getopt::Long" : "0", "Pod::Usage" : "0", "Test::Pod" : "1.00", "Test::Pod::Coverage" : "0.08", "XML::DT" : "0.57" } } }, "release_status" : "stable", "version" : "0.31", "x_serialization_backend" : "JSON::PP version 2.27300" } XML-TMX-0.31/META.yml000644 000765 000024 00000001155 12645001536 014105 0ustar00ambsstaff000000 000000 --- abstract: 'TMX tools' author: - 'Projecto Natura' build_requires: ExtUtils::MakeMaker: '0' configure_requires: ExtUtils::MakeMaker: '0' dynamic_config: 1 generated_by: 'ExtUtils::MakeMaker version 7.1, CPAN::Meta::Converter version 2.150005' license: unknown meta-spec: url: http://module-build.sourceforge.net/META-spec-v1.4.html version: '1.4' name: XML-TMX no_index: directory: - t - inc requires: Exporter: '0' Getopt::Long: '0' Pod::Usage: '0' Test::Pod: '1.00' Test::Pod::Coverage: '0.08' XML::DT: '0.57' version: '0.31' x_serialization_backend: 'CPAN::Meta::YAML version 0.018' XML-TMX-0.31/README000644 000765 000024 00000000572 12644776723 013536 0ustar00ambsstaff000000 000000 XML::TMX ======== INSTALLATION To install this module type the following: perl Makefile.PL make make test make install Note that some installed scripts require modules that are not required for XML::TMX installation. This gives you the possibility to install quickly XML::TMX for your use, but probably installs some commands that will not work out-of-the-box. XML-TMX-0.31/scripts/000755 000765 000024 00000000000 12645001535 014320 5ustar00ambsstaff000000 000000 XML-TMX-0.31/t/000755 000765 000024 00000000000 12645001535 013074 5ustar00ambsstaff000000 000000 XML-TMX-0.31/t/00-aux.t000644 000765 000024 00000002322 12004264465 014274 0ustar00ambsstaff000000 000000 #!/usr/bin/perl use XML::TMX::Reader; use Test::More; is_deeply(XML::TMX::Reader::_merge_notes(undef, "foo"), ["foo"]); is_deeply(XML::TMX::Reader::_merge_notes(undef, ["foo"]), ["foo"]); is_deeply(XML::TMX::Reader::_merge_notes([], ["foo"]), ["foo"]); is_deeply(XML::TMX::Reader::_merge_notes("foo","foo"), ["foo"]); is_deeply([ sort @{ XML::TMX::Reader::_merge_notes("foo", ["foo", "bar"]) }], [ sort (qw'foo bar') ]); is_deeply([ sort @{ XML::TMX::Reader::_merge_notes([qw'a b c e'], [qw'c d e f']) }], [ sort (qw'a b c d e f') ]); is_deeply(XML::TMX::Reader::_merge_props(undef, {foo => 'bar'}), {foo=>['bar']}); is_deeply(XML::TMX::Reader::_merge_props({foo => 'bar'}, {foo => 'bar'}), {foo=>['bar']}); my $m = XML::TMX::Reader::_merge_props({foo => 'ugh'}, {foo => 'bar'}); ok exists $m->{foo}; is_deeply [ sort @{$m->{foo}} ], [sort(qw'bar ugh')]; is_deeply(XML::TMX::Reader::_merge_props({foz => 'bar'}, {foo => 'bar'}), {foz=>'bar', foo=>['bar']}); $m = XML::TMX::Reader::_merge_props({foz => 'bar'}, {foz => 'baz', foo => 'bar'}); ok exists $m->{foo}; ok exists $m->{foz}; is_deeply [ sort @{$m->{foz}} ], [sort(qw'bar baz')]; is_deeply $m->{foo}, ['bar']; done_testing(); XML-TMX-0.31/t/01_main.t000644 000765 000024 00000000102 11735643720 014505 0ustar00ambsstaff000000 000000 # -*- cperl -*- use Test::More tests => 1; use XML::TMX; ok 1; XML-TMX-0.31/t/10_writer.t000644 000765 000024 00000003046 12606260725 015105 0ustar00ambsstaff000000 000000 # -*- cperl -*- use Test::More tests => 4; use XML::TMX::Writer; ok 1; my $tmx = new XML::TMX::Writer(); isa_ok($tmx, "XML::TMX::Writer"); $tmx->start_tmx(id => "foobar", -prop => { prop1 => 'val1', prop2 => 'val2', }, -note => [ 'note1', 'note2', 'note3' ], -output => "_${$}_"); $tmx->add_tu(srclang => 'en', -note => ['snote1', 'snote2', 'snote3'], -prop => { sprop1 => 'sval1', sprop2 => 'sval2' }, 'en' => {-prop => { a=>'b',c=>'d'}, -note => [qw,a b c d,], -seg =>'some text', }, 'pt' => 'algum texto'); $tmx->end_tmx(); ok(-f "_${$}_"); ok file_contents_almost_identical("t/writer1.xml", "_${$}_"); unlink "_${$}_"; sub file_contents_almost_identical { my ($file1, $file2) = @_; return 0 unless -f $file1; return 0 unless -f $file2; open F1, $file1 or die; open F2, $file2 or die; my ($l1,$l2); while (defined($l1 = ) && defined($l2 = )) { s/>\s*; return 0 if ; return 1; } XML-TMX-0.31/t/20_reader.t000644 000765 000024 00000003663 11742354675 015051 0ustar00ambsstaff000000 000000 # -*- cperl -*- use Test::More tests => 10; use XML::TMX::Reader; ok 1; my $reader; $reader = XML::TMX::Reader->new('foobar.tmx'); ok(!$reader,"rigth foobar.tmx is not present"); $reader = XML::TMX::Reader->new('t/sample.tmx'); ok($reader, "reading sample.tmx"); is_deeply($reader->{header}, { '-prop' => { 'bodykey' => ['bodyvalue'] }, '-note' => [ 'bodynote' ], 'o-tmf' => 'TW4Win 2.0 Format', adminlang => 'EN-US', creationdate => '20020312T164816Z', creationtoolversion => '5.0', creationtool => 'MyTool', srclang => 'EN-GB', segtype => 'sentence', datatype => 'html', }); my $count = 0; $reader->for_tu( sub { my $tu = shift; $count++; }); is($count, 7, "counting tu's with for_tu"); $reader->for_tu( { -output => "t/_tmp.tmx", }, sub { my $tu = shift; $tu->{-prop}={q=>[77], aut=>["jj","ambs"]}; $tu->{-note}=[2..5]; $tu; }); ok( -f "t/_tmp.tmx"); $reader = XML::TMX::Reader->new('t/_tmp.tmx'); ok $reader,"loading t/_tmp.tmx"; $reader->for_tu( {output => "t/_tmp2.tmx", }, sub { my $tu = shift; for (keys %{$tu->{-prop}}){ $tu->{-prop}{$_} .= "batatas"; } for (@{$tu->{-note}}){ $_ = "$_ cabolas" } $tu; }); my @langs = $reader->languages; is(@langs, 2 , "languages".join(",",@langs)); ok(grep { $_ eq "EN-GB" } @langs, "en"); ok(grep { $_ eq "PT-PT" } @langs, "pt"); unlink( "t/_tmp.tmx"); unlink( "t/_tmp2.tmx"); XML-TMX-0.31/t/25_for_tu.t000644 000765 000024 00000002420 11736072550 015070 0ustar00ambsstaff000000 000000 # -*- cperl -*- use Data::Dumper; use Test::More tests => 9; BEGIN { use_ok(XML::TMX::Reader); }; my $reader; $reader = XML::TMX::Reader->new('t/sample.tmx'); ok($reader, "reading sample.tmx"); my $count = 0; $reader->for_tu( sub { my $tu = shift; $count++; }); is($count, 7, "counting tu's with for_tu"); $count = 0; $reader->for_tu( {output => "t/_tmp.tmx", proc_tu => 6 }, sub { my $tu = shift; $tu->{-prop}={number => ++$count}; $tu; }); ok( -f "t/_tmp.tmx"); #unlink( "t/_tmp.tmx"); $reader = XML::TMX::Reader->new('t/_tmp.tmx'); ok($reader,"loadind t/_tmp.tmx"); $count = 0; $reader->for_tu( sub { my $tu = shift; $count++; }); is($count, 6, "counting tu's with for_tu"); $reader->for_tu( {output => "t/_tmp2.tmx", gen_tu=>2}, sub { my $tu = shift; if($tu->{-prop}{number} % 2 == 0) { return undef } else { $tu->{-note}[0]="This one is even"; return $tu;} }); my @langs = $reader->languages; is(@langs, 2 , "languages".join(",",@langs)); ok(grep { $_ eq "EN-GB" } @langs, "en"); ok(grep { $_ eq "PT-PT" } @langs, "pt"); unlink( "t/_tmp.tmx"); unlink( "t/_tmp2.tmx"); XML-TMX-0.31/t/50_tmxcat.t000644 000765 000024 00000001727 12606261055 015076 0ustar00ambsstaff000000 000000 #!/usr/bin/perl use strict; use warnings; use XML::TMX::Reader; use Test::More tests => 4; `$^X -Iblib scripts/tmx2tmx -cat t/writer1.xml t/writer2.xml > mycat.tmx`; ok -f 'mycat.tmx'; my $reader = XML::TMX::Reader->new('mycat.tmx'); isa_ok $reader => 'XML::TMX::Reader'; like $reader->{header}{creationdate} => qr/^\d+T\d+Z$/; delete $reader->{header}{creationtoolversion}; delete $reader->{header}{creationdate}; is_deeply($reader->{header}, { 'o-tmf' => 'plain text', adminlang => 'en', creationtool => 'XML::TMX::Writer', srclang => 'en', segtype => 'sentence', datatype => 'plaintext', '-note' => [qw.note1 note2 note3 note4 note5 note6.], '-prop' => {prop3 => ['val3'], prop4 => ['val4'], prop2 => ['val2', 'val22'], prop1 => ['val1', 'val11'], }, }); #unlink 'mycat.tmx'; XML-TMX-0.31/t/80_frompo.t000644 000765 000024 00000000125 10730327732 015073 0ustar00ambsstaff000000 000000 # -*- cperl -*- use Test::More tests => 1; BEGIN { use_ok(XML::TMX::FromPO); }; XML-TMX-0.31/t/98_pod.t000644 000765 000024 00000000201 10730327732 014357 0ustar00ambsstaff000000 000000 use Test::More; eval "use Test::Pod 1.00"; plan skip_all => "Test::Pod 1.00 required for testing POD" if $@; all_pod_files_ok(); XML-TMX-0.31/t/99_pod-coverage.t000644 000765 000024 00000000244 10730327732 016160 0ustar00ambsstaff000000 000000 use Test::More; eval "use Test::Pod::Coverage 0.08"; plan skip_all => "Test::Pod::Coverage 0.08 required for testing POD coverage" if $@; all_pod_coverage_ok(); XML-TMX-0.31/t/cat.xml000644 000765 000024 00000003022 11736074667 014402 0ustar00ambsstaff000000 000000
val11 val22 val3 val4 note1 note2 note3 note4 note5 note6
sval1 sval2 snote1 snote2 snote3 algum texto b d a b c d some text svalA svalB snoteA snoteB snoteC Hello World 2 4 1 2 3 4 olá mundo
XML-TMX-0.31/t/sample.tmx000644 000765 000024 00000007611 11736103573 015121 0ustar00ambsstaff000000 000000
bodyvalue bodynote
1 tu nr 1 1 tuv nr 1 en From Paul, called to be an apostle of Christ Jesus by the will of God, and from Sosthenes, our brother, 2 tuv nr 2 pt Paulo, Apóstolo de Jesus Cristo por vontade e chamamento de Deus, e o irmão Sóstenes, 2 tu nr 2 to God's Church which is in Corinth; to you whom God has sanctified in Christ Jesus and called to be holy, together with those who everywhere call upon the name of our Lord Christ Jesus, their Lord and ours. à Igreja de Deus que está em Corinto. Dirigimo-nos àqueles que foram santificados em Jesus Cristo e chamados a ser santos, juntamente com todos os que invocam em todo o lugar o nome de nosso Senhor Jesus Cristo, Senhor deles e nosso. 3 tu nr 3 Receive grace and peace from God our Father, and Christ Jesus our Lord. Graça e paz vos sejam dadas da parte de Deus nosso Pai e do Senhor Jesus Cristo. 4 tu nr 4 I give thanks constantly to my God for you and for the grace of God given to you in Christ Jesus. Sem cessar, agradeço a Deus por vossa causa, em vista da graça de Deus que vos foi concedida em Jesus Cristo. 5 tu nr 5 For you have been fully enriched in him with words as well as with knowledge, Pois em Jesus é que recebestes todas as riquezas, tanto da palavra como do conhecimento. 6 tu nr 6 even as the testimony concerning Christ was confirmed in you. Na verdade, o testemunho de Cristo tornou-se firme em vós, 7 tu nr 7 You do not lack any spiritual gift and only await the glorious coming of Christ Jesus, our Lord. a tal ponto que não vos falta nenhum dom, a vós que esperais a Revelação de nosso Senhor Jesus Cristo.
XML-TMX-0.31/t/writer1.xml000644 000765 000024 00000001616 12221072234 015211 0ustar00ambsstaff000000 000000
val1 val2 note1 note2 note3
sval1 sval2 snote1 snote2 snote3 b d a b c d some text algum texto
XML-TMX-0.31/t/writer2.xml000644 000765 000024 00000001752 11736074650 015231 0ustar00ambsstaff000000 000000
val11 val22 val3 val4 note4 note5 note6
svalA svalB snoteA snoteB snoteC Hello World 2 4 1 2 3 4 olá mundo
XML-TMX-0.31/scripts/tmx-explode000644 000765 000024 00000003262 11736073204 016516 0ustar00ambsstaff000000 000000 #!/usr/bin/perl use warnings; use strict; use XML::TMX::Reader; my @files = grep {-f $_} @ARGV; for my $file (@files) { print STDERR "exploding $file.."; my $reader = XML::TMX::Reader->new($file); my $name = $file; $name =~ s/\.tmx//; my %fh; my $i = 1; $reader->for_tu( sub { my ($langs) = @_; print STDERR "." unless $i % 10; for my $l (keys %$langs) { next if $l eq "-prop"; mkdir $l unless -d $l; $langs->{$l} =~ s/\n/ /g; $langs->{$l} =~ s/\s+/ /g; unless ($fh{$l}) { open $fh{$l}, ">$l/$name"; binmode $fh{$l}, ":utf8"; } print {$fh{$l}} "$i\t$langs->{$l}\n"; } ++$i; }, ); close $fh{$_} for (keys %fh); print STDERR "DONE\n"; } =encoding UTF-8 =head1 NAME tmx-explode - explodes tmx files in a file per language =head1 SYNOPSIS tmx-explode foo.tmx bar.tmx # if foo.tmx and bar.tmx have only PT and EN languages, you will get # \-EN # \-- foo # \-- bar # \-PT # \-- foo # \-- bar =head1 DESCRIPTION Explodes a set of tmx files, creating a folder per language, and a file per tmx file. =head1 SEE ALSO perl(1) =head1 AUTHOR Alberto Manuel Brandão Simões, Eambs@cpan.orgE =head1 COPYRIGHT AND LICENSE Copyright (C) 2012 by Alberto Manuel Brandão Simões =cut XML-TMX-0.31/scripts/tmx-POStagger000644 000765 000024 00000012104 12571111301 016671 0ustar00ambsstaff000000 000000 #!/usr/bin/perl -s use strict; use warnings; use File::Temp qw/ :POSIX /; my $tmpName = tmpnam(); our ( $o, # output filename $mwe, # tag mwe -- STILL NOT SUPPORTED $dates, # do dates analysis... $ner, # try to do NER.... $compact, # compact form $s, # use tags (on by default) ); $s = 1 unless defined $s; eval { require FL3 }; die "This XML::TMX script requires Lingua::FreeLing3 to be installed\n" if $@; FL3->import(); eval { require Lingua::FreeLing3::Word }; die "This XML::TMX script requires Lingua::FreeLing3 to be installed\n" if $@; Lingua::FreeLing3::Word->import(); my %initted = (); use XML::TMX::Reader '0.25'; my $file = shift or die "You must supply the name of the file to tokenize"; my $reader = XML::TMX::Reader->new($file); my $output = "pos_$file"; $output = $o if $o; my $pos_languages = {}; binmode STDOUT, ":utf8"; $reader->for_tu( { -output => $tmpName, -verbose => 1, -verbatim => 1, }, sub { my $tu = shift; for my $lang (keys %$tu) { if ($lang =~ /(pt|es|gl|it|fr|ru|en)/i) { $pos_languages->{$1} = 1; my $ln = lc $1; my $seg = $compact ? "" : "{$lang}{-seg}; my @tokens = map { Lingua::FreeLing3::Word->new($_) } split /\s+/, $txt; my $sentences = splitter($ln)->split(\@tokens); $sentences = morph($ln)->analyze($sentences); $sentences = hmm($ln)->tag($sentences); for my $stc (@$sentences) { $seg .= "\n" if $s; $seg .= $compact ? _dump_compact($stc->words) : _dump_words($stc->words); $seg .= "\n" if $s; } $tu->{$lang}{-iscdata} = $compact ? 0 : 1; $seg .= "]]>" unless $compact; $tu->{$lang}{-seg} = $seg; } } return $tu; }); $reader = XML::TMX::Reader->new($tmpName); $reader->for_tu({ -output => $output, -verbatim => 1, -raw => 1, -prop => ($compact ? {} : { 'pos-tagged' => join(",",keys %$pos_languages), 'pos-fields' => "word,lemma,pos", 'pos-s-attributes' => ['s'], })}, sub { $_ = $_[0]; if (!/\[CDATA\[/) { s{()(.*?)()} {my ($before,$middle,$after) = ($1,$2,$3); for ($middle) { s/&/&/g;s//>/g; }; $before.$middle.$after }ge; } return $_; }); #print STDERR "$tmpName\n"; unlink $tmpName if -e $tmpName; sub _init_ma { my $lang = shift; morph($lang, QuantitiesDetection => 0, MultiwordsDetection => ($mwe // 0), NumbersDetection => 0, DatesDetection => ($dates // 0), NERecognition => ($ner // 0)); $initted{$lang}++; } sub _dump_words { my @words = @_; my $seg = ""; for my $t (@words) { if ($t->is_multiword) { $seg .= sprintf("\n", $t->lemma(), $t->tag()); $seg .= _dump_words($t->get_mw_words); $seg .= "\n"; } else { $seg .= $t->form() ."\t". $t->lemma() ."\t". $t->tag() ."\n"; } } return $seg; } sub _dump_compact { my $seg = join(" " => map { my $t = $_; ($t->is_multiword ? join("_", map { $_->lemma() || $_->lc_form() } $t->get_mw_words) : ($t->lemma() || $t->lc_form())) . "/" . substr($t->tag(), 0, 1); } @_) . "\n"; $seg =~ s/&/&/g; $seg =~ s//>/g; return $seg; } =encoding UTF-8 =head1 NAME tmx-POStagger - POStaggers translation units on a tmx file. =head1 SYNOPSIS tmx-POStagger file.tmx # creates pos_file.tmx tmx-POStagger -o=out.tmx file.tmx -ner ... tags multiword named entities -dates -compact =head1 DESCRIPTION Although this script is bundled in C, it has a soft dependency on C. Soft means that the dependency is not ensured at install time, and other features of the module can still be used without C. Nevertheless, if you want to use this tool you should install that module. At the moment the supported languages are the same as supported by FreeLing3: English, Spanish, Russian, Portuguese and Italian. It your TMX file includes any other language, they will be maintained without a change. This behavior can change in the future, as a basic regexp based POStaggerr might be implemented. =head2 Options -ner -- groups Proper names with tag C which which WDT includes include VBZ Edward Cole -compact -dates =head1 SEE ALSO XML::TMX, Lingua::FreeLing3 =head1 AUTHOR Alberto Manuel Brandão Simões, Eambs@cpan.orgE =head1 COPYRIGHT AND LICENSE Copyright (C) 2012-2014 by Alberto Manuel Brandão Simões =cut XML-TMX-0.31/scripts/tmx-tokenize000755 000765 000024 00000004442 12571074123 016711 0ustar00ambsstaff000000 000000 #!/usr/bin/perl -s use strict; use warnings; our $o; eval { require FL3 }; die "This XML::TMX script requires Lingua::FreeLing3 to be installed\n" if $@; FL3->import(); use XML::TMX::Reader '0.25'; my $file = shift or die "You must supply the name of the file to tokenize"; my $reader = XML::TMX::Reader->new($file); my $output = "t_$file"; $output = $o if $o; binmode STDOUT, ":utf8"; $reader->for_tu( { -output => $output, -prop => { tokenized => "true" }, verbose => 1 }, sub { my $tu = shift; for my $lang (keys %$tu) { if ($lang =~ /(pt|es|it|ru|en|gl)/i) { my $ln = lc $1; my $txt = $tu->{$lang}{-seg}; if ($txt !~ /^\s*$/) { $txt = join(" ", @{ tokenizer($ln)->tokenize($txt, to_text => 1)}); } $tu->{$lang}{-seg} = $txt; } } return $tu; }); =encoding UTF-8 =head1 NAME tmx-tokenize - tokenizes translation units on a tmx file. =head1 SYNOPSIS tmx-tokenize file.tmx # creates t_file.tmx tmx-tokenize -o=out.tmx file.tmx =head1 DESCRIPTION Although this script is bundled in C, it has a soft dependency on C. Soft means that the dependency is not ensured at install time, and other features of the module can still be used without C. Nevertheless, if you want to use this tool you should install that module. At the moment the supported languages are the same as supported by FreeLing3: English, Spanish, Russian, Portuguese and Italian. It your TMX file includes any other language, they will be maintained without a change. This behavior can change in the future, as a basic regexp based tokenizer might be implemented. =head1 SEE ALSO XML::TMX, Lingua::FreeLing3 =head1 AUTHOR Alberto Manuel Brandão Simões, Eambs@cpan.orgE =head1 COPYRIGHT AND LICENSE Copyright (C) 2012 by Alberto Manuel Brandão Simões =cut XML-TMX-0.31/scripts/tmx2html000755 000765 000024 00000001210 11736073640 016025 0ustar00ambsstaff000000 000000 #!/usr/bin/perl -w -s use strict; use warnings; use XML::TMX; use XML::TMX::Reader; our ($icons); my $tmx = shift; my $tmx_obj = XML::TMX::Reader->new($tmx); print $tmx_obj->to_html(icons => $icons); =encoding UTF-8 =head1 NAME tmx2html - Converts a TMX to an HTML formated page =head1 SYNOPSIS tmx2html [-icons] file.tmx > file.html =head1 DESCRIPTION Pretty prints a TMX as an HTML file. Icons for languages can be produced, but still buggy. =head1 SEE ALSO perl(1) =head1 AUTHOR Alberto Manuel Brandão Simões, Eambs@cpan.orgE =head1 COPYRIGHT AND LICENSE Copyright (C) 2011 by Alberto Manuel Brandão Simões =cut XML-TMX-0.31/scripts/tmx2tmx000755 000765 000024 00000015276 12221072234 015676 0ustar00ambsstaff000000 000000 #!/usr/bin/perl -s use strict; use warnings; use XML::TMX::Reader; use XML::TMX::Writer; use XML::DT; our ( $toTrados, ## Fix some issues in the TMX file the old trados software did not cope with $clean, ## Remove empty UTs and UTs with only junk characters $cat, ## Concatenates TMX files $select, ## Filters the TMX to the selected languages ); # here we must take care that only one of the options is being used. =encoding utf-8 =head1 NAME tmx2tmx - utility to convert and filter TMX files =head1 SYNOPSYS tmx2tmx -cat file1.tmx ... filen.tmx > file.tmx tmx2tmx -toTrados file1.tmx > file2.tmx tmx2tmx -clean file1.tmx > file2.tmx tmx2tmx -select=PT,EN multilingual.tmx > pt-en.tmx =head1 DESCRIPTION This utility processes TMX documents and return TMX documents. Tasks done with this utility include conversion between TMX versions and TMX cleaning. =head2 TRADOS conversion As you maybe know, TRADOS is a company producing computer software for translators. It includes WorkBench which imports TMX files. Unfortunately, the version I used do not import TMX version 1.4. This process is done using the switch C<-toTrados>: tmx2tmx -toTrados file.tmx > trados.tmx =head2 TMX Cleaning Specially when translation memories are created from anotated text, or extracted directly from the Internet using any automatic method. This switch is used to remove junk in translation units. This option tries to remove junk from TMX files, like empty pairs where one of the sides is empty, or removing other junk type. Use it this way: tmx2tmx -clean file.tmx > file2.tmx =head2 Concatenating TMX tmx2tmx -cat file1.tmx ... filen.tmx > file.tmx ls | grep '.tmx$' | tmx2tmx -cat > file.tmx =head2 Select languages Select a bunch of languages: tmx2tmx -select=PT,EN,FR huge.tmx > pt-en-fr.tmx =cut toTrados() if $toTrados; clean() if $clean; cat() if $cat; selectl() if $select; die "No option supplied\n"; =head1 SEE ALSO tmx2html, po2tmx, XML::TMX =head1 AUTHOR Alberto Simões, Ealbie@alfarrabio.di.uminho.ptE =head1 COPYRIGHT AND LICENSE Copyright 2004 by Projecto Natura This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =cut #--( AUX FUNCS )----------- sub trim { my $x = shift; for ($x) { s!^\s+!!; s!\s+$!! } $x } sub toTrados { $/ = "
"; my $header = <>; #
->
$header =~ s!]+)/>!
!; # -> $header =~ s!]+>!!; # tmx version -> $header =~ s!]+>!!; # srclang -> srclang="foo" if ($header =~ m!(xml:)?lang=(["'])([^"']+)\2!) { my $lang = $3; $header =~ s!srclang=(["'])[^"']+\1!srclang="$lang"!; } $header =~ s/xml:lang/lang/g; # xml:lang -> lang print $header; while (<>) { s/xml:lang/lang/g; print } # ugly, but prevents us from forgetting an else somewhere. exit; } sub clean { my $file; my $reader; if ($file = shift) { # isto tv fosse mais rápido em sax... print dt( $file, -default => sub{ toxml() }, seg => sub{ $c = trim($c); toxml }, tu => sub{ # remove entries with junk, only return "" if $c =~ m!(\s|[\|\!@#\$%^&\-])*!; toxml() }, ); } else { die "At the moment, we do not handle files from stdin"; # TODO... aqui fazia jeito que o XML::TMX::Reader lêsse de um # filehandle já aberto (STDIN, por exemplo); } exit; } sub _join_and_print_headers { my $header; while (@_) { my $file = shift @_; my $reader = XML::TMX::Reader->new($file); for my $key (keys %{$reader->{header}}) { if ($key eq "-prop") { for my $type (keys %{$reader->{header}{$key}}) { push @{$header->{$key}{$type}}, @{$reader->{header}{$key}{$type}} } } elsif ($key eq "-note") { push @{$header->{$key}}, @{$reader->{header}{$key}} } else { $header->{$key} = $reader->{header}{$key} } } } my $writer = XML::TMX::Writer->new(); $writer->start_tmx(%$header); } sub cat { my @files ; if (@ARGV) { @files = @ARGV; } else { @files = map { chomp; $_ } <>; } binmode STDOUT, ":utf8"; _join_and_print_headers(@files); my $file = shift @files; open F, "<:utf8", $file or die "Cannot open file: $file\n"; while () { last if m!\n"; while () { s!!!; print; } close F; for $file (@files) { open F, "<:utf8", $file or die "Cannot open file: $file\n"; print "\n"; while () { last if m!]*>//g; print; while () { s!!!; print; } close F; } print "\n"; print "\n"; exit; } sub selectl { my @files; if (@ARGV) { @files = @ARGV; } else { @files = map { chomp; $_ } <>; } _join_and_print_headers(@files); my %languages = map { (lc $_ => 1) } split( /,/, $select ); my $tmx; while ($tmx = shift @files) { die "Can't find $tmx" if (!$tmx || !-f$tmx); print STDERR "$tmx\n"; my $reader = XML::TMX::Reader->new($tmx); $reader->for_tu( { -verbose => 1, -output => 1, -header => 0 }, sub { my $tu = shift; my $l = 0; for my $k (keys %$tu) { next if $k =~ /^-/; if (exists $languages{lc $k}) { $l++; } else { delete $tu->{$k} } } if ($l == keys %languages) { return $tu; } else { return undef; } }); } print "\n\n"; exit; } XML-TMX-0.31/scripts/tmxclean000755 000765 000024 00000003305 12153701756 016070 0ustar00ambsstaff000000 000000 #!/usr/bin/perl -s use v5.10; use strict; use warnings; use XML::TMX::Reader; our ( $junk, # remove if one of the languages just have junk $output, # output filename $eq , # remove if seg(l1) = seg(l2) $len , # remove if len(li) > 50 ∧ len(lj) > 2len(li) $v, $verbose ); my $cleaned = 0; my $processed = 0; my $tmx = shift or help(); my $reader = XML::TMX::Reader->new($tmx); $junk//=1; $output ||= "_cleaned_$tmx"; print STDERR "loading..." if $v; $reader->for_tu( {output => $output}, \&cleaner); printf STDERR "\rRemoved %d/%d (%.3f%%).\n", $cleaned, $processed, 100*$cleaned/$processed if $v; sub cleaner { my $langs = shift; $processed++; my $remove = 0; my %seg=(); my @len=(); for my $k (keys %$langs) { next if $k =~ /^-/; $remove = 1 if $eq && $seg{$langs->{$k}{-seg}}++; $remove = 1 if $junk && $langs->{$k}{-seg} =~ /^[-.,0-9\s]+$/; $remove = 1 if $junk && $langs->{$k}{-seg} =~ /^\W*$/; push(@len, length($langs->{$k}{-seg})); } @len = sort{$a <=> $b} @len; $remove = 1 if $len && $len[0] > 50 && $len[0]*2< $len[-1]; $cleaned++ if $remove; printf STDERR "\rRemoved %d/%d (%.3f%%)...", $cleaned, $processed, 100*$cleaned/$processed if $v && $processed%1000==0; return $remove ? undef : $langs; } sub help { print " tmxclean [-junk=1] \n"; exit 1; } =encoding UTF-8 =head1 NAME tmx-clean - clean TMX files ??? =head1 SYNOPSIS $ tmx-clean file.tmx # ??? =head1 DESCRIPTION Removes the Translation units that 1. have no letters... (unless -junk=0) 2. seg(L1) = seg(L2) (if -eq) =head1 SEE ALSO XML::TMX =cut XML-TMX-0.31/scripts/tmxgrep000755 000765 000024 00000002775 12641534470 015754 0ustar00ambsstaff000000 000000 #!/usr/bin/perl -w -s ## use utf8::all; use Data::Dumper; our ($all,$w,$max,$pdf, $o, $n, $a,$debug); $max //= 500; $max = 1000000000 if ($max == 0 or $all); use strict; use warnings; my $p = shift; $p = qr{\b$p\b} if $w; use XML::TMX; use XML::TMX::Reader; our ($icons); my $tmx = shift; my $tmx_obj = XML::TMX::Reader->new($tmx); $tmx_obj->for_tu( { patt => $p, gen_tu=> $max, n => $n || 0, output => $o || undef }, #### "__tmxgrep.tmx" sub { my($tu,$at) = @_; print STDERR Dumper($tu,$at) if $main::debug; # print STDERR "."; if($main::a){ for my $k(keys %$tu){ for(keys %{$tu->{$k}}){ $tu->{$k}{$_} =~ s!($p)!=($1)=!g; } } } return $tu; } ); if($pdf){ system("xpdf-tmx __.tmx");} __END__ =head1 NAME tmxgrep - grep translation units in a TMX file =head1 SYNOPSIS tmxgrep file.tmx options: -pdf -- output is transformed in a PDF file -max=300 -- extract up to 300 TU (def=500) -max=0 -- all matches -all -- all matches -a -- annotate matches with '==(...)' -o=out.tmx -- define output file (defaut= STDOUT) -n -- print original TU number (eg: ) =head1 DESCRIPTION Creates a TMX file with the tranlation units tha macth the provided regular expression. =head1 AUTHOR J.Joao Almeida, jj@di.uminho.pt =head1 SEE ALSO perl(1). xpdf-tmx XML::TMX TMX =cut XML-TMX-0.31/scripts/tmxsplit000644 000765 000024 00000005410 11763630336 016136 0ustar00ambsstaff000000 000000 #!/usr/bin/perl -s use strict; use utf8; use XML::DT; #use bytes; our ($twente,$latin1,$utf8,$q,$cutmaxlen); my $filename = $ARGV[0]; my %files; my $data; my %h = ( # '-outputenc' => "ISO-8859-1", 'seg' => sub{ for ($c){ # s/\&/\&/g; # s//\>/g; s/\s\s+|^\s+|\s+$/ /g; }; $c }, 'ut' => sub{" "}, 'tu' => sub{$c}, 'tuv' => sub{ $c =~ s/^[\s\n]*//; $c =~ s/[\s\n]*$//; $data->{$v{lang}||$v{"xml:lang"}} = $cutmaxlen && length($c) > $cutmaxlen ? substr($c,0,$cutmaxlen)."||" : $c}, ); $h{-outputenc} = "ISO-8859-1" if $twente || $latin1; undef $h{-outputenc} if $utf8; my $i = 0; $| = 1; my $f; for $f (@ARGV){ # print "\n$f" unless $q; print STDERR "\n$f"; $/ = "\n"; open X, $f or die "cannot open file $f"; do { if(/encoding=.ISO-8859-1./i){$h{-outputenc}=$h{-inputenc}="ISO-8859-1";} } while ($_ = and $_ !~ /!s and $resto = $'; $/ = ""; while() { ($_ = $resto . $_ and $resto = "" ) if $resto; $i++; last if /<\/body>/; #print "." if (!$q && $i%500==0); print STDERR "." if ($i % 1000==0); s/\>\s+/>/; undef($data); eval {dtstring($_, %h)} ; ## don't die in invalid XML if($@){warn($@)} else{ for my $k (keys %$data) { if (exists($files{"$filename-$k"})) { myprint($files{"$filename-$k"}, $data->{$k},$i); } else { my $x; open $x, ">$filename-$k" or die("cant >$filename-$k\n"); binmode($x,":utf8") if $utf8; myprint($x, $data->{$k},$i); $files{"$filename-$k"} = $x; } } } } close X; } for my $key (keys %files) { print "$key\n"; } sub myprint{ my($f,$tu,$i)=@_; if ($twente){ for ($tu){ s/<.*?>/ /gs; s/[\|\$]/ /gs; s/(\w)([.;,!:?«»"])/$1 $2/g; s/([.;,!:?«»"])(\w)/$1 $2/g; s/\s\s+|^\s+|\s+$/ /g; } print {$f} "$tu\n\$\n"; } else { print {$f} "$tu\n"; } } __END__ =encoding utf-8 =head1 NAME tmxsplit - splits a TMX file several files, one for each language =head1 SYNOPSIS tmxsplit f.tmx f2.tmx ... tmxsplit -twente f.tmx =head1 DESCRIPTION splits a TMX file in several files (one per language) and puts a tag C in each translation unit. The names for output files is based on the first tmx file supplied. =head1 Options -twente -- makes a format compatible with twente-aligner -latin1 -- a make latin1-encoded output -utf8 -- a make utf8-encoded output -q -- don't print filenames and "." -cutmaxlen=n -- cut translations by the n character =head1 AUTHOR Alberto Simões, albie@di.uminho.pt J.Joao Almeida, jj@di.uminho.pt =head1 SEE ALSO perl(1). tmx2cqp(1) =cut XML-TMX-0.31/scripts/tmxuniq000755 000765 000024 00000004542 12062106764 015763 0ustar00ambsstaff000000 000000 #!/usr/bin/perl -s use DB_File; use Fcntl ; use Lingua::PT::PLNbase; use XML::TMX::Reader; use Digest::MD5 qw(md5_hex); use Encode; our ($cont,$id,$dig,$tok,$o,$fast); if ($cont) { tie %dic, 'DB_File', "__tmxuniq_$$.db", O_RDWR|O_CREAT , 0640, $DB_BTREE; } else { tie %dic, 'DB_File', "__tmxuniq_$$.db", O_RDWR|O_CREAT|O_TRUNC , 0640, $DB_BTREE; } my $cid = 0; for my $file (@ARGV){ my $tm = XML::TMX::Reader->new($file); print STDERR "Processing..."; $tm->for_tu ( { output => $o || "$file._" }, sub { my $tu = shift; $cid++; $tu->{-prop}{id} = $cid if $id; my $key = join("|||", map { n($tu->{$_}{-seg}) } sort grep { !/^-/ } keys %$tu); my $digest = md5_hex(encode_utf8($key)); unless ($cid % 10000) { my $size = -s "__tmxuniq_$$.db"; printf STDERR "\rTotal: %10d Removed: %8d (%.2f%%) Database size: %10d bytes", $cid, $rem, (100*$rem/$cid), $size; } if ($dic{$digest}) { $dic{$digest} .= "$cid;" unless $fast; $rem ++; return undef } else { $dic{$digest} = "$cid;"; $tu->{-prop}{digest} = $digest if $dig; return {%$tu} ; # used clone.. no idea why } } ); my $size = -s "__tmxuniq_$$.db"; if ($cid) { printf STDERR "\rTotal: %10d Removed: %8d (%.2f%%) Database size: %10d bytes\n", $cid, $rem, (100*$rem/$cid), $size; } else { printf STDERR "\rHuh.. empty TMX?\n"; } undef $tm; } untie %h; sub n { my $a = shift; $a =~ s/\.{6,}/...../g; $a = tokenize( { rs => ' ' } => $a ) if $tok; $a =~ s/\s+/ /g; $a =~ s/ $//; $a =~ s/^ //; return $a; } __END__ =head1 NAME tmxuniq - removes duplicated translation units from TMXs =head1 SYNOPSIS tmxuniq [options] -l=en:pt tmx1 ... =head1 DESCRIPTION Removes duplicated translation units from a set of TMX (Translation Memory eXange format). =head1 OPTIONS -id -- insert a uniq id property in each TU -dig -- insert a digest property in each TU -tok -- tokenize/normalize text -o=out.tmx -- (with 1 argument) redefine output (default = input._) =head1 AUTHOR J.Joao Almeida, jj@di.uminho.pt =head1 SEE ALSO perl(1). =cut XML-TMX-0.31/scripts/tmxwc000755 000765 000024 00000001376 11763406075 015427 0ustar00ambsstaff000000 000000 #!/usr/bin/perl -s use XML::TMX::Reader; print STDERR ""; my $TTU = 0; my $n = scalar(@ARGV); for my $file (@ARGV) { my $tm = XML::TMX::Reader->new($file); my $TU = 0; $tm->for_tu({-raw => 1 }, sub { $TU++ ; print STDERR "\r$TU..." unless $TU%1000; } ); print STDERR "\r$file: $TU tu.\n"; $TTU += $TU; } print STDERR "total: $TTU tu.\n" if $n > 1; __END__ =encoding utf-8 =head1 NAME tmxwc - gives statistics about tmx files =head1 SYNOPSIS tmxwc tmx1 [tmx2 ...] =head1 DESCRIPTION Gives statistical information about TMX files like the number of translation units. =head1 AUTHOR Alberto Simões, Eambs@cpan.orgE =head1 SEE ALSO perl(1). =cut XML-TMX-0.31/scripts/tsv2tmx000644 000765 000024 00000005567 12645001504 015703 0ustar00ambsstaff000000 000000 #!/usr/bin/perl use warnings; use strict; use Getopt::Long; use Pod::Usage; ### Option varialbes my $man = 0; my $help = 0; my ($sl, $tl); our $verbose = 1; my $header = 1; my $columns = "1,2"; #### ------ binmode STDIN, ":utf8"; binmode STDOUT, ":utf8"; GetOptions ('help|h' => \$help, 'man' => \$man, "source=s" => \$sl, "target=s" => \$tl, "header" => \$header, "columns=s" => \$columns, "verbose|v" => \$verbose) or pod2usage(2); pod2usage(1) if $help; pod2usage(-exitval => 0, -verbose => 2) if $man; ## our ($c1, $c2); if ($columns =~ /^(\d+),(\d+)$/) { ($c1,$c2) = ($1,$2); } else { die "Columns definition should be a pair of integers: 1,2\n"; } if (!$sl || !$tl) { if ($header) { _log ("No source language or target language defined. Guessing!"); my ($l1, $l2) = readLine(); $sl = $l1 unless defined $sl; $tl = $l2 unless defined $tl; } else { die "No header, and one of the source or target languages not defined!\n"; } $header = 0; } readLine() if $header; use XML::TMX::Writer; my $tmx = XML::TMX::Writer->new(); $tmx->start_tmx(id => 'tsv2tmx'); my @r; while (@r = readLine()) { $tmx->add_tu($sl=>$r[0],$tl=>$r[1]); } $tmx->end_tmx(); sub _log { say STDERR @_ if $verbose; } sub readLine { my $line = ; if ($line) { chomp $line; return (split /\t/, $line)[$c1,$c2] } else { return (); } } __END__ =encoding utf-8 =head1 NAME tsv2tmx - Create a TMX from a TSV file =head1 SYNOPSIS tsv2tmx [options] Options: --help brief help message --man full documentation --verbose | -v activated verbose mode --sl=EN --tl=PT describe source and target language names --header treat first line as a heading --columns=1,2 specify which columns to extract =head1 OPTIONS =over 8 =item B<--help> Print a brief help message and exits. =item B<--man> Prints the manual page and exits. =item B<--verbose> | B<-v> Activates the verbose mode. =item B<--sl> | B<--tl> Use these options to specify the names for the source and target languages. =item B<--header> By default this switch is on, and it means that the TSV file includes a first line with a heading. If no source or target language names are specified, the first line will be used to guess them. =item B<--columns=1,2> Specify which columns should be extracted. Needs to be a pair of integers, separated by a comma. Columns indexes start at 0. Default to C<1,2>. =back =head1 DESCRIPTION Useful to create translation memories from TSV files, that can be easily exported from spreadsheet software. =head1 =head1 SEE ALSO XML::TMX =head1 AUTHOR Alberto Simões, C<< >> =head1 COPYRIGHT AND LICENSE Copyright 2016 by Projecto Natura This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =cut XML-TMX-0.31/lib/XML/000755 000765 000024 00000000000 12645001535 014037 5ustar00ambsstaff000000 000000 XML-TMX-0.31/lib/XML/TMX/000755 000765 000024 00000000000 12645001535 014507 5ustar00ambsstaff000000 000000 XML-TMX-0.31/lib/XML/TMX.pm000644 000765 000024 00000001622 12645001510 015037 0ustar00ambsstaff000000 000000 package XML::TMX; # vim:sw=3:ts=3:et: use 5.004; use warnings; use strict; use parent 'Exporter'; our $VERSION = '0.31'; our @ISA = 'Exporter'; our @EXPORT_OK = qw(); =encoding utf-8 =head1 NAME XML::TMX - Perl extensions for managing TMX files =head1 SYNOPSIS use XML::TMX; =head1 DESCRIPTION XML::TMX is the top level module. At the moment it does not contain any useful code, so check sub-modules, please. =head1 SEE ALSO XML::TMX::Writer, XML::TMX::Reader, XML::TMX::FromPO L, TMX Specification L =head1 AUTHOR Alberto Simões, Ealbie@alfarrabio.di.uminho.ptE Paulo Jorge Jesus Silva, Epaulojjs@bragatel.ptE =head1 COPYRIGHT AND LICENSE Copyright 2003-2012 by Projecto Natura This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =cut 1; XML-TMX-0.31/lib/XML/TMX/FromPO.pm000644 000765 000024 00000020377 12645001510 016211 0ustar00ambsstaff000000 000000 package XML::TMX::FromPO; use 5.004; use warnings; use strict; use XML::TMX::Writer; use Exporter (); use vars qw(@ISA @EXPORT_OK $VERSION); $VERSION = '0.31'; @ISA = 'Exporter'; @EXPORT_OK = qw(&new &parse_dir &create_tmx &clean_tmx); =pod =encoding utf-8 =head1 NAME XML::TMX::FromPO - Generates a TMX file from a group of PO files =head1 SYNOPSIS use XML::TMX::FromPO; my $conv = new XML::TMX::FromPO(OUTPUT => '%f.tmx'); =head1 DESCRIPTION This module can be used to generate TMX files from a group of PO files. =head1 METHODS The following methods are available: =head2 new $tmx = new XML::TMX::FromPO(); Creates a new XML::TMX::FromPO object. Please check the L section for details on the options. =cut sub new { my $proto = shift; my $class = ref($proto) || $proto; my $self = {}; $self->{LANG} = undef; $self->{OUTPUT} = undef; $self->{DEBUG} = 0; __common_conf($self, @_); unless(defined($self->{CONVER})) { if(system('recode >/dev/null 2>&1')) { $self->{CONVERT} = 'iconv -f %t -t utf8 < %f'; } else { $self->{CONVERT} = 'recode %t..utf8 < %f'; } } $self->{TMX} = {}; bless($self, $class); return($self); } =head2 rec_get_po TODO: Document method =cut sub rec_get_po { my $self = shift; my $dir = shift; my $lan1 = shift; __common_conf($self, @_); # check if directory is readable if(-f $dir) { my $file=$dir; my $lang = lc($lan1); if(!defined($self->{LANG}) || __check_lang($self, $lang)) { __processa($self, $file, $lang); } } else { die("$dir is not a readable directory\n") unless(-d $dir); for my $file (<$dir/*>) { if($file =~ /(.*)\.(po|messages)$/) { my $lang = lc($lan1); if(!defined($self->{LANG}) || __check_lang($self, $lang)) { __processa($self, $file, $lang); } } elsif($file =~ /(.*)\.(\w+)\.(po|messages)$/) { my $lang = lc($lan1); ## ??? my $lang = "\L$2"; if(!defined($self->{LANG}) || __check_lang($self, $lang)) { __processa($self, $file, $lang); } } elsif(-d $file) { rec_get_po($self,$file,$lan1) } else { ## warn ("$file ... não tem lingua\n") if($self->{DEBUG}); } } } __add_en($self) if(__check_lang($self, 'en')); __limpa($self); } =head2 parse_dir TODO: Document method =cut sub parse_dir { my $self = shift; my $dir = shift; __common_conf($self, @_); # check if directory is readable die("$dir is not a readable directory\n") unless(-d $dir); for my $file ((<$dir/*.po>),(<$dir/*.messages>)) { if($file =~ /(\w+)\.(po|messages)$/) { my $lang = "\L$1"; if(!defined($self->{LANG}) || __check_lang($self, $lang)) { __processa($self, $file, $lang); } } elsif($file =~ /(.*)\.(\w+)\.(po|messages)$/) { my $lang = "\L$2"; if(!defined($self->{LANG}) || __check_lang($self, $lang)) { __processa($self, $file, $lang); } } else { warn ("$file ... não tem lingua\n") if($self->{DEBUG}); } } __add_en($self) if(__check_lang($self, 'en')); __limpa($self); } # return value: # * 0 -> lang does not exist # * 1 -> lang exists sub __check_lang { my $self = shift; my $lang = shift; my @regex = @{$self->{LANG}}; while(my $regex = shift(@regex)) { last if($regex gt $lang); if($lang =~ /^$regex$/i) { return(1); } } return(0); } sub __add_en { my $self = shift; for my $str (keys %{$self->{TMX}}) { $self->{TMX}{$str}{'en'} = $str; } } =head2 create_tmx TODO: Document function =cut sub create_tmx { my $self = shift; my $tmx = new XML::TMX::Writer(); __common_conf($self, @_); my $n_langs = @{$self->{LANG}}; if(defined($self->{OUTPUT})) { $tmx->start_tmx(ID => 'XML::TMX::FromPO', OUTPUT => $self->{OUTPUT}); } else { $tmx->start_tmx(ID => 'XML::TMX::FromPO'); } for my $chave (keys %{$self->{TMX}}) { my $reg = __make_tu($self, $self->{TMX}{$chave}); # only write to file if all languages are defined $tmx->add_tu(%{$reg}) if(keys(%{$reg}) >= $n_langs); } $tmx->end_tmx(); } =head2 clean_tmx TODO: Document method =cut sub clean_tmx { my $self = shift; $self->{TMX} = {}; } sub __make_tu { my $self = shift; my $block = shift; my $reg = {}; if(!defined($self->{LANG})) { return($block); } for my $lang (keys %$block) { $reg->{$lang} = $block->{$lang} if(__check_lang($self, $lang)); } return($reg); } sub __processa { my $self = shift; my $a = shift; my $l = shift; local $/ = "\nmsgid"; #local $/ = "\nmsgid "; print STDERR "$a\n" if($self->{DEBUG}); my $codeline = `grep -i Content-Type $a | grep -i charset`; my $code = "?"; if($codeline =~ /charset=([\w-]+)/) { $code = $1; } my $convert = $self->{CONVERT}; $convert =~ s/\%t/$code/i; $convert =~ s/\%f/$a/i; if($code eq "?" || $code =~ /utf-?8/i ) { open(F,$a) or die;} else { open(F,"$convert|") or die;} my $mi = 0; while() { chomp; next if($mi == 0 && /^msgid\s+""/); if(/"Content-Type:/ && /charset=([\w-]+)/) { $code = $1; next } s/(^|\n)\s*#.*//g; # s/_//g unless $under; next unless(/\n\s*msgstr/); my ($m1,$m2) = ($`,$'); $m1 =~ s/(^\s*"|"\s*$)//g; $m1 =~ s/("\s*\n\s*")/ /g; $m2 =~ s/(^\s*"|"\s*$)//g; $m2 =~ s/("\s*\n\s*")/ /g; unless($m1) { warn "\n====M1 vazio... \n$m1\n=$m2\n"; next; } if($m2) { $self->{TMX}{$m1}{$l} = $m2; } # || "????? $m1"; #$self->{TMX}{$m1}{'en'} = $m1; # print "\n====\n$m1\n=$m2\n"; $mi++; } print STDERR "Charset: $code\n" if($self->{DEGUB}); close F; } sub __limpa { my $self = shift; # possíveis limpezas # (1) eliminar traduções que sejam igual ao original # (2) eliminar strings que não contenham pelo menos 2 letras # consecutivas # (3) eliminar frases que fiquem sem traduções # # um teste realizado com os po's do evolution mostrou uma redução do # ficheiro final de 12M para 8,6M, uma análise com o diff aos dumps # permitiu ver que grande parte do ''lixo'' eram de (1) for my $h1 (keys %{$self->{TMX}}) { if($h1 =~ /[a-z][a-z]/i) { for my $h2 (keys %{$self->{TMX}{$h1}}) { # optimização (1) delete($self->{TMX}{$h1}{$h2}) if($h2 !~ /^en/i && $h1 eq $self->{TMX}{$h1}{$h2}); } # optimização (3) delete($self->{TMX}{$h1}) unless(keys %{$self->{TMX}{$h1}}); } else { # optimização (2) delete($self->{TMX}{$h1}); } } } =pod =head1 COMMON CONFIGURATION These configuration options can be passed to all methods in the module: =over =item LANG => 'list' A case insensitive list of regular expression separated by whitespaces that matches the code of the languages that are to be processed. Defaults to all. =item CONVERT => 'iconv -f %t -t utf8 < %f' A string that contains the command to convert a file (%f) from some charset (%t) to Unicode. If none is specified, the module tries to use L, if it fails then the module defaults to L. =item OUTPUT => 'x.tmx' The name of the output file. If none is specified it defaults to the standard output. =item DEBUG => 1 Activate debugging information. Defaults to 0. =back =cut sub __common_conf { my $self = shift; my %opt = @_; if(defined($opt{LANG})) { my @list; for my $l (sort(split(/\s+/, $opt{LANG}))) { push(@list, $l) if($l =~ /^[a-z0-9_]+$/i); } $self->{LANG} = \@list if(@list); } $self->{CONVERT} = $opt{CONVERT} if defined($opt{CONVERT}); $self->{OUTPUT} = $opt{OUTPUT} if defined($opt{OUTPUT}); $self->{DEBUG} = $opt{DEBUG} if defined($opt{DEBUG}); } =head1 SEE ALSO L, L, L, L =head1 AUTHOR Paulo Jorge Jesus Silva, Epaulojjs@bragatel.ptE =head1 COPYRIGHT AND LICENSE Copyright 2003 by Paulo Jorge Jesus Silva This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =cut 1; XML-TMX-0.31/lib/XML/TMX/Reader.pm000644 000765 000024 00000030506 12645001510 016244 0ustar00ambsstaff000000 000000 package XML::TMX::Reader; use 5.010; use warnings; use strict; use Exporter (); use vars qw($VERSION @ISA @EXPORT_OK); use XML::DT; use XML::TMX::Writer; $VERSION = '0.31'; @ISA = 'Exporter'; @EXPORT_OK = qw(); =encoding utf-8 =head1 NAME XML::TMX::Reader - Perl extension for reading TMX files =head1 SYNOPSIS use XML::TMX::Reader; my $reader = XML::TMX::Reader->new( $filename ); $reader -> for_tu( sub { my $tu = shift; #blah blah blah }); @used_languages = $reader->languages; $reader->to_html() =head1 DESCRIPTION This module provides a simple way for reading TMX files. =head1 METHODS The following methods are available: =head2 C This method creates a new XML::TMX::Reader object. This process checks for the existence of the file and extracts some meta-information from the TMX header; my $reader = XML::TMX::Reader->new("my.tmx"); =cut sub new { my ($class, $file) = @_; return undef unless -f $file; my $self = bless { encoding => _guess_encoding($file), filename => $file, ignore_markup => 1, } => $class; $self->_parse_header; return $self; } sub _guess_encoding { my $file = shift; my $encoding = 'UTF-8'; open my $fh, "<", $file or die "can't open $file"; my $line = <$fh>; if ($line =~ /encoding=['"]([^'"]+)['"]/) { $encoding = $1; } close $fh; return $encoding; } sub _parse_header { my $self = shift; my $header = ""; { local $/ = ""; open my $fh, "<:encoding($self->{encoding})", $self->{filename} or die "$!"; $header = <$fh>; close $fh; } $header =~ s/^.*().*$/$1/s; $header =~ s!().*$!$1!s; dtstring($header => ( 'header' => sub { $self->{header}{$_} = $v{$_} for (keys %v); }, 'prop' => sub { $v{type} ||= "_"; push @{$self->{header}{-prop}{$v{type}}}, $c; }, 'note' => sub { push @{$self->{header}{-note}}, $c; }, )); } =head2 C This method is used to set the flag to ignore (or not) markup inside translation unit segments. The default is to ignore those markup. If called without parameters, it sets the flag to ignore the markup. If you don't want to do that, use $reader->ignore_markup(0); =cut sub ignore_markup { my ($self, $opt) = @_; $opt = 1 unless defined $opt; $self->{ignore_markup} = $opt; } =head2 C This method returns the languages being used on the specified translation memory. Note that the module does not check for language code correctness or existence. =cut sub languages { my $self = shift; my %languages = (); $self->for_tu({proc_tu => 100}, sub { my $tu = shift; for ( keys %$tu ) { $languages{$_}++ unless m/^-/; } } ); return keys %languages; } =head2 C Use C to process all translation units from a TMX file. This version iterates for all tu (one at the time) The configuration hash is a reference to a Perl hash. At the moment these are valid options: =over =item C<-verbose> Set this option to a true value and a counter of the number of processed translation units will be printed to stderr. =item C<-output> | C Filename to output the changed TMX to. Note that if you use this option, your function should return a hash reference where keys are language names, and values their respective translation. =item C Write at most C TUs =item C Process at most C TUs =item C Only process TU that match C. =item C<-raw> Pass the XML directly to the method instead of parsing it. =item C<-verbatim> Use segment contents verbatim, without any normalization. =item C<-prop> A hashref of properties to be B to the TMX header block. =item C<-note> An arrayref of notes to be B to the TMX header block. =item C<-header> A boolean value. If set to true, the heading tags (and closing tag) of the TMX file are written. Otherwise, only the translation unit tags are written. =back The function will receive two arguments: =over =item * a reference to a hash which maps: the language codes to the respective translation unit segment; a special key "-prop" that maps property names to properties; a special key "-note" that maps to a list of notes. =item * a reference to a hash which contains the attributes for those translation unit tag; =back If you want to process the TMX and return it again, your function should return an hash reference where keys are the languages, and values their respective translation. =cut sub _merge_notes { my ($orig, $new) = @_; $orig //= []; $orig = [$orig] unless ref $orig eq "ARRAY"; $new = [$new] unless ref $new eq "ARRAY"; push @$orig => grep { my $x = $_; !grep { $x eq $_} @$orig } @$new; return $orig; } sub _merge_props { my ($orig, $new) = @_; die "-prop should be hash" if $orig and ref $orig ne "HASH"; die "-prop should be hash" if $new and ref $new ne "HASH"; for my $key (keys %$new) { $orig->{$key} = _merge_notes($orig->{$key}, $new->{$key}); } return $orig; } sub _compute_header { my ($current, $conf) = @_; my %header = %$current; if (exists($conf->{-note})) { $header{-note} = _merge_notes($header{-note}, $conf->{-note}); } if (exists($conf->{-prop})) { $header{-prop} = _merge_props($header{-prop}, $conf->{-prop}); } return \%header; } sub for_tu { my $self = shift; my $conf = { -header => 1 }; my $i = 0; ref($_[0]) eq "HASH" and $conf = {%$conf , %{shift(@_)}}; my $code = shift; die "invalid processor" unless ref($code) eq "CODE"; local $/; my $outputingTMX = 0; my $tmx; my $data; my $gen=0; my %h = ( -type => { tu => 'SEQ', tuv => 'SEQ' }, tu => sub { my $tu; for my $va (@$c) { if ($va->[0] eq "-prop") { push @{$tu->{$va->[0]}{$va->[1]}}, $va->[2] } elsif ($va->[0] eq "-note") { push @{$tu->{$va->[0]}}, $va->[1] } else { $tu->{$va->[0]} = $va->[1] } } my ($ans, $v) = $code->($tu, \%v); # Check if the user wants to create a TMX and # forgot to say us if (!$outputingTMX && $ans && ref($ans) eq "HASH") { $outputingTMX = 1; $tmx = XML::TMX::Writer->new(); if ($conf->{-header}) { my $header = _compute_header($self->{header}, $conf); $tmx->start_tmx(encoding => $self->{encoding}, %$header); } } # Add the translation unit if ($ans && ref($ans) eq "HASH") { $gen++; %v = %$v if ($v && ref($v) eq "HASH"); my %ans = (%v, %$ans); $ans{"-n"}=$i if $conf->{n} ; $tmx->add_tu(-verbatim => $conf->{-verbatim}, %ans); } }, tuv => sub { my $tuv; for my $v (@$c) { if ($v->[0] eq "-prop") { push @{$tuv->{$v->[0]}{$v->[1]}}, $v->[2] } elsif ($v->[0] eq "-note") { push @{$tuv->{$v->[0]}}, $v->[1] } elsif ($v->[0] eq "-cdata") { $tuv->{-iscdata} = 1; $tuv->{-seg} = $v->[1]; } else { $tuv->{-seg} = $v->[0]; } } [ $v{lang} || $v{'xml:lang'} || "_" => $tuv ] }, prop => sub { ["-prop", $v{type} || "_", $c] }, note => sub { ["-note" , $c] }, seg => sub { return ($v{iscdata}) ? [ -cdata => $c ] : [ $c ] }, -cdata => sub { father->{'iscdata'} = 1; $c }, hi => sub { $self->{ignore_markup}?$c:toxml }, ph => sub { $self->{ignore_markup}?$c:toxml }, ); $/ = "\n"; $h{-outputenc} = $h{-inputenc} = $self->{encoding}; my $resto = ""; ## Go through the header... open X, "{encoding})" ,$self->{filename} or die "cannot open file $self->{filename}\n"; while () { if (/^\xFF\xFE/) { die("UTF16 encoding not supported; try 'iconv -f unicode -t utf8 tmx' before\n"); } next if /^\s*$/; last if /)(.*)!s) { $resto = $3; } # If we have an output filename, user wants to output a TMX $conf->{-output} = $conf->{output} if defined($conf->{output}); if (defined($conf->{-output})) { $outputingTMX = 1; $tmx = XML::TMX::Writer->new(); if ($conf->{-header}) { my $header = _compute_header($self->{header}, $conf); $tmx->start_tmx(encoding => $self->{encoding}, -output => $conf->{-output}, %$header); } } $/ = ""; $conf->{-verbose}++ if $conf->{verbose}; print STDERR "." if $conf->{-verbose}; while () { ($_ = $resto . $_ and $resto = "" ) if $resto; last if /<\/body>/; $i++; print STDERR "\r$i" if $conf->{-verbose} && !($i % 10); last if defined $conf->{proc_tu} && $i > $conf->{proc_tu} ; last if defined $conf->{gen_tu} && $gen > $conf->{gen_tu}; next if defined $conf->{patt} && !m/$conf->{patt}/ ; #### # This can't be done. Not sure why it was being done. # So, please, unless you know the implications for tagged crpora # do not uncomment it. # s/\>\s+/>/; undef($data); if ($conf->{'-raw'}) { my $ans = $code->($_); if ($conf->{-output}) { $ans->{"-n"}=$i if $conf->{n} ; $tmx->add_tu(-raw => $ans); } } else { eval { dtstring($_, %h) } ; ## dont die in invalid XML warn $@ if $@; } } print STDERR "\r$i\n" if $conf->{-verbose}; close X; $tmx->end_tmx if $conf->{-header} && $outputingTMX; } =head2 C Use this method to create a nice HTML file with the translation memories. Notice that this method is not finished yet, and relies on some images, on some specific locations. =cut sub to_html { my $self = shift; my %opt = @_; $self->for_tu(sub { my ($langs, $opts) = @_; my $ret = ""; for (keys %$langs) { next if /^-/; $ret .= "\n" } $ret .= "
"; if ($opt{icons}) { $ret .= "\"$_\"/" } else { $ret .= "$_" } $ret .= "$langs->{$_}{-seg}
"; $ret; } ); } sub for_tu2 { warn "Please update your code to use 'for_tu'\n"; &for_tu; } =head2 C deprecated. use C =head1 SEE ALSO L, TMX Specification L =head1 AUTHOR Alberto Simões, Ealbie@alfarrabio.di.uminho.ptE Paulo Jorge Jesus Silva, Epaulojjs@bragatel.ptE J.João Almeida, Ejj@di.uminho.ptE =head1 COPYRIGHT AND LICENSE Copyright 2003-2012 by Projecto Natura This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =cut 1; XML-TMX-0.31/lib/XML/TMX/Writer.pm000644 000765 000024 00000026505 12645001510 016322 0ustar00ambsstaff000000 000000 package XML::TMX::Writer; use 5.004; use warnings; use strict; use Exporter (); use vars qw($VERSION @ISA @EXPORT_OK); $VERSION = '0.31'; @ISA = 'Exporter'; @EXPORT_OK = qw(); =encoding utf-8 =head1 NAME XML::TMX::Writer - Perl extension for writing TMX files =head1 SYNOPSIS use XML::TMX::Writer; my $tmx = XML::TMX::Writer->new(); $tmx->start_tmx(id => 'paulojjs'); $tmx->add_tu('en' => 'some text', 'pt' => 'algum texto'); $tmx->add_tu('en' => 'some text', 'pt' => 'algum texto', -note => [32, 34 ], -prop => { q => 23, aut => "jj"} ); $tmx->end_tmx(); =head1 DESCRIPTION This module provides a simple way for writing TMX files. =head1 METHODS The following methods are available: =head2 new $tmx = new XML::TMX::Writer(); Creates a new XML::TMX::Writer object =cut sub new { my $proto = shift; my $class = ref($proto) || $proto; my %ops = @_; my $self = { OUTPUT => \*STDOUT }; binmode $self->{OUTPUT}, ":utf8" unless exists $ops{-encoding} and $ops{-encoding} !~ /utf.?8/i; bless($self, $class); return($self); } =head2 start_tmx $tmx->start_tmx(-output => 'some_file.tmx'); Begins a TMX file. Several options are available: =over 4 =item -output Output of the TMX, if none is defined stdout is used by default. =item tool Tool used to create the TMX. Defaults to 'XML::TMX::Writer' =item toolversion Some version identification of the tool used to create the TMX. Defaults to the current module version =item segtype Segment type used in the ItuE> elements. Possible values are I, I, I and I. Defaults to I. =item srctmf Specifies the format of the translation memory file from which the TMX document or segment thereof have been generated. =item adminlang Specifies the default language for the administrative and informative elements InoteE> and IpropE>. =item srclang Specifies the language of the source text. If a ItuE> element does not have a srclang attribute specified, it uses the one defined in the IheaderE> element. Defaults to I<*all*>. =item datatype Specifies the type of data contained in the element. Depending on that type, you may apply different processes to the data. The recommended values for the datatype attribute are as follow (this list is not exhaustive): =over 4 =item unknown undefined =item alptext WinJoust data =item cdf Channel Definition Format =item cmx Corel CMX Format =item cpp C and C++ style text =item hptag HP-Tag =item html HTML, DHTML, etc =item interleaf Interleaf documents =item ipf IPF/BookMaster =item java Java, source and property files =item javascript JavaScript, ECMAScript scripts =item lisp Lisp =item mif Framemaker MIF, MML, etc =item opentag OpenTag data =item pascal Pascal, Delphi style text =item plaintext Plain text (default) =item pm PageMaker =item rtf Rich Text Format =item sgml SGML =item stf-f S-Tagger for FrameMaker =item stf-i S-Tagger for Interleaf =item transit Transit data =item vbscript Visual Basic scripts =item winres Windows resources from RC, DLL, EXE =item xml XML =item xptag Quark XPressTag =back =item srcencoding All TMX documents are in Unicode. However, it is sometimes useful to know what code set was used to encode text that was converted to Unicode for purposes of interchange. This option specifies the original or preferred code set of the data of the element in case it is to be re-encoded in a non-Unicode code set. Defaults to none. =item id Specifies the identifier of the user who created the element. Defaults to none. =item -note A reference to a list of notes to be added in the header. =item -prop A reference fo a hash of properties to be added in the header. Keys are used as the C attribute, value as the tag contents. =back =cut sub start_tmx { my $self = shift; my %options = @_; my %o; my @time = gmtime(time); $o{'creationdate'} = sprintf("%d%02d%02dT%02d%02d%02dZ", $time[5]+1900, $time[4]+1, $time[3], $time[2], $time[1], $time[0]); my $encoding = $options{encoding} || "UTF-8"; if (defined($options{'-output'})) { delete $self->{OUTPUT}; # because it is a glob open $self->{OUTPUT}, ">", $options{'-output'} or die "Cannot open file '$options{'-output'}': $!\n"; } if ($encoding =~ m!utf.?8!i) { binmode $self->{OUTPUT}, ":utf8" } $self->_write("\n"); my @valid_segtype = qw'block sentence paragraph phrase'; if(defined($options{SEGTYPE}) && grep { $_ eq $options{SEGTYPE} } @valid_segtype) { $o{segtype} = $options{SEGTYPE}; } else { $o{segtype} = 'sentence' } $o{'creationtool'} = $options{tool} || 'XML::TMX::Writer'; $o{'creationtoolversion'} = $options{toolversion} || $VERSION; $o{'o-tmf'} = $options{srctmf} || 'plain text'; $o{'adminlang'} = $options{adminlang} || 'en'; $o{'srclang'} = $options{srclang} || 'en'; $o{'datatype'} = $options{datatype} || 'plaintext'; defined($options{srcencoding}) and $o{'o-encoding'} = $options{srcencoding}; defined($options{id}) and $o{'creationid'} = $options{id}; $self->_startTag(0, 'tmx', 'version' => 1.4)->_nl; $self->_startTag(1, 'header', %o)->_nl; $self->_write_props(2, $options{'-prop'}) if defined $options{'-prop'}; $self->_write_notes(2, $options{'-note'}) if defined $options{'-note'}; $self->_indent(1)->_endTag('header')->_nl; $self->_startTag(0,'body')->_nl->_nl; } sub _write_props { my ($self, $indent, $props) = @_; return unless ref($props) eq "HASH"; for my $key (sort keys %$props) { if (ref($props->{$key}) eq "ARRAY") { for my $val (@{$props->{$key}}) { if ($key eq "_") { $self->_startTag($indent, 'prop'); } else { $self->_startTag($indent, prop => (type => $key)); } $self->_characters($val); $self->_endTag('prop')->_nl; } } else { if ($key eq "_") { $self->_startTag($indent, 'prop'); } else { $self->_startTag($indent, prop => (type => $key)); } $self->_characters($props->{$key}); $self->_endTag('prop')->_nl; } } } sub _write_notes { my ($self, $indent, $notes) = @_; return unless ref($notes) eq "ARRAY"; for my $p (@{$notes}) { $self->_startTag($indent, 'note'); $self->_characters($p); $self->_endTag('note')->_nl; } } =head2 add_tu $tmx->add_tu(srclang => LANG1, LANG1 => 'text1', LANG2 => 'text2'); $tmx->add_tu(srclang => LANG1, LANG1 => 'text1', LANG2 => 'text2', -note => ["value1", ## notes "value2"], -prop => { type1 => ["value1","value"], #multiple values _ => 'value2', # anonymound properties typen => ["valuen"],} ); Adds a translation unit to the TMX file. Several optional labels can be specified: =over =item id Specifies an identifier for the ItuE> element. Its value is not defined by the standard (it could be unique or not, numeric or alphanumeric, etc.). =item srcencoding Same meaning as told in B method. =item datatype Same meaning as told in B method. =item segtype Same meaning as told in B method. =item srclang Same meaning as told in B method. =back =cut sub add_tu { my $self = shift; my %tuv = @_; my %prop = (); my @note = (); my %opt; my $verbatim = 0; my $cdata = 0; if (exists($tuv{-raw})) { # value already includes tags, hopefully, at least! # so we will not mess with it. $self->_write($tuv{-raw}); return; } for my $key (qw'id datatype segtype srclang creationid creationdate') { if (exists($tuv{$key})) { $opt{$key} = $tuv{$key}; delete $tuv{$key}; } } if (defined($tuv{srcencoding})) { $opt{'o-encoding'} = $tuv{srcencoding}; delete $tuv{srcencoding}; } $verbatim++ if defined $tuv{-verbatim}; delete $tuv{-verbatim} if exists $tuv{-verbatim}; if (defined($tuv{"-prop"})) { %prop = %{$tuv{"-prop"}}; delete $tuv{"-prop"}; } if (defined($tuv{"-note"})) { @note = @{$tuv{"-note"}}; delete $tuv{"-note"}; } if (defined($tuv{"-n"})) { $opt{id}=$tuv{"-n"}; delete $tuv{"-n"}; } $self->_startTag(0,'tu', %opt)->_nl; ### write the prop s problemas 23 $self->_write_props(3, \%prop); $self->_write_notes(3, \@note); for my $lang (sort keys %tuv) { my $cdata = 0; $self->_startTag(1, 'tuv', 'xml:lang' => $lang); if (ref($tuv{$lang}) eq "HASH") { $cdata++ if defined($tuv{$lang}{-iscdata}); delete($tuv{$lang}{-iscdata}) if exists($tuv{$lang}{-iscdata}); $self->_write_props(2, $tuv{$lang}{-prop}) if exists $tuv{$lang}{-prop}; $self->_write_notes(2, $tuv{$lang}{-note}) if exists $tuv{$lang}{-note}; $tuv{$lang} = $tuv{$lang}{-seg} || ""; } $self->_startTag(0, 'seg'); if ($verbatim) { $self->_write($tuv{$lang}); } elsif ($cdata) { $self->_write("_write($tuv{$lang}); $self->_write("]]>"); } else { $self->_characters($tuv{$lang}); } $self->_endTag('seg'); $self->_endTag('tuv')->_nl; } $self->_endTag('tu')->_nl->_nl; } =head2 end_tmx $tmx->end_tmx(); Ends the TMX file, closing file handles if necessary. =cut sub end_tmx { my $self = shift(); $self->_endTag('body')->_nl; $self->_endTag('tmx')->_nl; close($self->{OUTPUT}); } =head1 SEE ALSO TMX Specification L =head1 AUTHOR Paulo Jorge Jesus Silva, Epaulojjs@bragatel.ptE Alberto Simões, Ealbie@alfarrabio.di.uminho.ptE =head1 COPYRIGHT AND LICENSE Copyright 2003 by Projecto Natura This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =cut sub _write { my $self = shift; print {$self->{OUTPUT}} @_; return $self; } sub _nl { my $self = shift; $self->_write("\n"); } sub _startTag { my ($self, $indent, $tagName, %attributes) = @_; my $attributes = ""; $attributes = " ".join(" ",map {"$_=\"$attributes{$_}\""} sort keys %attributes) if %attributes; $self->_indent($indent)->_write("<$tagName$attributes>"); } sub _indent { my ($self, $indent) = @_; $indent = " " x $indent; $self->_write($indent); } sub _characters { my ($self, $text) = @_; $text = "" unless defined $text; $text =~ s/\n/ /g; $text =~ s/ +/ /g; $text =~ s/&/&/g; $text =~ s//>/g; $self->_write($text); } sub _endTag { my ($self, $tagName) = @_; $self->_write(""); } 1; XML-TMX-0.31/examples/po2tmx000755 000765 000024 00000003717 11764151056 015643 0ustar00ambsstaff000000 000000 #!/usr/bin/perl -w -s our($s); use Data::Dumper; use strict; use lib 'lib'; use XML::TMX::FromPO; my $tmx = new XML::TMX::FromPO(DEBUG => 1); my %convert = ( 'pt_en' => 'pt en', 'pt_es' => 'pt es', 'pt_de' => 'pt de', 'pt_it' => 'pt it', 'pt_ru' => 'pt ru', 'pt_fr' => 'pt fr', 'pt_br' => 'pt pt_BR', 'br_en' => 'pt_BR en', 'br_es' => 'pt_BR es', 'br_de' => 'pt_BR de', 'br_it' => 'pt_BR it', 'br_ru' => 'pt_BR ru', 'br_fr' => 'pt_BR fr', 'en_fr' => 'en fr' ); if($s){ my $lang = $s; if ($s eq "1") { die("Usage $0 -s=pt DIR\n")} while(my $dir = shift()) { $tmx->rec_get_po($dir, $lang, LANG=>"$lang en"); open(XML, "| tee _1.aux |xmllint --recover --format - | bzip2 > $lang-en.tmx.bz2"); # print STDOUT Dumper($tmx); *STDOUT = *XML; $tmx->create_tmx(LANG => "$lang en"); close(XML); $tmx->clean_tmx() } } else{ my $lang = ''; for my $c (keys %convert) { $lang = $lang . " $convert{$c}"; } while(my $dir = shift()) { if(-d "$dir/po" and <$dir/po/*.po>){ $tmx->parse_dir("$dir/po", LANG => $lang) ; } if(-d "$dir" and <$dir/*.po>) { $tmx->parse_dir("$dir", LANG => $lang) ; } $dir =~ m/([a-z0-9_\.-]+)\/*$/i; for my $conv (keys %convert) { open(XML, "|xmllint --format - | bzip2 > $1_$conv.tmx.bz2"); *STDOUT = *XML; $tmx->create_tmx(LANG => "$convert{$conv}"); close(XML); } $tmx->clean_tmx() } } =encoding UTF-8 =head1 NAME po2tmx - creates TMX files from PO files =head1 SYNOPSIS po2tmx POdir # po2tmx -s=fr #single PO files en-fr (does not join multiple PO-files) =head1 DESCRIPTION read several POs, and joinds their translation units to build several TMX pairs. =head1 SEE ALSO XML::TMX =cut XML-TMX-0.31/examples/xpdf-tmx000755 000765 000024 00000003527 12221072235 016146 0ustar00ambsstaff000000 000000 #!/usr/bin/perl -s ### NOT IN DISTRO our ($m); $m ||= 1000; my $tmp="/tmp/_tmxpdf_$$_.pdf"; my $tmx=shift or die("usage:\n\t$0 file.tmx\n"); use strict; if(not -d "$ENV{HOME}/.gwb"){ mkdir("$ENV{HOME}/.gwb"); } if(not -f "$ENV{HOME}/.gwb/tmx.css"){ open(F,">","$ENV{HOME}/.gwb/tmx.css") or die("cant create'$ENV{HOME}/.gwb/tmx.css'\n"); while(){ last if /END/; print F $_} close F; } system("prince -s $ENV{HOME}/.gwb/tmx.css $tmx $tmp 2>/dev/null"); system("xpdf -z width -cont -g 1500x1050 $tmp"); unlink $tmp; __DATA__ @charset "UTF-8"; tu { display: block; border-top: solid red 1px; padding: 2pt; counter-increment: num; columns:2; } tuv { display: block; page-break-inside: avoid; margin-bottom: 4pt; } tuv {color:orange; font-size:70% ; } tuv:lang(pt) {color:green; } tuv:lang(pt_pt){color:green; } tuv:lang(pt-pt){color:green; } tuv:lang(PT_PT){color:green; } tuv:lang(PT-PT){color:green; } tuv::before {color: black; font-size:70%; } tuv[lang]::before { content:counter(num) " " attr(lang) " "; } tuv:lang(pt)::before { content:counter(num) " PT "; } tuv:lang(en)::before { content:counter(num) " EN "; } tuv:lang(en_GB)::before { content:counter(num) " EN "; } tuv:lang(en_US)::before { content:counter(num) " EN "; } seg { } header{ display:none; } @page{ margin: 10mm 10mm 10mm 10mm; columns:2; size:A4; } @screen{ margin: 8mm 8mm 8mm 8mm; columns:2; size:screen; } /* END */ =head1 NAME xpdf-tmx - convert TMX files to PDF and show them =head1 SYNOPSIS xpdf-tmx file.tmx =head1 DESCRIPTION =head2 Dependencies C converter must be installed xpdf =head1 AUTHOR J.Joao Almeida, jj@di.uminho.pt =head1 SEE ALSO perl(1). prince xpdf =cut