XML-TMX-0.31/ 000755 000765 000024 00000000000 12645001536 012632 5 ustar 00ambs staff 000000 000000 XML-TMX-0.31/Changes 000644 000765 000024 00000006731 12645001531 014127 0 ustar 00ambs staff 000000 000000 Revision history for Perl extension XML::TMX.
0.31 Mon Jan 11 19:32:33 WET 2016
- Fixed utf8 POD.
0.30 Mon Jan 11 19:07:37 WET 2016
- Added tsv2tmx script.
0.29 Sat Oct 10 20:04:11 WEST 2015
- Consistent version numbers
0.28 Thu Sep 17 14:24:20 WEST 2015
- Fixed TMX spec URL (thanks to Alexander Becker)
0.27 Mon Aug 31 18:14:12 WEST 2015
- Fixed tmx-POStagger.
0.26 Tue Apr 28 19:08:28 WEST 2015
- Fixed bug when processing CDATA elements.
- Added -raw option to for_tu
- Reworked TMX annotated format.
- Various improvements in tmx-POStagger.
0.25 Tue Jul 30 16:06:58 WEST 2013
- tmx2tmx -cat : fix a bug related with a missing ":utf8"
- tmxclean -len: remove tu if len > 50 and len-min * 2 < len-max
- tmxclean -eq: remove tu if equal ( seg(l1)=seg(l2) )
- output formatting of the TMX slightly changed (tu = lines/5)
0.24 Thu Nov 29 11:51:03 WET 2012
- Require Perl v5.10
0.23 Nov 24, 2012
- sort languages when writing (good for tests)
0.22 Jun 6, 2012
- more UTF-8 pod stuff
0.21 Jun 5, 2012
- properly encode UTF-8
Scripts
- tmx2tmx -cat preserves props and notes from all documents
- add POD to some scripts
- added tmx-tokenize
- Moved some scripts to example folder
Reader
- save info about CDATA sections
- add -raw option on for_tu2
- for_tu2 renamed as for_tu
- reads header correctly (props and notes)
Writer
- write correctly CDATA sections
- better output
- handle header props and notes
- more testing
0.20 Jan 31, 2012
- add 'verbose' option to for_tu family of methods
0.19 Jan 27, 2012
- fix 'ignore_markup' in Reader.pm (thanks to Achim Ruopp)
0.18 Feb 09, 2011
- tmxwc working for multiple files;
- Fixed tmxuniq to work without complaining on unicode;
0.17 Dec 22, 2008
- Added tmxwc and tmxclean scripts.
0.16 Fev 5, 2007
- refactored for_tu2 function (it used $&, $` and $').
0.15 Dec 9, 2006
- new for_tu2 function that will soon replace for_tu
more efficient for big files
- languages function just checks the first few 100 TUs
0.14 Jun 7, 2006
- in XML::TMX::Writer, if we know the encoding, use binmode
to force it.
0.13 Jul 20, 2005
- for_tu now handles direct output for another TMX file.
- removed some newlined in the output.
0.12 Jul 17, 2005
- added some tests to XML::TMX::Writer;
- changed test names to be tested in order;
0.11 Jul 12, 2005
- corrected stupid bug lying around since 0.05
0.10 Sep 30, 2004
0.09 - problem with upload to CPAN;
0.09 - problem with upload to CPAN;
- added pod and pod-coverage tests;
- added documentation;
0.07 May 18, 2004
- option to remove sub-tags is now global and not specific for
the for_tu command;
0.06 Apr 22, 2004
- corrected bug on test files
- tmx2tmx is installed
- tmx2tmx conversion to TRADOS TMX1.1 format
- tmx2tmx basic TMX cleaner
0.05 Jan 07, 2004
- removed dependency with XML::Writer;
- option to remove sub-tags when processing s;
0.04 Nov 11, 2003
- use xml:lang instead of lang if the first exists
0.03 Oct 12, 2003
- Many bugs were found. Here is a working version (we hope)
0.02 Oct 09, 2003
- Corrected warnings with perl 5.8.1
0.01 Sep 14, 2003
- Created XML::TMX::{Reader|Query}
XML-TMX-0.31/examples/ 000755 000765 000024 00000000000 12645001535 014447 5 ustar 00ambs staff 000000 000000 XML-TMX-0.31/lib/ 000755 000765 000024 00000000000 12645001535 013377 5 ustar 00ambs staff 000000 000000 XML-TMX-0.31/Makefile.PL 000644 000765 000024 00000002060 12644775453 014621 0 ustar 00ambs staff 000000 000000 use ExtUtils::MakeMaker;
use 5.010;
WriteMakefile(
'NAME' => 'XML::TMX',
'VERSION_FROM' => 'lib/XML/TMX.pm',
'AUTHOR' => 'Projecto Natura',
'ABSTRACT' => 'TMX tools',
'EXE_FILES' => [
qw!scripts/tmxsplit
scripts/tmxuniq
scripts/tmx-POStagger
scripts/tmx-tokenize
scripts/tmxclean
scripts/tmxwc
scripts/tmx2html
scripts/tmxgrep
scripts/tsv2tmx
scripts/tmx2tmx!],
'PREREQ_PM' => {
'Test::Pod' => '1.00',
'Test::Pod::Coverage' => '0.08',
'XML::DT' => '0.57',
'Exporter' => '0',
'Pod::Usage' => 0,
'Getopt::Long' => 0,
},
);
XML-TMX-0.31/MANIFEST 000644 000765 000024 00000001257 12645001537 013771 0 ustar 00ambs staff 000000 000000 Makefile.PL
MANIFEST
README
Changes
lib/XML/TMX/Writer.pm
lib/XML/TMX/FromPO.pm
lib/XML/TMX/Reader.pm
lib/XML/TMX.pm
scripts/tmx2tmx
scripts/tmx2html
scripts/tmxsplit
scripts/tmxuniq
scripts/tmxgrep
scripts/tmxwc
scripts/tmxclean
scripts/tmx-explode
scripts/tmx-tokenize
scripts/tmx-POStagger
examples/xpdf-tmx
examples/po2tmx
t/01_main.t
t/10_writer.t
t/20_reader.t
t/25_for_tu.t
t/50_tmxcat.t
t/80_frompo.t
t/98_pod.t
t/99_pod-coverage.t
t/sample.tmx
t/writer1.xml
t/writer2.xml
t/cat.xml
META.yml Module meta-data (added by MakeMaker)
t/00-aux.t
scripts/tsv2tmx
META.json Module JSON meta-data (added by MakeMaker)
XML-TMX-0.31/META.json 000644 000765 000024 00000002054 12645001536 014254 0 ustar 00ambs staff 000000 000000 {
"abstract" : "TMX tools",
"author" : [
"Projecto Natura"
],
"dynamic_config" : 1,
"generated_by" : "ExtUtils::MakeMaker version 7.1, CPAN::Meta::Converter version 2.150005",
"license" : [
"unknown"
],
"meta-spec" : {
"url" : "http://search.cpan.org/perldoc?CPAN::Meta::Spec",
"version" : "2"
},
"name" : "XML-TMX",
"no_index" : {
"directory" : [
"t",
"inc"
]
},
"prereqs" : {
"build" : {
"requires" : {
"ExtUtils::MakeMaker" : "0"
}
},
"configure" : {
"requires" : {
"ExtUtils::MakeMaker" : "0"
}
},
"runtime" : {
"requires" : {
"Exporter" : "0",
"Getopt::Long" : "0",
"Pod::Usage" : "0",
"Test::Pod" : "1.00",
"Test::Pod::Coverage" : "0.08",
"XML::DT" : "0.57"
}
}
},
"release_status" : "stable",
"version" : "0.31",
"x_serialization_backend" : "JSON::PP version 2.27300"
}
XML-TMX-0.31/META.yml 000644 000765 000024 00000001155 12645001536 014105 0 ustar 00ambs staff 000000 000000 ---
abstract: 'TMX tools'
author:
- 'Projecto Natura'
build_requires:
ExtUtils::MakeMaker: '0'
configure_requires:
ExtUtils::MakeMaker: '0'
dynamic_config: 1
generated_by: 'ExtUtils::MakeMaker version 7.1, CPAN::Meta::Converter version 2.150005'
license: unknown
meta-spec:
url: http://module-build.sourceforge.net/META-spec-v1.4.html
version: '1.4'
name: XML-TMX
no_index:
directory:
- t
- inc
requires:
Exporter: '0'
Getopt::Long: '0'
Pod::Usage: '0'
Test::Pod: '1.00'
Test::Pod::Coverage: '0.08'
XML::DT: '0.57'
version: '0.31'
x_serialization_backend: 'CPAN::Meta::YAML version 0.018'
XML-TMX-0.31/README 000644 000765 000024 00000000572 12644776723 013536 0 ustar 00ambs staff 000000 000000 XML::TMX
========
INSTALLATION
To install this module type the following:
perl Makefile.PL
make
make test
make install
Note that some installed scripts require modules that are not required
for XML::TMX installation. This gives you the possibility to install
quickly XML::TMX for your use, but probably installs some commands
that will not work out-of-the-box.
XML-TMX-0.31/scripts/ 000755 000765 000024 00000000000 12645001535 014320 5 ustar 00ambs staff 000000 000000 XML-TMX-0.31/t/ 000755 000765 000024 00000000000 12645001535 013074 5 ustar 00ambs staff 000000 000000 XML-TMX-0.31/t/00-aux.t 000644 000765 000024 00000002322 12004264465 014274 0 ustar 00ambs staff 000000 000000 #!/usr/bin/perl
use XML::TMX::Reader;
use Test::More;
is_deeply(XML::TMX::Reader::_merge_notes(undef, "foo"), ["foo"]);
is_deeply(XML::TMX::Reader::_merge_notes(undef, ["foo"]), ["foo"]);
is_deeply(XML::TMX::Reader::_merge_notes([], ["foo"]), ["foo"]);
is_deeply(XML::TMX::Reader::_merge_notes("foo","foo"), ["foo"]);
is_deeply([ sort @{ XML::TMX::Reader::_merge_notes("foo", ["foo", "bar"]) }],
[ sort (qw'foo bar') ]);
is_deeply([ sort @{ XML::TMX::Reader::_merge_notes([qw'a b c e'], [qw'c d e f']) }],
[ sort (qw'a b c d e f') ]);
is_deeply(XML::TMX::Reader::_merge_props(undef, {foo => 'bar'}), {foo=>['bar']});
is_deeply(XML::TMX::Reader::_merge_props({foo => 'bar'}, {foo => 'bar'}),
{foo=>['bar']});
my $m = XML::TMX::Reader::_merge_props({foo => 'ugh'}, {foo => 'bar'});
ok exists $m->{foo};
is_deeply [ sort @{$m->{foo}} ], [sort(qw'bar ugh')];
is_deeply(XML::TMX::Reader::_merge_props({foz => 'bar'}, {foo => 'bar'}),
{foz=>'bar', foo=>['bar']});
$m = XML::TMX::Reader::_merge_props({foz => 'bar'}, {foz => 'baz', foo => 'bar'});
ok exists $m->{foo};
ok exists $m->{foz};
is_deeply [ sort @{$m->{foz}} ], [sort(qw'bar baz')];
is_deeply $m->{foo}, ['bar'];
done_testing();
XML-TMX-0.31/t/01_main.t 000644 000765 000024 00000000102 11735643720 014505 0 ustar 00ambs staff 000000 000000 # -*- cperl -*-
use Test::More tests => 1;
use XML::TMX;
ok 1;
XML-TMX-0.31/t/10_writer.t 000644 000765 000024 00000003046 12606260725 015105 0 ustar 00ambs staff 000000 000000 # -*- cperl -*-
use Test::More tests => 4;
use XML::TMX::Writer;
ok 1;
my $tmx = new XML::TMX::Writer();
isa_ok($tmx, "XML::TMX::Writer");
$tmx->start_tmx(id => "foobar",
-prop => {
prop1 => 'val1',
prop2 => 'val2',
},
-note => [
'note1', 'note2', 'note3'
],
-output => "_${$}_");
$tmx->add_tu(srclang => 'en',
-note => ['snote1', 'snote2', 'snote3'],
-prop => { sprop1 => 'sval1',
sprop2 => 'sval2' },
'en' => {-prop => { a=>'b',c=>'d'},
-note => [qw,a b c d,],
-seg =>'some text', },
'pt' => 'algum texto');
$tmx->end_tmx();
ok(-f "_${$}_");
ok file_contents_almost_identical("t/writer1.xml", "_${$}_");
unlink "_${$}_";
sub file_contents_almost_identical {
my ($file1, $file2) = @_;
return 0 unless -f $file1;
return 0 unless -f $file2;
open F1, $file1 or die;
open F2, $file2 or die;
my ($l1,$l2);
while (defined($l1 = ) && defined($l2 = )) {
s/>\s*>;
return 0 if ;
return 1;
}
XML-TMX-0.31/t/20_reader.t 000644 000765 000024 00000003663 11742354675 015051 0 ustar 00ambs staff 000000 000000 # -*- cperl -*-
use Test::More tests => 10;
use XML::TMX::Reader;
ok 1;
my $reader;
$reader = XML::TMX::Reader->new('foobar.tmx');
ok(!$reader,"rigth foobar.tmx is not present");
$reader = XML::TMX::Reader->new('t/sample.tmx');
ok($reader, "reading sample.tmx");
is_deeply($reader->{header}, { '-prop' => { 'bodykey' => ['bodyvalue'] },
'-note' => [ 'bodynote' ],
'o-tmf' => 'TW4Win 2.0 Format',
adminlang => 'EN-US',
creationdate => '20020312T164816Z',
creationtoolversion => '5.0',
creationtool => 'MyTool',
srclang => 'EN-GB',
segtype => 'sentence',
datatype => 'html',
});
my $count = 0;
$reader->for_tu( sub {
my $tu = shift;
$count++;
});
is($count, 7, "counting tu's with for_tu");
$reader->for_tu( { -output => "t/_tmp.tmx", },
sub {
my $tu = shift;
$tu->{-prop}={q=>[77], aut=>["jj","ambs"]};
$tu->{-note}=[2..5];
$tu;
});
ok( -f "t/_tmp.tmx");
$reader = XML::TMX::Reader->new('t/_tmp.tmx');
ok $reader,"loading t/_tmp.tmx";
$reader->for_tu( {output => "t/_tmp2.tmx", },
sub {
my $tu = shift;
for (keys %{$tu->{-prop}}){
$tu->{-prop}{$_} .= "batatas";
}
for (@{$tu->{-note}}){
$_ = "$_ cabolas"
}
$tu;
});
my @langs = $reader->languages;
is(@langs, 2 , "languages".join(",",@langs));
ok(grep { $_ eq "EN-GB" } @langs, "en");
ok(grep { $_ eq "PT-PT" } @langs, "pt");
unlink( "t/_tmp.tmx");
unlink( "t/_tmp2.tmx");
XML-TMX-0.31/t/25_for_tu.t 000644 000765 000024 00000002420 11736072550 015070 0 ustar 00ambs staff 000000 000000 # -*- cperl -*-
use Data::Dumper;
use Test::More tests => 9;
BEGIN {
use_ok(XML::TMX::Reader);
};
my $reader;
$reader = XML::TMX::Reader->new('t/sample.tmx');
ok($reader, "reading sample.tmx");
my $count = 0;
$reader->for_tu( sub {
my $tu = shift;
$count++;
});
is($count, 7, "counting tu's with for_tu");
$count = 0;
$reader->for_tu( {output => "t/_tmp.tmx", proc_tu => 6 },
sub {
my $tu = shift;
$tu->{-prop}={number => ++$count};
$tu;
});
ok( -f "t/_tmp.tmx");
#unlink( "t/_tmp.tmx");
$reader = XML::TMX::Reader->new('t/_tmp.tmx');
ok($reader,"loadind t/_tmp.tmx");
$count = 0;
$reader->for_tu( sub {
my $tu = shift;
$count++;
});
is($count, 6, "counting tu's with for_tu");
$reader->for_tu( {output => "t/_tmp2.tmx", gen_tu=>2},
sub {
my $tu = shift;
if($tu->{-prop}{number} % 2 == 0) { return undef }
else { $tu->{-note}[0]="This one is even";
return $tu;}
});
my @langs = $reader->languages;
is(@langs, 2 , "languages".join(",",@langs));
ok(grep { $_ eq "EN-GB" } @langs, "en");
ok(grep { $_ eq "PT-PT" } @langs, "pt");
unlink( "t/_tmp.tmx");
unlink( "t/_tmp2.tmx");
XML-TMX-0.31/t/50_tmxcat.t 000644 000765 000024 00000001727 12606261055 015076 0 ustar 00ambs staff 000000 000000 #!/usr/bin/perl
use strict;
use warnings;
use XML::TMX::Reader;
use Test::More tests => 4;
`$^X -Iblib scripts/tmx2tmx -cat t/writer1.xml t/writer2.xml > mycat.tmx`;
ok -f 'mycat.tmx';
my $reader = XML::TMX::Reader->new('mycat.tmx');
isa_ok $reader => 'XML::TMX::Reader';
like $reader->{header}{creationdate} => qr/^\d+T\d+Z$/;
delete $reader->{header}{creationtoolversion};
delete $reader->{header}{creationdate};
is_deeply($reader->{header},
{
'o-tmf' => 'plain text',
adminlang => 'en',
creationtool => 'XML::TMX::Writer',
srclang => 'en',
segtype => 'sentence',
datatype => 'plaintext',
'-note' => [qw.note1 note2 note3 note4 note5 note6.],
'-prop' => {prop3 => ['val3'],
prop4 => ['val4'],
prop2 => ['val2', 'val22'],
prop1 => ['val1', 'val11'],
},
});
#unlink 'mycat.tmx';
XML-TMX-0.31/t/80_frompo.t 000644 000765 000024 00000000125 10730327732 015073 0 ustar 00ambs staff 000000 000000 # -*- cperl -*-
use Test::More tests => 1;
BEGIN {
use_ok(XML::TMX::FromPO);
};
XML-TMX-0.31/t/98_pod.t 000644 000765 000024 00000000201 10730327732 014357 0 ustar 00ambs staff 000000 000000 use Test::More;
eval "use Test::Pod 1.00";
plan skip_all => "Test::Pod 1.00 required for testing POD" if $@;
all_pod_files_ok();
XML-TMX-0.31/t/99_pod-coverage.t 000644 000765 000024 00000000244 10730327732 016160 0 ustar 00ambs staff 000000 000000 use Test::More;
eval "use Test::Pod::Coverage 0.08";
plan skip_all => "Test::Pod::Coverage 0.08 required for testing POD coverage" if $@;
all_pod_coverage_ok();
XML-TMX-0.31/t/cat.xml 000644 000765 000024 00000003022 11736074667 014402 0 ustar 00ambs staff 000000 000000
val11
val22
val3
val4
note1
note2
note3
note4
note5
note6
sval1
sval2
snote1
snote2
snote3
algum texto
b
d
a
b
c
d
some text
svalA
svalB
snoteA
snoteB
snoteC
Hello World
2
4
1
2
3
4
olá mundo
XML-TMX-0.31/t/sample.tmx 000644 000765 000024 00000007611 11736103573 015121 0 ustar 00ambs staff 000000 000000
1
tu nr 1
1
tuv nr 1 en
From Paul, called to be an apostle of Christ Jesus by the
will of God, and from Sosthenes, our brother,
2
tuv nr 2 pt
Paulo, Apóstolo de Jesus Cristo por vontade e chamamento
de Deus, e o irmão Sóstenes,
2
tu nr 2
to God's Church which is in Corinth; to you whom God has
sanctified in Christ Jesus and called to be holy, together
with those who everywhere call upon the name of our Lord
Christ Jesus, their Lord and ours.
à Igreja de Deus que está em Corinto. Dirigimo-nos
àqueles que foram santificados em Jesus Cristo e chamados a
ser santos, juntamente com todos os que invocam em todo o
lugar o nome de nosso Senhor Jesus Cristo, Senhor deles e
nosso.
3
tu nr 3
Receive grace and peace from God our Father, and Christ
Jesus our Lord.
Graça e paz vos sejam dadas da parte de Deus nosso Pai e
do Senhor Jesus Cristo.
4
tu nr 4
I give thanks constantly to my God for you and for the
grace of God given to you in Christ Jesus.
Sem cessar, agradeço a Deus por vossa causa, em vista da
graça de Deus que vos foi concedida em Jesus Cristo.
5
tu nr 5
For you have been fully enriched in him with words as
well as with knowledge,
Pois em Jesus é que recebestes todas as riquezas, tanto
da palavra como do conhecimento.
6
tu nr 6
even as the testimony concerning Christ was confirmed in
you.
Na verdade, o testemunho de Cristo tornou-se firme em
vós,
7
tu nr 7
You do not lack any spiritual gift and only await the
glorious coming of Christ Jesus, our Lord.
a tal ponto que não vos falta nenhum dom, a vós que
esperais a Revelação de nosso Senhor Jesus Cristo.
XML-TMX-0.31/t/writer1.xml 000644 000765 000024 00000001616 12221072234 015211 0 ustar 00ambs staff 000000 000000
val1
val2
note1
note2
note3
sval1
sval2
snote1
snote2
snote3
b
d
a
b
c
d
some text
algum texto
XML-TMX-0.31/t/writer2.xml 000644 000765 000024 00000001752 11736074650 015231 0 ustar 00ambs staff 000000 000000
val11
val22
val3
val4
note4
note5
note6
svalA
svalB
snoteA
snoteB
snoteC
Hello World
2
4
1
2
3
4
olá mundo
XML-TMX-0.31/scripts/tmx-explode 000644 000765 000024 00000003262 11736073204 016516 0 ustar 00ambs staff 000000 000000 #!/usr/bin/perl
use warnings;
use strict;
use XML::TMX::Reader;
my @files = grep {-f $_} @ARGV;
for my $file (@files) {
print STDERR "exploding $file..";
my $reader = XML::TMX::Reader->new($file);
my $name = $file;
$name =~ s/\.tmx//;
my %fh;
my $i = 1;
$reader->for_tu(
sub {
my ($langs) = @_;
print STDERR "." unless $i % 10;
for my $l (keys %$langs) {
next if $l eq "-prop";
mkdir $l unless -d $l;
$langs->{$l} =~ s/\n/ /g;
$langs->{$l} =~ s/\s+/ /g;
unless ($fh{$l}) {
open $fh{$l}, ">$l/$name";
binmode $fh{$l}, ":utf8";
}
print {$fh{$l}} "$i\t$langs->{$l}\n";
}
++$i;
},
);
close $fh{$_} for (keys %fh);
print STDERR "DONE\n";
}
=encoding UTF-8
=head1 NAME
tmx-explode - explodes tmx files in a file per language
=head1 SYNOPSIS
tmx-explode foo.tmx bar.tmx
# if foo.tmx and bar.tmx have only PT and EN languages, you will get
# \-EN
# \-- foo
# \-- bar
# \-PT
# \-- foo
# \-- bar
=head1 DESCRIPTION
Explodes a set of tmx files, creating a folder per language, and a
file per tmx file.
=head1 SEE ALSO
perl(1)
=head1 AUTHOR
Alberto Manuel Brandão Simões, Eambs@cpan.orgE
=head1 COPYRIGHT AND LICENSE
Copyright (C) 2012 by Alberto Manuel Brandão Simões
=cut
XML-TMX-0.31/scripts/tmx-POStagger 000644 000765 000024 00000012104 12571111301 016671 0 ustar 00ambs staff 000000 000000 #!/usr/bin/perl -s
use strict;
use warnings;
use File::Temp qw/ :POSIX /;
my $tmpName = tmpnam();
our (
$o, # output filename
$mwe, # tag mwe -- STILL NOT SUPPORTED
$dates, # do dates analysis...
$ner, # try to do NER....
$compact, # compact form
$s, # use tags (on by default)
);
$s = 1 unless defined $s;
eval { require FL3 };
die "This XML::TMX script requires Lingua::FreeLing3 to be installed\n" if $@;
FL3->import();
eval { require Lingua::FreeLing3::Word };
die "This XML::TMX script requires Lingua::FreeLing3 to be installed\n" if $@;
Lingua::FreeLing3::Word->import();
my %initted = ();
use XML::TMX::Reader '0.25';
my $file = shift or die "You must supply the name of the file to tokenize";
my $reader = XML::TMX::Reader->new($file);
my $output = "pos_$file";
$output = $o if $o;
my $pos_languages = {};
binmode STDOUT, ":utf8";
$reader->for_tu(
{
-output => $tmpName,
-verbose => 1,
-verbatim => 1,
},
sub {
my $tu = shift;
for my $lang (keys %$tu) {
if ($lang =~ /(pt|es|gl|it|fr|ru|en)/i) {
$pos_languages->{$1} = 1;
my $ln = lc $1;
my $seg = $compact ? "" : "{$lang}{-seg};
my @tokens = map { Lingua::FreeLing3::Word->new($_) } split /\s+/, $txt;
my $sentences = splitter($ln)->split(\@tokens);
$sentences = morph($ln)->analyze($sentences);
$sentences = hmm($ln)->tag($sentences);
for my $stc (@$sentences) {
$seg .= "\n" if $s;
$seg .= $compact ? _dump_compact($stc->words)
: _dump_words($stc->words);
$seg .= "\n" if $s;
}
$tu->{$lang}{-iscdata} = $compact ? 0 : 1;
$seg .= "]]>" unless $compact;
$tu->{$lang}{-seg} = $seg;
}
}
return $tu;
});
$reader = XML::TMX::Reader->new($tmpName);
$reader->for_tu({
-output => $output,
-verbatim => 1,
-raw => 1,
-prop => ($compact ? {} : {
'pos-tagged' => join(",",keys %$pos_languages),
'pos-fields' => "word,lemma,pos",
'pos-s-attributes' => ['s'],
})},
sub {
$_ = $_[0];
if (!/\[CDATA\[/) {
s{()(.*?)()}
{my ($before,$middle,$after) = ($1,$2,$3);
for ($middle) {
s/&/&/g;s/</g;s/>/>/g;
};
$before.$middle.$after
}ge;
}
return $_; });
#print STDERR "$tmpName\n";
unlink $tmpName if -e $tmpName;
sub _init_ma {
my $lang = shift;
morph($lang,
QuantitiesDetection => 0,
MultiwordsDetection => ($mwe // 0),
NumbersDetection => 0,
DatesDetection => ($dates // 0),
NERecognition => ($ner // 0));
$initted{$lang}++;
}
sub _dump_words {
my @words = @_;
my $seg = "";
for my $t (@words) {
if ($t->is_multiword) {
$seg .= sprintf("\n", $t->lemma(), $t->tag());
$seg .= _dump_words($t->get_mw_words);
$seg .= "\n";
} else {
$seg .= $t->form() ."\t". $t->lemma() ."\t". $t->tag() ."\n";
}
}
return $seg;
}
sub _dump_compact {
my $seg = join(" " => map {
my $t = $_;
($t->is_multiword ? join("_", map { $_->lemma() || $_->lc_form() } $t->get_mw_words)
: ($t->lemma() || $t->lc_form()))
. "/" . substr($t->tag(), 0, 1);
} @_) . "\n";
$seg =~ s/&/&/g;
$seg =~ s/</g;
$seg =~ s/>/>/g;
return $seg;
}
=encoding UTF-8
=head1 NAME
tmx-POStagger - POStaggers translation units on a tmx file.
=head1 SYNOPSIS
tmx-POStagger file.tmx # creates pos_file.tmx
tmx-POStagger -o=out.tmx file.tmx
-ner ... tags multiword named entities
-dates
-compact
=head1 DESCRIPTION
Although this script is bundled in C, it has a soft
dependency on C. Soft means that the dependency is
not ensured at install time, and other features of the module can
still be used without C. Nevertheless, if you want
to use this tool you should install that module.
At the moment the supported languages are the same as supported by
FreeLing3: English, Spanish, Russian, Portuguese and Italian.
It your TMX file includes any other language, they will be maintained
without a change. This behavior can change in the future, as a basic
regexp based POStaggerr might be implemented.
=head2 Options
-ner -- groups Proper names with tag C
which which WDT
includes include VBZ
Edward
Cole
-compact
-dates
=head1 SEE ALSO
XML::TMX, Lingua::FreeLing3
=head1 AUTHOR
Alberto Manuel Brandão Simões, Eambs@cpan.orgE
=head1 COPYRIGHT AND LICENSE
Copyright (C) 2012-2014 by Alberto Manuel Brandão Simões
=cut
XML-TMX-0.31/scripts/tmx-tokenize 000755 000765 000024 00000004442 12571074123 016711 0 ustar 00ambs staff 000000 000000 #!/usr/bin/perl -s
use strict;
use warnings;
our $o;
eval { require FL3 };
die "This XML::TMX script requires Lingua::FreeLing3 to be installed\n" if $@;
FL3->import();
use XML::TMX::Reader '0.25';
my $file = shift or die "You must supply the name of the file to tokenize";
my $reader = XML::TMX::Reader->new($file);
my $output = "t_$file";
$output = $o if $o;
binmode STDOUT, ":utf8";
$reader->for_tu( {
-output => $output,
-prop => { tokenized => "true" },
verbose => 1
},
sub {
my $tu = shift;
for my $lang (keys %$tu) {
if ($lang =~ /(pt|es|it|ru|en|gl)/i) {
my $ln = lc $1;
my $txt = $tu->{$lang}{-seg};
if ($txt !~ /^\s*$/) {
$txt = join(" ",
@{ tokenizer($ln)->tokenize($txt,
to_text => 1)});
}
$tu->{$lang}{-seg} = $txt;
}
}
return $tu;
});
=encoding UTF-8
=head1 NAME
tmx-tokenize - tokenizes translation units on a tmx file.
=head1 SYNOPSIS
tmx-tokenize file.tmx # creates t_file.tmx
tmx-tokenize -o=out.tmx file.tmx
=head1 DESCRIPTION
Although this script is bundled in C, it has a soft
dependency on C. Soft means that the dependency is
not ensured at install time, and other features of the module can
still be used without C. Nevertheless, if you want
to use this tool you should install that module.
At the moment the supported languages are the same as supported by
FreeLing3: English, Spanish, Russian, Portuguese and Italian.
It your TMX file includes any other language, they will be maintained
without a change. This behavior can change in the future, as a basic
regexp based tokenizer might be implemented.
=head1 SEE ALSO
XML::TMX, Lingua::FreeLing3
=head1 AUTHOR
Alberto Manuel Brandão Simões, Eambs@cpan.orgE
=head1 COPYRIGHT AND LICENSE
Copyright (C) 2012 by Alberto Manuel Brandão Simões
=cut
XML-TMX-0.31/scripts/tmx2html 000755 000765 000024 00000001210 11736073640 016025 0 ustar 00ambs staff 000000 000000 #!/usr/bin/perl -w -s
use strict;
use warnings;
use XML::TMX;
use XML::TMX::Reader;
our ($icons);
my $tmx = shift;
my $tmx_obj = XML::TMX::Reader->new($tmx);
print $tmx_obj->to_html(icons => $icons);
=encoding UTF-8
=head1 NAME
tmx2html - Converts a TMX to an HTML formated page
=head1 SYNOPSIS
tmx2html [-icons] file.tmx > file.html
=head1 DESCRIPTION
Pretty prints a TMX as an HTML file. Icons for languages can be
produced, but still buggy.
=head1 SEE ALSO
perl(1)
=head1 AUTHOR
Alberto Manuel Brandão Simões, Eambs@cpan.orgE
=head1 COPYRIGHT AND LICENSE
Copyright (C) 2011 by Alberto Manuel Brandão Simões
=cut
XML-TMX-0.31/scripts/tmx2tmx 000755 000765 000024 00000015276 12221072234 015676 0 ustar 00ambs staff 000000 000000 #!/usr/bin/perl -s
use strict;
use warnings;
use XML::TMX::Reader;
use XML::TMX::Writer;
use XML::DT;
our (
$toTrados, ## Fix some issues in the TMX file the old trados software did not cope with
$clean, ## Remove empty UTs and UTs with only junk characters
$cat, ## Concatenates TMX files
$select, ## Filters the TMX to the selected languages
);
# here we must take care that only one of the options is being used.
=encoding utf-8
=head1 NAME
tmx2tmx - utility to convert and filter TMX files
=head1 SYNOPSYS
tmx2tmx -cat file1.tmx ... filen.tmx > file.tmx
tmx2tmx -toTrados file1.tmx > file2.tmx
tmx2tmx -clean file1.tmx > file2.tmx
tmx2tmx -select=PT,EN multilingual.tmx > pt-en.tmx
=head1 DESCRIPTION
This utility processes TMX documents and return TMX documents. Tasks
done with this utility include conversion between TMX versions and TMX
cleaning.
=head2 TRADOS conversion
As you maybe know, TRADOS is a company producing computer software for
translators. It includes WorkBench which imports TMX
files. Unfortunately, the version I used do not import TMX version
1.4.
This process is done using the switch C<-toTrados>:
tmx2tmx -toTrados file.tmx > trados.tmx
=head2 TMX Cleaning
Specially when translation memories are created from anotated text, or
extracted directly from the Internet using any automatic method. This
switch is used to remove junk in translation units.
This option tries to remove junk from TMX files, like empty pairs
where one of the sides is empty, or removing other junk type.
Use it this way:
tmx2tmx -clean file.tmx > file2.tmx
=head2 Concatenating TMX
tmx2tmx -cat file1.tmx ... filen.tmx > file.tmx
ls | grep '.tmx$' | tmx2tmx -cat > file.tmx
=head2 Select languages
Select a bunch of languages:
tmx2tmx -select=PT,EN,FR huge.tmx > pt-en-fr.tmx
=cut
toTrados() if $toTrados;
clean() if $clean;
cat() if $cat;
selectl() if $select;
die "No option supplied\n";
=head1 SEE ALSO
tmx2html, po2tmx, XML::TMX
=head1 AUTHOR
Alberto Simões, Ealbie@alfarrabio.di.uminho.ptE
=head1 COPYRIGHT AND LICENSE
Copyright 2004 by Projecto Natura
This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself.
=cut
#--( AUX FUNCS )-----------
sub trim {
my $x = shift;
for ($x) {
s!^\s+!!;
s!\s+$!!
}
$x
}
sub toTrados {
$/ = "";
my $header = <>;
# ->
$header =~ s!]+)/>!!;
#
->
$header =~ s!]+>!!;
# tmx version ->
$header =~ s!]+>!!;
# srclang -> srclang="foo"
if ($header =~ m!(xml:)?lang=(["'])([^"']+)\2!) {
my $lang = $3;
$header =~ s!srclang=(["'])[^"']+\1!srclang="$lang"!;
}
$header =~ s/xml:lang/lang/g;
# xml:lang -> lang
print $header;
while (<>) {
s/xml:lang/lang/g;
print
}
# ugly, but prevents us from forgetting an else somewhere.
exit;
}
sub clean {
my $file;
my $reader;
if ($file = shift) {
# isto tv fosse mais rápido em sax...
print dt( $file,
-default => sub{ toxml() },
seg => sub{
$c = trim($c); toxml
},
tu => sub{
# remove entries with junk, only
return "" if $c =~ m!(\s|[\|\!@#\$%^&\-])*!;
toxml()
},
);
} else {
die "At the moment, we do not handle files from stdin";
# TODO... aqui fazia jeito que o XML::TMX::Reader lêsse de um
# filehandle já aberto (STDIN, por exemplo);
}
exit;
}
sub _join_and_print_headers {
my $header;
while (@_) {
my $file = shift @_;
my $reader = XML::TMX::Reader->new($file);
for my $key (keys %{$reader->{header}}) {
if ($key eq "-prop") {
for my $type (keys %{$reader->{header}{$key}}) {
push @{$header->{$key}{$type}}, @{$reader->{header}{$key}{$type}}
}
}
elsif ($key eq "-note") {
push @{$header->{$key}}, @{$reader->{header}{$key}}
}
else {
$header->{$key} = $reader->{header}{$key}
}
}
}
my $writer = XML::TMX::Writer->new();
$writer->start_tmx(%$header);
}
sub cat {
my @files ;
if (@ARGV) {
@files = @ARGV;
} else {
@files = map { chomp; $_ } <>;
}
binmode STDOUT, ":utf8";
_join_and_print_headers(@files);
my $file = shift @files;
open F, "<:utf8", $file or die "Cannot open file: $file\n";
while () {
last if m!\n";
while () {
s!\s*(body|tmx)\s*>!!;
print;
}
close F;
for $file (@files) {
open F, "<:utf8", $file or die "Cannot open file: $file\n";
print "\n";
while () {
last if m!]*>//g;
print;
while () {
s!\s*(body|tmx)\s*>!!;
print;
}
close F;
}
print "\n";
print "\n";
exit;
}
sub selectl {
my @files;
if (@ARGV) {
@files = @ARGV;
} else {
@files = map { chomp; $_ } <>;
}
_join_and_print_headers(@files);
my %languages = map { (lc $_ => 1) } split( /,/, $select );
my $tmx;
while ($tmx = shift @files) {
die "Can't find $tmx" if (!$tmx || !-f$tmx);
print STDERR "$tmx\n";
my $reader = XML::TMX::Reader->new($tmx);
$reader->for_tu(
{
-verbose => 1,
-output => 1,
-header => 0
},
sub {
my $tu = shift;
my $l = 0;
for my $k (keys %$tu) {
next if $k =~ /^-/;
if (exists $languages{lc $k}) {
$l++;
} else {
delete $tu->{$k}
}
}
if ($l == keys %languages) {
return $tu;
} else {
return undef;
}
});
}
print "\n\n";
exit;
}
XML-TMX-0.31/scripts/tmxclean 000755 000765 000024 00000003305 12153701756 016070 0 ustar 00ambs staff 000000 000000 #!/usr/bin/perl -s
use v5.10;
use strict;
use warnings;
use XML::TMX::Reader;
our (
$junk, # remove if one of the languages just have junk
$output, # output filename
$eq , # remove if seg(l1) = seg(l2)
$len , # remove if len(li) > 50 ∧ len(lj) > 2len(li)
$v, $verbose
);
my $cleaned = 0;
my $processed = 0;
my $tmx = shift or help();
my $reader = XML::TMX::Reader->new($tmx);
$junk//=1;
$output ||= "_cleaned_$tmx";
print STDERR "loading..." if $v;
$reader->for_tu( {output => $output},
\&cleaner);
printf STDERR "\rRemoved %d/%d (%.3f%%).\n",
$cleaned, $processed, 100*$cleaned/$processed if $v;
sub cleaner {
my $langs = shift;
$processed++;
my $remove = 0;
my %seg=();
my @len=();
for my $k (keys %$langs) {
next if $k =~ /^-/;
$remove = 1 if $eq && $seg{$langs->{$k}{-seg}}++;
$remove = 1 if $junk && $langs->{$k}{-seg} =~ /^[-.,0-9\s]+$/;
$remove = 1 if $junk && $langs->{$k}{-seg} =~ /^\W*$/;
push(@len, length($langs->{$k}{-seg}));
}
@len = sort{$a <=> $b} @len;
$remove = 1 if $len && $len[0] > 50 && $len[0]*2< $len[-1];
$cleaned++ if $remove;
printf STDERR "\rRemoved %d/%d (%.3f%%)...", $cleaned, $processed,
100*$cleaned/$processed if $v && $processed%1000==0;
return $remove ? undef : $langs;
}
sub help {
print " tmxclean [-junk=1] \n";
exit 1;
}
=encoding UTF-8
=head1 NAME
tmx-clean - clean TMX files ???
=head1 SYNOPSIS
$ tmx-clean file.tmx # ???
=head1 DESCRIPTION
Removes the Translation units that
1. have no letters... (unless -junk=0)
2. seg(L1) = seg(L2) (if -eq)
=head1 SEE ALSO
XML::TMX
=cut
XML-TMX-0.31/scripts/tmxgrep 000755 000765 000024 00000002775 12641534470 015754 0 ustar 00ambs staff 000000 000000 #!/usr/bin/perl -w -s
## use utf8::all;
use Data::Dumper;
our ($all,$w,$max,$pdf, $o, $n, $a,$debug);
$max //= 500;
$max = 1000000000 if ($max == 0 or $all);
use strict;
use warnings;
my $p = shift;
$p = qr{\b$p\b} if $w;
use XML::TMX;
use XML::TMX::Reader;
our ($icons);
my $tmx = shift;
my $tmx_obj = XML::TMX::Reader->new($tmx);
$tmx_obj->for_tu(
{ patt => $p,
gen_tu=> $max,
n => $n || 0,
output => $o || undef }, #### "__tmxgrep.tmx"
sub { my($tu,$at) = @_;
print STDERR Dumper($tu,$at) if $main::debug;
# print STDERR ".";
if($main::a){
for my $k(keys %$tu){
for(keys %{$tu->{$k}}){
$tu->{$k}{$_} =~ s!($p)!=($1)=!g;
}
}
}
return $tu;
}
);
if($pdf){ system("xpdf-tmx __.tmx");}
__END__
=head1 NAME
tmxgrep - grep translation units in a TMX file
=head1 SYNOPSIS
tmxgrep file.tmx
options:
-pdf -- output is transformed in a PDF file
-max=300 -- extract up to 300 TU (def=500)
-max=0 -- all matches
-all -- all matches
-a -- annotate matches with '==(...)'
-o=out.tmx -- define output file (defaut= STDOUT)
-n -- print original TU number (eg: )
=head1 DESCRIPTION
Creates a TMX file with the tranlation units tha macth the provided
regular expression.
=head1 AUTHOR
J.Joao Almeida, jj@di.uminho.pt
=head1 SEE ALSO
perl(1).
xpdf-tmx
XML::TMX
TMX
=cut
XML-TMX-0.31/scripts/tmxsplit 000644 000765 000024 00000005410 11763630336 016136 0 ustar 00ambs staff 000000 000000 #!/usr/bin/perl -s
use strict;
use utf8;
use XML::DT;
#use bytes;
our ($twente,$latin1,$utf8,$q,$cutmaxlen);
my $filename = $ARGV[0];
my %files;
my $data;
my %h = (
# '-outputenc' => "ISO-8859-1",
'seg' => sub{
for ($c){
# s/\&/\&/g;
# s/\</g;
# s/>/\>/g;
s/\s\s+|^\s+|\s+$/ /g;
};
$c
},
'ut' => sub{" "},
'tu' => sub{$c},
'tuv' => sub{
$c =~ s/^[\s\n]*//;
$c =~ s/[\s\n]*$//;
$data->{$v{lang}||$v{"xml:lang"}} =
$cutmaxlen && length($c) > $cutmaxlen
? substr($c,0,$cutmaxlen)."||" : $c},
);
$h{-outputenc} = "ISO-8859-1" if $twente || $latin1;
undef $h{-outputenc} if $utf8;
my $i = 0;
$| = 1;
my $f;
for $f (@ARGV){
# print "\n$f" unless $q;
print STDERR "\n$f";
$/ = "\n";
open X, $f or die "cannot open file $f";
do {
if(/encoding=.ISO-8859-1./i){$h{-outputenc}=$h{-inputenc}="ISO-8859-1";}
} while ($_ = and $_ !~ /!s and $resto = $';
$/ = "";
while() {
($_ = $resto . $_ and $resto = "" ) if $resto;
$i++;
last if /<\/body>/;
#print "." if (!$q && $i%500==0);
print STDERR "." if ($i % 1000==0);
s/\>\s+/>/;
undef($data);
eval {dtstring($_, %h)} ; ## don't die in invalid XML
if($@){warn($@)}
else{
for my $k (keys %$data) {
if (exists($files{"$filename-$k"})) {
myprint($files{"$filename-$k"}, $data->{$k},$i);
} else {
my $x;
open $x, ">$filename-$k" or die("cant >$filename-$k\n");
binmode($x,":utf8") if $utf8;
myprint($x, $data->{$k},$i);
$files{"$filename-$k"} = $x;
}
}
}
}
close X;
}
for my $key (keys %files) {
print "$key\n";
}
sub myprint{
my($f,$tu,$i)=@_;
if ($twente){
for ($tu){
s/<.*?>/ /gs;
s/[\|\$]/ /gs;
s/(\w)([.;,!:?«»"])/$1 $2/g;
s/([.;,!:?«»"])(\w)/$1 $2/g;
s/\s\s+|^\s+|\s+$/ /g;
}
print {$f} "$tu\n\$\n";
} else {
print {$f} "$tu\n";
}
}
__END__
=encoding utf-8
=head1 NAME
tmxsplit - splits a TMX file several files, one for each language
=head1 SYNOPSIS
tmxsplit f.tmx f2.tmx ...
tmxsplit -twente f.tmx
=head1 DESCRIPTION
splits a TMX file in several files (one per language) and puts
a tag C in each translation unit.
The names for output files is based on the first tmx file supplied.
=head1 Options
-twente -- makes a format compatible with twente-aligner
-latin1 -- a make latin1-encoded output
-utf8 -- a make utf8-encoded output
-q -- don't print filenames and "."
-cutmaxlen=n -- cut translations by the n character
=head1 AUTHOR
Alberto Simões, albie@di.uminho.pt
J.Joao Almeida, jj@di.uminho.pt
=head1 SEE ALSO
perl(1).
tmx2cqp(1)
=cut
XML-TMX-0.31/scripts/tmxuniq 000755 000765 000024 00000004542 12062106764 015763 0 ustar 00ambs staff 000000 000000 #!/usr/bin/perl -s
use DB_File;
use Fcntl ;
use Lingua::PT::PLNbase;
use XML::TMX::Reader;
use Digest::MD5 qw(md5_hex);
use Encode;
our ($cont,$id,$dig,$tok,$o,$fast);
if ($cont) {
tie %dic, 'DB_File', "__tmxuniq_$$.db", O_RDWR|O_CREAT , 0640, $DB_BTREE;
} else {
tie %dic, 'DB_File', "__tmxuniq_$$.db", O_RDWR|O_CREAT|O_TRUNC , 0640, $DB_BTREE;
}
my $cid = 0;
for my $file (@ARGV){
my $tm = XML::TMX::Reader->new($file);
print STDERR "Processing...";
$tm->for_tu
(
{ output => $o || "$file._" },
sub {
my $tu = shift;
$cid++;
$tu->{-prop}{id} = $cid if $id;
my $key = join("|||", map { n($tu->{$_}{-seg}) } sort grep { !/^-/ } keys %$tu);
my $digest = md5_hex(encode_utf8($key));
unless ($cid % 10000) {
my $size = -s "__tmxuniq_$$.db";
printf STDERR
"\rTotal: %10d Removed: %8d (%.2f%%) Database size: %10d bytes",
$cid, $rem, (100*$rem/$cid), $size;
}
if ($dic{$digest}) {
$dic{$digest} .= "$cid;" unless $fast;
$rem ++;
return undef
} else {
$dic{$digest} = "$cid;";
$tu->{-prop}{digest} = $digest if $dig;
return {%$tu} ; # used clone.. no idea why
}
}
);
my $size = -s "__tmxuniq_$$.db";
if ($cid) {
printf STDERR "\rTotal: %10d Removed: %8d (%.2f%%) Database size: %10d bytes\n",
$cid, $rem, (100*$rem/$cid), $size;
} else {
printf STDERR "\rHuh.. empty TMX?\n";
}
undef $tm;
}
untie %h;
sub n {
my $a = shift;
$a =~ s/\.{6,}/...../g;
$a = tokenize( { rs => ' ' } => $a ) if $tok;
$a =~ s/\s+/ /g;
$a =~ s/ $//;
$a =~ s/^ //;
return $a;
}
__END__
=head1 NAME
tmxuniq - removes duplicated translation units from TMXs
=head1 SYNOPSIS
tmxuniq [options] -l=en:pt tmx1 ...
=head1 DESCRIPTION
Removes duplicated translation units from a set of TMX (Translation
Memory eXange format).
=head1 OPTIONS
-id -- insert a uniq id property in each TU
-dig -- insert a digest property in each TU
-tok -- tokenize/normalize text
-o=out.tmx -- (with 1 argument) redefine output (default = input._)
=head1 AUTHOR
J.Joao Almeida, jj@di.uminho.pt
=head1 SEE ALSO
perl(1).
=cut
XML-TMX-0.31/scripts/tmxwc 000755 000765 000024 00000001376 11763406075 015427 0 ustar 00ambs staff 000000 000000 #!/usr/bin/perl -s
use XML::TMX::Reader;
print STDERR "";
my $TTU = 0;
my $n = scalar(@ARGV);
for my $file (@ARGV) {
my $tm = XML::TMX::Reader->new($file);
my $TU = 0;
$tm->for_tu({-raw => 1 },
sub { $TU++ ;
print STDERR "\r$TU..." unless $TU%1000;
}
);
print STDERR "\r$file: $TU tu.\n";
$TTU += $TU;
}
print STDERR "total: $TTU tu.\n" if $n > 1;
__END__
=encoding utf-8
=head1 NAME
tmxwc - gives statistics about tmx files
=head1 SYNOPSIS
tmxwc tmx1 [tmx2 ...]
=head1 DESCRIPTION
Gives statistical information about TMX files like the number of
translation units.
=head1 AUTHOR
Alberto Simões, Eambs@cpan.orgE
=head1 SEE ALSO
perl(1).
=cut
XML-TMX-0.31/scripts/tsv2tmx 000644 000765 000024 00000005567 12645001504 015703 0 ustar 00ambs staff 000000 000000 #!/usr/bin/perl
use warnings;
use strict;
use Getopt::Long;
use Pod::Usage;
### Option varialbes
my $man = 0;
my $help = 0;
my ($sl, $tl);
our $verbose = 1;
my $header = 1;
my $columns = "1,2";
#### ------
binmode STDIN, ":utf8";
binmode STDOUT, ":utf8";
GetOptions ('help|h' => \$help, 'man' => \$man,
"source=s" => \$sl,
"target=s" => \$tl,
"header" => \$header,
"columns=s" => \$columns,
"verbose|v" => \$verbose)
or pod2usage(2);
pod2usage(1) if $help;
pod2usage(-exitval => 0, -verbose => 2) if $man;
##
our ($c1, $c2);
if ($columns =~ /^(\d+),(\d+)$/) {
($c1,$c2) = ($1,$2);
} else {
die "Columns definition should be a pair of integers: 1,2\n";
}
if (!$sl || !$tl) {
if ($header) {
_log ("No source language or target language defined. Guessing!");
my ($l1, $l2) = readLine();
$sl = $l1 unless defined $sl;
$tl = $l2 unless defined $tl;
} else {
die "No header, and one of the source or target languages not defined!\n";
}
$header = 0;
}
readLine() if $header;
use XML::TMX::Writer;
my $tmx = XML::TMX::Writer->new();
$tmx->start_tmx(id => 'tsv2tmx');
my @r;
while (@r = readLine()) {
$tmx->add_tu($sl=>$r[0],$tl=>$r[1]);
}
$tmx->end_tmx();
sub _log {
say STDERR @_ if $verbose;
}
sub readLine {
my $line = ;
if ($line) {
chomp $line;
return (split /\t/, $line)[$c1,$c2]
} else {
return ();
}
}
__END__
=encoding utf-8
=head1 NAME
tsv2tmx - Create a TMX from a TSV file
=head1 SYNOPSIS
tsv2tmx [options]
Options:
--help brief help message
--man full documentation
--verbose | -v activated verbose mode
--sl=EN --tl=PT describe source and target language names
--header treat first line as a heading
--columns=1,2 specify which columns to extract
=head1 OPTIONS
=over 8
=item B<--help>
Print a brief help message and exits.
=item B<--man>
Prints the manual page and exits.
=item B<--verbose> | B<-v>
Activates the verbose mode.
=item B<--sl> | B<--tl>
Use these options to specify the names for the source and target
languages.
=item B<--header>
By default this switch is on, and it means that the TSV file
includes a first line with a heading. If no source or target
language names are specified, the first line will be used to
guess them.
=item B<--columns=1,2>
Specify which columns should be extracted. Needs to be a pair
of integers, separated by a comma. Columns indexes start at 0.
Default to C<1,2>.
=back
=head1 DESCRIPTION
Useful to create translation memories from TSV files, that can be
easily exported from spreadsheet software.
=head1
=head1 SEE ALSO
XML::TMX
=head1 AUTHOR
Alberto Simões, C<< >>
=head1 COPYRIGHT AND LICENSE
Copyright 2016 by Projecto Natura
This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself.
=cut
XML-TMX-0.31/lib/XML/ 000755 000765 000024 00000000000 12645001535 014037 5 ustar 00ambs staff 000000 000000 XML-TMX-0.31/lib/XML/TMX/ 000755 000765 000024 00000000000 12645001535 014507 5 ustar 00ambs staff 000000 000000 XML-TMX-0.31/lib/XML/TMX.pm 000644 000765 000024 00000001622 12645001510 015037 0 ustar 00ambs staff 000000 000000 package XML::TMX;
# vim:sw=3:ts=3:et:
use 5.004;
use warnings;
use strict;
use parent 'Exporter';
our $VERSION = '0.31';
our @ISA = 'Exporter';
our @EXPORT_OK = qw();
=encoding utf-8
=head1 NAME
XML::TMX - Perl extensions for managing TMX files
=head1 SYNOPSIS
use XML::TMX;
=head1 DESCRIPTION
XML::TMX is the top level module. At the moment it does not contain
any useful code, so check sub-modules, please.
=head1 SEE ALSO
XML::TMX::Writer, XML::TMX::Reader, XML::TMX::FromPO
L, TMX Specification L
=head1 AUTHOR
Alberto Simões, Ealbie@alfarrabio.di.uminho.ptE
Paulo Jorge Jesus Silva, Epaulojjs@bragatel.ptE
=head1 COPYRIGHT AND LICENSE
Copyright 2003-2012 by Projecto Natura
This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself.
=cut
1;
XML-TMX-0.31/lib/XML/TMX/FromPO.pm 000644 000765 000024 00000020377 12645001510 016211 0 ustar 00ambs staff 000000 000000 package XML::TMX::FromPO;
use 5.004;
use warnings;
use strict;
use XML::TMX::Writer;
use Exporter ();
use vars qw(@ISA @EXPORT_OK $VERSION);
$VERSION = '0.31';
@ISA = 'Exporter';
@EXPORT_OK = qw(&new &parse_dir &create_tmx &clean_tmx);
=pod
=encoding utf-8
=head1 NAME
XML::TMX::FromPO - Generates a TMX file from a group of PO files
=head1 SYNOPSIS
use XML::TMX::FromPO;
my $conv = new XML::TMX::FromPO(OUTPUT => '%f.tmx');
=head1 DESCRIPTION
This module can be used to generate TMX files from a group of PO files.
=head1 METHODS
The following methods are available:
=head2 new
$tmx = new XML::TMX::FromPO();
Creates a new XML::TMX::FromPO object. Please check the L section for details on the options.
=cut
sub new {
my $proto = shift;
my $class = ref($proto) || $proto;
my $self = {};
$self->{LANG} = undef;
$self->{OUTPUT} = undef;
$self->{DEBUG} = 0;
__common_conf($self, @_);
unless(defined($self->{CONVER})) {
if(system('recode >/dev/null 2>&1')) {
$self->{CONVERT} = 'iconv -f %t -t utf8 < %f';
} else {
$self->{CONVERT} = 'recode %t..utf8 < %f';
}
}
$self->{TMX} = {};
bless($self, $class);
return($self);
}
=head2 rec_get_po
TODO: Document method
=cut
sub rec_get_po {
my $self = shift;
my $dir = shift;
my $lan1 = shift;
__common_conf($self, @_);
# check if directory is readable
if(-f $dir) {
my $file=$dir;
my $lang = lc($lan1);
if(!defined($self->{LANG}) || __check_lang($self, $lang)) {
__processa($self, $file, $lang);
}
}
else {
die("$dir is not a readable directory\n") unless(-d $dir);
for my $file (<$dir/*>) {
if($file =~ /(.*)\.(po|messages)$/) {
my $lang = lc($lan1);
if(!defined($self->{LANG}) || __check_lang($self, $lang)) {
__processa($self, $file, $lang);
}
}
elsif($file =~ /(.*)\.(\w+)\.(po|messages)$/) {
my $lang = lc($lan1);
## ??? my $lang = "\L$2";
if(!defined($self->{LANG}) || __check_lang($self, $lang)) {
__processa($self, $file, $lang);
}
}
elsif(-d $file) {
rec_get_po($self,$file,$lan1)
}
else {
## warn ("$file ... não tem lingua\n") if($self->{DEBUG});
}
}
}
__add_en($self) if(__check_lang($self, 'en'));
__limpa($self);
}
=head2 parse_dir
TODO: Document method
=cut
sub parse_dir {
my $self = shift;
my $dir = shift;
__common_conf($self, @_);
# check if directory is readable
die("$dir is not a readable directory\n") unless(-d $dir);
for my $file ((<$dir/*.po>),(<$dir/*.messages>)) {
if($file =~ /(\w+)\.(po|messages)$/) {
my $lang = "\L$1";
if(!defined($self->{LANG}) || __check_lang($self, $lang)) {
__processa($self, $file, $lang);
}
} elsif($file =~ /(.*)\.(\w+)\.(po|messages)$/) {
my $lang = "\L$2";
if(!defined($self->{LANG}) || __check_lang($self, $lang)) {
__processa($self, $file, $lang);
}
} else {
warn ("$file ... não tem lingua\n") if($self->{DEBUG});
}
}
__add_en($self) if(__check_lang($self, 'en'));
__limpa($self);
}
# return value:
# * 0 -> lang does not exist
# * 1 -> lang exists
sub __check_lang {
my $self = shift;
my $lang = shift;
my @regex = @{$self->{LANG}};
while(my $regex = shift(@regex)) {
last if($regex gt $lang);
if($lang =~ /^$regex$/i) {
return(1);
}
}
return(0);
}
sub __add_en {
my $self = shift;
for my $str (keys %{$self->{TMX}}) {
$self->{TMX}{$str}{'en'} = $str;
}
}
=head2 create_tmx
TODO: Document function
=cut
sub create_tmx {
my $self = shift;
my $tmx = new XML::TMX::Writer();
__common_conf($self, @_);
my $n_langs = @{$self->{LANG}};
if(defined($self->{OUTPUT})) {
$tmx->start_tmx(ID => 'XML::TMX::FromPO', OUTPUT => $self->{OUTPUT});
} else {
$tmx->start_tmx(ID => 'XML::TMX::FromPO');
}
for my $chave (keys %{$self->{TMX}}) {
my $reg = __make_tu($self, $self->{TMX}{$chave});
# only write to file if all languages are defined
$tmx->add_tu(%{$reg}) if(keys(%{$reg}) >= $n_langs);
}
$tmx->end_tmx();
}
=head2 clean_tmx
TODO: Document method
=cut
sub clean_tmx {
my $self = shift;
$self->{TMX} = {};
}
sub __make_tu {
my $self = shift;
my $block = shift;
my $reg = {};
if(!defined($self->{LANG})) {
return($block);
}
for my $lang (keys %$block) {
$reg->{$lang} = $block->{$lang} if(__check_lang($self, $lang));
}
return($reg);
}
sub __processa {
my $self = shift;
my $a = shift;
my $l = shift;
local $/ = "\nmsgid";
#local $/ = "\nmsgid ";
print STDERR "$a\n" if($self->{DEBUG});
my $codeline = `grep -i Content-Type $a | grep -i charset`;
my $code = "?";
if($codeline =~ /charset=([\w-]+)/) { $code = $1; }
my $convert = $self->{CONVERT};
$convert =~ s/\%t/$code/i;
$convert =~ s/\%f/$a/i;
if($code eq "?" || $code =~ /utf-?8/i ) { open(F,$a) or die;}
else { open(F,"$convert|") or die;}
my $mi = 0;
while() {
chomp;
next if($mi == 0 && /^msgid\s+""/);
if(/"Content-Type:/ && /charset=([\w-]+)/) { $code = $1; next }
s/(^|\n)\s*#.*//g;
# s/_//g unless $under;
next unless(/\n\s*msgstr/);
my ($m1,$m2) = ($`,$');
$m1 =~ s/(^\s*"|"\s*$)//g;
$m1 =~ s/("\s*\n\s*")/ /g;
$m2 =~ s/(^\s*"|"\s*$)//g;
$m2 =~ s/("\s*\n\s*")/ /g;
unless($m1) {
warn "\n====M1 vazio... \n$m1\n=$m2\n";
next;
}
if($m2) {
$self->{TMX}{$m1}{$l} = $m2;
} # || "????? $m1";
#$self->{TMX}{$m1}{'en'} = $m1;
# print "\n====\n$m1\n=$m2\n";
$mi++;
}
print STDERR "Charset: $code\n" if($self->{DEGUB});
close F;
}
sub __limpa {
my $self = shift;
# possíveis limpezas
# (1) eliminar traduções que sejam igual ao original
# (2) eliminar strings que não contenham pelo menos 2 letras
# consecutivas
# (3) eliminar frases que fiquem sem traduções
#
# um teste realizado com os po's do evolution mostrou uma redução do
# ficheiro final de 12M para 8,6M, uma análise com o diff aos dumps
# permitiu ver que grande parte do ''lixo'' eram de (1)
for my $h1 (keys %{$self->{TMX}}) {
if($h1 =~ /[a-z][a-z]/i) {
for my $h2 (keys %{$self->{TMX}{$h1}}) {
# optimização (1)
delete($self->{TMX}{$h1}{$h2}) if($h2 !~ /^en/i && $h1 eq $self->{TMX}{$h1}{$h2});
}
# optimização (3)
delete($self->{TMX}{$h1}) unless(keys %{$self->{TMX}{$h1}});
} else {
# optimização (2)
delete($self->{TMX}{$h1});
}
}
}
=pod
=head1 COMMON CONFIGURATION
These configuration options can be passed to all methods in the module:
=over
=item LANG => 'list'
A case insensitive list of regular expression separated by whitespaces
that matches the code of the languages that are to be processed.
Defaults to all.
=item CONVERT => 'iconv -f %t -t utf8 < %f'
A string that contains the command to convert a file (%f) from some
charset (%t) to Unicode. If none is specified, the module tries to
use L, if it fails then the module defaults to L.
=item OUTPUT => 'x.tmx'
The name of the output file. If none is specified it defaults to the
standard output.
=item DEBUG => 1
Activate debugging information. Defaults to 0.
=back
=cut
sub __common_conf {
my $self = shift;
my %opt = @_;
if(defined($opt{LANG})) {
my @list;
for my $l (sort(split(/\s+/, $opt{LANG}))) {
push(@list, $l) if($l =~ /^[a-z0-9_]+$/i);
}
$self->{LANG} = \@list if(@list);
}
$self->{CONVERT} = $opt{CONVERT} if defined($opt{CONVERT});
$self->{OUTPUT} = $opt{OUTPUT} if defined($opt{OUTPUT});
$self->{DEBUG} = $opt{DEBUG} if defined($opt{DEBUG});
}
=head1 SEE ALSO
L, L, L, L
=head1 AUTHOR
Paulo Jorge Jesus Silva, Epaulojjs@bragatel.ptE
=head1 COPYRIGHT AND LICENSE
Copyright 2003 by Paulo Jorge Jesus Silva
This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself.
=cut
1;
XML-TMX-0.31/lib/XML/TMX/Reader.pm 000644 000765 000024 00000030506 12645001510 016244 0 ustar 00ambs staff 000000 000000 package XML::TMX::Reader;
use 5.010;
use warnings;
use strict;
use Exporter ();
use vars qw($VERSION @ISA @EXPORT_OK);
use XML::DT;
use XML::TMX::Writer;
$VERSION = '0.31';
@ISA = 'Exporter';
@EXPORT_OK = qw();
=encoding utf-8
=head1 NAME
XML::TMX::Reader - Perl extension for reading TMX files
=head1 SYNOPSIS
use XML::TMX::Reader;
my $reader = XML::TMX::Reader->new( $filename );
$reader -> for_tu( sub {
my $tu = shift;
#blah blah blah
});
@used_languages = $reader->languages;
$reader->to_html()
=head1 DESCRIPTION
This module provides a simple way for reading TMX files.
=head1 METHODS
The following methods are available:
=head2 C
This method creates a new XML::TMX::Reader object. This process checks
for the existence of the file and extracts some meta-information from
the TMX header;
my $reader = XML::TMX::Reader->new("my.tmx");
=cut
sub new {
my ($class, $file) = @_;
return undef unless -f $file;
my $self = bless {
encoding => _guess_encoding($file),
filename => $file,
ignore_markup => 1,
} => $class;
$self->_parse_header;
return $self;
}
sub _guess_encoding {
my $file = shift;
my $encoding = 'UTF-8';
open my $fh, "<", $file or die "can't open $file";
my $line = <$fh>;
if ($line =~ /encoding=['"]([^'"]+)['"]/) {
$encoding = $1;
}
close $fh;
return $encoding;
}
sub _parse_header {
my $self = shift;
my $header = "";
{
local $/ = "";
open my $fh, "<:encoding($self->{encoding})", $self->{filename} or die "$!";
$header = <$fh>;
close $fh;
}
$header =~ s/^.*().*$/$1/s;
$header =~ s!().*$!$1!s;
dtstring($header => (
'header' => sub {
$self->{header}{$_} = $v{$_} for (keys %v);
},
'prop' => sub {
$v{type} ||= "_";
push @{$self->{header}{-prop}{$v{type}}}, $c;
},
'note' => sub {
push @{$self->{header}{-note}}, $c;
},
));
}
=head2 C
This method is used to set the flag to ignore (or not) markup inside
translation unit segments. The default is to ignore those markup.
If called without parameters, it sets the flag to ignore the
markup. If you don't want to do that, use
$reader->ignore_markup(0);
=cut
sub ignore_markup {
my ($self, $opt) = @_;
$opt = 1 unless defined $opt;
$self->{ignore_markup} = $opt;
}
=head2 C
This method returns the languages being used on the specified
translation memory. Note that the module does not check for language
code correctness or existence.
=cut
sub languages {
my $self = shift;
my %languages = ();
$self->for_tu({proc_tu => 100},
sub {
my $tu = shift;
for ( keys %$tu ) {
$languages{$_}++ unless m/^-/;
}
} );
return keys %languages;
}
=head2 C
Use C to process all translation units from a TMX file.
This version iterates for all tu (one at the time)
The configuration hash is a reference to a Perl hash. At the moment
these are valid options:
=over
=item C<-verbose>
Set this option to a true value and a counter of the number of
processed translation units will be printed to stderr.
=item C<-output> | C";
$conf->{-verbose}++ if $conf->{verbose};
print STDERR "." if $conf->{-verbose};
while () {
($_ = $resto . $_ and $resto = "" ) if $resto;
last if /<\/body>/;
$i++;
print STDERR "\r$i" if $conf->{-verbose} && !($i % 10);
last if defined $conf->{proc_tu} && $i > $conf->{proc_tu} ;
last if defined $conf->{gen_tu} && $gen > $conf->{gen_tu};
next if defined $conf->{patt} && !m/$conf->{patt}/ ;
####
# This can't be done. Not sure why it was being done.
# So, please, unless you know the implications for tagged crpora
# do not uncomment it.
# s/\>\s+/>/;
undef($data);
if ($conf->{'-raw'}) {
my $ans = $code->($_);
if ($conf->{-output}) {
$ans->{"-n"}=$i if $conf->{n} ;
$tmx->add_tu(-raw => $ans);
}
} else {
eval { dtstring($_, %h) } ; ## dont die in invalid XML
warn $@ if $@;
}
}
print STDERR "\r$i\n" if $conf->{-verbose};
close X;
$tmx->end_tmx if $conf->{-header} && $outputingTMX;
}
=head2 C
Use this method to create a nice HTML file with the translation
memories. Notice that this method is not finished yet, and relies on
some images, on some specific locations.
=cut
sub to_html {
my $self = shift;
my %opt = @_;
$self->for_tu(sub {
my ($langs, $opts) = @_;
my $ret = "";
for (keys %$langs) {
next if /^-/;
$ret .= "";
if ($opt{icons}) {
$ret .= " "
} else {
$ret .= "$_"
}
$ret .= " | $langs->{$_}{-seg} |
\n"
}
$ret .= " |
";
$ret;
}
);
}
sub for_tu2 {
warn "Please update your code to use 'for_tu'\n";
&for_tu;
}
=head2 C
deprecated. use C
=head1 SEE ALSO
L, TMX Specification L
=head1 AUTHOR
Alberto Simões, Ealbie@alfarrabio.di.uminho.ptE
Paulo Jorge Jesus Silva, Epaulojjs@bragatel.ptE
J.João Almeida, Ejj@di.uminho.ptE
=head1 COPYRIGHT AND LICENSE
Copyright 2003-2012 by Projecto Natura
This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself.
=cut
1;
XML-TMX-0.31/lib/XML/TMX/Writer.pm 000644 000765 000024 00000026505 12645001510 016322 0 ustar 00ambs staff 000000 000000 package XML::TMX::Writer;
use 5.004;
use warnings;
use strict;
use Exporter ();
use vars qw($VERSION @ISA @EXPORT_OK);
$VERSION = '0.31';
@ISA = 'Exporter';
@EXPORT_OK = qw();
=encoding utf-8
=head1 NAME
XML::TMX::Writer - Perl extension for writing TMX files
=head1 SYNOPSIS
use XML::TMX::Writer;
my $tmx = XML::TMX::Writer->new();
$tmx->start_tmx(id => 'paulojjs');
$tmx->add_tu('en' => 'some text', 'pt' => 'algum texto');
$tmx->add_tu('en' => 'some text',
'pt' => 'algum texto',
-note => [32, 34 ],
-prop => { q => 23,
aut => "jj"}
);
$tmx->end_tmx();
=head1 DESCRIPTION
This module provides a simple way for writing TMX files.
=head1 METHODS
The following methods are available:
=head2 new
$tmx = new XML::TMX::Writer();
Creates a new XML::TMX::Writer object
=cut
sub new {
my $proto = shift;
my $class = ref($proto) || $proto;
my %ops = @_;
my $self = { OUTPUT => \*STDOUT };
binmode $self->{OUTPUT}, ":utf8" unless exists $ops{-encoding} and $ops{-encoding} !~ /utf.?8/i;
bless($self, $class);
return($self);
}
=head2 start_tmx
$tmx->start_tmx(-output => 'some_file.tmx');
Begins a TMX file. Several options are available:
=over 4
=item -output
Output of the TMX, if none is defined stdout is used by default.
=item tool
Tool used to create the TMX. Defaults to 'XML::TMX::Writer'
=item toolversion
Some version identification of the tool used to create the TMX. Defaults
to the current module version
=item segtype
Segment type used in the ItuE> elements. Possible values are I,
I, I and I. Defaults to I.
=item srctmf
Specifies the format of the translation memory file from which the TMX document or
segment thereof have been generated.
=item adminlang
Specifies the default language for the administrative and informative
elements InoteE> and IpropE>.
=item srclang
Specifies the language of the source text. If a ItuE> element does
not have a srclang attribute specified, it uses the one defined in the
IheaderE> element. Defaults to I<*all*>.
=item datatype
Specifies the type of data contained in the element. Depending on that
type, you may apply different processes to the data.
The recommended values for the datatype attribute are as follow (this list is
not exhaustive):
=over 4
=item unknown
undefined
=item alptext
WinJoust data
=item cdf
Channel Definition Format
=item cmx
Corel CMX Format
=item cpp
C and C++ style text
=item hptag
HP-Tag
=item html
HTML, DHTML, etc
=item interleaf
Interleaf documents
=item ipf
IPF/BookMaster
=item java
Java, source and property files
=item javascript
JavaScript, ECMAScript scripts
=item lisp
Lisp
=item mif
Framemaker MIF, MML, etc
=item opentag
OpenTag data
=item pascal
Pascal, Delphi style text
=item plaintext
Plain text (default)
=item pm
PageMaker
=item rtf
Rich Text Format
=item sgml
SGML
=item stf-f
S-Tagger for FrameMaker
=item stf-i
S-Tagger for Interleaf
=item transit
Transit data
=item vbscript
Visual Basic scripts
=item winres
Windows resources from RC, DLL, EXE
=item xml
XML
=item xptag
Quark XPressTag
=back
=item srcencoding
All TMX documents are in Unicode. However, it is sometimes useful to know
what code set was used to encode text that was converted to Unicode for
purposes of interchange. This option specifies the original or preferred
code set of the data of the element in case it is to be re-encoded in a
non-Unicode code set. Defaults to none.
=item id
Specifies the identifier of the user who created the element. Defaults to none.
=item -note
A reference to a list of notes to be added in the header.
=item -prop
A reference fo a hash of properties to be added in the header. Keys
are used as the C attribute, value as the tag contents.
=back
=cut
sub start_tmx {
my $self = shift;
my %options = @_;
my %o;
my @time = gmtime(time);
$o{'creationdate'} = sprintf("%d%02d%02dT%02d%02d%02dZ", $time[5]+1900,
$time[4]+1, $time[3], $time[2], $time[1], $time[0]);
my $encoding = $options{encoding} || "UTF-8";
if (defined($options{'-output'})) {
delete $self->{OUTPUT}; # because it is a glob
open $self->{OUTPUT}, ">", $options{'-output'}
or die "Cannot open file '$options{'-output'}': $!\n";
}
if ($encoding =~ m!utf.?8!i) {
binmode $self->{OUTPUT}, ":utf8"
}
$self->_write("\n");
my @valid_segtype = qw'block sentence paragraph phrase';
if(defined($options{SEGTYPE}) && grep { $_ eq $options{SEGTYPE} } @valid_segtype) {
$o{segtype} = $options{SEGTYPE};
} else {
$o{segtype} = 'sentence'
}
$o{'creationtool'} = $options{tool} || 'XML::TMX::Writer';
$o{'creationtoolversion'} = $options{toolversion} || $VERSION;
$o{'o-tmf'} = $options{srctmf} || 'plain text';
$o{'adminlang'} = $options{adminlang} || 'en';
$o{'srclang'} = $options{srclang} || 'en';
$o{'datatype'} = $options{datatype} || 'plaintext';
defined($options{srcencoding}) and $o{'o-encoding'} = $options{srcencoding};
defined($options{id}) and $o{'creationid'} = $options{id};
$self->_startTag(0, 'tmx', 'version' => 1.4)->_nl;
$self->_startTag(1, 'header', %o)->_nl;
$self->_write_props(2, $options{'-prop'}) if defined $options{'-prop'};
$self->_write_notes(2, $options{'-note'}) if defined $options{'-note'};
$self->_indent(1)->_endTag('header')->_nl;
$self->_startTag(0,'body')->_nl->_nl;
}
sub _write_props {
my ($self, $indent, $props) = @_;
return unless ref($props) eq "HASH";
for my $key (sort keys %$props) {
if (ref($props->{$key}) eq "ARRAY") {
for my $val (@{$props->{$key}}) {
if ($key eq "_") {
$self->_startTag($indent, 'prop');
} else {
$self->_startTag($indent, prop => (type => $key));
}
$self->_characters($val);
$self->_endTag('prop')->_nl;
}
} else {
if ($key eq "_") {
$self->_startTag($indent, 'prop');
} else {
$self->_startTag($indent, prop => (type => $key));
}
$self->_characters($props->{$key});
$self->_endTag('prop')->_nl;
}
}
}
sub _write_notes {
my ($self, $indent, $notes) = @_;
return unless ref($notes) eq "ARRAY";
for my $p (@{$notes}) {
$self->_startTag($indent, 'note');
$self->_characters($p);
$self->_endTag('note')->_nl;
}
}
=head2 add_tu
$tmx->add_tu(srclang => LANG1, LANG1 => 'text1', LANG2 => 'text2');
$tmx->add_tu(srclang => LANG1,
LANG1 => 'text1',
LANG2 => 'text2',
-note => ["value1", ## notes
"value2"],
-prop => { type1 => ["value1","value"], #multiple values
_ => 'value2', # anonymound properties
typen => ["valuen"],}
);
Adds a translation unit to the TMX file. Several optional labels can be
specified:
=over
=item id
Specifies an identifier for the ItuE> element. Its value is not
defined by the standard (it could be unique or not, numeric or
alphanumeric, etc.).
=item srcencoding
Same meaning as told in B method.
=item datatype
Same meaning as told in B method.
=item segtype
Same meaning as told in B method.
=item srclang
Same meaning as told in B method.
=back
=cut
sub add_tu {
my $self = shift;
my %tuv = @_;
my %prop = ();
my @note = ();
my %opt;
my $verbatim = 0;
my $cdata = 0;
if (exists($tuv{-raw})) {
# value already includes tags, hopefully, at least!
# so we will not mess with it.
$self->_write($tuv{-raw});
return;
}
for my $key (qw'id datatype segtype srclang creationid creationdate') {
if (exists($tuv{$key})) {
$opt{$key} = $tuv{$key};
delete $tuv{$key};
}
}
if (defined($tuv{srcencoding})) {
$opt{'o-encoding'} = $tuv{srcencoding};
delete $tuv{srcencoding};
}
$verbatim++ if defined $tuv{-verbatim};
delete $tuv{-verbatim} if exists $tuv{-verbatim};
if (defined($tuv{"-prop"})) {
%prop = %{$tuv{"-prop"}};
delete $tuv{"-prop"};
}
if (defined($tuv{"-note"})) {
@note = @{$tuv{"-note"}};
delete $tuv{"-note"};
}
if (defined($tuv{"-n"})) {
$opt{id}=$tuv{"-n"};
delete $tuv{"-n"};
}
$self->_startTag(0,'tu', %opt)->_nl;
### write the prop s problemas 23
$self->_write_props(3, \%prop);
$self->_write_notes(3, \@note);
for my $lang (sort keys %tuv) {
my $cdata = 0;
$self->_startTag(1, 'tuv', 'xml:lang' => $lang);
if (ref($tuv{$lang}) eq "HASH") {
$cdata++ if defined($tuv{$lang}{-iscdata});
delete($tuv{$lang}{-iscdata}) if exists($tuv{$lang}{-iscdata});
$self->_write_props(2, $tuv{$lang}{-prop}) if exists $tuv{$lang}{-prop};
$self->_write_notes(2, $tuv{$lang}{-note}) if exists $tuv{$lang}{-note};
$tuv{$lang} = $tuv{$lang}{-seg} || "";
}
$self->_startTag(0, 'seg');
if ($verbatim) {
$self->_write($tuv{$lang});
} elsif ($cdata) {
$self->_write("_write($tuv{$lang});
$self->_write("]]>");
} else {
$self->_characters($tuv{$lang});
}
$self->_endTag('seg');
$self->_endTag('tuv')->_nl;
}
$self->_endTag('tu')->_nl->_nl;
}
=head2 end_tmx
$tmx->end_tmx();
Ends the TMX file, closing file handles if necessary.
=cut
sub end_tmx {
my $self = shift();
$self->_endTag('body')->_nl;
$self->_endTag('tmx')->_nl;
close($self->{OUTPUT});
}
=head1 SEE ALSO
TMX Specification L
=head1 AUTHOR
Paulo Jorge Jesus Silva, Epaulojjs@bragatel.ptE
Alberto Simões, Ealbie@alfarrabio.di.uminho.ptE
=head1 COPYRIGHT AND LICENSE
Copyright 2003 by Projecto Natura
This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself.
=cut
sub _write {
my $self = shift;
print {$self->{OUTPUT}} @_;
return $self;
}
sub _nl {
my $self = shift;
$self->_write("\n");
}
sub _startTag {
my ($self, $indent, $tagName, %attributes) = @_;
my $attributes = "";
$attributes = " ".join(" ",map {"$_=\"$attributes{$_}\""} sort keys %attributes) if %attributes;
$self->_indent($indent)->_write("<$tagName$attributes>");
}
sub _indent {
my ($self, $indent) = @_;
$indent = " " x $indent;
$self->_write($indent);
}
sub _characters {
my ($self, $text) = @_;
$text = "" unless defined $text;
$text =~ s/\n/ /g;
$text =~ s/ +/ /g;
$text =~ s/&/&/g;
$text =~ s/</g;
$text =~ s/>/>/g;
$self->_write($text);
}
sub _endTag {
my ($self, $tagName) = @_;
$self->_write("$tagName>");
}
1;
XML-TMX-0.31/examples/po2tmx 000755 000765 000024 00000003717 11764151056 015643 0 ustar 00ambs staff 000000 000000 #!/usr/bin/perl -w -s
our($s);
use Data::Dumper;
use strict;
use lib 'lib';
use XML::TMX::FromPO;
my $tmx = new XML::TMX::FromPO(DEBUG => 1);
my %convert = ( 'pt_en' => 'pt en',
'pt_es' => 'pt es',
'pt_de' => 'pt de',
'pt_it' => 'pt it',
'pt_ru' => 'pt ru',
'pt_fr' => 'pt fr',
'pt_br' => 'pt pt_BR',
'br_en' => 'pt_BR en',
'br_es' => 'pt_BR es',
'br_de' => 'pt_BR de',
'br_it' => 'pt_BR it',
'br_ru' => 'pt_BR ru',
'br_fr' => 'pt_BR fr',
'en_fr' => 'en fr'
);
if($s){
my $lang = $s;
if ($s eq "1") { die("Usage $0 -s=pt DIR\n")}
while(my $dir = shift()) {
$tmx->rec_get_po($dir, $lang, LANG=>"$lang en");
open(XML, "| tee _1.aux |xmllint --recover --format - | bzip2 > $lang-en.tmx.bz2");
# print STDOUT Dumper($tmx);
*STDOUT = *XML;
$tmx->create_tmx(LANG => "$lang en");
close(XML);
$tmx->clean_tmx()
}
}
else{
my $lang = '';
for my $c (keys %convert) {
$lang = $lang . " $convert{$c}";
}
while(my $dir = shift()) {
if(-d "$dir/po" and <$dir/po/*.po>){ $tmx->parse_dir("$dir/po", LANG => $lang) ; }
if(-d "$dir" and <$dir/*.po>) { $tmx->parse_dir("$dir", LANG => $lang) ; }
$dir =~ m/([a-z0-9_\.-]+)\/*$/i;
for my $conv (keys %convert) {
open(XML, "|xmllint --format - | bzip2 > $1_$conv.tmx.bz2");
*STDOUT = *XML;
$tmx->create_tmx(LANG => "$convert{$conv}");
close(XML);
}
$tmx->clean_tmx()
}
}
=encoding UTF-8
=head1 NAME
po2tmx - creates TMX files from PO files
=head1 SYNOPSIS
po2tmx POdir #
po2tmx -s=fr #single PO files en-fr (does not join multiple PO-files)
=head1 DESCRIPTION
read several POs, and joinds their translation units to build several
TMX pairs.
=head1 SEE ALSO
XML::TMX
=cut
XML-TMX-0.31/examples/xpdf-tmx 000755 000765 000024 00000003527 12221072235 016146 0 ustar 00ambs staff 000000 000000 #!/usr/bin/perl -s
### NOT IN DISTRO
our ($m);
$m ||= 1000;
my $tmp="/tmp/_tmxpdf_$$_.pdf";
my $tmx=shift or die("usage:\n\t$0 file.tmx\n");
use strict;
if(not -d "$ENV{HOME}/.gwb"){ mkdir("$ENV{HOME}/.gwb"); }
if(not -f "$ENV{HOME}/.gwb/tmx.css"){
open(F,">","$ENV{HOME}/.gwb/tmx.css") or
die("cant create'$ENV{HOME}/.gwb/tmx.css'\n");
while(){ last if /END/;
print F $_}
close F;
}
system("prince -s $ENV{HOME}/.gwb/tmx.css $tmx $tmp 2>/dev/null");
system("xpdf -z width -cont -g 1500x1050 $tmp");
unlink $tmp;
__DATA__
@charset "UTF-8";
tu { display: block;
border-top: solid red 1px;
padding: 2pt;
counter-increment: num;
columns:2;
}
tuv { display: block;
page-break-inside: avoid;
margin-bottom: 4pt; }
tuv {color:orange; font-size:70% ; }
tuv:lang(pt) {color:green; }
tuv:lang(pt_pt){color:green; }
tuv:lang(pt-pt){color:green; }
tuv:lang(PT_PT){color:green; }
tuv:lang(PT-PT){color:green; }
tuv::before {color: black; font-size:70%; }
tuv[lang]::before { content:counter(num) " " attr(lang) " "; }
tuv:lang(pt)::before { content:counter(num) " PT "; }
tuv:lang(en)::before { content:counter(num) " EN "; }
tuv:lang(en_GB)::before { content:counter(num) " EN "; }
tuv:lang(en_US)::before { content:counter(num) " EN "; }
seg { }
header{ display:none; }
@page{
margin: 10mm 10mm 10mm 10mm;
columns:2;
size:A4;
}
@screen{
margin: 8mm 8mm 8mm 8mm;
columns:2;
size:screen;
}
/* END */
=head1 NAME
xpdf-tmx - convert TMX files to PDF and show them
=head1 SYNOPSIS
xpdf-tmx file.tmx
=head1 DESCRIPTION
=head2 Dependencies
C converter must be installed
xpdf
=head1 AUTHOR
J.Joao Almeida, jj@di.uminho.pt
=head1 SEE ALSO
perl(1).
prince
xpdf
=cut