XML-TMX-0.36/0000755000175000017500000000000013154220371011174 5ustar ambsambsXML-TMX-0.36/lib/0000755000175000017500000000000013154220371011742 5ustar ambsambsXML-TMX-0.36/lib/XML/0000755000175000017500000000000013154220371012402 5ustar ambsambsXML-TMX-0.36/lib/XML/TMX.pm0000644000175000017500000000156613154220371013420 0ustar ambsambspackage XML::TMX; $XML::TMX::VERSION = '0.36'; # ABSTRACT: Perl extensions for managing TMX files use 5.010; use warnings; use strict; use parent 'Exporter'; our @ISA = 'Exporter'; our @EXPORT_OK = qw(); 1; __END__ =pod =encoding utf-8 =head1 NAME XML::TMX - Perl extensions for managing TMX files =head1 VERSION version 0.36 =head1 SYNOPSIS use XML::TMX; =head1 DESCRIPTION XML::TMX is the top level module. At the moment it does not contain any useful code, so check sub-modules, please. =head1 AUTHORS =over 4 =item * Alberto Simões =item * José João Almeida =back =head1 COPYRIGHT AND LICENSE This software is copyright (c) 2010-2017 by Projeto Natura . This is free software; you can redistribute it and/or modify it under the same terms as the Perl 5 programming language system itself. =cut XML-TMX-0.36/lib/XML/TMX/0000755000175000017500000000000013154220371013052 5ustar ambsambsXML-TMX-0.36/lib/XML/TMX/Reader.pm0000644000175000017500000003133013154220371014612 0ustar ambsambspackage XML::TMX::Reader; $XML::TMX::Reader::VERSION = '0.36'; # ABSTRACT: Perl extension for reading TMX files use 5.010; use warnings; use strict; use Exporter (); use XML::DT; use XML::TMX::Writer; use File::BOM qw( open_bom); our @ISA = 'Exporter'; our @EXPORT_OK = qw(); sub new { my ($class, $file) = @_; return undef unless -f $file; my $self = bless { encoding => _guess_encoding($file), filename => $file, ignore_markup => 1, } => $class; $self->_parse_header; return $self; } sub _guess_encoding { my $file = shift; my $encoding = 'UTF-8'; my $enc = open_bom (my $fh, $file, ":utf8"); # or die "can't open $file ($!)"; $encoding= $enc if $enc; my $line = <$fh>; if ($line =~ /encoding=['"]([^'"]+)['"]/) { $encoding = $1; } #print STDERR "Debug: defuse.ENC= $enc; enc=$encoding\n"; close $fh; return $encoding; } sub _enc2bin{ my $enc=shift; if($enc =~ /utf.*8/i){ $enc = "utf8" } else {$enc = "encoding($enc)"} return ":$enc"; } sub _parse_header { my $self = shift; my $header = ""; { my $fh; local $/ = ""; open_bom($fh, $self->{filename},_enc2bin($self->{encoding})) ;# or die "$!"; #print STDERR "Debug2: defuse.ENC= $!- ; enc=$self->{encoding}\n"; $header = <$fh>; close $fh; } $header =~ s/^.*().*$/$1/s; $header =~ s!().*$!$1!s; dtstring($header => ( 'header' => sub { $self->{header}{$_} = $v{$_} for (keys %v); }, 'prop' => sub { $v{type} ||= "_"; push @{$self->{header}{-prop}{$v{type}}}, $c; }, 'note' => sub { push @{$self->{header}{-note}}, $c; }, )); } sub ignore_markup { my ($self, $opt) = @_; $opt = 1 unless defined $opt; $self->{ignore_markup} = $opt; } sub languages { my $self = shift; my %languages = (); $self->for_tu({proc_tu => 100}, sub { my $tu = shift; for ( keys %$tu ) { $languages{$_}++ unless m/^-/; } } ); return keys %languages; } sub _merge_notes { my ($orig, $new) = @_; $orig //= []; $orig = [$orig] unless ref $orig eq "ARRAY"; $new = [$new] unless ref $new eq "ARRAY"; push @$orig => grep { my $x = $_; !grep { $x eq $_} @$orig } @$new; return $orig; } sub _merge_props { my ($orig, $new) = @_; die "-prop should be hash" if $orig and ref $orig ne "HASH"; die "-prop should be hash" if $new and ref $new ne "HASH"; for my $key (keys %$new) { $orig->{$key} = _merge_notes($orig->{$key}, $new->{$key}); } return $orig; } sub _compute_header { my ($current, $conf) = @_; my %header = %$current; if (exists($conf->{-note})) { $header{-note} = _merge_notes($header{-note}, $conf->{-note}); } if (exists($conf->{-prop})) { $header{-prop} = _merge_props($header{-prop}, $conf->{-prop}); } return \%header; } sub for_tu { my $self = shift; my $conf = { -header => 1 }; my $i = 0; ref($_[0]) eq "HASH" and $conf = {%$conf , %{shift(@_)}}; my $code = shift; die "invalid processor" unless ref($code) eq "CODE"; local $/; my $outputingTMX = 0; my $tmx; my $data; my $gen=0; my %h = ( -type => { tu => 'SEQ', tuv => 'SEQ' }, tu => sub { my $tu; for my $va (@$c) { if ($va->[0] eq "-prop") { push @{$tu->{$va->[0]}{$va->[1]}}, $va->[2] } elsif ($va->[0] eq "-note") { push @{$tu->{$va->[0]}}, $va->[1] } else { $tu->{$va->[0]} = $va->[1] } } my ($ans, $v) = $code->($tu, \%v); # Check if the user wants to create a TMX and # forgot to say us if (!$outputingTMX && $ans && ref($ans) eq "HASH") { $outputingTMX = 1; $tmx = XML::TMX::Writer->new(); if ($conf->{-header}) { my $header = _compute_header($self->{header}, $conf); $tmx->start_tmx(encoding => $self->{encoding}, %$header); } } # Add the translation unit if ($ans && ref($ans) eq "HASH") { $gen++; %v = %$v if ($v && ref($v) eq "HASH"); my %ans = (%v, %$ans); $ans{"-n"}=$i if $conf->{n} ; $tmx->add_tu(-verbatim => $conf->{-verbatim}, %ans); } }, tuv => sub { my $tuv; for my $v (@$c) { if ($v->[0] eq "-prop") { push @{$tuv->{$v->[0]}{$v->[1]}}, $v->[2] } elsif ($v->[0] eq "-note") { push @{$tuv->{$v->[0]}}, $v->[1] } elsif ($v->[0] eq "-cdata") { $tuv->{-iscdata} = 1; $tuv->{-seg} = $v->[1]; } else { $tuv->{-seg} = $v->[0]; } } [ $v{lang} || $v{'xml:lang'} || "_" => $tuv ] }, prop => sub { ["-prop", $v{type} || "_", $c] }, note => sub { ["-note" , $c] }, seg => sub { return ($v{iscdata}) ? [ -cdata => $c ] : [ $c ] }, -cdata => sub { father->{'iscdata'} = 1; $c }, hi => sub { $self->{ignore_markup}?$c:toxml }, ph => sub { $self->{ignore_markup}?$c:toxml }, ); $/ = "\n"; $h{-outputenc} = $h{-inputenc} = $self->{encoding}; my $resto = ""; ## Go through the header... my $fh; open_bom($fh, $self->{filename},_enc2bin($self->{encoding})) ;# or die "$!"; #print STDERR "Debug2: defuse.ENC= $! ; enc=" ,_enc2bin($self->{encoding}),"\n"; while (<$fh>) { next if /^\s*$/; last if /)(.*)!s) { $resto = $3; } # If we have an output filename, user wants to output a TMX $conf->{-output} = $conf->{output} if defined($conf->{output}); if (defined($conf->{-output})) { $outputingTMX = 1; $tmx = XML::TMX::Writer->new(); if ($conf->{-header}) { my $header = _compute_header($self->{header}, $conf); $tmx->start_tmx(encoding => $self->{encoding}, -output => $conf->{-output}, %$header); } } $/ = ""; $conf->{-verbose}++ if $conf->{verbose}; print STDERR "." if $conf->{-verbose}; while (<$fh>) { ($_ = $resto . $_ and $resto = "" ) if $resto; last if /<\/body>/; $i++; print STDERR "\r$i" if $conf->{-verbose} && !($i % 10); last if defined $conf->{proc_tu} && $i > $conf->{proc_tu} ; last if defined $conf->{gen_tu} && $gen > $conf->{gen_tu}; next if defined $conf->{patt} && !m/$conf->{patt}/ ; #### # This can't be done. Not sure why it was being done. # So, please, unless you know the implications for tagged crpora # do not uncomment it. # s/\>\s+/>/; undef($data); if ($conf->{'-raw'}) { my $ans = $code->($_); if ($conf->{-output}) { $ans->{"-n"}=$i if $conf->{n} ; $tmx->add_tu(-raw => $ans); } } else { eval { dtstring($_, %h) } ; ## dont die in invalid XML warn $@ if $@; } } print STDERR "\r$i\n" if $conf->{-verbose}; close $fh; $tmx->end_tmx if $conf->{-header} && $outputingTMX; } sub to_html { my $self = shift; my %opt = @_; my @ls=$self->languages; my $html="". join("\n",map{""} @ls) ."\n"; $self->for_tu(sub { my ($langs, $opts) = @_; my $ret = ""; # "
$_
"; for (@ls) { $ret .= "\n\t"} #if ($opt{icons}) { # $ret .= "\"$_\"/" # } else { # $ret .= "$_" # } # $ret .= "\n" $html.= "$ret\n\n"; return "" } ); return $html; } sub for_tu2 { warn "Please update your code to use 'for_tu'\n"; &for_tu; } 1; __END__ =pod =encoding utf-8 =head1 NAME XML::TMX::Reader - Perl extension for reading TMX files =head1 VERSION version 0.36 =head1 SYNOPSIS use XML::TMX::Reader; my $reader = XML::TMX::Reader->new( $filename ); $reader -> for_tu( sub { my $tu = shift; #blah blah blah }); @used_languages = $reader->languages; $reader->to_html() =head1 DESCRIPTION This module provides a simple way for reading TMX files. =head1 METHODS The following methods are available: =head2 C This method creates a new XML::TMX::Reader object. This process checks for the existence of the file and extracts some meta-information from the TMX header; my $reader = XML::TMX::Reader->new("my.tmx"); =head2 C This method is used to set the flag to ignore (or not) markup inside translation unit segments. The default is to ignore those markup. If called without parameters, it sets the flag to ignore the markup. If you don't want to do that, use $reader->ignore_markup(0); =head2 C This method returns the languages being used on the specified translation memory. Note that the module does not check for language code correctness or existence. =head2 C Use C to process all translation units from a TMX file. This version iterates for all tu (one at the time) The configuration hash is a reference to a Perl hash. At the moment these are valid options: =over =item C<-verbose> Set this option to a true value and a counter of the number of processed translation units will be printed to stderr. =item C<-output> | C Filename to output the changed TMX to. Note that if you use this option, your function should return a hash reference where keys are language names, and values their respective translation. =item C Write at most C TUs =item C Process at most C TUs =item C Only process TU that match C. =item C<-raw> Pass the XML directly to the method instead of parsing it. =item C<-verbatim> Use segment contents verbatim, without any normalization. =item C<-prop> A hashref of properties to be B to the TMX header block. =item C<-note> An arrayref of notes to be B to the TMX header block. =item C<-header> A boolean value. If set to true, the heading tags (and closing tag) of the TMX file are written. Otherwise, only the translation unit tags are written. =back The function will receive two arguments: =over =item * a reference to a hash which maps: the language codes to the respective translation unit segment; a special key "-prop" that maps property names to properties; a special key "-note" that maps to a list of notes. =item * a reference to a hash which contains the attributes for those translation unit tag; =back If you want to process the TMX and return it again, your function should return an hash reference where keys are the languages, and values their respective translation. =head2 C Use this method to create a nice HTML file with the translation memories. Notice that this method is not finished yet, and relies on some images, on some specific locations. =head2 C deprecated. use C =head1 SEE ALSO L, TMX Specification L =head1 CONTRIBUTORS Paulo Jorge Jesus Silva, Epaulojjs@bragatel.ptE =head1 AUTHORS =over 4 =item * Alberto Simões =item * José João Almeida =back =head1 COPYRIGHT AND LICENSE This software is copyright (c) 2010-2017 by Projeto Natura . This is free software; you can redistribute it and/or modify it under the same terms as the Perl 5 programming language system itself. =cut XML-TMX-0.36/lib/XML/TMX/FromPO.pm0000644000175000017500000002076213154220371014561 0ustar ambsambspackage XML::TMX::FromPO; $XML::TMX::FromPO::VERSION = '0.36'; # ABSTRACT: Generates a TMX file from a group of PO files use 5.010; use warnings; use strict; use XML::TMX::Writer; use Exporter (); our @ISA = 'Exporter'; our @EXPORT_OK = qw(&new &parse_dir &create_tmx &clean_tmx); sub new { my $proto = shift; my $class = ref($proto) || $proto; my $self = {}; $self->{LANG} = undef; $self->{OUTPUT} = undef; $self->{DEBUG} = 0; __common_conf($self, @_); unless(defined($self->{CONVER})) { if(system('recode >/dev/null 2>&1')) { $self->{CONVERT} = 'iconv -f %t -t utf8 < %f'; } else { $self->{CONVERT} = 'recode %t..utf8 < %f'; } } $self->{TMX} = {}; bless($self, $class); return($self); } sub rec_get_po { my $self = shift; my $dir = shift; my $lan1 = shift; __common_conf($self, @_); # check if directory is readable if(-f $dir) { my $file=$dir; my $lang = lc($lan1); if(!defined($self->{LANG}) || __check_lang($self, $lang)) { __processa($self, $file, $lang); } } else { die("$dir is not a readable directory\n") unless(-d $dir); for my $file (<$dir/*>) { if($file =~ /(.*)\.(po|messages)$/) { my $lang = lc($lan1); if(!defined($self->{LANG}) || __check_lang($self, $lang)) { __processa($self, $file, $lang); } } elsif($file =~ /(.*)\.(\w+)\.(po|messages)$/) { my $lang = lc($lan1); ## ??? my $lang = "\L$2"; if(!defined($self->{LANG}) || __check_lang($self, $lang)) { __processa($self, $file, $lang); } } elsif(-d $file) { rec_get_po($self,$file,$lan1) } else { ## warn ("$file ... não tem lingua\n") if($self->{DEBUG}); } } } __add_en($self) if(__check_lang($self, 'en')); __limpa($self); } sub parse_dir { my $self = shift; my $dir = shift; __common_conf($self, @_); # check if directory is readable die("$dir is not a readable directory\n") unless(-d $dir); for my $file ((<$dir/*.po>),(<$dir/*.messages>)) { if($file =~ /(\w+)\.(po|messages)$/) { my $lang = "\L$1"; if(!defined($self->{LANG}) || __check_lang($self, $lang)) { __processa($self, $file, $lang); } } elsif($file =~ /(.*)\.(\w+)\.(po|messages)$/) { my $lang = "\L$2"; if(!defined($self->{LANG}) || __check_lang($self, $lang)) { __processa($self, $file, $lang); } } else { warn ("$file ... não tem lingua\n") if($self->{DEBUG}); } } __add_en($self) if(__check_lang($self, 'en')); __limpa($self); } # return value: # * 0 -> lang does not exist # * 1 -> lang exists sub __check_lang { my $self = shift; my $lang = shift; my @regex = @{$self->{LANG}}; while(my $regex = shift(@regex)) { last if($regex gt $lang); if($lang =~ /^$regex$/i) { return(1); } } return(0); } sub __add_en { my $self = shift; for my $str (keys %{$self->{TMX}}) { $self->{TMX}{$str}{'en'} = $str; } } sub create_tmx { my $self = shift; my $tmx = new XML::TMX::Writer(); __common_conf($self, @_); my $n_langs = @{$self->{LANG}}; if(defined($self->{OUTPUT})) { $tmx->start_tmx(ID => 'XML::TMX::FromPO', OUTPUT => $self->{OUTPUT}); } else { $tmx->start_tmx(ID => 'XML::TMX::FromPO'); } for my $chave (keys %{$self->{TMX}}) { my $reg = __make_tu($self, $self->{TMX}{$chave}); # only write to file if all languages are defined $tmx->add_tu(%{$reg}) if(keys(%{$reg}) >= $n_langs); } $tmx->end_tmx(); } sub clean_tmx { my $self = shift; $self->{TMX} = {}; } sub __make_tu { my $self = shift; my $block = shift; my $reg = {}; if(!defined($self->{LANG})) { return($block); } for my $lang (keys %$block) { $reg->{$lang} = $block->{$lang} if(__check_lang($self, $lang)); } return($reg); } sub __processa { my $self = shift; my $a = shift; my $l = shift; local $/ = "\nmsgid"; #local $/ = "\nmsgid "; print STDERR "$a\n" if($self->{DEBUG}); my $codeline = `grep -i Content-Type $a | grep -i charset`; my $code = "?"; if($codeline =~ /charset=([\w-]+)/) { $code = $1; } my $convert = $self->{CONVERT}; $convert =~ s/\%t/$code/i; $convert =~ s/\%f/$a/i; if($code eq "?" || $code =~ /utf-?8/i ) { open(F,$a) or die;} else { open(F,"$convert|") or die;} my $mi = 0; while() { chomp; next if($mi == 0 && /^msgid\s+""/); if(/"Content-Type:/ && /charset=([\w-]+)/) { $code = $1; next } s/(^|\n)\s*#.*//g; # s/_//g unless $under; next unless(/\n\s*msgstr/); my ($m1,$m2) = ($`,$'); $m1 =~ s/(^\s*"|"\s*$)//g; $m1 =~ s/("\s*\n\s*")/ /g; $m2 =~ s/(^\s*"|"\s*$)//g; $m2 =~ s/("\s*\n\s*")/ /g; unless($m1) { warn "\n====M1 vazio... \n$m1\n=$m2\n"; next; } if($m2) { $self->{TMX}{$m1}{$l} = $m2; } # || "????? $m1"; #$self->{TMX}{$m1}{'en'} = $m1; # print "\n====\n$m1\n=$m2\n"; $mi++; } print STDERR "Charset: $code\n" if($self->{DEGUB}); close F; } sub __limpa { my $self = shift; # possíveis limpezas # (1) eliminar traduções que sejam igual ao original # (2) eliminar strings que não contenham pelo menos 2 letras # consecutivas # (3) eliminar frases que fiquem sem traduções # # um teste realizado com os po's do evolution mostrou uma redução do # ficheiro final de 12M para 8,6M, uma análise com o diff aos dumps # permitiu ver que grande parte do ''lixo'' eram de (1) for my $h1 (keys %{$self->{TMX}}) { if($h1 =~ /[a-z][a-z]/i) { for my $h2 (keys %{$self->{TMX}{$h1}}) { # optimização (1) delete($self->{TMX}{$h1}{$h2}) if($h2 !~ /^en/i && $h1 eq $self->{TMX}{$h1}{$h2}); } # optimização (3) delete($self->{TMX}{$h1}) unless(keys %{$self->{TMX}{$h1}}); } else { # optimização (2) delete($self->{TMX}{$h1}); } } } sub __common_conf { my $self = shift; my %opt = @_; if(defined($opt{LANG})) { my @list; for my $l (sort(split(/\s+/, $opt{LANG}))) { push(@list, $l) if($l =~ /^[a-z0-9_]+$/i); } $self->{LANG} = \@list if(@list); } $self->{CONVERT} = $opt{CONVERT} if defined($opt{CONVERT}); $self->{OUTPUT} = $opt{OUTPUT} if defined($opt{OUTPUT}); $self->{DEBUG} = $opt{DEBUG} if defined($opt{DEBUG}); } 1; __END__ =pod =encoding utf-8 =head1 NAME XML::TMX::FromPO - Generates a TMX file from a group of PO files =head1 VERSION version 0.36 =head1 SYNOPSIS use XML::TMX::FromPO; my $conv = new XML::TMX::FromPO(OUTPUT => '%f.tmx'); =head1 DESCRIPTION This module can be used to generate TMX files from a group of PO files. =head1 METHODS The following methods are available: =head2 new $tmx = new XML::TMX::FromPO(); Creates a new XML::TMX::FromPO object. Please check the L section for details on the options. =head2 rec_get_po TODO: Document method =head2 parse_dir TODO: Document method =head2 create_tmx TODO: Document function =head2 clean_tmx TODO: Document method =head1 COMMON CONFIGURATION These configuration options can be passed to all methods in the module: =over =item LANG => 'list' A case insensitive list of regular expression separated by whitespaces that matches the code of the languages that are to be processed. Defaults to all. =item CONVERT => 'iconv -f %t -t utf8 < %f' A string that contains the command to convert a file (%f) from some charset (%t) to Unicode. If none is specified, the module tries to use L, if it fails then the module defaults to L. =item OUTPUT => 'x.tmx' The name of the output file. If none is specified it defaults to the standard output. =item DEBUG => 1 Activate debugging information. Defaults to 0. =back =head1 SEE ALSO L, L, L, L =head1 CONTRIBUTORS Paulo Jorge Jesus Silva, Epaulojjs@bragatel.ptE =head1 AUTHORS =over 4 =item * Alberto Simões =item * José João Almeida =back =head1 COPYRIGHT AND LICENSE This software is copyright (c) 2010-2017 by Projeto Natura . This is free software; you can redistribute it and/or modify it under the same terms as the Perl 5 programming language system itself. =cut XML-TMX-0.36/lib/XML/TMX/Writer.pm0000644000175000017500000002714213154220371014672 0ustar ambsambspackage XML::TMX::Writer; $XML::TMX::Writer::VERSION = '0.36'; # ABSTRACT: Perl extension for writing TMX files use 5.010; use warnings; use strict; use Exporter (); our @ISA = 'Exporter'; our @EXPORT_OK = qw(); sub new { my $proto = shift; my $class = ref($proto) || $proto; my %ops = @_; my $self = { OUTPUT => \*STDOUT }; binmode $self->{OUTPUT}, ":utf8" unless exists $ops{-encoding} and $ops{-encoding} !~ /utf.?8/i; bless($self, $class); return($self); } sub start_tmx { my $self = shift; my %options = @_; my %o; my @time = gmtime(time); $o{'creationdate'} = sprintf("%d%02d%02dT%02d%02d%02dZ", $time[5]+1900, $time[4]+1, $time[3], $time[2], $time[1], $time[0]); my $encoding = $options{encoding} || "UTF-8"; if (defined($options{'-output'})) { delete $self->{OUTPUT}; # because it is a glob open $self->{OUTPUT}, ">", $options{'-output'} or die "Cannot open file '$options{'-output'}': $!\n"; } if ($encoding =~ m!utf.?8!i) { binmode $self->{OUTPUT}, ":utf8" } $self->_write("\n"); my @valid_segtype = qw'block sentence paragraph phrase'; if(defined($options{SEGTYPE}) && grep { $_ eq $options{SEGTYPE} } @valid_segtype) { $o{segtype} = $options{SEGTYPE}; } else { $o{segtype} = 'sentence' } $o{'creationtool'} = $options{tool} || 'XML::TMX::Writer'; $o{'creationtoolversion'} = $options{toolversion} || $XML::TMX::Writer::VERSION; $o{'o-tmf'} = $options{srctmf} || 'plain text'; $o{'adminlang'} = $options{adminlang} || 'en'; $o{'srclang'} = $options{srclang} || 'en'; $o{'datatype'} = $options{datatype} || 'plaintext'; defined($options{srcencoding}) and $o{'o-encoding'} = $options{srcencoding}; defined($options{id}) and $o{'creationid'} = $options{id}; $self->_startTag(0, 'tmx', 'version' => 1.4)->_nl; $self->_startTag(1, 'header', %o)->_nl; $self->_write_props(2, $options{'-prop'}) if defined $options{'-prop'}; $self->_write_notes(2, $options{'-note'}) if defined $options{'-note'}; $self->_indent(1)->_endTag('header')->_nl; $self->_startTag(0,'body')->_nl->_nl; } sub _write_props { my ($self, $indent, $props) = @_; return unless ref($props) eq "HASH"; for my $key (sort keys %$props) { if (ref($props->{$key}) eq "ARRAY") { for my $val (@{$props->{$key}}) { if ($key eq "_") { $self->_startTag($indent, 'prop'); } else { $self->_startTag($indent, prop => (type => $key)); } $self->_characters($val); $self->_endTag('prop')->_nl; } } else { if ($key eq "_") { $self->_startTag($indent, 'prop'); } else { $self->_startTag($indent, prop => (type => $key)); } $self->_characters($props->{$key}); $self->_endTag('prop')->_nl; } } } sub _write_notes { my ($self, $indent, $notes) = @_; return unless ref($notes) eq "ARRAY"; for my $p (@{$notes}) { $self->_startTag($indent, 'note'); $self->_characters($p); $self->_endTag('note')->_nl; } } sub add_tu { my $self = shift; my %tuv = @_; my %prop = (); my @note = (); my %opt; my $verbatim = 0; my $cdata = 0; if (exists($tuv{-raw})) { # value already includes tags, hopefully, at least! # so we will not mess with it. $self->_write($tuv{-raw}); return; } for my $key (qw'id datatype segtype srclang creationid creationdate') { if (exists($tuv{$key})) { $opt{$key} = $tuv{$key}; delete $tuv{$key}; } } if (defined($tuv{srcencoding})) { $opt{'o-encoding'} = $tuv{srcencoding}; delete $tuv{srcencoding}; } $verbatim++ if defined $tuv{-verbatim}; delete $tuv{-verbatim} if exists $tuv{-verbatim}; if (defined($tuv{"-prop"})) { %prop = %{$tuv{"-prop"}}; delete $tuv{"-prop"}; } if (defined($tuv{"-note"})) { @note = @{$tuv{"-note"}}; delete $tuv{"-note"}; } if (defined($tuv{"-n"})) { $opt{id}=$tuv{"-n"}; delete $tuv{"-n"}; } $self->_startTag(0,'tu', %opt)->_nl; ### write the prop s problemas 23 $self->_write_props(3, \%prop); $self->_write_notes(3, \@note); for my $lang (sort keys %tuv) { my $cdata = 0; $self->_startTag(1, 'tuv', 'xml:lang' => $lang); if (ref($tuv{$lang}) eq "HASH") { $cdata++ if defined($tuv{$lang}{-iscdata}); delete($tuv{$lang}{-iscdata}) if exists($tuv{$lang}{-iscdata}); $self->_write_props(2, $tuv{$lang}{-prop}) if exists $tuv{$lang}{-prop}; $self->_write_notes(2, $tuv{$lang}{-note}) if exists $tuv{$lang}{-note}; $tuv{$lang} = $tuv{$lang}{-seg} || ""; } $self->_startTag(0, 'seg'); if ($verbatim) { $self->_write($tuv{$lang}); } elsif ($cdata) { $self->_write("_write($tuv{$lang}); $self->_write("]]>"); } else { $self->_characters($tuv{$lang}); } $self->_endTag('seg'); $self->_endTag('tuv')->_nl; } $self->_endTag('tu')->_nl->_nl; } sub end_tmx { my $self = shift(); $self->_endTag('body')->_nl; $self->_endTag('tmx')->_nl; close($self->{OUTPUT}); } sub _write { my $self = shift; print {$self->{OUTPUT}} @_; return $self; } sub _nl { my $self = shift; $self->_write("\n"); } sub _startTag { my ($self, $indent, $tagName, %attributes) = @_; my $attributes = ""; $attributes = " ".join(" ",map {"$_=\"$attributes{$_}\""} sort keys %attributes) if %attributes; $self->_indent($indent)->_write("<$tagName$attributes>"); } sub _indent { my ($self, $indent) = @_; $indent = " " x $indent; $self->_write($indent); } sub _characters { my ($self, $text) = @_; $text = "" unless defined $text; $text =~ s/\n/ /g; $text =~ s/ +/ /g; $text =~ s/&/&/g; $text =~ s//>/g; $text =~ s!<(b|emph)>(.+?)</\1>!<$1>$2!gs; $self->_write($text); } sub _endTag { my ($self, $tagName) = @_; $self->_write(""); } 1; __END__ =pod =encoding utf-8 =head1 NAME XML::TMX::Writer - Perl extension for writing TMX files =head1 VERSION version 0.36 =head1 SYNOPSIS use XML::TMX::Writer; my $tmx = XML::TMX::Writer->new(); $tmx->start_tmx(id => 'paulojjs'); $tmx->add_tu('en' => 'some text', 'pt' => 'algum texto'); $tmx->add_tu('en' => 'some text', 'pt' => 'algum texto', -note => [32, 34 ], -prop => { q => 23, aut => "jj"} ); $tmx->end_tmx(); =head1 DESCRIPTION This module provides a simple way for writing TMX files. =head1 METHODS The following methods are available: =head2 new $tmx = new XML::TMX::Writer(); Creates a new XML::TMX::Writer object =head2 start_tmx $tmx->start_tmx(-output => 'some_file.tmx'); Begins a TMX file. Several options are available: =over 4 =item -output Output of the TMX, if none is defined stdout is used by default. =item tool Tool used to create the TMX. Defaults to 'XML::TMX::Writer' =item toolversion Some version identification of the tool used to create the TMX. Defaults to the current module version =item segtype Segment type used in the ItuE> elements. Possible values are I, I, I and I. Defaults to I. =item srctmf Specifies the format of the translation memory file from which the TMX document or segment thereof have been generated. =item adminlang Specifies the default language for the administrative and informative elements InoteE> and IpropE>. =item srclang Specifies the language of the source text. If a ItuE> element does not have a srclang attribute specified, it uses the one defined in the IheaderE> element. Defaults to I<*all*>. =item datatype Specifies the type of data contained in the element. Depending on that type, you may apply different processes to the data. The recommended values for the datatype attribute are as follow (this list is not exhaustive): =over 4 =item unknown undefined =item alptext WinJoust data =item cdf Channel Definition Format =item cmx Corel CMX Format =item cpp C and C++ style text =item hptag HP-Tag =item html HTML, DHTML, etc =item interleaf Interleaf documents =item ipf IPF/BookMaster =item java Java, source and property files =item javascript JavaScript, ECMAScript scripts =item lisp Lisp =item mif Framemaker MIF, MML, etc =item opentag OpenTag data =item pascal Pascal, Delphi style text =item plaintext Plain text (default) =item pm PageMaker =item rtf Rich Text Format =item sgml SGML =item stf-f S-Tagger for FrameMaker =item stf-i S-Tagger for Interleaf =item transit Transit data =item vbscript Visual Basic scripts =item winres Windows resources from RC, DLL, EXE =item xml XML =item xptag Quark XPressTag =back =item srcencoding All TMX documents are in Unicode. However, it is sometimes useful to know what code set was used to encode text that was converted to Unicode for purposes of interchange. This option specifies the original or preferred code set of the data of the element in case it is to be re-encoded in a non-Unicode code set. Defaults to none. =item id Specifies the identifier of the user who created the element. Defaults to none. =item -note A reference to a list of notes to be added in the header. =item -prop A reference fo a hash of properties to be added in the header. Keys are used as the C attribute, value as the tag contents. =back =head2 add_tu $tmx->add_tu(srclang => LANG1, LANG1 => 'text1', LANG2 => 'text2'); $tmx->add_tu(srclang => LANG1, LANG1 => 'text1', LANG2 => 'text2', -note => ["value1", ## notes "value2"], -prop => { type1 => ["value1","value"], #multiple values _ => 'value2', # anonymound properties typen => ["valuen"],} ); Adds a translation unit to the TMX file. Several optional labels can be specified: =over =item id Specifies an identifier for the ItuE> element. Its value is not defined by the standard (it could be unique or not, numeric or alphanumeric, etc.). =item srcencoding Same meaning as told in B method. =item datatype Same meaning as told in B method. =item segtype Same meaning as told in B method. =item srclang Same meaning as told in B method. =back =head2 end_tmx $tmx->end_tmx(); Ends the TMX file, closing file handles if necessary. =head1 SEE ALSO TMX Specification L =head1 CONTRIBUTORS Paulo Jorge Jesus Silva, Epaulojjs@bragatel.ptE =head1 AUTHORS =over 4 =item * Alberto Simões =item * José João Almeida =back =head1 COPYRIGHT AND LICENSE This software is copyright (c) 2010-2017 by Projeto Natura . This is free software; you can redistribute it and/or modify it under the same terms as the Perl 5 programming language system itself. =cut XML-TMX-0.36/t/0000755000175000017500000000000013154220371011437 5ustar ambsambsXML-TMX-0.36/t/98_pod.t0000644000175000017500000000020113154220371012717 0ustar ambsambsuse Test::More; eval "use Test::Pod 1.00"; plan skip_all => "Test::Pod 1.00 required for testing POD" if $@; all_pod_files_ok(); XML-TMX-0.36/t/80_frompo.t0000644000175000017500000000012513154220371013433 0ustar ambsambs# -*- cperl -*- use Test::More tests => 1; BEGIN { use_ok(XML::TMX::FromPO); }; XML-TMX-0.36/t/25_for_tu.t0000644000175000017500000000242013154220371013426 0ustar ambsambs# -*- cperl -*- use Data::Dumper; use Test::More tests => 9; BEGIN { use_ok(XML::TMX::Reader); }; my $reader; $reader = XML::TMX::Reader->new('t/sample.tmx'); ok($reader, "reading sample.tmx"); my $count = 0; $reader->for_tu( sub { my $tu = shift; $count++; }); is($count, 7, "counting tu's with for_tu"); $count = 0; $reader->for_tu( {output => "t/_tmp.tmx", proc_tu => 6 }, sub { my $tu = shift; $tu->{-prop}={number => ++$count}; $tu; }); ok( -f "t/_tmp.tmx"); #unlink( "t/_tmp.tmx"); $reader = XML::TMX::Reader->new('t/_tmp.tmx'); ok($reader,"loadind t/_tmp.tmx"); $count = 0; $reader->for_tu( sub { my $tu = shift; $count++; }); is($count, 6, "counting tu's with for_tu"); $reader->for_tu( {output => "t/_tmp2.tmx", gen_tu=>2}, sub { my $tu = shift; if($tu->{-prop}{number} % 2 == 0) { return undef } else { $tu->{-note}[0]="This one is even"; return $tu;} }); my @langs = $reader->languages; is(@langs, 2 , "languages".join(",",@langs)); ok(grep { $_ eq "EN-GB" } @langs, "en"); ok(grep { $_ eq "PT-PT" } @langs, "pt"); unlink( "t/_tmp.tmx"); unlink( "t/_tmp2.tmx"); XML-TMX-0.36/t/20_reader.t0000644000175000017500000000371313154220371013373 0ustar ambsambs# -*- cperl -*- use Test::More tests => 10; use File::Temp; use XML::TMX::Reader; ok 1; my $reader; $reader = XML::TMX::Reader->new('foobar.tmx'); ok(!$reader,"rigth foobar.tmx is not present"); $reader = XML::TMX::Reader->new('t/sample.tmx'); ok($reader, "reading sample.tmx"); is_deeply($reader->{header}, { '-prop' => { 'bodykey' => ['bodyvalue'] }, '-note' => [ 'bodynote' ], 'o-tmf' => 'TW4Win 2.0 Format', adminlang => 'EN-US', creationdate => '20020312T164816Z', creationtoolversion => '5.0', creationtool => 'MyTool', srclang => 'EN-GB', segtype => 'sentence', datatype => 'html', }); my $count = 0; $reader->for_tu( sub { my $tu = shift; $count++; }); my $tmp = File::Temp->new(SUFFIX=>'.tmx', UNLINK => 0); is($count, 7, "counting tu's with for_tu"); $reader->for_tu( { -output => $tmp->filename }, sub { my $tu = shift; $tu->{-prop}={q=>[77], aut=>["jj","ambs"]}; $tu->{-note}=[2..5]; $tu; }); ok( -f $tmp->filename ); $reader = XML::TMX::Reader->new( $tmp->filename ); ok $reader,"loading " . $tmp->filename; my $tmp2 = File::Temp->new(SUFFIXE=>'.tmx', UNLINK => 0); $reader->for_tu( {output => $tmp2->filename }, sub { my $tu = shift; for (keys %{$tu->{-prop}}){ $tu->{-prop}{$_} .= "batatas"; } for (@{$tu->{-note}}){ $_ = "$_ cabolas" } $tu; }); my @langs = $reader->languages; is(@langs, 2 , "languages".join(",",@langs)); ok(grep { $_ eq "EN-GB" } @langs, "en"); ok(grep { $_ eq "PT-PT" } @langs, "pt"); unlink( $tmp->filename ); unlink( $tmp2->filename ); XML-TMX-0.36/t/tmx14.dtd0000644000175000017500000002436113154220371013117 0ustar ambsambs XML-TMX-0.36/t/cat.xml0000644000175000017500000000302213154220371012725 0ustar ambsambs
val11 val22 val3 val4 note1 note2 note3 note4 note5 note6
sval1 sval2 snote1 snote2 snote3 algum texto b d a b c d some text svalA svalB snoteA snoteB snoteC Hello World 2 4 1 2 3 4 olá mundo
XML-TMX-0.36/t/writer1.xml0000644000175000017500000000161613154220371013562 0ustar ambsambs
val1 val2 note1 note2 note3
sval1 sval2 snote1 snote2 snote3 b d a b c d some text algum texto
XML-TMX-0.36/t/10_writer.t0000644000175000017500000000304613154220371013443 0ustar ambsambs# -*- cperl -*- use Test::More tests => 4; use XML::TMX::Writer; ok 1; my $tmx = new XML::TMX::Writer(); isa_ok($tmx, "XML::TMX::Writer"); $tmx->start_tmx(id => "foobar", -prop => { prop1 => 'val1', prop2 => 'val2', }, -note => [ 'note1', 'note2', 'note3' ], -output => "_${$}_"); $tmx->add_tu(srclang => 'en', -note => ['snote1', 'snote2', 'snote3'], -prop => { sprop1 => 'sval1', sprop2 => 'sval2' }, 'en' => {-prop => { a=>'b',c=>'d'}, -note => [qw,a b c d,], -seg =>'some text', }, 'pt' => 'algum texto'); $tmx->end_tmx(); ok(-f "_${$}_"); ok file_contents_almost_identical("t/writer1.xml", "_${$}_"); unlink "_${$}_"; sub file_contents_almost_identical { my ($file1, $file2) = @_; return 0 unless -f $file1; return 0 unless -f $file2; open F1, $file1 or die; open F2, $file2 or die; my ($l1,$l2); while (defined($l1 = ) && defined($l2 = )) { s/>\s*; return 0 if ; return 1; } XML-TMX-0.36/t/50_tmxcat.t0000644000175000017500000000172713154220371013437 0ustar ambsambs#!/usr/bin/perl use strict; use warnings; use XML::TMX::Reader; use Test::More tests => 4; `$^X -Iblib scripts/tmx2tmx -cat t/writer1.xml t/writer2.xml > mycat.tmx`; ok -f 'mycat.tmx'; my $reader = XML::TMX::Reader->new('mycat.tmx'); isa_ok $reader => 'XML::TMX::Reader'; like $reader->{header}{creationdate} => qr/^\d+T\d+Z$/; delete $reader->{header}{creationtoolversion}; delete $reader->{header}{creationdate}; is_deeply($reader->{header}, { 'o-tmf' => 'plain text', adminlang => 'en', creationtool => 'XML::TMX::Writer', srclang => 'en', segtype => 'sentence', datatype => 'plaintext', '-note' => [qw.note1 note2 note3 note4 note5 note6.], '-prop' => {prop3 => ['val3'], prop4 => ['val4'], prop2 => ['val2', 'val22'], prop1 => ['val1', 'val11'], }, }); #unlink 'mycat.tmx'; XML-TMX-0.36/t/01_main.t0000644000175000017500000000010213154220371013041 0ustar ambsambs# -*- cperl -*- use Test::More tests => 1; use XML::TMX; ok 1; XML-TMX-0.36/t/00-aux.t0000644000175000017500000000232213154220371012635 0ustar ambsambs#!/usr/bin/perl use XML::TMX::Reader; use Test::More; is_deeply(XML::TMX::Reader::_merge_notes(undef, "foo"), ["foo"]); is_deeply(XML::TMX::Reader::_merge_notes(undef, ["foo"]), ["foo"]); is_deeply(XML::TMX::Reader::_merge_notes([], ["foo"]), ["foo"]); is_deeply(XML::TMX::Reader::_merge_notes("foo","foo"), ["foo"]); is_deeply([ sort @{ XML::TMX::Reader::_merge_notes("foo", ["foo", "bar"]) }], [ sort (qw'foo bar') ]); is_deeply([ sort @{ XML::TMX::Reader::_merge_notes([qw'a b c e'], [qw'c d e f']) }], [ sort (qw'a b c d e f') ]); is_deeply(XML::TMX::Reader::_merge_props(undef, {foo => 'bar'}), {foo=>['bar']}); is_deeply(XML::TMX::Reader::_merge_props({foo => 'bar'}, {foo => 'bar'}), {foo=>['bar']}); my $m = XML::TMX::Reader::_merge_props({foo => 'ugh'}, {foo => 'bar'}); ok exists $m->{foo}; is_deeply [ sort @{$m->{foo}} ], [sort(qw'bar ugh')]; is_deeply(XML::TMX::Reader::_merge_props({foz => 'bar'}, {foo => 'bar'}), {foz=>'bar', foo=>['bar']}); $m = XML::TMX::Reader::_merge_props({foz => 'bar'}, {foz => 'baz', foo => 'bar'}); ok exists $m->{foo}; ok exists $m->{foz}; is_deeply [ sort @{$m->{foz}} ], [sort(qw'bar baz')]; is_deeply $m->{foo}, ['bar']; done_testing(); XML-TMX-0.36/t/sample.tmx0000644000175000017500000000761113154220371013457 0ustar ambsambs
bodyvalue bodynote
1 tu nr 1 1 tuv nr 1 en From Paul, called to be an apostle of Christ Jesus by the will of God, and from Sosthenes, our brother, 2 tuv nr 2 pt Paulo, Apóstolo de Jesus Cristo por vontade e chamamento de Deus, e o irmão Sóstenes, 2 tu nr 2 to God's Church which is in Corinth; to you whom God has sanctified in Christ Jesus and called to be holy, together with those who everywhere call upon the name of our Lord Christ Jesus, their Lord and ours. à Igreja de Deus que está em Corinto. Dirigimo-nos àqueles que foram santificados em Jesus Cristo e chamados a ser santos, juntamente com todos os que invocam em todo o lugar o nome de nosso Senhor Jesus Cristo, Senhor deles e nosso. 3 tu nr 3 Receive grace and peace from God our Father, and Christ Jesus our Lord. Graça e paz vos sejam dadas da parte de Deus nosso Pai e do Senhor Jesus Cristo. 4 tu nr 4 I give thanks constantly to my God for you and for the grace of God given to you in Christ Jesus. Sem cessar, agradeço a Deus por vossa causa, em vista da graça de Deus que vos foi concedida em Jesus Cristo. 5 tu nr 5 For you have been fully enriched in him with words as well as with knowledge, Pois em Jesus é que recebestes todas as riquezas, tanto da palavra como do conhecimento. 6 tu nr 6 even as the testimony concerning Christ was confirmed in you. Na verdade, o testemunho de Cristo tornou-se firme em vós, 7 tu nr 7 You do not lack any spiritual gift and only await the glorious coming of Christ Jesus, our Lord. a tal ponto que não vos falta nenhum dom, a vós que esperais a Revelação de nosso Senhor Jesus Cristo.
XML-TMX-0.36/t/99_pod-coverage.t0000644000175000017500000000024413154220371014520 0ustar ambsambsuse Test::More; eval "use Test::Pod::Coverage 0.08"; plan skip_all => "Test::Pod::Coverage 0.08 required for testing POD coverage" if $@; all_pod_coverage_ok(); XML-TMX-0.36/t/writer2.xml0000644000175000017500000000175213154220371013564 0ustar ambsambs
val11 val22 val3 val4 note4 note5 note6
svalA svalB snoteA snoteB snoteC Hello World 2 4 1 2 3 4 olá mundo
XML-TMX-0.36/META.yml0000644000175000017500000000141313154220371012444 0ustar ambsambs--- abstract: 'Perl extensions for managing TMX files' author: - 'Alberto Simões ' - 'José João Almeida ' build_requires: Test::More: '0' configure_requires: ExtUtils::MakeMaker: '0' dynamic_config: 0 generated_by: 'Dist::Zilla version 6.010, CPAN::Meta::Converter version 2.150010' license: perl meta-spec: url: http://module-build.sourceforge.net/META-spec-v1.4.html version: '1.4' name: XML-TMX requires: DB_File: '0' Data::Dumper: '0' Digest::MD5: '0' Encode: '0' Exporter: '0' Fcntl: '0' File::BOM: '0' File::Temp: '0' Getopt::Long: '0' Pod::Usage: '0' XML::DT: '0' parent: '0' perl: v5.10.0 strict: '0' utf8: '0' warnings: '0' version: '0.36' x_serialization_backend: 'YAML::Tiny version 1.70' XML-TMX-0.36/LICENSE0000644000175000017500000004400413154220371012203 0ustar ambsambsThis software is copyright (c) 2010-2017 by Projeto Natura . This is free software; you can redistribute it and/or modify it under the same terms as the Perl 5 programming language system itself. Terms of the Perl programming language system itself a) the GNU General Public License as published by the Free Software Foundation; either version 1, or (at your option) any later version, or b) the "Artistic License" --- The GNU General Public License, Version 1, February 1989 --- This software is Copyright (c) 2010-2017 by Projeto Natura . This is free software, licensed under: The GNU General Public License, Version 1, February 1989 GNU GENERAL PUBLIC LICENSE Version 1, February 1989 Copyright (C) 1989 Free Software Foundation, Inc. 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. Preamble The license agreements of most software companies try to keep users at the mercy of those companies. By contrast, our General Public License is intended to guarantee your freedom to share and change free software--to make sure the software is free for all its users. The General Public License applies to the Free Software Foundation's software and to any other program whose authors commit to using it. You can use it for your programs, too. When we speak of free software, we are referring to freedom, not price. Specifically, the General Public License is designed to make sure that you have the freedom to give away or sell copies of free software, that you receive source code or can get it if you want it, that you can change the software or use pieces of it in new free programs; and that you know you can do these things. To protect your rights, we need to make restrictions that forbid anyone to deny you these rights or to ask you to surrender the rights. These restrictions translate to certain responsibilities for you if you distribute copies of the software, or if you modify it. For example, if you distribute copies of a such a program, whether gratis or for a fee, you must give the recipients all the rights that you have. You must make sure that they, too, receive or can get the source code. And you must tell them their rights. We protect your rights with two steps: (1) copyright the software, and (2) offer you this license which gives you legal permission to copy, distribute and/or modify the software. Also, for each author's protection and ours, we want to make certain that everyone understands that there is no warranty for this free software. If the software is modified by someone else and passed on, we want its recipients to know that what they have is not the original, so that any problems introduced by others will not reflect on the original authors' reputations. The precise terms and conditions for copying, distribution and modification follow. GNU GENERAL PUBLIC LICENSE TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 0. This License Agreement applies to any program or other work which contains a notice placed by the copyright holder saying it may be distributed under the terms of this General Public License. The "Program", below, refers to any such program or work, and a "work based on the Program" means either the Program or any work containing the Program or a portion of it, either verbatim or with modifications. Each licensee is addressed as "you". 1. You may copy and distribute verbatim copies of the Program's source code as you receive it, in any medium, provided that you conspicuously and appropriately publish on each copy an appropriate copyright notice and disclaimer of warranty; keep intact all the notices that refer to this General Public License and to the absence of any warranty; and give any other recipients of the Program a copy of this General Public License along with the Program. You may charge a fee for the physical act of transferring a copy. 2. You may modify your copy or copies of the Program or any portion of it, and copy and distribute such modifications under the terms of Paragraph 1 above, provided that you also do the following: a) cause the modified files to carry prominent notices stating that you changed the files and the date of any change; and b) cause the whole of any work that you distribute or publish, that in whole or in part contains the Program or any part thereof, either with or without modifications, to be licensed at no charge to all third parties under the terms of this General Public License (except that you may choose to grant warranty protection to some or all third parties, at your option). c) If the modified program normally reads commands interactively when run, you must cause it, when started running for such interactive use in the simplest and most usual way, to print or display an announcement including an appropriate copyright notice and a notice that there is no warranty (or else, saying that you provide a warranty) and that users may redistribute the program under these conditions, and telling the user how to view a copy of this General Public License. d) You may charge a fee for the physical act of transferring a copy, and you may at your option offer warranty protection in exchange for a fee. Mere aggregation of another independent work with the Program (or its derivative) on a volume of a storage or distribution medium does not bring the other work under the scope of these terms. 3. You may copy and distribute the Program (or a portion or derivative of it, under Paragraph 2) in object code or executable form under the terms of Paragraphs 1 and 2 above provided that you also do one of the following: a) accompany it with the complete corresponding machine-readable source code, which must be distributed under the terms of Paragraphs 1 and 2 above; or, b) accompany it with a written offer, valid for at least three years, to give any third party free (except for a nominal charge for the cost of distribution) a complete machine-readable copy of the corresponding source code, to be distributed under the terms of Paragraphs 1 and 2 above; or, c) accompany it with the information you received as to where the corresponding source code may be obtained. (This alternative is allowed only for noncommercial distribution and only if you received the program in object code or executable form alone.) Source code for a work means the preferred form of the work for making modifications to it. For an executable file, complete source code means all the source code for all modules it contains; but, as a special exception, it need not include source code for modules which are standard libraries that accompany the operating system on which the executable file runs, or for standard header files or definitions files that accompany that operating system. 4. You may not copy, modify, sublicense, distribute or transfer the Program except as expressly provided under this General Public License. Any attempt otherwise to copy, modify, sublicense, distribute or transfer the Program is void, and will automatically terminate your rights to use the Program under this License. However, parties who have received copies, or rights to use copies, from you under this General Public License will not have their licenses terminated so long as such parties remain in full compliance. 5. By copying, distributing or modifying the Program (or any work based on the Program) you indicate your acceptance of this license to do so, and all its terms and conditions. 6. Each time you redistribute the Program (or any work based on the Program), the recipient automatically receives a license from the original licensor to copy, distribute or modify the Program subject to these terms and conditions. You may not impose any further restrictions on the recipients' exercise of the rights granted herein. 7. The Free Software Foundation may publish revised and/or new versions of the General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns. Each version is given a distinguishing version number. If the Program specifies a version number of the license which applies to it and "any later version", you have the option of following the terms and conditions either of that version or of any later version published by the Free Software Foundation. If the Program does not specify a version number of the license, you may choose any version ever published by the Free Software Foundation. 8. If you wish to incorporate parts of the Program into other free programs whose distribution conditions are different, write to the author to ask for permission. For software which is copyrighted by the Free Software Foundation, write to the Free Software Foundation; we sometimes make exceptions for this. Our decision will be guided by the two goals of preserving the free status of all derivatives of our free software and of promoting the sharing and reuse of software generally. NO WARRANTY 9. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 10. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. END OF TERMS AND CONDITIONS Appendix: How to Apply These Terms to Your New Programs If you develop a new program, and you want it to be of the greatest possible use to humanity, the best way to achieve this is to make it free software which everyone can redistribute and change under these terms. To do so, attach the following notices to the program. It is safest to attach them to the start of each source file to most effectively convey the exclusion of warranty; and each file should have at least the "copyright" line and a pointer to where the full notice is found. Copyright (C) 19yy This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 1, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301 USA Also add information on how to contact you by electronic and paper mail. If the program is interactive, make it output a short notice like this when it starts in an interactive mode: Gnomovision version 69, Copyright (C) 19xx name of author Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. This is free software, and you are welcome to redistribute it under certain conditions; type `show c' for details. The hypothetical commands `show w' and `show c' should show the appropriate parts of the General Public License. Of course, the commands you use may be called something other than `show w' and `show c'; they could even be mouse-clicks or menu items--whatever suits your program. You should also get your employer (if you work as a programmer) or your school, if any, to sign a "copyright disclaimer" for the program, if necessary. Here a sample; alter the names: Yoyodyne, Inc., hereby disclaims all copyright interest in the program `Gnomovision' (a program to direct compilers to make passes at assemblers) written by James Hacker. , 1 April 1989 Ty Coon, President of Vice That's all there is to it! --- The Artistic License 1.0 --- This software is Copyright (c) 2010-2017 by Projeto Natura . This is free software, licensed under: The Artistic License 1.0 The Artistic License Preamble The intent of this document is to state the conditions under which a Package may be copied, such that the Copyright Holder maintains some semblance of artistic control over the development of the package, while giving the users of the package the right to use and distribute the Package in a more-or-less customary fashion, plus the right to make reasonable modifications. Definitions: - "Package" refers to the collection of files distributed by the Copyright Holder, and derivatives of that collection of files created through textual modification. - "Standard Version" refers to such a Package if it has not been modified, or has been modified in accordance with the wishes of the Copyright Holder. - "Copyright Holder" is whoever is named in the copyright or copyrights for the package. - "You" is you, if you're thinking about copying or distributing this Package. - "Reasonable copying fee" is whatever you can justify on the basis of media cost, duplication charges, time of people involved, and so on. (You will not be required to justify it to the Copyright Holder, but only to the computing community at large as a market that must bear the fee.) - "Freely Available" means that no fee is charged for the item itself, though there may be fees involved in handling the item. It also means that recipients of the item may redistribute it under the same conditions they received it. 1. You may make and give away verbatim copies of the source form of the Standard Version of this Package without restriction, provided that you duplicate all of the original copyright notices and associated disclaimers. 2. You may apply bug fixes, portability fixes and other modifications derived from the Public Domain or from the Copyright Holder. A Package modified in such a way shall still be considered the Standard Version. 3. You may otherwise modify your copy of this Package in any way, provided that you insert a prominent notice in each changed file stating how and when you changed that file, and provided that you do at least ONE of the following: a) place your modifications in the Public Domain or otherwise make them Freely Available, such as by posting said modifications to Usenet or an equivalent medium, or placing the modifications on a major archive site such as ftp.uu.net, or by allowing the Copyright Holder to include your modifications in the Standard Version of the Package. b) use the modified Package only within your corporation or organization. c) rename any non-standard executables so the names do not conflict with standard executables, which must also be provided, and provide a separate manual page for each non-standard executable that clearly documents how it differs from the Standard Version. d) make other distribution arrangements with the Copyright Holder. 4. You may distribute the programs of this Package in object code or executable form, provided that you do at least ONE of the following: a) distribute a Standard Version of the executables and library files, together with instructions (in the manual page or equivalent) on where to get the Standard Version. b) accompany the distribution with the machine-readable source of the Package with your modifications. c) accompany any non-standard executables with their corresponding Standard Version executables, giving the non-standard executables non-standard names, and clearly documenting the differences in manual pages (or equivalent), together with instructions on where to get the Standard Version. d) make other distribution arrangements with the Copyright Holder. 5. You may charge a reasonable copying fee for any distribution of this Package. You may charge any fee you choose for support of this Package. You may not charge a fee for this Package itself. However, you may distribute this Package in aggregate with other (possibly commercial) programs as part of a larger (possibly commercial) software distribution provided that you do not advertise this Package as a product of your own. 6. The scripts and library files supplied as input to or produced as output from the programs of this Package do not automatically fall under the copyright of this Package, but belong to whomever generated them, and may be sold commercially, and may be aggregated with this Package. 7. C or perl subroutines supplied by you and linked into this Package shall not be considered part of this Package. 8. The name of the Copyright Holder may not be used to endorse or promote products derived from this software without specific prior written permission. 9. THIS PACKAGE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. The End XML-TMX-0.36/Makefile.PL0000644000175000017500000000362013154220371013147 0ustar ambsambs# This file was automatically generated by Dist::Zilla::Plugin::MakeMaker v6.010. use strict; use warnings; use 5.010000; use ExtUtils::MakeMaker; my %WriteMakefileArgs = ( "ABSTRACT" => "Perl extensions for managing TMX files", "AUTHOR" => "Alberto Sim\x{f5}es , Jos\x{e9} Jo\x{e3}o Almeida ", "CONFIGURE_REQUIRES" => { "ExtUtils::MakeMaker" => 0 }, "DISTNAME" => "XML-TMX", "EXE_FILES" => [ "scripts/tmx-POStagger", "scripts/tmx-explode", "scripts/tmx-tokenize", "scripts/tmx2html", "scripts/tmx2tmx", "scripts/tmxclean", "scripts/tmxgrep", "scripts/tmxsplit", "scripts/tmxuniq", "scripts/tmxwc", "scripts/tsv2tmx" ], "LICENSE" => "perl", "MIN_PERL_VERSION" => "5.010000", "NAME" => "XML::TMX", "PREREQ_PM" => { "DB_File" => 0, "Data::Dumper" => 0, "Digest::MD5" => 0, "Encode" => 0, "Exporter" => 0, "Fcntl" => 0, "File::BOM" => 0, "File::Temp" => 0, "Getopt::Long" => 0, "Pod::Usage" => 0, "XML::DT" => 0, "parent" => 0, "strict" => 0, "utf8" => 0, "warnings" => 0 }, "TEST_REQUIRES" => { "Test::More" => 0 }, "VERSION" => "0.36", "test" => { "TESTS" => "t/*.t" } ); my %FallbackPrereqs = ( "DB_File" => 0, "Data::Dumper" => 0, "Digest::MD5" => 0, "Encode" => 0, "Exporter" => 0, "Fcntl" => 0, "File::BOM" => 0, "File::Temp" => 0, "Getopt::Long" => 0, "Pod::Usage" => 0, "Test::More" => 0, "XML::DT" => 0, "parent" => 0, "strict" => 0, "utf8" => 0, "warnings" => 0 ); unless ( eval { ExtUtils::MakeMaker->VERSION(6.63_03) } ) { delete $WriteMakefileArgs{TEST_REQUIRES}; delete $WriteMakefileArgs{BUILD_REQUIRES}; $WriteMakefileArgs{PREREQ_PM} = \%FallbackPrereqs; } delete $WriteMakefileArgs{CONFIGURE_REQUIRES} unless eval { ExtUtils::MakeMaker->VERSION(6.52) }; WriteMakefile(%WriteMakefileArgs); XML-TMX-0.36/examples/0000755000175000017500000000000013154220371013012 5ustar ambsambsXML-TMX-0.36/examples/tmx-metrics0000755000175000017500000000524313154220371015220 0ustar ambsambs#!/usr/bin/perl -s # PODNAME: tmx-explode # ABSTRACT: Explodes tmx files in a file per language use strict ; use Lingua::NATools::Client; use Data::Dumper; $Data::Dumper::Indent=1; $Data::Dumper::Terse=1; my $Gn1={}; my $Gn2={}; my $Geq=0; my $Geq2=0; my $t; my $perfeitas =0; my $nonumber =0; my $pessimas =0; my $delnumbers=0; my $desiguaist=0; # size(bigger) > 2(size(shorter)) my $iguais=0; our $debug = 1; my $tmx = shift or die; use XML::TMX::Reader; my $t = XML::TMX::Reader->new( $tmx ); my $tun = 0; print "\n================($tmx)========\n"; $t->for_tu( \&mycmp ); print "\n comon numbers = ",MScardinter($Gn1,$Gn2)*2, "\n eq-same-TU numbers = ", $Geq*2 , "\n total numbers = ", $t= MScard($Gn1)+MScard($Gn2), "\n TU = $tun", "\n good TU (Num>0) = $perfeitas", "\n good TU (Num=0) = $nonumber", "\n very bad TU = $pessimas ($delnumbers)", "\n equal TU = $iguais", "\n size too diff TU = $desiguaist", (($Geq*2.4 < $t )? "\n PROBLEMS - numbers too diff." : ""), (($pessimas / $tun > 0.2 )? "\n PROBLEMS - too many bad TU" : ""), "\n After del. bad TU:". $Geq2*2, "/" , $t-$delnumbers,"=", ($Geq2*2)/( $t-$delnumbers), "\n\n"; sub mycmp{ $tun++; my $tu = shift; if(length($tu->{en}) > 30 && (length($tu->{en}) > 2 * length($tu->{pt}) || length($tu->{pt}) > 2 * length($tu->{en}))){ print "remove $tun = size is too different\n" if $debug ; $desiguaist ++ ; } elsif(length($tu->{en}) > 30 && $tu->{en} eq $tu->{pt}){ print "remove $tun = the TUs are equal.\n" if $debug ; $iguais ++ ; } my $n1 = getn($tu->{en}); my $n2 = getn($tu->{pt}); my $n3 = MScardinter($n1,$n2); $Geq += $n3; $Geq2 += $n3; my $n4 = MScard($n1)+MScard($n2) ; $perfeitas ++ if $n4 == $n3*2 && $n4 > 0; $nonumber ++ if $n4 == $n3*2 && $n4 == 0; if((3+ $n3) * 2.4 < 3+$n4 ) { $delnumbers += $n4; $Geq2 -= $n3; $pessimas ++; print "remove $tun = Number in this TUs are too different\n" if $debug; print " ***EN\t$tu->{en}\n ***PT\t$tu->{pt}\n" if $debug; } MSaddto($Gn1,$n1); MSaddto($Gn2,$n2); } sub MSaddto{ # a = a U b my ($m1,$m2)=@_; for(keys %$m2){$m1->{$_} += $m2->{$_}} } sub MScardinter{ # #(a /\ b) my ($m1,$m2)=@_; my $com = 0; for(keys %$m2){ $com += $m1->{$_} > $m2->{$_} ? $m2->{$_} : $m1->{$_}} $com } sub MScard{ # Multiset cardinal my ($m1)=@_; my $c = 0; for(values %$m1){ $c += $_ } $c } sub getn{ # get the multiset of the numbers present in the sentence my $s=shift; my %a=(); for( $s =~ m/(\d+)/g){$a{$_}++} \%a } __END__ XML-TMX-0.36/examples/tmx2pdf-prince0000755000175000017500000000414213154220371015603 0ustar ambsambs#!/usr/bin/perl -s ### NOT IN DISTRO our ($m,$o,$v); $m ||= 1000; my $tmx=shift or die("usage:\n\t$0 file.tmx\n"); my $tmp=$o; my @u=(); ## unlink list unless($o){ if($tmx =~ /(.*\/)?(.*).tmx$/){$tmp = "${1}__$2.pdf"} else {$tmp = "/tmp/_tmxpdf_$$_.pdf"; # @u=($tmp); } } use strict; if(not -d "$ENV{HOME}/.gwb"){ mkdir("$ENV{HOME}/.gwb"); } if(not -f "$ENV{HOME}/.gwb/tmx.css"){ open(F,">","$ENV{HOME}/.gwb/tmx.css") or die("cant create'$ENV{HOME}/.gwb/tmx.css'\n"); while(){ last if /END/; print F $_} close F; } system("prince -s $ENV{HOME}/.gwb/tmx.css '$tmx' -o '$tmp' 2> /dev/null"); if($v){ system("xpdf -z width -cont -g 1500x1050 $tmp")} ; unlink (@u); __DATA__ @charset "UTF-8"; tu { display: block; border-top: solid red 1px; padding: 2pt; counter-increment: num; page-break-inside: avoid; columns:2; } tuv { display: block; page-break-inside: avoid; margin-bottom: 4pt; } tuv {color:orange; font-size:70% ; } tuv:lang(pt) {color:green; } tuv:lang(pt_pt){color:green; } tuv:lang(pt-pt){color:green; } tuv:lang(PT_PT){color:green; } tuv:lang(PT-PT){color:green; } strong, emph, b { font-weight: bold; } tuv::before {color: black; font-size:70%; } tuv[lang]::before { content:counter(num) " " attr(lang) " "; } tuv:lang(pt)::before { content:counter(num) " PT "; } tuv:lang(en)::before { content:counter(num) " EN "; } tuv:lang(en_GB)::before { content:counter(num) " EN "; } tuv:lang(en_US)::before { content:counter(num) " EN "; } seg { } header{ display:none; } @page{ margin: 10mm 10mm 10mm 10mm; columns:2; size:A4; } @screen{ margin: 8mm 8mm 8mm 8mm; columns:2; size:screen; } /* END */ =head1 NAME tmx2pdf-prince - convert TMX files to PDF =head1 SYNOPSIS tmx2pdf-prince file.tmx tmx2pdf-prince -v file.tmx =head1 DESCRIPTION =head2 Dependencies C converter must be installed xpdf =head1 AUTHOR J.Joao Almeida, jj@di.uminho.pt =head1 SEE ALSO perl(1). prince xpdf =cut XML-TMX-0.36/examples/po2tmx0000755000175000017500000000371713154220371014201 0ustar ambsambs#!/usr/bin/perl -w -s our($s); use Data::Dumper; use strict; use lib 'lib'; use XML::TMX::FromPO; my $tmx = new XML::TMX::FromPO(DEBUG => 1); my %convert = ( 'pt_en' => 'pt en', 'pt_es' => 'pt es', 'pt_de' => 'pt de', 'pt_it' => 'pt it', 'pt_ru' => 'pt ru', 'pt_fr' => 'pt fr', 'pt_br' => 'pt pt_BR', 'br_en' => 'pt_BR en', 'br_es' => 'pt_BR es', 'br_de' => 'pt_BR de', 'br_it' => 'pt_BR it', 'br_ru' => 'pt_BR ru', 'br_fr' => 'pt_BR fr', 'en_fr' => 'en fr' ); if($s){ my $lang = $s; if ($s eq "1") { die("Usage $0 -s=pt DIR\n")} while(my $dir = shift()) { $tmx->rec_get_po($dir, $lang, LANG=>"$lang en"); open(XML, "| tee _1.aux |xmllint --recover --format - | bzip2 > $lang-en.tmx.bz2"); # print STDOUT Dumper($tmx); *STDOUT = *XML; $tmx->create_tmx(LANG => "$lang en"); close(XML); $tmx->clean_tmx() } } else{ my $lang = ''; for my $c (keys %convert) { $lang = $lang . " $convert{$c}"; } while(my $dir = shift()) { if(-d "$dir/po" and <$dir/po/*.po>){ $tmx->parse_dir("$dir/po", LANG => $lang) ; } if(-d "$dir" and <$dir/*.po>) { $tmx->parse_dir("$dir", LANG => $lang) ; } $dir =~ m/([a-z0-9_\.-]+)\/*$/i; for my $conv (keys %convert) { open(XML, "|xmllint --format - | bzip2 > $1_$conv.tmx.bz2"); *STDOUT = *XML; $tmx->create_tmx(LANG => "$convert{$conv}"); close(XML); } $tmx->clean_tmx() } } =encoding UTF-8 =head1 NAME po2tmx - creates TMX files from PO files =head1 SYNOPSIS po2tmx POdir # po2tmx -s=fr #single PO files en-fr (does not join multiple PO-files) =head1 DESCRIPTION read several POs, and joinds their translation units to build several TMX pairs. =head1 SEE ALSO XML::TMX =cut XML-TMX-0.36/xt/0000755000175000017500000000000013154220371011627 5ustar ambsambsXML-TMX-0.36/xt/author/0000755000175000017500000000000013154220371013131 5ustar ambsambsXML-TMX-0.36/xt/author/pod-syntax.t0000644000175000017500000000025213154220371015423 0ustar ambsambs#!perl # This file was automatically generated by Dist::Zilla::Plugin::PodSyntaxTests. use strict; use warnings; use Test::More; use Test::Pod 1.41; all_pod_files_ok(); XML-TMX-0.36/xt/release/0000755000175000017500000000000013154220371013247 5ustar ambsambsXML-TMX-0.36/xt/release/cpan-changes.t0000644000175000017500000000034413154220371015764 0ustar ambsambsuse strict; use warnings; # this test was generated with Dist::Zilla::Plugin::Test::CPAN::Changes 0.012 use Test::More 0.96 tests => 1; use Test::CPAN::Changes; subtest 'changes_ok' => sub { changes_file_ok('Changes'); }; XML-TMX-0.36/MANIFEST.SKIP0000644000175000017500000000000413154220371013064 0ustar ambsambs.*~ XML-TMX-0.36/dist.ini0000644000175000017500000000126513154220371012644 0ustar ambsambsname = XML-TMX author = Alberto Simões author = José João Almeida license = Perl_5 copyright_holder = Projeto Natura copyright_year = 2010-2017 version = 0.36 [@Filter] -bundle = @Basic -remove = ExtraTests [ExecDir] dir = scripts [AutoPrereqs] skip=^FL3 skip=^Lingua::FreeLing skip=^Lingua::PT::PLNbase [Prereqs / RuntimeSuggests] Lingua::FreeLing3 = 0 Lingua::PT::PLNbase = 0 ; [MetaResources] ; repository.web = https://github.com/ambs/Dancer2-Plugin-JWT ; repository.url = https://github.com/ambs/Dancer2-Plugin-JWT ; repository.type = git [MetaJSON] [PkgVersion] [Test::CPAN::Changes] [PodSyntaxTests] [PodWeaver] XML-TMX-0.36/scripts/0000755000175000017500000000000013154220371012663 5ustar ambsambsXML-TMX-0.36/scripts/tmxsplit0000644000175000017500000000617313154220371014501 0ustar ambsambs#!/usr/bin/perl -s # PODNAME: tmxsplit # ABSTRACT: Splits a TMX file several files, one for each language use strict; use utf8; use XML::DT; #use bytes; our ($twente,$latin1,$utf8,$q,$cutmaxlen); my $filename = $ARGV[0]; my %files; my $data; my %h = ( # '-outputenc' => "ISO-8859-1", 'seg' => sub{ for ($c){ # s/\&/\&/g; # s//\>/g; s/\s\s+|^\s+|\s+$/ /g; }; $c }, 'ut' => sub{" "}, 'tu' => sub{$c}, 'tuv' => sub{ $c =~ s/^[\s\n]*//; $c =~ s/[\s\n]*$//; $data->{$v{lang}||$v{"xml:lang"}} = $cutmaxlen && length($c) > $cutmaxlen ? substr($c,0,$cutmaxlen)."||" : $c}, ); $h{-outputenc} = "ISO-8859-1" if $twente || $latin1; undef $h{-outputenc} if $utf8; my $i = 0; $| = 1; my $f; for $f (@ARGV){ # print "\n$f" unless $q; print STDERR "\n$f"; $/ = "\n"; open X, $f or die "cannot open file $f"; do { if(/encoding=.ISO-8859-1./i){$h{-outputenc}=$h{-inputenc}="ISO-8859-1";} } while ($_ = and $_ !~ /!s and $resto = $'; $/ = "
"; while() { ($_ = $resto . $_ and $resto = "" ) if $resto; $i++; last if /<\/body>/; #print "." if (!$q && $i%500==0); print STDERR "." if ($i % 1000==0); s/\>\s+/>/; undef($data); eval {dtstring($_, %h)} ; ## don't die in invalid XML if($@){warn($@)} else{ for my $k (keys %$data) { if (exists($files{"$filename-$k"})) { myprint($files{"$filename-$k"}, $data->{$k},$i); } else { my $x; open $x, ">$filename-$k" or die("cant >$filename-$k\n"); binmode($x,":utf8") if $utf8; myprint($x, $data->{$k},$i); $files{"$filename-$k"} = $x; } } } } close X; } for my $key (keys %files) { print "$key\n"; } sub myprint{ my($f,$tu,$i)=@_; if ($twente){ for ($tu){ s/<.*?>/ /gs; s/[\|\$]/ /gs; s/(\w)([.;,!:?«»"])/$1 $2/g; s/([.;,!:?«»"])(\w)/$1 $2/g; s/\s\s+|^\s+|\s+$/ /g; } print {$f} "$tu\n\$\n"; } else { print {$f} "$tu\n"; } } __END__ =pod =encoding utf-8 =head1 NAME tmxsplit - Splits a TMX file several files, one for each language =head1 VERSION version 0.36 =head1 SYNOPSIS tmxsplit f.tmx f2.tmx ... tmxsplit -twente f.tmx =head1 DESCRIPTION splits a TMX file in several files (one per language) and puts a tag C in each translation unit. The names for output files is based on the first tmx file supplied. =head1 Options -twente -- makes a format compatible with twente-aligner -latin1 -- a make latin1-encoded output -utf8 -- a make utf8-encoded output -q -- don't print filenames and "." -cutmaxlen=n -- cut translations by the n character =head1 AUTHORS =over 4 =item * Alberto Simões =item * José João Almeida =back =head1 COPYRIGHT AND LICENSE This software is copyright (c) 2010-2017 by Projeto Natura . This is free software; you can redistribute it and/or modify it under the same terms as the Perl 5 programming language system itself. =cut XML-TMX-0.36/scripts/tmx2tmx0000755000175000017500000001702513154220371014241 0ustar ambsambs#!/usr/bin/perl -s # PODNAME: tmx2tmx # ABSTRACT: Utility to convert and filter TMX files use strict; use warnings; use XML::TMX::Reader; use XML::TMX::Writer; use XML::DT; our ( $toTrados, ## Fix some issues in the TMX file the old trados software did not cope with $clean, ## Remove empty UTs and UTs with only junk characters $cat, ## Concatenates TMX files $select, ## Filters the TMX to the selected languages ); # here we must take care that only one of the options is being used. toTrados() if $toTrados; clean() if $clean; cat() if $cat; selectl() if $select; die "No option supplied\n"; #--( AUX FUNCS )----------- sub trim { my $x = shift; for ($x) { s!^\s+!!; s!\s+$!! } $x } sub toTrados { $/ = ""; my $header = <>; #
->
$header =~ s!]+)/>!
!; # -> $header =~ s!]+>!!; # tmx version -> $header =~ s!]+>!!; # srclang -> srclang="foo" if ($header =~ m!(xml:)?lang=(["'])([^"']+)\2!) { my $lang = $3; $header =~ s!srclang=(["'])[^"']+\1!srclang="$lang"!; } $header =~ s/xml:lang/lang/g; # xml:lang -> lang print $header; while (<>) { s/xml:lang/lang/g; print } # ugly, but prevents us from forgetting an else somewhere. exit; } sub clean { my $file; my $reader; if ($file = shift) { # isto tv fosse mais rápido em sax... print dt( $file, -default => sub{ toxml() }, seg => sub{ $c = trim($c); toxml }, tu => sub{ # remove entries with junk, only return "" if $c =~ m!(\s|[\|\!@#\$%^&\-])*!; toxml() }, ); } else { die "At the moment, we do not handle files from stdin"; # TODO... aqui fazia jeito que o XML::TMX::Reader lêsse de um # filehandle já aberto (STDIN, por exemplo); } exit; } sub _join_and_print_headers { my $header; while (@_) { my $file = shift @_; my $reader = XML::TMX::Reader->new($file); for my $key (keys %{$reader->{header}}) { if ($key eq "-prop") { for my $type (keys %{$reader->{header}{$key}}) { push @{$header->{$key}{$type}}, @{$reader->{header}{$key}{$type}} } } elsif ($key eq "-note") { push @{$header->{$key}}, @{$reader->{header}{$key}} } else { $header->{$key} = $reader->{header}{$key} } } } my $writer = XML::TMX::Writer->new(); $writer->start_tmx(%$header); } sub open_tmx_with_correct_encoding { # in the future it might be a good idea to use the guess_encoding method from Reader.pm my $filename = shift; open my $f, $filename or die "Can't open file $filename"; my $header = <$f>; close $f; my $fh; if ($header =~ /encoding\s*=\s*["']([^"']+)["']/) { open $fh, "<:encoding($1)", $filename or die "Can't open file $filename"; } else { open $fh, "<:utf8", $filename or die "Can't open file $filename"; } return $fh; } sub cat { my @files ; if (@ARGV) { @files = @ARGV; } else { @files = map { chomp; $_ } <>; } binmode STDOUT, ":utf8"; _join_and_print_headers(@files); my $fh; my $file = shift @files; $fh = open_tmx_with_correct_encoding($file); # open $fh, "<:utf8", $file or die "Cannot open file: $file\n"; while (<$fh>) { last if m!\n"; while (<$fh>) { s!!!; print; } close $fh; for $file (@files) { $fh = open_tmx_with_correct_encoding($file); # open $fh, "<:utf8", $file or die "Cannot open file: $file\n"; print "\n"; while (<$fh>) { last if m!]*>//g; print; while (<$fh>) { s!!!; print; } close $fh; } print "\n"; print "\n"; exit; } sub selectl { my @files; if (@ARGV) { @files = @ARGV; } else { @files = map { chomp; $_ } <>; } _join_and_print_headers(@files); my %languages = map { (lc $_ => 1) } split( /,/, $select ); my $tmx; while ($tmx = shift @files) { die "Can't find $tmx" if (!$tmx || !-f$tmx); print STDERR "$tmx\n"; my $reader = XML::TMX::Reader->new($tmx); $reader->for_tu( { -verbose => 1, -output => 1, -header => 0 }, sub { my $tu = shift; my $l = 0; for my $k (keys %$tu) { next if $k =~ /^-/; if (exists $languages{lc $k}) { $l++; } else { delete $tu->{$k} } } if ($l == keys %languages) { return $tu; } else { return undef; } }); } print "\n\n"; exit; } __END__ =pod =encoding utf-8 =head1 NAME tmx2tmx - Utility to convert and filter TMX files =head1 VERSION version 0.36 =head1 DESCRIPTION This utility processes TMX documents and return TMX documents. Tasks done with this utility include conversion between TMX versions and TMX cleaning. =head2 TRADOS conversion As you maybe know, TRADOS is a company producing computer software for translators. It includes WorkBench which imports TMX files. Unfortunately, the version I used do not import TMX version 1.4. This process is done using the switch C<-toTrados>: tmx2tmx -toTrados file.tmx > trados.tmx =head2 TMX Cleaning Specially when translation memories are created from anotated text, or extracted directly from the Internet using any automatic method. This switch is used to remove junk in translation units. This option tries to remove junk from TMX files, like empty pairs where one of the sides is empty, or removing other junk type. Use it this way: tmx2tmx -clean file.tmx > file2.tmx =head2 Concatenating TMX tmx2tmx -cat file1.tmx ... filen.tmx > file.tmx ls | grep '.tmx$' | tmx2tmx -cat > file.tmx =head2 Select languages Select a bunch of languages: tmx2tmx -select=PT,EN,FR huge.tmx > pt-en-fr.tmx =head1 SYNOPSYS tmx2tmx -cat file1.tmx ... filen.tmx > file.tmx tmx2tmx -toTrados file1.tmx > file2.tmx tmx2tmx -clean file1.tmx > file2.tmx tmx2tmx -select=PT,EN multilingual.tmx > pt-en.tmx =head1 SEE ALSO tmx2html, po2tmx, XML::TMX =head1 AUTHORS =over 4 =item * Alberto Simões =item * José João Almeida =back =head1 COPYRIGHT AND LICENSE This software is copyright (c) 2010-2017 by Projeto Natura . This is free software; you can redistribute it and/or modify it under the same terms as the Perl 5 programming language system itself. =cut XML-TMX-0.36/scripts/tmx-POStagger0000644000175000017500000001261613154220371015255 0ustar ambsambs#!/usr/bin/perl -s # PODNAME: tmx-POStagger # ABSTRACT: POStaggers translation units on a tmx file. use strict; use warnings; use File::Temp qw/ :POSIX /; my $tmpName = tmpnam(); our ( $o, # output filename $mwe, # tag mwe -- STILL NOT SUPPORTED $dates, # do dates analysis... $ner, # try to do NER.... $compact, # compact form $s, # use tags (on by default) ); $s = 1 unless defined $s; eval { require FL3 }; die "This XML::TMX script requires Lingua::FreeLing3 to be installed\n" if $@; FL3->import(); eval { require Lingua::FreeLing3::Word }; die "This XML::TMX script requires Lingua::FreeLing3 to be installed\n" if $@; Lingua::FreeLing3::Word->import(); my %initted = (); use XML::TMX::Reader '0.25'; my $file = shift or die "You must supply the name of the file to tokenize"; my $reader = XML::TMX::Reader->new($file); my $output = "pos_$file"; $output = $o if $o; my $pos_languages = {}; binmode STDOUT, ":utf8"; $reader->for_tu( { -output => $tmpName, -verbose => 1, -verbatim => 1, }, sub { my $tu = shift; for my $lang (keys %$tu) { if ($lang =~ /(pt|es|gl|it|fr|ru|en)/i) { $pos_languages->{$1} = 1; my $ln = lc $1; my $seg = $compact ? "" : "{$lang}{-seg}; my @tokens = map { Lingua::FreeLing3::Word->new($_) } split /\s+/, $txt; my $sentences = splitter($ln)->split(\@tokens); $sentences = morph($ln)->analyze($sentences); $sentences = hmm($ln)->tag($sentences); for my $stc (@$sentences) { $seg .= "\n" if $s; $seg .= $compact ? _dump_compact($stc->words) : _dump_words($stc->words); $seg .= "\n" if $s; } $tu->{$lang}{-iscdata} = $compact ? 0 : 1; $seg .= "]]>" unless $compact; $tu->{$lang}{-seg} = $seg; } } return $tu; }); $reader = XML::TMX::Reader->new($tmpName); $reader->for_tu({ -output => $output, -verbatim => 1, -raw => 1, -prop => ($compact ? {} : { 'pos-tagged' => join(",",keys %$pos_languages), 'pos-fields' => "word,lemma,pos", 'pos-s-attributes' => ['s'], })}, sub { $_ = $_[0]; if (!/\[CDATA\[/) { s{()(.*?)()} {my ($before,$middle,$after) = ($1,$2,$3); for ($middle) { s/&/&/g;s//>/g; }; $before.$middle.$after }ge; } return $_; }); #print STDERR "$tmpName\n"; unlink $tmpName if -e $tmpName; sub _init_ma { my $lang = shift; morph($lang, QuantitiesDetection => 0, MultiwordsDetection => ($mwe // 0), NumbersDetection => 0, DatesDetection => ($dates // 0), NERecognition => ($ner // 0)); $initted{$lang}++; } sub _dump_words { my @words = @_; my $seg = ""; for my $t (@words) { if ($t->is_multiword) { $seg .= sprintf("\n", $t->lemma(), $t->tag()); $seg .= _dump_words($t->get_mw_words); $seg .= "\n"; } else { $seg .= $t->form() ."\t". $t->lemma() ."\t". $t->tag() ."\n"; } } return $seg; } sub _dump_compact { my $seg = join(" " => map { my $t = $_; ($t->is_multiword ? join("_", map { $_->lemma() || $_->lc_form() } $t->get_mw_words) : ($t->lemma() || $t->lc_form())) . "/" . substr($t->tag(), 0, 1); } @_) . "\n"; $seg =~ s/&/&/g; $seg =~ s//>/g; return $seg; } __END__ =pod =encoding UTF-8 =head1 NAME tmx-POStagger - POStaggers translation units on a tmx file. =head1 VERSION version 0.36 =head1 SYNOPSIS tmx-POStagger file.tmx # creates pos_file.tmx tmx-POStagger -o=out.tmx file.tmx -ner ... tags multiword named entities -dates -compact =head1 DESCRIPTION Although this script is bundled in C, it has a soft dependency on C. Soft means that the dependency is not ensured at install time, and other features of the module can still be used without C. Nevertheless, if you want to use this tool you should install that module. At the moment the supported languages are the same as supported by FreeLing3: English, Spanish, Russian, Portuguese and Italian. It your TMX file includes any other language, they will be maintained without a change. This behavior can change in the future, as a basic regexp based POStaggerr might be implemented. =head2 Options -ner -- groups Proper names with tag C which which WDT includes include VBZ Edward Cole -compact -dates =head1 SEE ALSO XML::TMX, Lingua::FreeLing3 =head1 AUTHORS =over 4 =item * Alberto Simões =item * José João Almeida =back =head1 COPYRIGHT AND LICENSE This software is copyright (c) 2010-2017 by Projeto Natura . This is free software; you can redistribute it and/or modify it under the same terms as the Perl 5 programming language system itself. =cut XML-TMX-0.36/scripts/tmxwc0000755000175000017500000000226113154220371013754 0ustar ambsambs#!/usr/bin/perl -s # ABSTRACT: Gives statistics about tmx file. # PODNAME: tmxwc our($h); use XML::TMX::Reader; my $TTU = 0; my $n = scalar(@ARGV); for my $file (@ARGV) { my $tm = XML::TMX::Reader->new($file); my $TU = 0; $tm->for_tu({-raw => 1 }, sub { $TU++ ; print STDERR "\r$TU..." unless $TU%1000; } ); print (($h?"":"$file: "),"$TU tu.\n"); $TTU += $TU; } print "total: $TTU tu.\n" if $n > 1; __END__ =pod =encoding utf-8 =head1 NAME tmxwc - Gives statistics about tmx file. =head1 VERSION version 0.36 =head1 SYNOPSIS tmxwc tmx1 [tmx2 ...] -h don't print filenames (default: print filenames) =head1 DESCRIPTION Gives statistical information about TMX files like the number of translation units. =head1 AUTHORS =over 4 =item * Alberto Simões =item * José João Almeida =back =head1 COPYRIGHT AND LICENSE This software is copyright (c) 2010-2017 by Projeto Natura . This is free software; you can redistribute it and/or modify it under the same terms as the Perl 5 programming language system itself. =cut XML-TMX-0.36/scripts/tmx2html0000755000175000017500000000220713154220371014371 0ustar ambsambs#!/usr/bin/perl -w -s # PODNAME: tmx2html # ABSTRACT: Converts a TMX to an HTML formatted page use strict; use warnings; use XML::TMX; use XML::TMX::Reader; our ($icons); my $tmx = shift; my $tmx_obj = XML::TMX::Reader->new($tmx); binmode(STDOUT,":utf8"); print qq{ }, $tmx_obj->to_html(), qq{ }; __END__ =pod =encoding UTF-8 =head1 NAME tmx2html - Converts a TMX to an HTML formatted page =head1 VERSION version 0.36 =head1 SYNOPSIS tmx2html [-icons] file.tmx > file.html =head1 DESCRIPTION Pretty prints a TMX as an HTML file. Icons for languages can be produced, but still buggy. =head1 AUTHORS =over 4 =item * Alberto Simões =item * José João Almeida =back =head1 COPYRIGHT AND LICENSE This software is copyright (c) 2010-2017 by Projeto Natura . This is free software; you can redistribute it and/or modify it under the same terms as the Perl 5 programming language system itself. =cut XML-TMX-0.36/scripts/tmxgrep0000755000175000017500000000424413154220371014303 0ustar ambsambs#!/usr/bin/perl -w -s # PODNAME: tmxgrep # ABSTRACT: grep translation units in a TMX file ## use utf8::all; use Data::Dumper; our ($all,$w,$max,$pdf, $o, $n, $a,$debug, $i); my ($ab,$ae); ## emphatise marker if($a){ if($a eq "1"){ $ab="=("; $ae=")="; } elsif($a){ $ab="<$a>"; $ae=""; } } $max //= 500; $max = 1000000000 if ($max == 0 or $all); use strict; use warnings; my $p = shift; if($i and $w){ $p = qr{(?i)\b$p\b} } elsif($i) { $p = qr{(?i)$p} } elsif($w) { $p = qr{\b$p\b} } use XML::TMX; use XML::TMX::Reader; $o //= "__.tmx" if $pdf; our ($icons); my $tmx = shift; my $tmx_obj = XML::TMX::Reader->new($tmx); $tmx_obj->for_tu( { patt => $p, gen_tu=> $max, n => $n || 0, output => $o || undef }, #### "__tmxgrep.tmx" sub { my($tu,$at) = @_; print STDERR Dumper($tu,$at) if $main::debug; # print STDERR "."; if($main::a){ for my $k(keys %$tu){ for(keys %{$tu->{$k}}){ $tu->{$k}{$_} =~ s!($p)!$ab$1$ae!g; } } } return $tu; } ); if($pdf){ system("tmx2pdf-prince -v $o");} __END__ =pod =encoding UTF-8 =head1 NAME tmxgrep - grep translation units in a TMX file =head1 VERSION version 0.36 =head1 SYNOPSIS tmxgrep file.tmx options: -pdf -- output is transformed in a PDF file -max=300 -- extract up to 300 TU (def=500) -max=0 -- all matches -all -- all matches -a -- annotate matches with '=(...)=' -o=out.tmx -- define output file (defaut= STDOUT) -n -- print original TU number (eg: ) -i -- ignore case =head1 DESCRIPTION Creates a TMX file with the translation units that macth the provided regular expression. =head1 AUTHORS =over 4 =item * Alberto Simões =item * José João Almeida =back =head1 COPYRIGHT AND LICENSE This software is copyright (c) 2010-2017 by Projeto Natura . This is free software; you can redistribute it and/or modify it under the same terms as the Perl 5 programming language system itself. =cut XML-TMX-0.36/scripts/tmxuniq0000755000175000017500000000602713154220371014323 0ustar ambsambs#!/usr/bin/perl -s # PODNAME: tmxuniq # ABSTRACT: removes duplicated translation units from TMXs BEGIN { eval "require Lingua::PT::PLNbase"; if ($@) { print STDERR "This script requires Lingua::PT::PLNbase to run. We suggest installing\n", "it using CPAN: http://www.cpan.org/modules/INSTALL.html\n"; exit(0); } } use DB_File; use Fcntl ; use XML::TMX::Reader; use Digest::MD5 qw(md5_hex); use Encode; use Lingua::PT::PLNbase our ($cont,$id,$dig,$tok,$o,$fast); if ($cont) { tie %dic, 'DB_File', "__tmxuniq_$$.db", O_RDWR|O_CREAT , 0640, $DB_BTREE; } else { tie %dic, 'DB_File', "__tmxuniq_$$.db", O_RDWR|O_CREAT|O_TRUNC , 0640, $DB_BTREE; } my $cid = 0; for my $file (@ARGV){ my $tm = XML::TMX::Reader->new($file); print STDERR "Processing..."; $tm->for_tu ( { output => $o || "$file._" }, sub { my $tu = shift; $cid++; $tu->{-prop}{id} = $cid if $id; my $key = join("|||", map { n($tu->{$_}{-seg}) } sort grep { !/^-/ } keys %$tu); my $digest = md5_hex(encode_utf8($key)); unless ($cid % 10000) { my $size = -s "__tmxuniq_$$.db"; printf STDERR "\rTotal: %10d Removed: %8d (%.2f%%) Database size: %10d bytes", $cid, $rem, (100*$rem/$cid), $size; } if ($dic{$digest}) { $dic{$digest} .= "$cid;" unless $fast; $rem ++; return undef } else { $dic{$digest} = "$cid;"; $tu->{-prop}{digest} = $digest if $dig; return {%$tu} ; # used clone.. no idea why } } ); my $size = -s "__tmxuniq_$$.db"; if ($cid) { printf STDERR "\rTotal: %10d Removed: %8d (%.2f%%) Database size: %10d bytes\n", $cid, $rem, (100*$rem/$cid), $size; } else { printf STDERR "\rHuh.. empty TMX?\n"; } undef $tm; } untie %h; sub n { my $a = shift; $a =~ s/\.{6,}/...../g; $a = tokenize( { rs => ' ' } => $a ) if $tok; $a =~ s/\s+/ /g; $a =~ s/ $//; $a =~ s/^ //; return $a; } __END__ =pod =encoding UTF-8 =head1 NAME tmxuniq - removes duplicated translation units from TMXs =head1 VERSION version 0.36 =head1 SYNOPSIS tmxuniq [options] -l=en:pt tmx1 ... =head1 DESCRIPTION Removes duplicated translation units from a set of TMX (Translation Memory eXange format). =head1 OPTIONS -id -- insert a uniq id property in each TU -dig -- insert a digest property in each TU -tok -- tokenize/normalize text -o=out.tmx -- (with 1 argument) redefine output (default = input._) =head1 SEE ALSO perl(1). =head1 AUTHORS =over 4 =item * Alberto Simões =item * José João Almeida =back =head1 COPYRIGHT AND LICENSE This software is copyright (c) 2010-2017 by Projeto Natura . This is free software; you can redistribute it and/or modify it under the same terms as the Perl 5 programming language system itself. =cut XML-TMX-0.36/scripts/tsv2tmx0000644000175000017500000000611013154220371014233 0ustar ambsambs#!/usr/bin/perl # PODNAME: tsv2tmx # ABSTRACT: Create a TMX from a TSV file use warnings; use strict; use Getopt::Long; use Pod::Usage; ### Option varialbes my $man = 0; my $help = 0; my ($sl, $tl); our $verbose = 1; my $header = 1; my $columns = "1,2"; #### ------ binmode STDIN, ":utf8"; binmode STDOUT, ":utf8"; GetOptions ('help|h' => \$help, 'man' => \$man, "source=s" => \$sl, "target=s" => \$tl, "header" => \$header, "columns=s" => \$columns, "verbose|v" => \$verbose) or pod2usage(2); pod2usage(1) if $help; pod2usage(-exitval => 0, -verbose => 2) if $man; ## our ($c1, $c2); if ($columns =~ /^(\d+),(\d+)$/) { ($c1,$c2) = ($1,$2); } else { die "Columns definition should be a pair of integers: 1,2\n"; } if (!$sl || !$tl) { if ($header) { _log ("No source language or target language defined. Guessing!"); my ($l1, $l2) = readLine(); $sl = $l1 unless defined $sl; $tl = $l2 unless defined $tl; } else { die "No header, and one of the source or target languages not defined!\n"; } $header = 0; } readLine() if $header; use XML::TMX::Writer; my $tmx = XML::TMX::Writer->new(); $tmx->start_tmx(id => 'tsv2tmx'); my @r; while (@r = readLine()) { $tmx->add_tu($sl=>$r[0],$tl=>$r[1]); } $tmx->end_tmx(); sub _log { say STDERR @_ if $verbose; } sub readLine { my $line = ; if ($line) { chomp $line; return (split /\t/, $line)[$c1,$c2] } else { return (); } } __END__ =pod =encoding utf-8 =head1 NAME tsv2tmx - Create a TMX from a TSV file =head1 VERSION version 0.36 =head1 SYNOPSIS tsv2tmx [options] Options: --help brief help message --man full documentation --verbose | -v activated verbose mode --sl=EN --tl=PT describe source and target language names --header treat first line as a heading --columns=1,2 specify which columns to extract =head1 DESCRIPTION Useful to create translation memories from TSV files, that can be easily exported from spreadsheet software. =head1 OPTIONS =over 8 =item B<--help> Print a brief help message and exits. =item B<--man> Prints the manual page and exits. =item B<--verbose> | B<-v> Activates the verbose mode. =item B<--sl> | B<--tl> Use these options to specify the names for the source and target languages. =item B<--header> By default this switch is on, and it means that the TSV file includes a first line with a heading. If no source or target language names are specified, the first line will be used to guess them. =item B<--columns=1,2> Specify which columns should be extracted. Needs to be a pair of integers, separated by a comma. Columns indexes start at 0. Default to C<1,2>. =back =head1 SEE ALSO XML::TMX =head1 AUTHORS =over 4 =item * Alberto Simões =item * José João Almeida =back =head1 COPYRIGHT AND LICENSE This software is copyright (c) 2010-2017 by Projeto Natura . This is free software; you can redistribute it and/or modify it under the same terms as the Perl 5 programming language system itself. =cut XML-TMX-0.36/scripts/tmxclean0000755000175000017500000000425013154220371014425 0ustar ambsambs#!/usr/bin/perl -s # PODNAME: tmxclean # ABSTRACT: Simple tool to clean TMX files use v5.10; use strict; use warnings; use XML::TMX::Reader; our ( $junk, # remove if one of the languages just have junk $output, # output filename $eq , # remove if seg(l1) = seg(l2) $len , # remove if len(li) > 50 ∧ len(lj) > 2len(li) $v, $verbose ); my $cleaned = 0; my $processed = 0; my $tmx = shift or help(); my $reader = XML::TMX::Reader->new($tmx); $junk//=1; $output ||= "_cleaned_$tmx"; print STDERR "loading..." if $v; $reader->for_tu( {output => $output}, \&cleaner); printf STDERR "\rRemoved %d/%d (%.3f%%).\n", $cleaned, $processed, 100*$cleaned/$processed if $v; sub cleaner { my $langs = shift; $processed++; my $remove = 0; my %seg=(); my @len=(); for my $k (keys %$langs) { next if $k =~ /^-/; $remove = 1 if $eq && $seg{$langs->{$k}{-seg}}++; $remove = 1 if $junk && $langs->{$k}{-seg} =~ /^[-.,0-9\s]+$/; $remove = 1 if $junk && $langs->{$k}{-seg} =~ /^\W*$/; push(@len, length($langs->{$k}{-seg})); } @len = sort{$a <=> $b} @len; $remove = 1 if $len && $len[0] > 50 && $len[0]*2< $len[-1]; $cleaned++ if $remove; printf STDERR "\rRemoved %d/%d (%.3f%%)...", $cleaned, $processed, 100*$cleaned/$processed if $v && $processed%1000==0; return $remove ? undef : $langs; } sub help { print " tmxclean [-junk=1] \n"; exit 1; } __END__ =pod =encoding UTF-8 =head1 NAME tmxclean - Simple tool to clean TMX files =head1 VERSION version 0.36 =head1 SYNOPSIS $ tmx-clean file.tmx =head1 DESCRIPTION Removes the Translation units that 1. have no letters... (unless -junk=0) 2. seg(L1) = seg(L2) (if -eq) =head1 SEE ALSO XML::TMX =head1 AUTHORS =over 4 =item * Alberto Simões =item * José João Almeida =back =head1 COPYRIGHT AND LICENSE This software is copyright (c) 2010-2017 by Projeto Natura . This is free software; you can redistribute it and/or modify it under the same terms as the Perl 5 programming language system itself. =cut XML-TMX-0.36/scripts/tmx-tokenize0000755000175000017500000000516113154220371015252 0ustar ambsambs#!/usr/bin/perl -s # PODNAME: tmx-tokenize # ABSTRACT: Tokenizes translation units on a tmx file. use strict; use warnings; our $o; eval { require FL3 }; die "This XML::TMX script requires Lingua::FreeLing3 to be installed\n" if $@; FL3->import(); use XML::TMX::Reader '0.25'; my $file = shift or die "You must supply the name of the file to tokenize"; my $reader = XML::TMX::Reader->new($file); my $output = "t_$file"; $output = $o if $o; binmode STDOUT, ":utf8"; $reader->for_tu( { -output => $output, -prop => { tokenized => "true" }, verbose => 1 }, sub { my $tu = shift; for my $lang (keys %$tu) { if ($lang =~ /(pt|es|it|ru|en|gl)/i) { my $ln = lc $1; my $txt = $tu->{$lang}{-seg}; if ($txt !~ /^\s*$/) { $txt = join(" ", @{ tokenizer($ln)->tokenize($txt, to_text => 1)}); } $tu->{$lang}{-seg} = $txt; } } return $tu; }); __END__ =pod =encoding UTF-8 =head1 NAME tmx-tokenize - Tokenizes translation units on a tmx file. =head1 VERSION version 0.36 =head1 SYNOPSIS tmx-tokenize file.tmx # creates t_file.tmx tmx-tokenize -o=out.tmx file.tmx =head1 DESCRIPTION Although this script is bundled in C, it has a soft dependency on C. Soft means that the dependency is not ensured at install time, and other features of the module can still be used without C. Nevertheless, if you want to use this tool you should install that module. At the moment the supported languages are the same as supported by FreeLing3: English, Spanish, Russian, Portuguese and Italian. It your TMX file includes any other language, they will be maintained without a change. This behavior can change in the future, as a basic regexp based tokenizer might be implemented. =head1 SEE ALSO XML::TMX, Lingua::FreeLing3 =head1 AUTHORS =over 4 =item * Alberto Simões =item * José João Almeida =back =head1 COPYRIGHT AND LICENSE This software is copyright (c) 2010-2017 by Projeto Natura . This is free software; you can redistribute it and/or modify it under the same terms as the Perl 5 programming language system itself. =cut XML-TMX-0.36/scripts/tmx-explode0000644000175000017500000000374713154220371015067 0ustar ambsambs#!/usr/bin/perl # PODNAME: tmx-explode # ABSTRACT: Explodes tmx files in a file per language use warnings; use strict; use XML::TMX::Reader; my @files = grep {-f $_} @ARGV; for my $file (@files) { print STDERR "exploding $file.."; my $reader = XML::TMX::Reader->new($file); my $name = $file; $name =~ s/\.tmx//; my %fh; my $i = 1; $reader->for_tu( sub { my ($langs) = @_; print STDERR "." unless $i % 10; for my $l (keys %$langs) { next if $l eq "-prop"; mkdir $l unless -d $l; $langs->{$l} =~ s/\n/ /g; $langs->{$l} =~ s/\s+/ /g; unless ($fh{$l}) { open $fh{$l}, ">$l/$name"; binmode $fh{$l}, ":utf8"; } print {$fh{$l}} "$i\t$langs->{$l}\n"; } ++$i; }, ); close $fh{$_} for (keys %fh); print STDERR "DONE\n"; } __END__ =pod =encoding UTF-8 =head1 NAME tmx-explode - Explodes tmx files in a file per language =head1 VERSION version 0.36 =head1 SYNOPSIS tmx-explode foo.tmx bar.tmx # if foo.tmx and bar.tmx have only PT and EN languages, you will get # \-EN # \-- foo # \-- bar # \-PT # \-- foo # \-- bar =head1 DESCRIPTION Explodes a set of tmx files, creating a folder per language, and a file per tmx file. =head1 AUTHORS =over 4 =item * Alberto Simões =item * José João Almeida =back =head1 COPYRIGHT AND LICENSE This software is copyright (c) 2010-2017 by Projeto Natura . This is free software; you can redistribute it and/or modify it under the same terms as the Perl 5 programming language system itself. =cut XML-TMX-0.36/Changes0000644000175000017500000000732613154220371012477 0ustar ambsambsRevision history for Perl extension XML::TMX. 0.36 2017-09-08 - Minor package Fixes (Nuno Smash Carvalho) 0.35 2017-09-07 - Remove spurious Emacs temp files. 0.34 2017-09-06 - Minor Dist::Zilla improvements; - Use File::Temp on some tests; 0.33 2017-09-05 - Distribution based on Dist::Zilla - Documentation Improvements 0.32 2017-08-11 - Added better handling of files with BOM - Added better handling of tmx2tmx -cat, handling input encodings (still needing extra work with BOM files) 0.31 2016-01-11 - Fixed utf8 POD. 0.30 2016-01-11 - Added tsv2tmx script. 0.29 2015-10-10 - Consistent version numbers 0.28 2015-09-17 - Fixed TMX spec URL (thanks to Alexander Becker) 0.27 2015-08-31 - Fixed tmx-POStagger. 0.26 2015-04-28 - Fixed bug when processing CDATA elements. - Added -raw option to for_tu - Reworked TMX annotated format. - Various improvements in tmx-POStagger. 0.25 2013-07-30 - tmx2tmx -cat : fix a bug related with a missing ":utf8" - tmxclean -len: remove tu if len > 50 and len-min * 2 < len-max - tmxclean -eq: remove tu if equal ( seg(l1)=seg(l2) ) - output formatting of the TMX slightly changed (tu = lines/5) 0.24 2012-11-29 - Require Perl v5.10 0.23 2012-11-24 - sort languages when writing (good for tests) 0.22 2012-06-06 - more UTF-8 pod stuff 0.21 2012-06-05 - properly encode UTF-8 Scripts - tmx2tmx -cat preserves props and notes from all documents - add POD to some scripts - added tmx-tokenize - Moved some scripts to example folder Reader - save info about CDATA sections - add -raw option on for_tu2 - for_tu2 renamed as for_tu - reads header correctly (props and notes) Writer - write correctly CDATA sections - better output - handle header props and notes - more testing 0.20 2012-01-31 - add 'verbose' option to for_tu family of methods 0.19 2012-01-27 - fix 'ignore_markup' in Reader.pm (thanks to Achim Ruopp) 0.18 2011-02-09 - tmxwc working for multiple files; - Fixed tmxuniq to work without complaining on unicode; 0.17 2008-12-22 - Added tmxwc and tmxclean scripts. 0.16 2007-02-05 - refactored for_tu2 function (it used $&, $` and $'). 0.15 2006-12-09 - new for_tu2 function that will soon replace for_tu more efficient for big files - languages function just checks the first few 100 TUs 0.14 2006-06-07 - in XML::TMX::Writer, if we know the encoding, use binmode to force it. 0.13 2005-06-20 - for_tu now handles direct output for another TMX file. - removed some newlined in the output. 0.12 2005-07-17 - added some tests to XML::TMX::Writer; - changed test names to be tested in order; 0.11 2005-07-12 - corrected stupid bug lying around since 0.05 0.10 2004-11-30 - added pod and pod-coverage tests; - added documentation; 0.07 2004-05-18 - option to remove sub-tags is now global and not specific for the for_tu command; 0.06 2004-04-22 - corrected bug on test files - tmx2tmx is installed - tmx2tmx conversion to TRADOS TMX1.1 format - tmx2tmx basic TMX cleaner 0.05 2004-01-07 - removed dependency with XML::Writer; - option to remove sub-tags when processing s; 0.04 2003-11-11 - use xml:lang instead of lang if the first exists 0.03 2003-10-12 - Many bugs were found. Here is a working version (we hope) 0.02 2003-10-09 - Corrected warnings with perl 5.8.1 0.01 2003-09-14 - Created XML::TMX::{Reader|Query} XML-TMX-0.36/README0000644000175000017500000000061613154220371012057 0ustar ambsambs This archive contains the distribution XML-TMX, version 0.36: Perl extensions for managing TMX files This software is copyright (c) 2010-2017 by Projeto Natura . This is free software; you can redistribute it and/or modify it under the same terms as the Perl 5 programming language system itself. This README file was generated by Dist::Zilla::Plugin::Readme v6.010. XML-TMX-0.36/MANIFEST0000644000175000017500000000134113154220371012324 0ustar ambsambs# This file was automatically generated by Dist::Zilla::Plugin::Manifest v6.010. Changes LICENSE MANIFEST MANIFEST.SKIP META.json META.yml Makefile.PL README dist.ini examples/po2tmx examples/tmx-metrics examples/tmx2pdf-prince lib/XML/TMX.pm lib/XML/TMX/FromPO.pm lib/XML/TMX/Reader.pm lib/XML/TMX/Writer.pm scripts/tmx-POStagger scripts/tmx-explode scripts/tmx-tokenize scripts/tmx2html scripts/tmx2tmx scripts/tmxclean scripts/tmxgrep scripts/tmxsplit scripts/tmxuniq scripts/tmxwc scripts/tsv2tmx t/00-aux.t t/01_main.t t/10_writer.t t/20_reader.t t/25_for_tu.t t/50_tmxcat.t t/80_frompo.t t/98_pod.t t/99_pod-coverage.t t/cat.xml t/sample.tmx t/tmx14.dtd t/writer1.xml t/writer2.xml xt/author/pod-syntax.t xt/release/cpan-changes.t XML-TMX-0.36/META.json0000644000175000017500000000315613154220371012622 0ustar ambsambs{ "abstract" : "Perl extensions for managing TMX files", "author" : [ "Alberto Sim\u00f5es ", "Jos\u00e9 Jo\u00e3o Almeida " ], "dynamic_config" : 0, "generated_by" : "Dist::Zilla version 6.010, CPAN::Meta::Converter version 2.150010", "license" : [ "perl_5" ], "meta-spec" : { "url" : "http://search.cpan.org/perldoc?CPAN::Meta::Spec", "version" : 2 }, "name" : "XML-TMX", "prereqs" : { "configure" : { "requires" : { "ExtUtils::MakeMaker" : "0" } }, "develop" : { "requires" : { "Test::CPAN::Changes" : "0.19", "Test::More" : "0.96", "Test::Pod" : "1.41" } }, "runtime" : { "requires" : { "DB_File" : "0", "Data::Dumper" : "0", "Digest::MD5" : "0", "Encode" : "0", "Exporter" : "0", "Fcntl" : "0", "File::BOM" : "0", "File::Temp" : "0", "Getopt::Long" : "0", "Pod::Usage" : "0", "XML::DT" : "0", "parent" : "0", "perl" : "v5.10.0", "strict" : "0", "utf8" : "0", "warnings" : "0" }, "suggests" : { "Lingua::FreeLing3" : "0", "Lingua::PT::PLNbase" : "0" } }, "test" : { "requires" : { "Test::More" : "0" } } }, "release_status" : "stable", "version" : "0.36", "x_serialization_backend" : "Cpanel::JSON::XS version 3.0239" }
$langs->{$_}{-seg}$langs->{$_}{-seg}