#!/usr/bin/perl

# PODNAME: tsv2tmx
# ABSTRACT: Create a TMX from a TSV file

use warnings;
use strict;
use Getopt::Long;
use Pod::Usage;
use utf8::all;

### Option variables
my $man = 0;
my $help = 0;
my ($sl, $tl);
our $verbose = 1;
my $header = 1;
my $columns = "0,1";
my $fs = qr'\t';
#### ------

GetOptions ('help|h' => \$help, 'man' => \$man,
			"source|sl=s" => \$sl,    
            "target|tl=s" => \$tl,  
            "header"   => \$header,
            "columns|c=s" => \$columns,
            "fs=s" => \$fs,
            "verbose|v"  => \$verbose)
or pod2usage(2);

pod2usage(1) if $help;
pod2usage(-exitval => 0, -verbose => 2) if $man;

##
sub strip { my $v=shift; 
  $v =~ s/^\s*//;
  $v =~ s/\s*$//;
  $v;
}

our ($c1, $c2);
if ($columns =~ /^(\d+),(\d+)$/) {
	($c1,$c2) = ($1,$2);
} else {
	die "Columns definition should be a pair of integers. Ex: 1,2\n";
}

if (!$sl || !$tl) {
	if ($header) {
		my ($l1, $l2) = readLine("find langs in header");
		$sl = strip($l1) unless defined $sl;
		$tl = strip($l2) unless defined $tl;
	} else {
		die "No header, and one of the source or target languages not defined!\n";
	}
	$header = 0;
}

readLine() if $header;

use XML::TMX::Writer;
my $tmx = XML::TMX::Writer->new();
$tmx->start_tmx(id => 'tsv2tmx');	

my @r;
while (@r = readLine()) {
	$tmx->add_tu($sl=>$r[0],$tl=>$r[1]);
}

$tmx->end_tmx();

sub _log {
	say STDERR @_ if $verbose;
}

sub readLine {
    my $isheader = shift || 0;
	my $line = <>;
	return () unless $line;

	chomp $line;
    $line =~ s{&((\w+;)?)}{ if ($1){ "&$1" } else { "&amp;"} }ge;
    if    ($line =~ /^\s*$/ ){ return readLine() }
    elsif ($line =~ /^#/    ){ return readLine() }
	else { 
       my @fields = split /$fs/, $line;
       if ($isheader and scalar @fields <2 ){ 
          die("Invalid header:$line\n") }
       my $sl = strip($fields[$c1] or "");
       my $tl = strip($fields[$c2] or "");

       if ($isheader){ 
          if( $sl and $tl){ _log ("From header:$sl - $tl");}
          else { die("Invalid header: $line\n") }
       }
       if ($sl or $tl){ return ($sl,$tl)  }
       else           { return readLine() }
	}
}

__END__

=pod

=encoding utf-8

=head1 NAME

tsv2tmx - Create a TMX from a TSV file

=head1 VERSION

version 0.39

=head1 SYNOPSIS

tsv2tmx [options] file.tsv ...

 Options:
   --help            brief help message
   --man             full documentation
   --verbose | -v    activated verbose mode
   --sl=EN --tl=PT   describe source and target language names
   --header          treat first line as a heading
   --columns=1,2     specify which columns to extract
   --fs='::'         set field separator. (separated by '::')

=head1 DESCRIPTION

Useful to create translation memories from TSV files, that can be
easily exported from spreadsheets or other applications.

=head1 OPTIONS

=over 8

=item B<--help>

Print a brief help message and exits.

=item B<--man>

Prints the manual page and exits.

=item B<--verbose> | B<-v>

Activates the verbose mode.

=item B<--sl> | B<--tl>

Use these options to specify the names for the source and target
languages.

=item B<--header>

By default this switch is on, and it means that the TSV file
includes a first line with a heading. If no source or target
language names are specified, the first line will be used to
guess them.

=item B<--columns=1,2>

Specify which columns should be extracted. Needs to be a pair
of integers, separated by a comma. Columns indexes start at 0.
Default to C<0,1>.

=back

=head1 SEE ALSO

XML::TMX

=head1 AUTHORS

=over 4

=item *

Alberto Simões <ambs@cpan.org>

=item *

José João Almeida <jj@di.uminho.pt>

=back

=head1 COPYRIGHT AND LICENSE

This software is copyright (c) 2010-2017 by Projeto Natura <natura@di.uminho.pt>.

This is free software; you can redistribute it and/or modify it under
the same terms as the Perl 5 programming language system itself.

=cut
