#!/usr/bin/perl

use warnings;
use strict;
use Encode;
use utf8;

binmode STDOUT, ':utf8';

my @gridarr = ();
my @txtarr = ();
my @wordarr = ();
my @cutarr = ();
my $cmd_tykelda = 0;
my $cmd_impordi = 0;
my $cmd_gridilaused = 0;

my $help = ">textgrid2cut.pl [-i] [-c] xxx.textGrid
... xxx.textgrid olgu UCS-2
... samas kataloogis olgu xxx.txt UTF-8
... samas kataloogis olgu xxx.wav, kui on soov seda lõikuda
... -i loeme ja kasutame lõikamiseks gridis leiduvaid piire,
...     muidu arvutame ja lisame gridi uued lausepiirid
... -c tükeldab wav-i lauseteks (NB! sox peab installitud olema)
";

foreach (@ARGV) {
  if ($_ eq '-c') {
    $cmd_tykelda = 1;
    next;
  }
  if ($_ eq '-i') {
    $cmd_impordi = 1;
    next;
  }
  if ($_ eq '-gs') {
    $cmd_gridilaused = 1;
    next;
  }

  my $gridfile = $_;
  my $txtfile = '';
  my $wavfile = '';
  my $id;

  if (! -f $gridfile ) {
    print "Puudub fail $gridfile\n";
    next;
  }

  if ($gridfile =~ /\.textgrid(\.cut)?$/i ) {
    $txtfile = $` . '.txt';
    $wavfile = $` . '.wav';
    $id = $`;
    $id =~ s/.*\///;
  }
  else {
    print "Eeldasin, et sisend on kujul blabla.textgrid\n";
    next;
  }

  # edaspidi alati vajalik
  if (! -f $txtfile) {
    print "Eeldasin, et samas kataloogis on $txtfile\n";
    next;
  }

  if (! -e $wavfile) {
    $wavfile =~ s/_/./g;
  }
  if (($cmd_tykelda) and (! -e $wavfile)) {
    print "Kui tahate tükeldada, peab ka $wavfile samas kataloogis olema\n";
    next;
  }

  # kõik vajalik oli olemas
  $help = '';

  open F, '<:encoding(UCS-2)', $gridfile;
  @gridarr = (<F>);
  close F;
  open F, '<:encoding(UTF-8)', $txtfile;
  # filtreerime välja tühjad read
  while (<F>) { next unless /\pL/; push @txtarr, $_; }
  close F;

  for (my $i = 0; $i <= $#txtarr; $i++) {
    my $lause = $txtarr[$i];
    $lause = $` if $lause =~ /\+\d\s*$/;
    push @wordarr, '¤' if ($i > 0) and ($i <= $#txtarr);
    print "sisaldab numbreid: $lause" if $lause =~ /\d/;
    while ($lause =~ /[\pL\d]+/g) {
      push @wordarr, $&;
    }
  }

  if ($cmd_impordi) {
    impordi_lausevahed();
  }
  elsif ($cmd_gridilaused) {
    prindi_gridi_laused();
    exit;
  }
  else {
    arvuta_lausevahed();
  }

  if ( $#txtarr != $#cutarr + 1 ) {
    for my $x (0..3) {
      print "$txtarr[$x]\t$cutarr[$x]\n";
    }
    print "...\n";
    for my $x (-3..0) {
      print "$txtarr[$#txtarr+$x]\t$cutarr[$#cutarr+$x]\n";
    }
    print "Viga: lausete ja vahemike arv erineb: ",$#txtarr," !== ",$#cutarr+1,". Ära kasuta -i võtit!\n";
    die ();
  }

  my $cutfile = $gridfile . '.cut';
  print "Tekitan $cutfile\n";
  open F, '>:encoding(UCS-2)', $cutfile;
  prindi_koos_vahedega();
  close F;

  if ($cmd_tykelda) {
    print "Tekitan blabla10001.wav ja edasi\n";
    tykelda_wav($id, $wavfile);
  }
}

print $help;


# ----------------------------------------------------------

# gridi teksti muuta ei saa, gridi tekst ei vasta täpselt lausetes tehtud
# parandustele ja lausepiirid peavad tulema lausetest. Seega tekitame sellise
# lausete sõnade array, mis vastab gridile, aga sisaldab lausevahesid ¤
#
# vaja @wordarr puhastada ja joondada
#
sub gridi_tekst_stringina {
  # nopime gridist sõnad välja ja teeme pika stringi
  my $ort_tier = 0;
  my $text;
  my $gridwords = '';
  foreach (@gridarr) {
    $ort_tier = 1 if /name = \"ORT\"/;
    $ort_tier = 0 if /item \[/;
    next unless $ort_tier;
    next unless /text = \"(.*?)\"/;
    $text = $1;
    # praati text väljale jäänud BOM
    $text = $' if $text =~ /^\x{feff}/;
    $gridwords .= ' ' if $gridwords;
    $gridwords .= $text;
  }
  return $gridwords;
}

sub joonda_wordarr {
  my $gridwords = gridi_tekst_stringina();
  $gridwords =~ s/<CUT>//g;
  $gridwords =~ s/\s\s+/ /g;
  my $textwords = join ' ', @wordarr;
  $textwords = joonda ($gridwords, $textwords, 20);
  @wordarr = split / /, $textwords;
}

sub joonda {
  my ($g, $t, $pikkus) = @_;
  return $t if $t eq $g;
  $pikkus = 20 unless $pikkus;

  while ($pikkus > 4) {
    # leiame tekstist vähemalt nii pika sõna
    while ($t =~ / ([^ ]{$pikkus,}) /g) {
      my $ta = $`; my $tw = $&; my $to = $';
      # kas sama esineb gridis?
      if ( $g =~ /$tw/ ) {
        my $ga = $`; my $gw = $&; my $go = $';
        print "klappiv sõna $tw\n";

        # liidame klapile tagantpoolt külge
        while (1) {
          if ($to =~ /^¤ ?/) { $tw .= $&; $to = $'; next; }
          my $tnext = ($to =~ /^([^ ]+)($| )/ ? $& : '');
          my $gnext = ($go =~ /^([^ ]+)($| )/ ? $& : '');
          last unless ($tnext eq $gnext);
          last unless ($tnext or $gnext);
          $tw .= $tnext;
          $to =~ /^([^ ]+)($| )/; $to = $';
          $go =~ /^([^ ]+)($| )/; $go = $';
        }
        # liidame klapile eestpoolt külge
        while (1) {
          if ($ta =~ / ?¤$/) { $tw = $& . $tw; $ta = $`; next; }
          my $tnext = ($ta =~ /(^| )([^ ]+)$/ ? $& : '');
          my $gnext = ($ga =~ /(^| )([^ ]+)$/ ? $& : '');
          last unless ($tnext eq $gnext);
          last unless ($tnext or $gnext);
          $tw = $tnext . $tw;
          $ta =~ /(^| )([^ ]+)$/; $ta = $`;
          $ga =~ /(^| )([^ ]+)$/; $ga = $`;
        }
        print "Laiendatud klapp: [$tw]\n";
        # kui kummaltki poolt ei õnnestunud laiendada, siis pigem ei usalda
        next if ($tw eq $gw);
        # OK

        return joonda ($ga, $ta, $pikkus-1) . $tw . joonda ($go, $to, $pikkus);

      } # sama sõna leidus gridis
    } # tekstis leidus nii pikki sõnu
    $pikkus--;
  } # pikkus on 4, midagi klappivat ei leia

  print "\nEI KLAPI, muudan t -> g\n";
  print "T: $t\n";
  print "G: $g\n";

  return $g;
}

# ----------------------------------------------------------

sub arvuta_lausevahed {

  joonda_wordarr();

  my $ort_tier = 0;
  my $eelmiselopp = 0;
  my $uuealgus = 0;
  my $ootansona = shift @wordarr;
  my ($xmin, $xmax, $text);


  foreach (@gridarr) {
    $ort_tier = 1 if /name = \"ORT\"/;
    $ort_tier = 0 if /item \[/;
    next unless $ort_tier;
    if ( /(?<=xmin = )\d+(\.\d+)?/ ) { $xmin = $&; next; }
    if ( /(?<=xmax = )\d+(\.\d+)?/ ) { $xmax = $&; next; }
    next unless /text = \"(.*?)\"/;
    $text = $1;
    # praati text väljale jäänud BOM
    $text = $' if $text =~ /^\x{feff}/;

VORDLEUUT:
    # klapp lause sees?
    if ($text eq $ootansona) {
      $eelmiselopp = $xmax;
      return unless $ootansona = shift @wordarr;
      next;
    }
    # kirjavahemärk?
    next if ($text eq '');
    # kirjavahemärk?
    next if ($text eq '<CUT>');

    # lausevahe , aga tuli miski muu, s.t algas uus lause?
    if ($ootansona eq '¤') {
      $uuealgus = $xmin;
      push @cutarr, ($eelmiselopp + $uuealgus)/2;
      $ootansona = shift @wordarr;
      goto VORDLEUUT;
    }
    my $edasi = join (' ', splice(@wordarr, 0, 10));
    die "Ebaklapp TEXT $ootansona != GRID $text, EDASI $edasi";
  }
}

sub impordi_lausevahed {
  my $lause_tier = 0;
  my ($lauselopp);
  @cutarr = ();

  foreach (@gridarr) {
    $lause_tier = 1 if /name = \"Lause\"/;
    $lause_tier = 2 if $lause_tier and (/intervals: size = \d+/);
    $lause_tier = 0 if /item \[/;
    next unless $lause_tier == 2;
    next unless ( /(?<=xmax = )\d+(\.\d+)?/ );
    push @cutarr, $&;
  }
}

sub prindi_gridi_laused {
  my $ort_tier = 0;
  my ($lopp, $xmax, $text);
  my $loendur = 1;

  impordi_lausevahed();

  $lopp = shift @cutarr;
  print "1:\t", $txtarr[0] || "<puudub>\n";
  print "1:\t";

  foreach (@gridarr) {
    $ort_tier = 1 if /name = \"ORT\"/;
    $ort_tier = 0 if /item \[/;
    next unless $ort_tier;
    if ( /(?<=xmax = )\d+(\.\d+)?/ ) { $xmax = $&; next; }
    next unless /text = \"(.*?)\"/;
    $text = $1;
    # praati text väljale jäänud BOM
    $text = $' if $text =~ /^\x{feff}/;

    if ($xmax < $lopp) {
      print "$text ";
    }
    else {
      print "\n\n\n";
      $loendur++;
      print "OOTAN \t", $txtarr[$loendur-1] || "<puudub>\n";
      print "\n$loendur:\t$text ";
      $lopp = shift @cutarr;
    }
  }
}


sub prindi_koos_vahedega {
  my $lause_tier = 0;
  my ($eelmiselopp, $tekstilopp);

#  foreach (@cutarr) { print "loige: $_\n"; }

  foreach (@gridarr) {
    $lause_tier = 1 if /name = \"Lause\"/;
    $lause_tier = 0 if /item \[/;
    if ( /(?<=xmax = )\d+(\.\d+)?/ ) { $tekstilopp = $&; }

    if (($lause_tier == 1) and (/intervals: size = \d+/)) {
      my $ette = $`;
      my $taha = $';
      print F $ette, 'intervals: size = ', scalar(@cutarr)+1, $taha;

      $eelmiselopp = '0';
      my $i;
      for ($i = 0; $i <= $#cutarr; $i++) {
        my $int = $cutarr[$i];
        print F $ette, 'intervals [', $i+1, ']:', $taha;
        print F $ette, '    xmin = ', $eelmiselopp, $taha;
        print F $ette, '    xmax = ', $int, $taha;
        print F $ette, '    text = ""', $taha;
        $eelmiselopp = $int;
      }
      print F $ette, 'intervals [', $i+1, ']:', $taha;
      print F $ette, '    xmin = ', $eelmiselopp, $taha;
      print F $ette, '    xmax = ', $tekstilopp, $taha;
      print F $ette, '    text = ""', $taha;

      # et tükeldaja hiljem teaks:
      push @cutarr, $tekstilopp;

      $lause_tier = 2;
    }

    next if $lause_tier == 2;
    print F $_;
  }
}

sub tykelda_wav {
  my ($id, $wav) = @_;
  foreach (@txtarr) {
    s/\x{feff}//;
    s/[\x{0d}\x{0a} ]+$//s;
  }
  my $kust = 0;
  for (my $i = 0; $i <= $#cutarr; $i++) {
    my $kuhu = $cutarr[$i];
    my $pikkus = $kuhu - $kust;
    my $tykk = $id . '_' . ($i + 10001);
    `sox $wav $tykk.wav trim $kust $pikkus`;
    $kust = $kuhu;
    open F, '>:encoding(UTF-8)', $tykk.'.txt';
    print F $txtarr[$i];
    close (F);
  }
}
