#!/usr/bin/env perl
use strict;
use warnings;

# This script allows to inspect if any annotated spans start or end
# with whitespaces in the primary data of a KorAP-XML file.

use Getopt::Long qw(GetOptions :config no_auto_abbrev);
use Pod::Usage;

use Mojo::File 'path';
use Mojo::DOM;
use Mojo::Util qw'encode decode';
binmode(STDOUT, ':encoding(utf-8)');

our $VERSION = '0.1.0';
our $VERSION_MSG = "\nkorapxml_offset_checker - v$VERSION\n";

GetOptions(
  'input|i=s'  => \(my $base = ''),
  'anno|a=s'   => \(my $annotation = ''),
  'fix|f'      => \(my $fix = ''),
  'help|h' => sub {
    pod2usage(
      -verbose  => 99,
      -sections => 'NAME|DESCRIPTION|SYNOPSIS|ARGUMENTS|OPTIONS',
      -msg      => $VERSION_MSG,
      -output   => '-'
    )
  }
);

unless($base) {
  pod2usage(
    -verbose => 99,
    -sections => 'NAME|SYNOPSIS',
    -msg => $VERSION_MSG,
    -output => '-'
  );
  exit;
};

my ($foundry, $layer) = split('[\/|#]', $annotation);
my $text_fix;

$base = path($base);

sub check_primary {
  my $text = shift;

  $text_fix = '';

  # Compare with annotation
  my $anno = decode('UTF-8', path($base, $foundry, $layer . '.xml')->slurp);

  my $offset = 0;
  my $problems = 0;
  my $last_from = 0;

  # Read lemma from annotation
  my $lemma = Mojo::DOM->new->parse($anno)->find('span[from]')->each(
    sub {
      my $span = shift;

      # Check if the primary data starts or ends with a space
      my $primary = substr($text, $span->attr('from') - $offset, $span->attr('to') - $span->attr('from'));

      my $from = $span->attr('from');
      my $to  = $span->attr('to');

      unless ($primary) {
        print "Unable to find primary data at ($from-$to)\n\n";
        exit(1);
      };

      if ($primary =~ /^(?:\s+)|(\s+$)/) {

        # Remember span position
        my $span_id = $span->attr('id');

        print ++$problems,
          ". Problem found in $base/$foundry/$layer ",
          "at span-ID #$span_id ($from-$to)!\n";

        if (my $lemma = $span->at('f[name=lemma]')) {
          print "Lemma: '", $lemma->all_text, "'\n";
        };

        print 'Snippet',
          ($offset ? ' (adjusted)' : ''),
          ': ',
          substr($text, $span->attr('from')-30-$offset, 30),
          '[['.$primary.']]',
          substr($text, $span->attr('to')-$offset, 30),
          "\n";

        if (defined $1) {

          $offset += length($1);

          if ($fix) {

            # Forecast fix
            print 'Fix',
              ': ',
              substr($text, $span->attr('from') - 30 - $offset, 30),
              "$1",
              '[[',
              substr($text, $span->attr('from') - $offset, $span->attr('to') - $span->attr('from')),
              ']]',
              substr($text, $span->attr('to') - $offset, 30),
              "\n";

            # Rewrite primary data with fix
            $text_fix .= substr($text, $last_from, $span->attr('from') - $last_from - $offset);
            $text_fix .= "$1";
            $text_fix .= substr($text, $span->attr('from') - $offset, length($1));

            $last_from = $span->attr('from') - $offset + length($1);
          };
          print "\n";
          return;
        } else {
          if ($fix) {
            print "Unable to fix file.\n";
          };
          print "\n";
          exit(1);
        };
      };
    }
  );

  if ($fix) {
    # Finish the text data
    $text_fix .= substr($text, $last_from);
  };

  return $problems;
};

# Load normal data.xml
unless (-f path($base, 'data.xml')) {
  die 'Unable to load from ' . $base;
};

# Load data.xml
my $data = path($base, 'data.xml')->slurp;
my $text = decode('UTF-8', Mojo::DOM->new->parse($data)->at('text')->all_text);

unless (check_primary($text)) {
  print "No problem found in $base/$foundry/$layer!\n";
  exit(0);
};


# The fix flag was activated
if ($fix) {

  $|=0;

  print "Check fixed data ...\n\n";

  my $data_fix = Mojo::DOM->new->parse($data)->at('text')->child_nodes->[0]->replace($text_fix)->root;

  unless (check_primary($data_fix->at('text')->all_text)) {
    print "Fixed data is fine - overwrite data.xml? (y)\n";
    my $stdin = <STDIN>;
    chomp($stdin);
    if ($stdin eq 'y' || $stdin eq 'Y') {
      path($base, 'data.xml')->spurt(encode('UTF-8', $data_fix));
      print "File written.\n\n";
    } else {
      print "No file written.\n\n";
    };
    exit(0);
  }

  else {
    print "Unable to fix file\n\n";
    exit(1);
  };
};

exit(1);


__END__

=pod

=encoding utf8

=head1 NAME

korapxml_offset_checker - Check offsets in KorAP-XML files

=head1 SYNOPSIS

  perl korapxml_offset_checker -i NKJP/NKJP/SuperExpress -a nkjp/morpho

=head1 INSTALLATION

Requires Mojolicious.

=head1 OPTIONS

=over 2

=item B<--input>

Expects a path to the root of a KorAP-XML document
(the root being the text level, including the C<data.xml>),
e.g. C<NKJP/NKJP/SuperExpress>.

=item B<--annotation>

Expects the annotation to check for failing offsets in the form of
C<foundry/layer>, e.g. C<nkjp/morpho>.

=item B<--fix>

Binary flag to rewrite data.xml with fixed offsets.

=back

=head1 COPYRIGHT AND LICENSE

Copyright (C) 2022, L<IDS Mannheim|https://www.ids-mannheim.de/>

Author: Nils Diewald

This program is free software published under the
L<BSD-2 License|https://opensource.org/licenses/BSD-2-Clause>.

=cut

