Add fix option to repair broken data.xml

Change-Id: I9eeafdda9420730b8e41ea5b1ec055f9d2fbca4a
diff --git a/bin/korapxml_offset_checker b/bin/korapxml_offset_checker
index c5a449f..18c0fed 100755
--- a/bin/korapxml_offset_checker
+++ b/bin/korapxml_offset_checker
@@ -17,8 +17,9 @@
 our $VERSION_MSG = "\nkorapxml_offset_checker - v$VERSION\n";
 
 GetOptions(
-  'input|i=s' => \(my $base = ''),
-  'anno|a=s' => \(my $annotation = ''),
+  'input|i=s'  => \(my $base = ''),
+  'anno|a=s'   => \(my $annotation = ''),
+  'fix|f'      => \(my $fix = ''),
   'help|h' => sub {
     pod2usage(
       -verbose  => 99,
@@ -39,8 +40,106 @@
   exit;
 };
 
+my ($foundry, $layer) = split('[\/|#]', $annotation);
+my $text_fix;
+
 $base = path($base);
 
+sub check_primary {
+  my $text = shift;
+
+  $text_fix = '';
+
+  # Compare with annotation
+  my $anno = decode('UTF-8', path($base, $foundry, $layer . '.xml')->slurp);
+
+  my $offset = 0;
+  my $problems = 0;
+  my $last_from = 0;
+
+  # Read lemma from annotation
+  my $lemma = Mojo::DOM->new->parse($anno)->find('span[from]')->each(
+    sub {
+      my $span = shift;
+
+      # Check if the primary data starts or ends with a space
+      my $primary = substr($text, $span->attr('from') - $offset, $span->attr('to') - $span->attr('from'));
+
+      my $from = $span->attr('from');
+      my $to  = $span->attr('to');
+
+      unless ($primary) {
+        print "Unable to find primary data at ($from-$to)\n\n";
+        exit(1);
+      };
+
+      if ($primary =~ /^(?:\s+)|(\s+$)/) {
+
+        # Remember span position
+        my $span_id = $span->attr('id');
+
+        print ++$problems,
+          ". Problem found in $base/$foundry/$layer ",
+          "at span-ID #$span_id ($from-$to)!\n";
+
+        if (my $lemma = $span->at('f[name=lemma]')) {
+          print "Lemma: '", $lemma->all_text, "'\n";
+        };
+
+        print 'Snippet',
+          ($offset ? ' (adjusted)' : ''),
+          ': ',
+          substr($text, $span->attr('from')-30-$offset, 30),
+          '[['.$primary.']]',
+          substr($text, $span->attr('to')-$offset, 30),
+          "\n";
+
+        if (defined $1) {
+
+          $offset += length($1);
+
+          if ($fix) {
+
+            # Forecast fix
+            print 'Fix',
+              ': ',
+              substr($text, $span->attr('from') - 30 - $offset, 30),
+              "$1",
+              '[[',
+              substr($text, $span->attr('from') - $offset, $span->attr('to') - $span->attr('from')),
+              ']]',
+              substr($text, $span->attr('to') - $offset, 30),
+              "\n";
+
+            # Rewrite primary data with fix
+            $text_fix .= substr($text, $last_from, $span->attr('from') - $last_from - $offset);
+            $text_fix .= "$1";
+            $text_fix .= substr($text, $span->attr('from') - $offset, length($1));
+
+            $last_from = $span->attr('from') - $offset + length($1);
+          };
+          print "\n";
+          return;
+        } else {
+          if ($fix) {
+            print "Unable to fix file.\n";
+          };
+          print "\n";
+          exit(1);
+        };
+      };
+    }
+  );
+
+  if ($fix) {
+    # Finish the text data
+    $text_fix .= substr($text, $last_from);
+  };
+
+  return $problems;
+};
+
+# Load normal data.xml
 unless (-f path($base, 'data.xml')) {
   die 'Unable to load from ' . $base;
 };
@@ -49,57 +148,42 @@
 my $data = path($base, 'data.xml')->slurp;
 my $text = decode('UTF-8', Mojo::DOM->new->parse($data)->at('text')->all_text);
 
-# Compare with annotation
-my ($foundry, $layer) = split('[\/|#]', $annotation);
-my $anno = decode('UTF-8', path($base, $foundry, $layer . '.xml')->slurp);
+unless (check_primary($text)) {
+  print "No problem found in $base/$foundry/$layer!\n";
+  exit(0);
+};
 
-my $offset = 0;
-my $problems = 0;
 
-# Read lemma from annotation
-my $lemma = Mojo::DOM->new->parse($anno)->find('span[from]')->each(
-  sub {
-    my $span = shift;
+# The fix flag was activated
+if ($fix) {
 
-    # Check if the primary data starts or ends with a space
-    my $primary = substr($text, $span->attr('from') - $offset, $span->attr('to') - $span->attr('from'));
-    if ($primary =~ /^(?:\s+)|(\s+$)/) {
+  $|=0;
 
-      # Remember span position
-      my $span_id = $span->attr('id');
-      my $from    = $span->attr('from');
-      my $to      = $span->attr('to');
+  print "Check fixed data ...\n\n";
 
-      print ++$problems,
-        ". Problem found in $base/$foundry/$layer ",
-        "at span-ID #$span_id ($from-$to)!\n";
+  my $data_fix = Mojo::DOM->new->parse($data)->at('text')->child_nodes->[0]->replace($text_fix)->root;
 
-      if (my $lemma = $span->at('f[name=lemma]')) {
-        print "Lemma: '", $lemma->all_text, "'\n";
-      };
-
-      print 'Snippet',
-        ($offset ? ' (adjusted)' : ''),
-        ': ',
-        substr($text, $span->attr('from')-30-$offset, 30),
-        '[['.$primary.']]',
-        substr($text, $span->attr('to')-$offset, 30),
-        "\n\n";
-
-      if (defined $1) {
-        $offset += length($1);
-        return;
-      } else {
-        exit(1);
-      };
+  unless (check_primary($data_fix->at('text')->all_text)) {
+    print "Fixed data is fine - overwrite data.xml? (y)\n";
+    my $stdin = <STDIN>;
+    chomp($stdin);
+    if ($stdin eq 'y' || $stdin eq 'Y') {
+      path($base, 'data.xml')->spurt(encode('UTF-8', $data_fix));
+      print "File written.\n\n";
+    } else {
+      print "No file written.\n\n";
     };
+    exit(0);
   }
-);
 
-exit(1) if $offset;
+  else {
+    print "Unable to fix file\n\n";
+    exit(1);
+  };
+};
 
-print "No problem found in $base/$foundry/$layer!\n";
-exit(0);
+exit(1);
+
 
 __END__
 
@@ -134,6 +218,10 @@
 Expects the annotation to check for failing offsets in the form of
 C<foundry/layer>, e.g. C<nkjp/morpho>.
 
+=item B<--fix>
+
+Binary flag to rewrite data.xml with fixed offsets.
+
 =back
 
 =head1 COPYRIGHT AND LICENSE