Add fix option to repair broken data.xml
Change-Id: I9eeafdda9420730b8e41ea5b1ec055f9d2fbca4a
diff --git a/bin/korapxml_offset_checker b/bin/korapxml_offset_checker
index c5a449f..18c0fed 100755
--- a/bin/korapxml_offset_checker
+++ b/bin/korapxml_offset_checker
@@ -17,8 +17,9 @@
our $VERSION_MSG = "\nkorapxml_offset_checker - v$VERSION\n";
GetOptions(
- 'input|i=s' => \(my $base = ''),
- 'anno|a=s' => \(my $annotation = ''),
+ 'input|i=s' => \(my $base = ''),
+ 'anno|a=s' => \(my $annotation = ''),
+ 'fix|f' => \(my $fix = ''),
'help|h' => sub {
pod2usage(
-verbose => 99,
@@ -39,8 +40,106 @@
exit;
};
+my ($foundry, $layer) = split('[\/|#]', $annotation);
+my $text_fix;
+
$base = path($base);
+sub check_primary {
+ my $text = shift;
+
+ $text_fix = '';
+
+ # Compare with annotation
+ my $anno = decode('UTF-8', path($base, $foundry, $layer . '.xml')->slurp);
+
+ my $offset = 0;
+ my $problems = 0;
+ my $last_from = 0;
+
+ # Read lemma from annotation
+ my $lemma = Mojo::DOM->new->parse($anno)->find('span[from]')->each(
+ sub {
+ my $span = shift;
+
+ # Check if the primary data starts or ends with a space
+ my $primary = substr($text, $span->attr('from') - $offset, $span->attr('to') - $span->attr('from'));
+
+ my $from = $span->attr('from');
+ my $to = $span->attr('to');
+
+ unless ($primary) {
+ print "Unable to find primary data at ($from-$to)\n\n";
+ exit(1);
+ };
+
+ if ($primary =~ /^(?:\s+)|(\s+$)/) {
+
+ # Remember span position
+ my $span_id = $span->attr('id');
+
+ print ++$problems,
+ ". Problem found in $base/$foundry/$layer ",
+ "at span-ID #$span_id ($from-$to)!\n";
+
+ if (my $lemma = $span->at('f[name=lemma]')) {
+ print "Lemma: '", $lemma->all_text, "'\n";
+ };
+
+ print 'Snippet',
+ ($offset ? ' (adjusted)' : ''),
+ ': ',
+ substr($text, $span->attr('from')-30-$offset, 30),
+ '[['.$primary.']]',
+ substr($text, $span->attr('to')-$offset, 30),
+ "\n";
+
+ if (defined $1) {
+
+ $offset += length($1);
+
+ if ($fix) {
+
+ # Forecast fix
+ print 'Fix',
+ ': ',
+ substr($text, $span->attr('from') - 30 - $offset, 30),
+ "$1",
+ '[[',
+ substr($text, $span->attr('from') - $offset, $span->attr('to') - $span->attr('from')),
+ ']]',
+ substr($text, $span->attr('to') - $offset, 30),
+ "\n";
+
+ # Rewrite primary data with fix
+ $text_fix .= substr($text, $last_from, $span->attr('from') - $last_from - $offset);
+ $text_fix .= "$1";
+ $text_fix .= substr($text, $span->attr('from') - $offset, length($1));
+
+ $last_from = $span->attr('from') - $offset + length($1);
+ };
+ print "\n";
+ return;
+ } else {
+ if ($fix) {
+ print "Unable to fix file.\n";
+ };
+ print "\n";
+ exit(1);
+ };
+ };
+ }
+ );
+
+ if ($fix) {
+ # Finish the text data
+ $text_fix .= substr($text, $last_from);
+ };
+
+ return $problems;
+};
+
+# Load normal data.xml
unless (-f path($base, 'data.xml')) {
die 'Unable to load from ' . $base;
};
@@ -49,57 +148,42 @@
my $data = path($base, 'data.xml')->slurp;
my $text = decode('UTF-8', Mojo::DOM->new->parse($data)->at('text')->all_text);
-# Compare with annotation
-my ($foundry, $layer) = split('[\/|#]', $annotation);
-my $anno = decode('UTF-8', path($base, $foundry, $layer . '.xml')->slurp);
+unless (check_primary($text)) {
+ print "No problem found in $base/$foundry/$layer!\n";
+ exit(0);
+};
-my $offset = 0;
-my $problems = 0;
-# Read lemma from annotation
-my $lemma = Mojo::DOM->new->parse($anno)->find('span[from]')->each(
- sub {
- my $span = shift;
+# The fix flag was activated
+if ($fix) {
- # Check if the primary data starts or ends with a space
- my $primary = substr($text, $span->attr('from') - $offset, $span->attr('to') - $span->attr('from'));
- if ($primary =~ /^(?:\s+)|(\s+$)/) {
+ $|=0;
- # Remember span position
- my $span_id = $span->attr('id');
- my $from = $span->attr('from');
- my $to = $span->attr('to');
+ print "Check fixed data ...\n\n";
- print ++$problems,
- ". Problem found in $base/$foundry/$layer ",
- "at span-ID #$span_id ($from-$to)!\n";
+ my $data_fix = Mojo::DOM->new->parse($data)->at('text')->child_nodes->[0]->replace($text_fix)->root;
- if (my $lemma = $span->at('f[name=lemma]')) {
- print "Lemma: '", $lemma->all_text, "'\n";
- };
-
- print 'Snippet',
- ($offset ? ' (adjusted)' : ''),
- ': ',
- substr($text, $span->attr('from')-30-$offset, 30),
- '[['.$primary.']]',
- substr($text, $span->attr('to')-$offset, 30),
- "\n\n";
-
- if (defined $1) {
- $offset += length($1);
- return;
- } else {
- exit(1);
- };
+ unless (check_primary($data_fix->at('text')->all_text)) {
+ print "Fixed data is fine - overwrite data.xml? (y)\n";
+ my $stdin = <STDIN>;
+ chomp($stdin);
+ if ($stdin eq 'y' || $stdin eq 'Y') {
+ path($base, 'data.xml')->spurt(encode('UTF-8', $data_fix));
+ print "File written.\n\n";
+ } else {
+ print "No file written.\n\n";
};
+ exit(0);
}
-);
-exit(1) if $offset;
+ else {
+ print "Unable to fix file\n\n";
+ exit(1);
+ };
+};
-print "No problem found in $base/$foundry/$layer!\n";
-exit(0);
+exit(1);
+
__END__
@@ -134,6 +218,10 @@
Expects the annotation to check for failing offsets in the form of
C<foundry/layer>, e.g. C<nkjp/morpho>.
+=item B<--fix>
+
+Binary flag to rewrite data.xml with fixed offsets.
+
=back
=head1 COPYRIGHT AND LICENSE