Add experimental offset treatment for multiple offset errors
Change-Id: Ie452ff603f306c1e05667f191c4f2d6935f4d430
diff --git a/bin/korapxml_offset_checker b/bin/korapxml_offset_checker
index 08fc615..c5a449f 100755
--- a/bin/korapxml_offset_checker
+++ b/bin/korapxml_offset_checker
@@ -53,36 +53,51 @@
my ($foundry, $layer) = split('[\/|#]', $annotation);
my $anno = decode('UTF-8', path($base, $foundry, $layer . '.xml')->slurp);
+my $offset = 0;
+my $problems = 0;
+
# Read lemma from annotation
my $lemma = Mojo::DOM->new->parse($anno)->find('span[from]')->each(
sub {
my $span = shift;
# Check if the primary data starts or ends with a space
- my $primary = substr($text, $span->attr('from'), $span->attr('to') - $span->attr('from'));
- if ($primary =~ /^(?:\s+)|(?:\s+$)/) {
+ my $primary = substr($text, $span->attr('from') - $offset, $span->attr('to') - $span->attr('from'));
+ if ($primary =~ /^(?:\s+)|(\s+$)/) {
# Remember span position
my $span_id = $span->attr('id');
my $from = $span->attr('from');
my $to = $span->attr('to');
- print "Problem found in $base/$foundry/$layer at span-ID #$span_id ($from-$to)!\n";
+ print ++$problems,
+ ". Problem found in $base/$foundry/$layer ",
+ "at span-ID #$span_id ($from-$to)!\n";
if (my $lemma = $span->at('f[name=lemma]')) {
print "Lemma: '", $lemma->all_text, "'\n";
};
- print 'Snippet: ',
- substr($text, $span->attr('from')-30, 30),
+ print 'Snippet',
+ ($offset ? ' (adjusted)' : ''),
+ ': ',
+ substr($text, $span->attr('from')-30-$offset, 30),
'[['.$primary.']]',
- substr($text, $span->attr('to'), 30),
- "\n";
- exit(1);
+ substr($text, $span->attr('to')-$offset, 30),
+ "\n\n";
+
+ if (defined $1) {
+ $offset += length($1);
+ return;
+ } else {
+ exit(1);
+ };
};
}
);
+exit(1) if $offset;
+
print "No problem found in $base/$foundry/$layer!\n";
exit(0);