Add KorAP-XML offset checker
Change-Id: I556e804e9c295729fbf3e48e9529b9ca4b1b2217
diff --git a/bin/korapxml_offset_checker b/bin/korapxml_offset_checker
new file mode 100755
index 0000000..08fc615
--- /dev/null
+++ b/bin/korapxml_offset_checker
@@ -0,0 +1,134 @@
+#!/usr/bin/env perl
+use strict;
+use warnings;
+
+# This script allows to inspect if any annotated spans start or end
+# with whitespaces in the primary data of a KorAP-XML file.
+
+use Getopt::Long qw(GetOptions :config no_auto_abbrev);
+use Pod::Usage;
+
+use Mojo::File 'path';
+use Mojo::DOM;
+use Mojo::Util qw'encode decode';
+binmode(STDOUT, ':encoding(utf-8)');
+
+our $VERSION = '0.1.0';
+our $VERSION_MSG = "\nkorapxml_offset_checker - v$VERSION\n";
+
+GetOptions(
+ 'input|i=s' => \(my $base = ''),
+ 'anno|a=s' => \(my $annotation = ''),
+ 'help|h' => sub {
+ pod2usage(
+ -verbose => 99,
+ -sections => 'NAME|DESCRIPTION|SYNOPSIS|ARGUMENTS|OPTIONS',
+ -msg => $VERSION_MSG,
+ -output => '-'
+ )
+ }
+);
+
+unless($base) {
+ pod2usage(
+ -verbose => 99,
+ -sections => 'NAME|SYNOPSIS',
+ -msg => $VERSION_MSG,
+ -output => '-'
+ );
+ exit;
+};
+
+$base = path($base);
+
+unless (-f path($base, 'data.xml')) {
+ die 'Unable to load from ' . $base;
+};
+
+# Load data.xml
+my $data = path($base, 'data.xml')->slurp;
+my $text = decode('UTF-8', Mojo::DOM->new->parse($data)->at('text')->all_text);
+
+# Compare with annotation
+my ($foundry, $layer) = split('[\/|#]', $annotation);
+my $anno = decode('UTF-8', path($base, $foundry, $layer . '.xml')->slurp);
+
+# Read lemma from annotation
+my $lemma = Mojo::DOM->new->parse($anno)->find('span[from]')->each(
+ sub {
+ my $span = shift;
+
+ # Check if the primary data starts or ends with a space
+ my $primary = substr($text, $span->attr('from'), $span->attr('to') - $span->attr('from'));
+ if ($primary =~ /^(?:\s+)|(?:\s+$)/) {
+
+ # Remember span position
+ my $span_id = $span->attr('id');
+ my $from = $span->attr('from');
+ my $to = $span->attr('to');
+
+ print "Problem found in $base/$foundry/$layer at span-ID #$span_id ($from-$to)!\n";
+
+ if (my $lemma = $span->at('f[name=lemma]')) {
+ print "Lemma: '", $lemma->all_text, "'\n";
+ };
+
+ print 'Snippet: ',
+ substr($text, $span->attr('from')-30, 30),
+ '[['.$primary.']]',
+ substr($text, $span->attr('to'), 30),
+ "\n";
+ exit(1);
+ };
+ }
+);
+
+print "No problem found in $base/$foundry/$layer!\n";
+exit(0);
+
+__END__
+
+=pod
+
+=encoding utf8
+
+=head1 NAME
+
+korapxml_offset_checker - Check offsets in KorAP-XML files
+
+=head1 SYNOPSIS
+
+ perl korapxml_offset_checker -i NKJP/NKJP/SuperExpress -a nkjp/morpho
+
+=head1 INSTALLATION
+
+Requires Mojolicious.
+
+=head1 OPTIONS
+
+=over 2
+
+=item B<--input>
+
+Expects a path to the root of a KorAP-XML document
+(the root being the text level, including the C<data.xml>),
+e.g. C<NKJP/NKJP/SuperExpress>.
+
+=item B<--annotation>
+
+Expects the annotation to check for failing offsets in the form of
+C<foundry/layer>, e.g. C<nkjp/morpho>.
+
+=back
+
+=head1 COPYRIGHT AND LICENSE
+
+Copyright (C) 2022, L<IDS Mannheim|https://www.ids-mannheim.de/>
+
+Author: Nils Diewald
+
+This program is free software published under the
+L<BSD-2 License|https://opensource.org/licenses/BSD-2-Clause>.
+
+=cut
+