blob: c5a449f7f186fe5c000453642870ee5da66ddb22 [file] [log] [blame]
#!/usr/bin/env perl
use strict;
use warnings;
# This script allows to inspect if any annotated spans start or end
# with whitespaces in the primary data of a KorAP-XML file.
use Getopt::Long qw(GetOptions :config no_auto_abbrev);
use Pod::Usage;
use Mojo::File 'path';
use Mojo::DOM;
use Mojo::Util qw'encode decode';
binmode(STDOUT, ':encoding(utf-8)');
our $VERSION = '0.1.0';
our $VERSION_MSG = "\nkorapxml_offset_checker - v$VERSION\n";
GetOptions(
'input|i=s' => \(my $base = ''),
'anno|a=s' => \(my $annotation = ''),
'help|h' => sub {
pod2usage(
-verbose => 99,
-sections => 'NAME|DESCRIPTION|SYNOPSIS|ARGUMENTS|OPTIONS',
-msg => $VERSION_MSG,
-output => '-'
)
}
);
unless($base) {
pod2usage(
-verbose => 99,
-sections => 'NAME|SYNOPSIS',
-msg => $VERSION_MSG,
-output => '-'
);
exit;
};
$base = path($base);
unless (-f path($base, 'data.xml')) {
die 'Unable to load from ' . $base;
};
# Load data.xml
my $data = path($base, 'data.xml')->slurp;
my $text = decode('UTF-8', Mojo::DOM->new->parse($data)->at('text')->all_text);
# Compare with annotation
my ($foundry, $layer) = split('[\/|#]', $annotation);
my $anno = decode('UTF-8', path($base, $foundry, $layer . '.xml')->slurp);
my $offset = 0;
my $problems = 0;
# Read lemma from annotation
my $lemma = Mojo::DOM->new->parse($anno)->find('span[from]')->each(
sub {
my $span = shift;
# Check if the primary data starts or ends with a space
my $primary = substr($text, $span->attr('from') - $offset, $span->attr('to') - $span->attr('from'));
if ($primary =~ /^(?:\s+)|(\s+$)/) {
# Remember span position
my $span_id = $span->attr('id');
my $from = $span->attr('from');
my $to = $span->attr('to');
print ++$problems,
". Problem found in $base/$foundry/$layer ",
"at span-ID #$span_id ($from-$to)!\n";
if (my $lemma = $span->at('f[name=lemma]')) {
print "Lemma: '", $lemma->all_text, "'\n";
};
print 'Snippet',
($offset ? ' (adjusted)' : ''),
': ',
substr($text, $span->attr('from')-30-$offset, 30),
'[['.$primary.']]',
substr($text, $span->attr('to')-$offset, 30),
"\n\n";
if (defined $1) {
$offset += length($1);
return;
} else {
exit(1);
};
};
}
);
exit(1) if $offset;
print "No problem found in $base/$foundry/$layer!\n";
exit(0);
__END__
=pod
=encoding utf8
=head1 NAME
korapxml_offset_checker - Check offsets in KorAP-XML files
=head1 SYNOPSIS
perl korapxml_offset_checker -i NKJP/NKJP/SuperExpress -a nkjp/morpho
=head1 INSTALLATION
Requires Mojolicious.
=head1 OPTIONS
=over 2
=item B<--input>
Expects a path to the root of a KorAP-XML document
(the root being the text level, including the C<data.xml>),
e.g. C<NKJP/NKJP/SuperExpress>.
=item B<--annotation>
Expects the annotation to check for failing offsets in the form of
C<foundry/layer>, e.g. C<nkjp/morpho>.
=back
=head1 COPYRIGHT AND LICENSE
Copyright (C) 2022, L<IDS Mannheim|https://www.ids-mannheim.de/>
Author: Nils Diewald
This program is free software published under the
L<BSD-2 License|https://opensource.org/licenses/BSD-2-Clause>.
=cut