blob: 18c0fedd045ba536b892017764fad70a982d3ec2 [file] [log] [blame]
#!/usr/bin/env perl
use strict;
use warnings;
# This script allows to inspect if any annotated spans start or end
# with whitespaces in the primary data of a KorAP-XML file.
use Getopt::Long qw(GetOptions :config no_auto_abbrev);
use Pod::Usage;
use Mojo::File 'path';
use Mojo::DOM;
use Mojo::Util qw'encode decode';
binmode(STDOUT, ':encoding(utf-8)');
our $VERSION = '0.1.0';
our $VERSION_MSG = "\nkorapxml_offset_checker - v$VERSION\n";
GetOptions(
'input|i=s' => \(my $base = ''),
'anno|a=s' => \(my $annotation = ''),
'fix|f' => \(my $fix = ''),
'help|h' => sub {
pod2usage(
-verbose => 99,
-sections => 'NAME|DESCRIPTION|SYNOPSIS|ARGUMENTS|OPTIONS',
-msg => $VERSION_MSG,
-output => '-'
)
}
);
unless($base) {
pod2usage(
-verbose => 99,
-sections => 'NAME|SYNOPSIS',
-msg => $VERSION_MSG,
-output => '-'
);
exit;
};
my ($foundry, $layer) = split('[\/|#]', $annotation);
my $text_fix;
$base = path($base);
sub check_primary {
my $text = shift;
$text_fix = '';
# Compare with annotation
my $anno = decode('UTF-8', path($base, $foundry, $layer . '.xml')->slurp);
my $offset = 0;
my $problems = 0;
my $last_from = 0;
# Read lemma from annotation
my $lemma = Mojo::DOM->new->parse($anno)->find('span[from]')->each(
sub {
my $span = shift;
# Check if the primary data starts or ends with a space
my $primary = substr($text, $span->attr('from') - $offset, $span->attr('to') - $span->attr('from'));
my $from = $span->attr('from');
my $to = $span->attr('to');
unless ($primary) {
print "Unable to find primary data at ($from-$to)\n\n";
exit(1);
};
if ($primary =~ /^(?:\s+)|(\s+$)/) {
# Remember span position
my $span_id = $span->attr('id');
print ++$problems,
". Problem found in $base/$foundry/$layer ",
"at span-ID #$span_id ($from-$to)!\n";
if (my $lemma = $span->at('f[name=lemma]')) {
print "Lemma: '", $lemma->all_text, "'\n";
};
print 'Snippet',
($offset ? ' (adjusted)' : ''),
': ',
substr($text, $span->attr('from')-30-$offset, 30),
'[['.$primary.']]',
substr($text, $span->attr('to')-$offset, 30),
"\n";
if (defined $1) {
$offset += length($1);
if ($fix) {
# Forecast fix
print 'Fix',
': ',
substr($text, $span->attr('from') - 30 - $offset, 30),
"$1",
'[[',
substr($text, $span->attr('from') - $offset, $span->attr('to') - $span->attr('from')),
']]',
substr($text, $span->attr('to') - $offset, 30),
"\n";
# Rewrite primary data with fix
$text_fix .= substr($text, $last_from, $span->attr('from') - $last_from - $offset);
$text_fix .= "$1";
$text_fix .= substr($text, $span->attr('from') - $offset, length($1));
$last_from = $span->attr('from') - $offset + length($1);
};
print "\n";
return;
} else {
if ($fix) {
print "Unable to fix file.\n";
};
print "\n";
exit(1);
};
};
}
);
if ($fix) {
# Finish the text data
$text_fix .= substr($text, $last_from);
};
return $problems;
};
# Load normal data.xml
unless (-f path($base, 'data.xml')) {
die 'Unable to load from ' . $base;
};
# Load data.xml
my $data = path($base, 'data.xml')->slurp;
my $text = decode('UTF-8', Mojo::DOM->new->parse($data)->at('text')->all_text);
unless (check_primary($text)) {
print "No problem found in $base/$foundry/$layer!\n";
exit(0);
};
# The fix flag was activated
if ($fix) {
$|=0;
print "Check fixed data ...\n\n";
my $data_fix = Mojo::DOM->new->parse($data)->at('text')->child_nodes->[0]->replace($text_fix)->root;
unless (check_primary($data_fix->at('text')->all_text)) {
print "Fixed data is fine - overwrite data.xml? (y)\n";
my $stdin = <STDIN>;
chomp($stdin);
if ($stdin eq 'y' || $stdin eq 'Y') {
path($base, 'data.xml')->spurt(encode('UTF-8', $data_fix));
print "File written.\n\n";
} else {
print "No file written.\n\n";
};
exit(0);
}
else {
print "Unable to fix file\n\n";
exit(1);
};
};
exit(1);
__END__
=pod
=encoding utf8
=head1 NAME
korapxml_offset_checker - Check offsets in KorAP-XML files
=head1 SYNOPSIS
perl korapxml_offset_checker -i NKJP/NKJP/SuperExpress -a nkjp/morpho
=head1 INSTALLATION
Requires Mojolicious.
=head1 OPTIONS
=over 2
=item B<--input>
Expects a path to the root of a KorAP-XML document
(the root being the text level, including the C<data.xml>),
e.g. C<NKJP/NKJP/SuperExpress>.
=item B<--annotation>
Expects the annotation to check for failing offsets in the form of
C<foundry/layer>, e.g. C<nkjp/morpho>.
=item B<--fix>
Binary flag to rewrite data.xml with fixed offsets.
=back
=head1 COPYRIGHT AND LICENSE
Copyright (C) 2022, L<IDS Mannheim|https://www.ids-mannheim.de/>
Author: Nils Diewald
This program is free software published under the
L<BSD-2 License|https://opensource.org/licenses/BSD-2-Clause>.
=cut