| #!/usr/bin/env perl |
| use strict; |
| use warnings; |
| |
| # This script allows to inspect if any annotated spans start or end |
| # with whitespaces in the primary data of a KorAP-XML file. |
| |
| use Getopt::Long qw(GetOptions :config no_auto_abbrev); |
| use Pod::Usage; |
| |
| use Mojo::File 'path'; |
| use Mojo::DOM; |
| use Mojo::Util qw'encode decode'; |
| binmode(STDOUT, ':encoding(utf-8)'); |
| |
| our $VERSION = '0.1.0'; |
| our $VERSION_MSG = "\nkorapxml_offset_checker - v$VERSION\n"; |
| |
| GetOptions( |
| 'input|i=s' => \(my $base = ''), |
| 'anno|a=s' => \(my $annotation = ''), |
| 'fix|f' => \(my $fix = ''), |
| 'help|h' => sub { |
| pod2usage( |
| -verbose => 99, |
| -sections => 'NAME|DESCRIPTION|SYNOPSIS|ARGUMENTS|OPTIONS', |
| -msg => $VERSION_MSG, |
| -output => '-' |
| ) |
| } |
| ); |
| |
| unless($base) { |
| pod2usage( |
| -verbose => 99, |
| -sections => 'NAME|SYNOPSIS', |
| -msg => $VERSION_MSG, |
| -output => '-' |
| ); |
| exit; |
| }; |
| |
| my ($foundry, $layer) = split('[\/|#]', $annotation); |
| my $text_fix; |
| |
| $base = path($base); |
| |
| sub check_primary { |
| my $text = shift; |
| |
| $text_fix = ''; |
| |
| # Compare with annotation |
| my $anno = decode('UTF-8', path($base, $foundry, $layer . '.xml')->slurp); |
| |
| my $offset = 0; |
| my $problems = 0; |
| my $last_from = 0; |
| |
| # Read lemma from annotation |
| my $lemma = Mojo::DOM->new->parse($anno)->find('span[from]')->each( |
| sub { |
| my $span = shift; |
| |
| # Check if the primary data starts or ends with a space |
| my $primary = substr($text, $span->attr('from') - $offset, $span->attr('to') - $span->attr('from')); |
| |
| my $from = $span->attr('from'); |
| my $to = $span->attr('to'); |
| |
| unless ($primary) { |
| print "Unable to find primary data at ($from-$to)\n\n"; |
| exit(1); |
| }; |
| |
| if ($primary =~ /^(?:\s+)|(\s+$)/) { |
| |
| # Remember span position |
| my $span_id = $span->attr('id'); |
| |
| print ++$problems, |
| ". Problem found in $base/$foundry/$layer ", |
| "at span-ID #$span_id ($from-$to)!\n"; |
| |
| if (my $lemma = $span->at('f[name=lemma]')) { |
| print "Lemma: '", $lemma->all_text, "'\n"; |
| }; |
| |
| print 'Snippet', |
| ($offset ? ' (adjusted)' : ''), |
| ': ', |
| substr($text, $span->attr('from')-30-$offset, 30), |
| '[['.$primary.']]', |
| substr($text, $span->attr('to')-$offset, 30), |
| "\n"; |
| |
| if (defined $1) { |
| |
| $offset += length($1); |
| |
| if ($fix) { |
| |
| # Forecast fix |
| print 'Fix', |
| ': ', |
| substr($text, $span->attr('from') - 30 - $offset, 30), |
| "$1", |
| '[[', |
| substr($text, $span->attr('from') - $offset, $span->attr('to') - $span->attr('from')), |
| ']]', |
| substr($text, $span->attr('to') - $offset, 30), |
| "\n"; |
| |
| # Rewrite primary data with fix |
| $text_fix .= substr($text, $last_from, $span->attr('from') - $last_from - $offset); |
| $text_fix .= "$1"; |
| $text_fix .= substr($text, $span->attr('from') - $offset, length($1)); |
| |
| $last_from = $span->attr('from') - $offset + length($1); |
| }; |
| print "\n"; |
| return; |
| } else { |
| if ($fix) { |
| print "Unable to fix file.\n"; |
| }; |
| print "\n"; |
| exit(1); |
| }; |
| }; |
| } |
| ); |
| |
| if ($fix) { |
| # Finish the text data |
| $text_fix .= substr($text, $last_from); |
| }; |
| |
| return $problems; |
| }; |
| |
| # Load normal data.xml |
| unless (-f path($base, 'data.xml')) { |
| die 'Unable to load from ' . $base; |
| }; |
| |
| # Load data.xml |
| my $data = path($base, 'data.xml')->slurp; |
| my $text = decode('UTF-8', Mojo::DOM->new->parse($data)->at('text')->all_text); |
| |
| unless (check_primary($text)) { |
| print "No problem found in $base/$foundry/$layer!\n"; |
| exit(0); |
| }; |
| |
| |
| # The fix flag was activated |
| if ($fix) { |
| |
| $|=0; |
| |
| print "Check fixed data ...\n\n"; |
| |
| my $data_fix = Mojo::DOM->new->parse($data)->at('text')->child_nodes->[0]->replace($text_fix)->root; |
| |
| unless (check_primary($data_fix->at('text')->all_text)) { |
| print "Fixed data is fine - overwrite data.xml? (y)\n"; |
| my $stdin = <STDIN>; |
| chomp($stdin); |
| if ($stdin eq 'y' || $stdin eq 'Y') { |
| path($base, 'data.xml')->spurt(encode('UTF-8', $data_fix)); |
| print "File written.\n\n"; |
| } else { |
| print "No file written.\n\n"; |
| }; |
| exit(0); |
| } |
| |
| else { |
| print "Unable to fix file\n\n"; |
| exit(1); |
| }; |
| }; |
| |
| exit(1); |
| |
| |
| __END__ |
| |
| =pod |
| |
| =encoding utf8 |
| |
| =head1 NAME |
| |
| korapxml_offset_checker - Check offsets in KorAP-XML files |
| |
| =head1 SYNOPSIS |
| |
| perl korapxml_offset_checker -i NKJP/NKJP/SuperExpress -a nkjp/morpho |
| |
| =head1 INSTALLATION |
| |
| Requires Mojolicious. |
| |
| =head1 OPTIONS |
| |
| =over 2 |
| |
| =item B<--input> |
| |
| Expects a path to the root of a KorAP-XML document |
| (the root being the text level, including the C<data.xml>), |
| e.g. C<NKJP/NKJP/SuperExpress>. |
| |
| =item B<--annotation> |
| |
| Expects the annotation to check for failing offsets in the form of |
| C<foundry/layer>, e.g. C<nkjp/morpho>. |
| |
| =item B<--fix> |
| |
| Binary flag to rewrite data.xml with fixed offsets. |
| |
| =back |
| |
| =head1 COPYRIGHT AND LICENSE |
| |
| Copyright (C) 2022, L<IDS Mannheim|https://www.ids-mannheim.de/> |
| |
| Author: Nils Diewald |
| |
| This program is free software published under the |
| L<BSD-2 License|https://opensource.org/licenses/BSD-2-Clause>. |
| |
| =cut |
| |