blob: 08fc615494e71c41f903ff139070f4f47a66a7b8 [file] [log] [blame]
Akrona0e91ab2022-11-15 11:37:52 +01001#!/usr/bin/env perl
2use strict;
3use warnings;
4
5# This script allows to inspect if any annotated spans start or end
6# with whitespaces in the primary data of a KorAP-XML file.
7
8use Getopt::Long qw(GetOptions :config no_auto_abbrev);
9use Pod::Usage;
10
11use Mojo::File 'path';
12use Mojo::DOM;
13use Mojo::Util qw'encode decode';
14binmode(STDOUT, ':encoding(utf-8)');
15
16our $VERSION = '0.1.0';
17our $VERSION_MSG = "\nkorapxml_offset_checker - v$VERSION\n";
18
19GetOptions(
20 'input|i=s' => \(my $base = ''),
21 'anno|a=s' => \(my $annotation = ''),
22 'help|h' => sub {
23 pod2usage(
24 -verbose => 99,
25 -sections => 'NAME|DESCRIPTION|SYNOPSIS|ARGUMENTS|OPTIONS',
26 -msg => $VERSION_MSG,
27 -output => '-'
28 )
29 }
30);
31
32unless($base) {
33 pod2usage(
34 -verbose => 99,
35 -sections => 'NAME|SYNOPSIS',
36 -msg => $VERSION_MSG,
37 -output => '-'
38 );
39 exit;
40};
41
42$base = path($base);
43
44unless (-f path($base, 'data.xml')) {
45 die 'Unable to load from ' . $base;
46};
47
48# Load data.xml
49my $data = path($base, 'data.xml')->slurp;
50my $text = decode('UTF-8', Mojo::DOM->new->parse($data)->at('text')->all_text);
51
52# Compare with annotation
53my ($foundry, $layer) = split('[\/|#]', $annotation);
54my $anno = decode('UTF-8', path($base, $foundry, $layer . '.xml')->slurp);
55
56# Read lemma from annotation
57my $lemma = Mojo::DOM->new->parse($anno)->find('span[from]')->each(
58 sub {
59 my $span = shift;
60
61 # Check if the primary data starts or ends with a space
62 my $primary = substr($text, $span->attr('from'), $span->attr('to') - $span->attr('from'));
63 if ($primary =~ /^(?:\s+)|(?:\s+$)/) {
64
65 # Remember span position
66 my $span_id = $span->attr('id');
67 my $from = $span->attr('from');
68 my $to = $span->attr('to');
69
70 print "Problem found in $base/$foundry/$layer at span-ID #$span_id ($from-$to)!\n";
71
72 if (my $lemma = $span->at('f[name=lemma]')) {
73 print "Lemma: '", $lemma->all_text, "'\n";
74 };
75
76 print 'Snippet: ',
77 substr($text, $span->attr('from')-30, 30),
78 '[['.$primary.']]',
79 substr($text, $span->attr('to'), 30),
80 "\n";
81 exit(1);
82 };
83 }
84);
85
86print "No problem found in $base/$foundry/$layer!\n";
87exit(0);
88
89__END__
90
91=pod
92
93=encoding utf8
94
95=head1 NAME
96
97korapxml_offset_checker - Check offsets in KorAP-XML files
98
99=head1 SYNOPSIS
100
101 perl korapxml_offset_checker -i NKJP/NKJP/SuperExpress -a nkjp/morpho
102
103=head1 INSTALLATION
104
105Requires Mojolicious.
106
107=head1 OPTIONS
108
109=over 2
110
111=item B<--input>
112
113Expects a path to the root of a KorAP-XML document
114(the root being the text level, including the C<data.xml>),
115e.g. C<NKJP/NKJP/SuperExpress>.
116
117=item B<--annotation>
118
119Expects the annotation to check for failing offsets in the form of
120C<foundry/layer>, e.g. C<nkjp/morpho>.
121
122=back
123
124=head1 COPYRIGHT AND LICENSE
125
126Copyright (C) 2022, L<IDS Mannheim|https://www.ids-mannheim.de/>
127
128Author: Nils Diewald
129
130This program is free software published under the
131L<BSD-2 License|https://opensource.org/licenses/BSD-2-Clause>.
132
133=cut
134