blob: 18c0fedd045ba536b892017764fad70a982d3ec2 [file] [log] [blame]
Akrona0e91ab2022-11-15 11:37:52 +01001#!/usr/bin/env perl
2use strict;
3use warnings;
4
5# This script allows to inspect if any annotated spans start or end
6# with whitespaces in the primary data of a KorAP-XML file.
7
8use Getopt::Long qw(GetOptions :config no_auto_abbrev);
9use Pod::Usage;
10
11use Mojo::File 'path';
12use Mojo::DOM;
13use Mojo::Util qw'encode decode';
14binmode(STDOUT, ':encoding(utf-8)');
15
16our $VERSION = '0.1.0';
17our $VERSION_MSG = "\nkorapxml_offset_checker - v$VERSION\n";
18
19GetOptions(
Akrondc92b242022-11-17 11:15:55 +010020 'input|i=s' => \(my $base = ''),
21 'anno|a=s' => \(my $annotation = ''),
22 'fix|f' => \(my $fix = ''),
Akrona0e91ab2022-11-15 11:37:52 +010023 'help|h' => sub {
24 pod2usage(
25 -verbose => 99,
26 -sections => 'NAME|DESCRIPTION|SYNOPSIS|ARGUMENTS|OPTIONS',
27 -msg => $VERSION_MSG,
28 -output => '-'
29 )
30 }
31);
32
33unless($base) {
34 pod2usage(
35 -verbose => 99,
36 -sections => 'NAME|SYNOPSIS',
37 -msg => $VERSION_MSG,
38 -output => '-'
39 );
40 exit;
41};
42
Akrondc92b242022-11-17 11:15:55 +010043my ($foundry, $layer) = split('[\/|#]', $annotation);
44my $text_fix;
45
Akrona0e91ab2022-11-15 11:37:52 +010046$base = path($base);
47
Akrondc92b242022-11-17 11:15:55 +010048sub check_primary {
49 my $text = shift;
50
51 $text_fix = '';
52
53 # Compare with annotation
54 my $anno = decode('UTF-8', path($base, $foundry, $layer . '.xml')->slurp);
55
56 my $offset = 0;
57 my $problems = 0;
58 my $last_from = 0;
59
60 # Read lemma from annotation
61 my $lemma = Mojo::DOM->new->parse($anno)->find('span[from]')->each(
62 sub {
63 my $span = shift;
64
65 # Check if the primary data starts or ends with a space
66 my $primary = substr($text, $span->attr('from') - $offset, $span->attr('to') - $span->attr('from'));
67
68 my $from = $span->attr('from');
69 my $to = $span->attr('to');
70
71 unless ($primary) {
72 print "Unable to find primary data at ($from-$to)\n\n";
73 exit(1);
74 };
75
76 if ($primary =~ /^(?:\s+)|(\s+$)/) {
77
78 # Remember span position
79 my $span_id = $span->attr('id');
80
81 print ++$problems,
82 ". Problem found in $base/$foundry/$layer ",
83 "at span-ID #$span_id ($from-$to)!\n";
84
85 if (my $lemma = $span->at('f[name=lemma]')) {
86 print "Lemma: '", $lemma->all_text, "'\n";
87 };
88
89 print 'Snippet',
90 ($offset ? ' (adjusted)' : ''),
91 ': ',
92 substr($text, $span->attr('from')-30-$offset, 30),
93 '[['.$primary.']]',
94 substr($text, $span->attr('to')-$offset, 30),
95 "\n";
96
97 if (defined $1) {
98
99 $offset += length($1);
100
101 if ($fix) {
102
103 # Forecast fix
104 print 'Fix',
105 ': ',
106 substr($text, $span->attr('from') - 30 - $offset, 30),
107 "$1",
108 '[[',
109 substr($text, $span->attr('from') - $offset, $span->attr('to') - $span->attr('from')),
110 ']]',
111 substr($text, $span->attr('to') - $offset, 30),
112 "\n";
113
114 # Rewrite primary data with fix
115 $text_fix .= substr($text, $last_from, $span->attr('from') - $last_from - $offset);
116 $text_fix .= "$1";
117 $text_fix .= substr($text, $span->attr('from') - $offset, length($1));
118
119 $last_from = $span->attr('from') - $offset + length($1);
120 };
121 print "\n";
122 return;
123 } else {
124 if ($fix) {
125 print "Unable to fix file.\n";
126 };
127 print "\n";
128 exit(1);
129 };
130 };
131 }
132 );
133
134 if ($fix) {
135 # Finish the text data
136 $text_fix .= substr($text, $last_from);
137 };
138
139 return $problems;
140};
141
142# Load normal data.xml
Akrona0e91ab2022-11-15 11:37:52 +0100143unless (-f path($base, 'data.xml')) {
144 die 'Unable to load from ' . $base;
145};
146
147# Load data.xml
148my $data = path($base, 'data.xml')->slurp;
149my $text = decode('UTF-8', Mojo::DOM->new->parse($data)->at('text')->all_text);
150
Akrondc92b242022-11-17 11:15:55 +0100151unless (check_primary($text)) {
152 print "No problem found in $base/$foundry/$layer!\n";
153 exit(0);
154};
Akrona0e91ab2022-11-15 11:37:52 +0100155
Akronc2bc8e02022-11-16 16:52:04 +0100156
Akrondc92b242022-11-17 11:15:55 +0100157# The fix flag was activated
158if ($fix) {
Akrona0e91ab2022-11-15 11:37:52 +0100159
Akrondc92b242022-11-17 11:15:55 +0100160 $|=0;
Akrona0e91ab2022-11-15 11:37:52 +0100161
Akrondc92b242022-11-17 11:15:55 +0100162 print "Check fixed data ...\n\n";
Akrona0e91ab2022-11-15 11:37:52 +0100163
Akrondc92b242022-11-17 11:15:55 +0100164 my $data_fix = Mojo::DOM->new->parse($data)->at('text')->child_nodes->[0]->replace($text_fix)->root;
Akrona0e91ab2022-11-15 11:37:52 +0100165
Akrondc92b242022-11-17 11:15:55 +0100166 unless (check_primary($data_fix->at('text')->all_text)) {
167 print "Fixed data is fine - overwrite data.xml? (y)\n";
168 my $stdin = <STDIN>;
169 chomp($stdin);
170 if ($stdin eq 'y' || $stdin eq 'Y') {
171 path($base, 'data.xml')->spurt(encode('UTF-8', $data_fix));
172 print "File written.\n\n";
173 } else {
174 print "No file written.\n\n";
Akrona0e91ab2022-11-15 11:37:52 +0100175 };
Akrondc92b242022-11-17 11:15:55 +0100176 exit(0);
Akrona0e91ab2022-11-15 11:37:52 +0100177 }
Akrona0e91ab2022-11-15 11:37:52 +0100178
Akrondc92b242022-11-17 11:15:55 +0100179 else {
180 print "Unable to fix file\n\n";
181 exit(1);
182 };
183};
Akronc2bc8e02022-11-16 16:52:04 +0100184
Akrondc92b242022-11-17 11:15:55 +0100185exit(1);
186
Akrona0e91ab2022-11-15 11:37:52 +0100187
188__END__
189
190=pod
191
192=encoding utf8
193
194=head1 NAME
195
196korapxml_offset_checker - Check offsets in KorAP-XML files
197
198=head1 SYNOPSIS
199
200 perl korapxml_offset_checker -i NKJP/NKJP/SuperExpress -a nkjp/morpho
201
202=head1 INSTALLATION
203
204Requires Mojolicious.
205
206=head1 OPTIONS
207
208=over 2
209
210=item B<--input>
211
212Expects a path to the root of a KorAP-XML document
213(the root being the text level, including the C<data.xml>),
214e.g. C<NKJP/NKJP/SuperExpress>.
215
216=item B<--annotation>
217
218Expects the annotation to check for failing offsets in the form of
219C<foundry/layer>, e.g. C<nkjp/morpho>.
220
Akrondc92b242022-11-17 11:15:55 +0100221=item B<--fix>
222
223Binary flag to rewrite data.xml with fixed offsets.
224
Akrona0e91ab2022-11-15 11:37:52 +0100225=back
226
227=head1 COPYRIGHT AND LICENSE
228
229Copyright (C) 2022, L<IDS Mannheim|https://www.ids-mannheim.de/>
230
231Author: Nils Diewald
232
233This program is free software published under the
234L<BSD-2 License|https://opensource.org/licenses/BSD-2-Clause>.
235
236=cut
237