blob: c5a449f7f186fe5c000453642870ee5da66ddb22 [file] [log] [blame]
Akrona0e91ab2022-11-15 11:37:52 +01001#!/usr/bin/env perl
2use strict;
3use warnings;
4
5# This script allows to inspect if any annotated spans start or end
6# with whitespaces in the primary data of a KorAP-XML file.
7
8use Getopt::Long qw(GetOptions :config no_auto_abbrev);
9use Pod::Usage;
10
11use Mojo::File 'path';
12use Mojo::DOM;
13use Mojo::Util qw'encode decode';
14binmode(STDOUT, ':encoding(utf-8)');
15
16our $VERSION = '0.1.0';
17our $VERSION_MSG = "\nkorapxml_offset_checker - v$VERSION\n";
18
19GetOptions(
20 'input|i=s' => \(my $base = ''),
21 'anno|a=s' => \(my $annotation = ''),
22 'help|h' => sub {
23 pod2usage(
24 -verbose => 99,
25 -sections => 'NAME|DESCRIPTION|SYNOPSIS|ARGUMENTS|OPTIONS',
26 -msg => $VERSION_MSG,
27 -output => '-'
28 )
29 }
30);
31
32unless($base) {
33 pod2usage(
34 -verbose => 99,
35 -sections => 'NAME|SYNOPSIS',
36 -msg => $VERSION_MSG,
37 -output => '-'
38 );
39 exit;
40};
41
42$base = path($base);
43
44unless (-f path($base, 'data.xml')) {
45 die 'Unable to load from ' . $base;
46};
47
48# Load data.xml
49my $data = path($base, 'data.xml')->slurp;
50my $text = decode('UTF-8', Mojo::DOM->new->parse($data)->at('text')->all_text);
51
52# Compare with annotation
53my ($foundry, $layer) = split('[\/|#]', $annotation);
54my $anno = decode('UTF-8', path($base, $foundry, $layer . '.xml')->slurp);
55
Akronc2bc8e02022-11-16 16:52:04 +010056my $offset = 0;
57my $problems = 0;
58
Akrona0e91ab2022-11-15 11:37:52 +010059# Read lemma from annotation
60my $lemma = Mojo::DOM->new->parse($anno)->find('span[from]')->each(
61 sub {
62 my $span = shift;
63
64 # Check if the primary data starts or ends with a space
Akronc2bc8e02022-11-16 16:52:04 +010065 my $primary = substr($text, $span->attr('from') - $offset, $span->attr('to') - $span->attr('from'));
66 if ($primary =~ /^(?:\s+)|(\s+$)/) {
Akrona0e91ab2022-11-15 11:37:52 +010067
68 # Remember span position
69 my $span_id = $span->attr('id');
70 my $from = $span->attr('from');
71 my $to = $span->attr('to');
72
Akronc2bc8e02022-11-16 16:52:04 +010073 print ++$problems,
74 ". Problem found in $base/$foundry/$layer ",
75 "at span-ID #$span_id ($from-$to)!\n";
Akrona0e91ab2022-11-15 11:37:52 +010076
77 if (my $lemma = $span->at('f[name=lemma]')) {
78 print "Lemma: '", $lemma->all_text, "'\n";
79 };
80
Akronc2bc8e02022-11-16 16:52:04 +010081 print 'Snippet',
82 ($offset ? ' (adjusted)' : ''),
83 ': ',
84 substr($text, $span->attr('from')-30-$offset, 30),
Akrona0e91ab2022-11-15 11:37:52 +010085 '[['.$primary.']]',
Akronc2bc8e02022-11-16 16:52:04 +010086 substr($text, $span->attr('to')-$offset, 30),
87 "\n\n";
88
89 if (defined $1) {
90 $offset += length($1);
91 return;
92 } else {
93 exit(1);
94 };
Akrona0e91ab2022-11-15 11:37:52 +010095 };
96 }
97);
98
Akronc2bc8e02022-11-16 16:52:04 +010099exit(1) if $offset;
100
Akrona0e91ab2022-11-15 11:37:52 +0100101print "No problem found in $base/$foundry/$layer!\n";
102exit(0);
103
104__END__
105
106=pod
107
108=encoding utf8
109
110=head1 NAME
111
112korapxml_offset_checker - Check offsets in KorAP-XML files
113
114=head1 SYNOPSIS
115
116 perl korapxml_offset_checker -i NKJP/NKJP/SuperExpress -a nkjp/morpho
117
118=head1 INSTALLATION
119
120Requires Mojolicious.
121
122=head1 OPTIONS
123
124=over 2
125
126=item B<--input>
127
128Expects a path to the root of a KorAP-XML document
129(the root being the text level, including the C<data.xml>),
130e.g. C<NKJP/NKJP/SuperExpress>.
131
132=item B<--annotation>
133
134Expects the annotation to check for failing offsets in the form of
135C<foundry/layer>, e.g. C<nkjp/morpho>.
136
137=back
138
139=head1 COPYRIGHT AND LICENSE
140
141Copyright (C) 2022, L<IDS Mannheim|https://www.ids-mannheim.de/>
142
143Author: Nils Diewald
144
145This program is free software published under the
146L<BSD-2 License|https://opensource.org/licenses/BSD-2-Clause>.
147
148=cut
149