Akron | a0e91ab | 2022-11-15 11:37:52 +0100 | [diff] [blame] | 1 | #!/usr/bin/env perl |
| 2 | use strict; |
| 3 | use warnings; |
| 4 | |
| 5 | # This script allows to inspect if any annotated spans start or end |
| 6 | # with whitespaces in the primary data of a KorAP-XML file. |
| 7 | |
| 8 | use Getopt::Long qw(GetOptions :config no_auto_abbrev); |
| 9 | use Pod::Usage; |
| 10 | |
| 11 | use Mojo::File 'path'; |
| 12 | use Mojo::DOM; |
| 13 | use Mojo::Util qw'encode decode'; |
| 14 | binmode(STDOUT, ':encoding(utf-8)'); |
| 15 | |
| 16 | our $VERSION = '0.1.0'; |
| 17 | our $VERSION_MSG = "\nkorapxml_offset_checker - v$VERSION\n"; |
| 18 | |
| 19 | GetOptions( |
Akron | dc92b24 | 2022-11-17 11:15:55 +0100 | [diff] [blame] | 20 | 'input|i=s' => \(my $base = ''), |
| 21 | 'anno|a=s' => \(my $annotation = ''), |
| 22 | 'fix|f' => \(my $fix = ''), |
Akron | a0e91ab | 2022-11-15 11:37:52 +0100 | [diff] [blame] | 23 | 'help|h' => sub { |
| 24 | pod2usage( |
| 25 | -verbose => 99, |
| 26 | -sections => 'NAME|DESCRIPTION|SYNOPSIS|ARGUMENTS|OPTIONS', |
| 27 | -msg => $VERSION_MSG, |
| 28 | -output => '-' |
| 29 | ) |
| 30 | } |
| 31 | ); |
| 32 | |
| 33 | unless($base) { |
| 34 | pod2usage( |
| 35 | -verbose => 99, |
| 36 | -sections => 'NAME|SYNOPSIS', |
| 37 | -msg => $VERSION_MSG, |
| 38 | -output => '-' |
| 39 | ); |
| 40 | exit; |
| 41 | }; |
| 42 | |
Akron | dc92b24 | 2022-11-17 11:15:55 +0100 | [diff] [blame] | 43 | my ($foundry, $layer) = split('[\/|#]', $annotation); |
| 44 | my $text_fix; |
| 45 | |
Akron | a0e91ab | 2022-11-15 11:37:52 +0100 | [diff] [blame] | 46 | $base = path($base); |
| 47 | |
Akron | dc92b24 | 2022-11-17 11:15:55 +0100 | [diff] [blame] | 48 | sub check_primary { |
| 49 | my $text = shift; |
| 50 | |
| 51 | $text_fix = ''; |
| 52 | |
| 53 | # Compare with annotation |
| 54 | my $anno = decode('UTF-8', path($base, $foundry, $layer . '.xml')->slurp); |
| 55 | |
| 56 | my $offset = 0; |
| 57 | my $problems = 0; |
| 58 | my $last_from = 0; |
| 59 | |
| 60 | # Read lemma from annotation |
| 61 | my $lemma = Mojo::DOM->new->parse($anno)->find('span[from]')->each( |
| 62 | sub { |
| 63 | my $span = shift; |
| 64 | |
| 65 | # Check if the primary data starts or ends with a space |
| 66 | my $primary = substr($text, $span->attr('from') - $offset, $span->attr('to') - $span->attr('from')); |
| 67 | |
| 68 | my $from = $span->attr('from'); |
| 69 | my $to = $span->attr('to'); |
| 70 | |
| 71 | unless ($primary) { |
| 72 | print "Unable to find primary data at ($from-$to)\n\n"; |
| 73 | exit(1); |
| 74 | }; |
| 75 | |
| 76 | if ($primary =~ /^(?:\s+)|(\s+$)/) { |
| 77 | |
| 78 | # Remember span position |
| 79 | my $span_id = $span->attr('id'); |
| 80 | |
| 81 | print ++$problems, |
| 82 | ". Problem found in $base/$foundry/$layer ", |
| 83 | "at span-ID #$span_id ($from-$to)!\n"; |
| 84 | |
| 85 | if (my $lemma = $span->at('f[name=lemma]')) { |
| 86 | print "Lemma: '", $lemma->all_text, "'\n"; |
| 87 | }; |
| 88 | |
| 89 | print 'Snippet', |
| 90 | ($offset ? ' (adjusted)' : ''), |
| 91 | ': ', |
| 92 | substr($text, $span->attr('from')-30-$offset, 30), |
| 93 | '[['.$primary.']]', |
| 94 | substr($text, $span->attr('to')-$offset, 30), |
| 95 | "\n"; |
| 96 | |
| 97 | if (defined $1) { |
| 98 | |
| 99 | $offset += length($1); |
| 100 | |
| 101 | if ($fix) { |
| 102 | |
| 103 | # Forecast fix |
| 104 | print 'Fix', |
| 105 | ': ', |
| 106 | substr($text, $span->attr('from') - 30 - $offset, 30), |
| 107 | "$1", |
| 108 | '[[', |
| 109 | substr($text, $span->attr('from') - $offset, $span->attr('to') - $span->attr('from')), |
| 110 | ']]', |
| 111 | substr($text, $span->attr('to') - $offset, 30), |
| 112 | "\n"; |
| 113 | |
| 114 | # Rewrite primary data with fix |
| 115 | $text_fix .= substr($text, $last_from, $span->attr('from') - $last_from - $offset); |
| 116 | $text_fix .= "$1"; |
| 117 | $text_fix .= substr($text, $span->attr('from') - $offset, length($1)); |
| 118 | |
| 119 | $last_from = $span->attr('from') - $offset + length($1); |
| 120 | }; |
| 121 | print "\n"; |
| 122 | return; |
| 123 | } else { |
| 124 | if ($fix) { |
| 125 | print "Unable to fix file.\n"; |
| 126 | }; |
| 127 | print "\n"; |
| 128 | exit(1); |
| 129 | }; |
| 130 | }; |
| 131 | } |
| 132 | ); |
| 133 | |
| 134 | if ($fix) { |
| 135 | # Finish the text data |
| 136 | $text_fix .= substr($text, $last_from); |
| 137 | }; |
| 138 | |
| 139 | return $problems; |
| 140 | }; |
| 141 | |
| 142 | # Load normal data.xml |
Akron | a0e91ab | 2022-11-15 11:37:52 +0100 | [diff] [blame] | 143 | unless (-f path($base, 'data.xml')) { |
| 144 | die 'Unable to load from ' . $base; |
| 145 | }; |
| 146 | |
| 147 | # Load data.xml |
| 148 | my $data = path($base, 'data.xml')->slurp; |
| 149 | my $text = decode('UTF-8', Mojo::DOM->new->parse($data)->at('text')->all_text); |
| 150 | |
Akron | dc92b24 | 2022-11-17 11:15:55 +0100 | [diff] [blame] | 151 | unless (check_primary($text)) { |
| 152 | print "No problem found in $base/$foundry/$layer!\n"; |
| 153 | exit(0); |
| 154 | }; |
Akron | a0e91ab | 2022-11-15 11:37:52 +0100 | [diff] [blame] | 155 | |
Akron | c2bc8e0 | 2022-11-16 16:52:04 +0100 | [diff] [blame] | 156 | |
Akron | dc92b24 | 2022-11-17 11:15:55 +0100 | [diff] [blame] | 157 | # The fix flag was activated |
| 158 | if ($fix) { |
Akron | a0e91ab | 2022-11-15 11:37:52 +0100 | [diff] [blame] | 159 | |
Akron | dc92b24 | 2022-11-17 11:15:55 +0100 | [diff] [blame] | 160 | $|=0; |
Akron | a0e91ab | 2022-11-15 11:37:52 +0100 | [diff] [blame] | 161 | |
Akron | dc92b24 | 2022-11-17 11:15:55 +0100 | [diff] [blame] | 162 | print "Check fixed data ...\n\n"; |
Akron | a0e91ab | 2022-11-15 11:37:52 +0100 | [diff] [blame] | 163 | |
Akron | dc92b24 | 2022-11-17 11:15:55 +0100 | [diff] [blame] | 164 | my $data_fix = Mojo::DOM->new->parse($data)->at('text')->child_nodes->[0]->replace($text_fix)->root; |
Akron | a0e91ab | 2022-11-15 11:37:52 +0100 | [diff] [blame] | 165 | |
Akron | dc92b24 | 2022-11-17 11:15:55 +0100 | [diff] [blame] | 166 | unless (check_primary($data_fix->at('text')->all_text)) { |
| 167 | print "Fixed data is fine - overwrite data.xml? (y)\n"; |
| 168 | my $stdin = <STDIN>; |
| 169 | chomp($stdin); |
| 170 | if ($stdin eq 'y' || $stdin eq 'Y') { |
| 171 | path($base, 'data.xml')->spurt(encode('UTF-8', $data_fix)); |
| 172 | print "File written.\n\n"; |
| 173 | } else { |
| 174 | print "No file written.\n\n"; |
Akron | a0e91ab | 2022-11-15 11:37:52 +0100 | [diff] [blame] | 175 | }; |
Akron | dc92b24 | 2022-11-17 11:15:55 +0100 | [diff] [blame] | 176 | exit(0); |
Akron | a0e91ab | 2022-11-15 11:37:52 +0100 | [diff] [blame] | 177 | } |
Akron | a0e91ab | 2022-11-15 11:37:52 +0100 | [diff] [blame] | 178 | |
Akron | dc92b24 | 2022-11-17 11:15:55 +0100 | [diff] [blame] | 179 | else { |
| 180 | print "Unable to fix file\n\n"; |
| 181 | exit(1); |
| 182 | }; |
| 183 | }; |
Akron | c2bc8e0 | 2022-11-16 16:52:04 +0100 | [diff] [blame] | 184 | |
Akron | dc92b24 | 2022-11-17 11:15:55 +0100 | [diff] [blame] | 185 | exit(1); |
| 186 | |
Akron | a0e91ab | 2022-11-15 11:37:52 +0100 | [diff] [blame] | 187 | |
| 188 | __END__ |
| 189 | |
| 190 | =pod |
| 191 | |
| 192 | =encoding utf8 |
| 193 | |
| 194 | =head1 NAME |
| 195 | |
| 196 | korapxml_offset_checker - Check offsets in KorAP-XML files |
| 197 | |
| 198 | =head1 SYNOPSIS |
| 199 | |
| 200 | perl korapxml_offset_checker -i NKJP/NKJP/SuperExpress -a nkjp/morpho |
| 201 | |
| 202 | =head1 INSTALLATION |
| 203 | |
| 204 | Requires Mojolicious. |
| 205 | |
| 206 | =head1 OPTIONS |
| 207 | |
| 208 | =over 2 |
| 209 | |
| 210 | =item B<--input> |
| 211 | |
| 212 | Expects a path to the root of a KorAP-XML document |
| 213 | (the root being the text level, including the C<data.xml>), |
| 214 | e.g. C<NKJP/NKJP/SuperExpress>. |
| 215 | |
| 216 | =item B<--annotation> |
| 217 | |
| 218 | Expects the annotation to check for failing offsets in the form of |
| 219 | C<foundry/layer>, e.g. C<nkjp/morpho>. |
| 220 | |
Akron | dc92b24 | 2022-11-17 11:15:55 +0100 | [diff] [blame] | 221 | =item B<--fix> |
| 222 | |
| 223 | Binary flag to rewrite data.xml with fixed offsets. |
| 224 | |
Akron | a0e91ab | 2022-11-15 11:37:52 +0100 | [diff] [blame] | 225 | =back |
| 226 | |
| 227 | =head1 COPYRIGHT AND LICENSE |
| 228 | |
| 229 | Copyright (C) 2022, L<IDS Mannheim|https://www.ids-mannheim.de/> |
| 230 | |
| 231 | Author: Nils Diewald |
| 232 | |
| 233 | This program is free software published under the |
| 234 | L<BSD-2 License|https://opensource.org/licenses/BSD-2-Clause>. |
| 235 | |
| 236 | =cut |
| 237 | |