blob: b20a77f282e85c0fb3bf703b12a983e1bf0b55eb [file] [log] [blame]
Akron53bc81d2020-04-27 16:24:35 +02001#!/usr/bin/env perl
2use Mojo::Base -strict;
3use Mojo::DOM;
4use Mojo::File qw'path';
Akron59a0e4b2020-04-27 17:43:29 +02005use Mojo::JSON qw'decode_json';
Akron53bc81d2020-04-27 16:24:35 +02006use Mojo::ByteStream 'b';
7use String::Random;
8use Pod::Usage;
9use Getopt::Long qw/GetOptions :config no_auto_abbrev/;
10
11#############################################################
12# This helper tool iterates over a single KorAP-XML files #
13# and randomizes all word strings occurring following #
14# several rules. This is useful to create example files #
15# based on corpora that can't be published. #
16# (c) IDS Mannheim #
17#############################################################
18
19my %ERROR_HASH = (
20 -sections => 'NAME|SYNOPSIS',
21 -verbose => 99,
22 -output => '-',
23 -exit => 1
24);
25
26my ($orig_folder, $scr_folder);
27GetOptions(
28 'input|i=s' => \$orig_folder,
29 'output|o=s' => \$scr_folder,
Akron59a0e4b2020-04-27 17:43:29 +020030 'rules|r=s' => \(my $rule_file),
Akron53bc81d2020-04-27 16:24:35 +020031 'help|h' => sub {
32 pod2usage(
33 -sections => 'NAME|SYNOPSIS|DESCRIPTION|ARGUMENTS|OPTIONS',
34 -verbose => 99,
35 -output => '-'
36 );
37 }
38);
39
Akron59a0e4b2020-04-27 17:43:29 +020040unless ($orig_folder || $scr_folder || $rule_file) {
Akron53bc81d2020-04-27 16:24:35 +020041 pod2usage(%ERROR_HASH);
42};
43
44my $string_gen = String::Random->new;
45
46# Remember all generated pairs orig -> random
47my %replacements = ();
48my $offset = 0;
49my @offsets = ();
50
51# Turn a word into a random word with similar characteristics
52sub get_rnd_word {
53 my $o_word = shift;
54 return $o_word unless $o_word =~ /[a-z]/i;
55
56 # Return the old replacement
57 if ($replacements{$o_word}) {
58 return $replacements{$o_word};
59 };
60
61 my $word = $o_word;
62
63 # Turn the word into a pattern for String::Random
64 # c: Any Latin lowercase character [a-z]
65 # C: Any Latin uppercase character [A-Z]
66 # n: Any digit [0-9]
67 # !: A punctuation character
68 $word =~ tr/ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzöäü1234567890~`!@$%^&*()-_+={}[]|\\:;"'.<>?\/#,/CCCCCCCCCCCCCCCCCCCCCCCCCCccccccccccccccccccccccccccccccnnnnnnnnnn!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!/;
69 $word =~ s/[^Ccn!]/n/g;
70 $replacements{$o_word} = $string_gen->randpattern($word);
71}
72
73# 1. Load data.xml
74# replace all surface forms of /[a-z]/
75# with character strings of the same length, randomly created.
76# Create an array, accessible by offsets.
77my $data_file = $orig_folder . '/data.xml';
78# Process the data file and replace all surface words with random words
79my $data = Mojo::File->new($data_file)->slurp;
80my $dom = Mojo::DOM->new->xml(1)->parse(b($data)->decode);
81my $new_text = b($dom->at('text')->text)->split(
82 " "
83)->map(
84 sub {
85 my $token = get_rnd_word($_);
86 $offsets[$offset] = $token;
87 # print $offset, ':', $_, ':', $token,"\n";
88 $offset += length($token);
89 $offset++; # space
90
91 # exit if $offset > 300;
92 return $token;
93 }
94)->join(
95 " "
96);
97$dom->at('text')->content($new_text);
98
99# Create folder
100path($scr_folder)->make_path->child('data.xml')->spurt(b($dom->to_string)->encode);
101
102
103# 2. Take some css selectors and rename attributes,
104# either according to the surface form ("=") or
105# somehow derived ("^"), or random as well ("~"),
106# based on the given content, that can be randomized and
107# stuffed in a hash as well.
108# If no CSS rules are parsed, the file will just be copied.
109
Akronb61d6df2021-02-16 10:52:12 +0100110if ($rule_file) {
111 $rule_file = Mojo::File->new($rule_file);
112 if (-e $rule_file) {
113 my $rules = decode_json $rule_file->slurp;
Akron53bc81d2020-04-27 16:24:35 +0200114
Akronb61d6df2021-02-16 10:52:12 +0100115 foreach my $rule (@$rules) {
116 scramble(@$rule);
117 };
Akron59a0e4b2020-04-27 17:43:29 +0200118 };
119};
Akron53bc81d2020-04-27 16:24:35 +0200120
121# Scramble an annotation file
122sub scramble {
Akron59a0e4b2020-04-27 17:43:29 +0200123 my ($input, $rules) = @_;
Akron53bc81d2020-04-27 16:24:35 +0200124 my $data_file = path($orig_folder)->child($input);
125
126 unless (-f $data_file) {
127 warn "$data_file does not exist";
128 return;
129 };
130
131 my $data = $data_file->slurp;
132
133 # Only transfer if rules exist
134 if ($rules) {
135 my $dom = Mojo::DOM->new->xml(1)->parse(b($data)->decode);
136
137 foreach (@$rules) {
Akroncbf098a2020-04-27 17:56:42 +0200138 if ($input =~ /header\.xml$/) {
139 transform_header($dom, $_->[0]);
140 } else {
141 transform($dom, $_->[0], $_->[1]);
142 };
Akron53bc81d2020-04-27 16:24:35 +0200143 };
144
145 $data = b($dom->to_string)->encode;
Akron53bc81d2020-04-27 16:24:35 +0200146 };
147
Akron59a0e4b2020-04-27 17:43:29 +0200148 my $file = Mojo::File->new($scr_folder)->child($input);
Akron53bc81d2020-04-27 16:24:35 +0200149 path($file->dirname)->make_path;
150 $file->spurt($data);
151};
152
153
154# Iterate over an annotation document and scramble
155# all textual content based on CSS rules
156sub transform {
157 my ($dom, $selector, $rule) = @_;
158
159 $dom->find("spanList > span")->each(
160 sub {
161 my $from = $_->attr("from");
162 my $to = $_->attr("to");
163 $_->find($selector)->each(
164 sub {
165 my $word = $_->text;
166
167 unless ($offsets[$from]) {
168 # warn '!!! Unknown word at ' . $from . '!';
169 $_->content('UNKN');
170 return;
171 };
172
173 # The derive rule means that the original
174 # word is taken and appended the string 'ui'
175 if ($rule eq '^') {
176 my $deriv = $offsets[$from];
177 chop($deriv);
178 chop($deriv);
179 $_->content($deriv . 'ui');
180
181 }
182
183 # The random rule means the word is replaced by
184 # with a random word with the same characterisms.
185 elsif ($rule eq '~') {
186 $_->content(get_rnd_word($word));
187 }
188
189 # Any other rule means, that the original word
190 # from the character data is taken.
191 else {
192 $_->content($offsets[$from])
193 }
194 }
195 )
196 }
197 )
198};
199
Akron53bc81d2020-04-27 16:24:35 +0200200
Akroncbf098a2020-04-27 17:56:42 +0200201# Transform header file
202sub transform_header {
203 my ($dom, $selector) = @_;
204
205 $dom->find($selector)->each(
206 sub {
207 my $word = $_->text;
208
209 # The random rule means the word is replaced by
210 # with a random word with the same characterisms.
211 $_->content(get_rnd_word($word));
212 }
213 )
214};
215
216
217
Akron53bc81d2020-04-27 16:24:35 +0200218__END__
219
220=pod
221
222=encoding utf8
223
224=head1 NAME
225
226scramble_korapxml.pl - Merge KorAP-XML data and create Krill documents
227
228
229=head1 SYNOPSIS
230
231 scramble_korapxml.pl -i <input-directory> -o <output-directory>
232
233
234=head1 DESCRIPTION
235
236This helper tool iterates over a single KorAP-XML folder
237and randomizes all word strings occurring following
238several rules. This is useful to create example files
239based on corpora that can't be published.
240
Akron59a0e4b2020-04-27 17:43:29 +0200241
242=head1 OPTIONS
243
244=over 2
245
246=item B<--input|-i> <directory>
247
248The unscrambled KorAP-XML directory.
249
250
251=item B<--output|-o> <directory>
252
253The output directory
254
255
256=item B<--rules|-r> <file>
257
258The rule file for transformation as a json file.
259Example:
260
261 [
262 [
263 "dgd/annot.xml",
264 [
265 ["f[name=trans]", "="],
266 ["f[name=lemma]", "^"],
267 ["f[name=pos]", "~"]
268 ]
269 ],
270 ["struct/structure.xml"]
271 ]
272
273All elements of the json list are copied from the input directory to
274the output directory.
275The C<data.xml> file will be automatically coppied and scrambled.
276If the file name is followed by a rule set, these
277CSS selector rules followed by a transformation type marker
278are used to transform elements of the file.
279
Akroncbf098a2020-04-27 17:56:42 +0200280All CSS selectors for annotation files
281are nested in C<spanList > span>.
Akron59a0e4b2020-04-27 17:43:29 +0200282
283The following markers are supported:
284
285=over 4
286
287=item B<=>
288
289Take the scrambled surface form from the C<data.xml>.
290
291=item B<^>
292
293Take the scrambled surface form from the C<data.xml> and
294modify the term by appending the string C<ui>.
295
296=item B<~>
297
298Create a randomized string, keeping the characteristicts of
299the original element content.
Akron53bc81d2020-04-27 16:24:35 +0200300Two identical words in a single run will always be transfered
301to the same target word.
302
Akron59a0e4b2020-04-27 17:43:29 +0200303=back
304
Akroncbf098a2020-04-27 17:56:42 +0200305For header files, the rules are not nested and only the
306randomized marker C<~> is supported.
307
Akron59a0e4b2020-04-27 17:43:29 +0200308=back