blob: 95e727e490e81c20be110da458266f747431f78b [file] [log] [blame]
Akron53bc81d2020-04-27 16:24:35 +02001#!/usr/bin/env perl
2use Mojo::Base -strict;
3use Mojo::DOM;
4use Mojo::File qw'path';
Akron59a0e4b2020-04-27 17:43:29 +02005use Mojo::JSON qw'decode_json';
Akron53bc81d2020-04-27 16:24:35 +02006use Mojo::ByteStream 'b';
7use String::Random;
8use Pod::Usage;
9use Getopt::Long qw/GetOptions :config no_auto_abbrev/;
10
11#############################################################
12# This helper tool iterates over a single KorAP-XML files #
13# and randomizes all word strings occurring following #
14# several rules. This is useful to create example files #
15# based on corpora that can't be published. #
16# (c) IDS Mannheim #
17#############################################################
18
19my %ERROR_HASH = (
20 -sections => 'NAME|SYNOPSIS',
21 -verbose => 99,
22 -output => '-',
23 -exit => 1
24);
25
26my ($orig_folder, $scr_folder);
27GetOptions(
28 'input|i=s' => \$orig_folder,
29 'output|o=s' => \$scr_folder,
Akron59a0e4b2020-04-27 17:43:29 +020030 'rules|r=s' => \(my $rule_file),
Akron53bc81d2020-04-27 16:24:35 +020031 'help|h' => sub {
32 pod2usage(
33 -sections => 'NAME|SYNOPSIS|DESCRIPTION|ARGUMENTS|OPTIONS',
34 -verbose => 99,
35 -output => '-'
36 );
37 }
38);
39
Akron59a0e4b2020-04-27 17:43:29 +020040unless ($orig_folder || $scr_folder || $rule_file) {
Akron53bc81d2020-04-27 16:24:35 +020041 pod2usage(%ERROR_HASH);
42};
43
44my $string_gen = String::Random->new;
45
46# Remember all generated pairs orig -> random
47my %replacements = ();
48my $offset = 0;
49my @offsets = ();
50
51# Turn a word into a random word with similar characteristics
52sub get_rnd_word {
53 my $o_word = shift;
54 return $o_word unless $o_word =~ /[a-z]/i;
55
56 # Return the old replacement
57 if ($replacements{$o_word}) {
58 return $replacements{$o_word};
59 };
60
61 my $word = $o_word;
62
63 # Turn the word into a pattern for String::Random
64 # c: Any Latin lowercase character [a-z]
65 # C: Any Latin uppercase character [A-Z]
66 # n: Any digit [0-9]
67 # !: A punctuation character
68 $word =~ tr/ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzöäü1234567890~`!@$%^&*()-_+={}[]|\\:;"'.<>?\/#,/CCCCCCCCCCCCCCCCCCCCCCCCCCccccccccccccccccccccccccccccccnnnnnnnnnn!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!/;
69 $word =~ s/[^Ccn!]/n/g;
70 $replacements{$o_word} = $string_gen->randpattern($word);
71}
72
73# 1. Load data.xml
74# replace all surface forms of /[a-z]/
75# with character strings of the same length, randomly created.
76# Create an array, accessible by offsets.
77my $data_file = $orig_folder . '/data.xml';
78# Process the data file and replace all surface words with random words
79my $data = Mojo::File->new($data_file)->slurp;
80my $dom = Mojo::DOM->new->xml(1)->parse(b($data)->decode);
81my $new_text = b($dom->at('text')->text)->split(
82 " "
83)->map(
84 sub {
85 my $token = get_rnd_word($_);
86 $offsets[$offset] = $token;
87 # print $offset, ':', $_, ':', $token,"\n";
88 $offset += length($token);
89 $offset++; # space
90
91 # exit if $offset > 300;
92 return $token;
93 }
94)->join(
95 " "
96);
97$dom->at('text')->content($new_text);
98
99# Create folder
100path($scr_folder)->make_path->child('data.xml')->spurt(b($dom->to_string)->encode);
101
102
103# 2. Take some css selectors and rename attributes,
104# either according to the surface form ("=") or
105# somehow derived ("^"), or random as well ("~"),
106# based on the given content, that can be randomized and
107# stuffed in a hash as well.
108# If no CSS rules are parsed, the file will just be copied.
109
Akron59a0e4b2020-04-27 17:43:29 +0200110$rule_file = Mojo::File->new($rule_file);
Akron53bc81d2020-04-27 16:24:35 +0200111
Akron59a0e4b2020-04-27 17:43:29 +0200112if (-e $rule_file) {
113 my $rules = decode_json $rule_file->slurp;
114
115 foreach my $rule (@$rules) {
116 scramble(@$rule);
117 };
118};
Akron53bc81d2020-04-27 16:24:35 +0200119
120# Scramble an annotation file
121sub scramble {
Akron59a0e4b2020-04-27 17:43:29 +0200122 my ($input, $rules) = @_;
Akron53bc81d2020-04-27 16:24:35 +0200123 my $data_file = path($orig_folder)->child($input);
124
125 unless (-f $data_file) {
126 warn "$data_file does not exist";
127 return;
128 };
129
130 my $data = $data_file->slurp;
131
132 # Only transfer if rules exist
133 if ($rules) {
134 my $dom = Mojo::DOM->new->xml(1)->parse(b($data)->decode);
135
136 foreach (@$rules) {
Akroncbf098a2020-04-27 17:56:42 +0200137 if ($input =~ /header\.xml$/) {
138 transform_header($dom, $_->[0]);
139 } else {
140 transform($dom, $_->[0], $_->[1]);
141 };
Akron53bc81d2020-04-27 16:24:35 +0200142 };
143
144 $data = b($dom->to_string)->encode;
Akron53bc81d2020-04-27 16:24:35 +0200145 };
146
Akron59a0e4b2020-04-27 17:43:29 +0200147 my $file = Mojo::File->new($scr_folder)->child($input);
Akron53bc81d2020-04-27 16:24:35 +0200148 path($file->dirname)->make_path;
149 $file->spurt($data);
150};
151
152
153# Iterate over an annotation document and scramble
154# all textual content based on CSS rules
155sub transform {
156 my ($dom, $selector, $rule) = @_;
157
158 $dom->find("spanList > span")->each(
159 sub {
160 my $from = $_->attr("from");
161 my $to = $_->attr("to");
162 $_->find($selector)->each(
163 sub {
164 my $word = $_->text;
165
166 unless ($offsets[$from]) {
167 # warn '!!! Unknown word at ' . $from . '!';
168 $_->content('UNKN');
169 return;
170 };
171
172 # The derive rule means that the original
173 # word is taken and appended the string 'ui'
174 if ($rule eq '^') {
175 my $deriv = $offsets[$from];
176 chop($deriv);
177 chop($deriv);
178 $_->content($deriv . 'ui');
179
180 }
181
182 # The random rule means the word is replaced by
183 # with a random word with the same characterisms.
184 elsif ($rule eq '~') {
185 $_->content(get_rnd_word($word));
186 }
187
188 # Any other rule means, that the original word
189 # from the character data is taken.
190 else {
191 $_->content($offsets[$from])
192 }
193 }
194 )
195 }
196 )
197};
198
Akron53bc81d2020-04-27 16:24:35 +0200199
Akroncbf098a2020-04-27 17:56:42 +0200200# Transform header file
201sub transform_header {
202 my ($dom, $selector) = @_;
203
204 $dom->find($selector)->each(
205 sub {
206 my $word = $_->text;
207
208 # The random rule means the word is replaced by
209 # with a random word with the same characterisms.
210 $_->content(get_rnd_word($word));
211 }
212 )
213};
214
215
216
Akron53bc81d2020-04-27 16:24:35 +0200217__END__
218
219=pod
220
221=encoding utf8
222
223=head1 NAME
224
225scramble_korapxml.pl - Merge KorAP-XML data and create Krill documents
226
227
228=head1 SYNOPSIS
229
230 scramble_korapxml.pl -i <input-directory> -o <output-directory>
231
232
233=head1 DESCRIPTION
234
235This helper tool iterates over a single KorAP-XML folder
236and randomizes all word strings occurring following
237several rules. This is useful to create example files
238based on corpora that can't be published.
239
Akron59a0e4b2020-04-27 17:43:29 +0200240
241=head1 OPTIONS
242
243=over 2
244
245=item B<--input|-i> <directory>
246
247The unscrambled KorAP-XML directory.
248
249
250=item B<--output|-o> <directory>
251
252The output directory
253
254
255=item B<--rules|-r> <file>
256
257The rule file for transformation as a json file.
258Example:
259
260 [
261 [
262 "dgd/annot.xml",
263 [
264 ["f[name=trans]", "="],
265 ["f[name=lemma]", "^"],
266 ["f[name=pos]", "~"]
267 ]
268 ],
269 ["struct/structure.xml"]
270 ]
271
272All elements of the json list are copied from the input directory to
273the output directory.
274The C<data.xml> file will be automatically coppied and scrambled.
275If the file name is followed by a rule set, these
276CSS selector rules followed by a transformation type marker
277are used to transform elements of the file.
278
Akroncbf098a2020-04-27 17:56:42 +0200279All CSS selectors for annotation files
280are nested in C<spanList > span>.
Akron59a0e4b2020-04-27 17:43:29 +0200281
282The following markers are supported:
283
284=over 4
285
286=item B<=>
287
288Take the scrambled surface form from the C<data.xml>.
289
290=item B<^>
291
292Take the scrambled surface form from the C<data.xml> and
293modify the term by appending the string C<ui>.
294
295=item B<~>
296
297Create a randomized string, keeping the characteristicts of
298the original element content.
Akron53bc81d2020-04-27 16:24:35 +0200299Two identical words in a single run will always be transfered
300to the same target word.
301
Akron59a0e4b2020-04-27 17:43:29 +0200302=back
303
Akroncbf098a2020-04-27 17:56:42 +0200304For header files, the rules are not nested and only the
305randomized marker C<~> is supported.
306
Akron59a0e4b2020-04-27 17:43:29 +0200307=back