blob: e92095a15204f062e91599a41d8283cc1a2cada5 [file] [log] [blame]
Akron53bc81d2020-04-27 16:24:35 +02001#!/usr/bin/env perl
2use Mojo::Base -strict;
3use Mojo::DOM;
4use Mojo::File qw'path';
Akron59a0e4b2020-04-27 17:43:29 +02005use Mojo::JSON qw'decode_json';
Akron53bc81d2020-04-27 16:24:35 +02006use Mojo::ByteStream 'b';
7use String::Random;
8use Pod::Usage;
9use Getopt::Long qw/GetOptions :config no_auto_abbrev/;
10
11#############################################################
12# This helper tool iterates over a single KorAP-XML files #
13# and randomizes all word strings occurring following #
14# several rules. This is useful to create example files #
15# based on corpora that can't be published. #
16# (c) IDS Mannheim #
17#############################################################
18
19my %ERROR_HASH = (
20 -sections => 'NAME|SYNOPSIS',
21 -verbose => 99,
22 -output => '-',
23 -exit => 1
24);
25
26my ($orig_folder, $scr_folder);
27GetOptions(
28 'input|i=s' => \$orig_folder,
29 'output|o=s' => \$scr_folder,
Akron59a0e4b2020-04-27 17:43:29 +020030 'rules|r=s' => \(my $rule_file),
Akron53bc81d2020-04-27 16:24:35 +020031 'help|h' => sub {
32 pod2usage(
33 -sections => 'NAME|SYNOPSIS|DESCRIPTION|ARGUMENTS|OPTIONS',
34 -verbose => 99,
35 -output => '-'
36 );
37 }
38);
39
Akron59a0e4b2020-04-27 17:43:29 +020040unless ($orig_folder || $scr_folder || $rule_file) {
Akron53bc81d2020-04-27 16:24:35 +020041 pod2usage(%ERROR_HASH);
42};
43
44my $string_gen = String::Random->new;
45
46# Remember all generated pairs orig -> random
47my %replacements = ();
48my $offset = 0;
49my @offsets = ();
50
51# Turn a word into a random word with similar characteristics
52sub get_rnd_word {
53 my $o_word = shift;
54 return $o_word unless $o_word =~ /[a-z]/i;
55
56 # Return the old replacement
57 if ($replacements{$o_word}) {
58 return $replacements{$o_word};
59 };
60
61 my $word = $o_word;
62
63 # Turn the word into a pattern for String::Random
64 # c: Any Latin lowercase character [a-z]
65 # C: Any Latin uppercase character [A-Z]
66 # n: Any digit [0-9]
67 # !: A punctuation character
68 $word =~ tr/ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzöäü1234567890~`!@$%^&*()-_+={}[]|\\:;"'.<>?\/#,/CCCCCCCCCCCCCCCCCCCCCCCCCCccccccccccccccccccccccccccccccnnnnnnnnnn!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!/;
69 $word =~ s/[^Ccn!]/n/g;
70 $replacements{$o_word} = $string_gen->randpattern($word);
71}
72
73# 1. Load data.xml
74# replace all surface forms of /[a-z]/
75# with character strings of the same length, randomly created.
76# Create an array, accessible by offsets.
77my $data_file = $orig_folder . '/data.xml';
78# Process the data file and replace all surface words with random words
79my $data = Mojo::File->new($data_file)->slurp;
80my $dom = Mojo::DOM->new->xml(1)->parse(b($data)->decode);
81my $new_text = b($dom->at('text')->text)->split(
82 " "
83)->map(
84 sub {
85 my $token = get_rnd_word($_);
86 $offsets[$offset] = $token;
87 # print $offset, ':', $_, ':', $token,"\n";
88 $offset += length($token);
89 $offset++; # space
90
91 # exit if $offset > 300;
92 return $token;
93 }
94)->join(
95 " "
96);
97$dom->at('text')->content($new_text);
98
99# Create folder
100path($scr_folder)->make_path->child('data.xml')->spurt(b($dom->to_string)->encode);
101
102
103# 2. Take some css selectors and rename attributes,
104# either according to the surface form ("=") or
105# somehow derived ("^"), or random as well ("~"),
106# based on the given content, that can be randomized and
107# stuffed in a hash as well.
108# If no CSS rules are parsed, the file will just be copied.
109
Akron59a0e4b2020-04-27 17:43:29 +0200110$rule_file = Mojo::File->new($rule_file);
Akron53bc81d2020-04-27 16:24:35 +0200111
Akron59a0e4b2020-04-27 17:43:29 +0200112if (-e $rule_file) {
113 my $rules = decode_json $rule_file->slurp;
114
115 foreach my $rule (@$rules) {
116 scramble(@$rule);
117 };
118};
Akron53bc81d2020-04-27 16:24:35 +0200119
120# Scramble an annotation file
121sub scramble {
Akron59a0e4b2020-04-27 17:43:29 +0200122 my ($input, $rules) = @_;
Akron53bc81d2020-04-27 16:24:35 +0200123 my $data_file = path($orig_folder)->child($input);
124
125 unless (-f $data_file) {
126 warn "$data_file does not exist";
127 return;
128 };
129
130 my $data = $data_file->slurp;
131
132 # Only transfer if rules exist
133 if ($rules) {
134 my $dom = Mojo::DOM->new->xml(1)->parse(b($data)->decode);
135
136 foreach (@$rules) {
137 transform($dom, $_->[0], $_->[1]);
138 };
139
140 $data = b($dom->to_string)->encode;
Akron53bc81d2020-04-27 16:24:35 +0200141 };
142
Akron59a0e4b2020-04-27 17:43:29 +0200143 my $file = Mojo::File->new($scr_folder)->child($input);
Akron53bc81d2020-04-27 16:24:35 +0200144 path($file->dirname)->make_path;
145 $file->spurt($data);
146};
147
148
149# Iterate over an annotation document and scramble
150# all textual content based on CSS rules
151sub transform {
152 my ($dom, $selector, $rule) = @_;
153
154 $dom->find("spanList > span")->each(
155 sub {
156 my $from = $_->attr("from");
157 my $to = $_->attr("to");
158 $_->find($selector)->each(
159 sub {
160 my $word = $_->text;
161
162 unless ($offsets[$from]) {
163 # warn '!!! Unknown word at ' . $from . '!';
164 $_->content('UNKN');
165 return;
166 };
167
168 # The derive rule means that the original
169 # word is taken and appended the string 'ui'
170 if ($rule eq '^') {
171 my $deriv = $offsets[$from];
172 chop($deriv);
173 chop($deriv);
174 $_->content($deriv . 'ui');
175
176 }
177
178 # The random rule means the word is replaced by
179 # with a random word with the same characterisms.
180 elsif ($rule eq '~') {
181 $_->content(get_rnd_word($word));
182 }
183
184 # Any other rule means, that the original word
185 # from the character data is taken.
186 else {
187 $_->content($offsets[$from])
188 }
189 }
190 )
191 }
192 )
193};
194
Akron53bc81d2020-04-27 16:24:35 +0200195
196__END__
197
198=pod
199
200=encoding utf8
201
202=head1 NAME
203
204scramble_korapxml.pl - Merge KorAP-XML data and create Krill documents
205
206
207=head1 SYNOPSIS
208
209 scramble_korapxml.pl -i <input-directory> -o <output-directory>
210
211
212=head1 DESCRIPTION
213
214This helper tool iterates over a single KorAP-XML folder
215and randomizes all word strings occurring following
216several rules. This is useful to create example files
217based on corpora that can't be published.
218
Akron59a0e4b2020-04-27 17:43:29 +0200219
220=head1 OPTIONS
221
222=over 2
223
224=item B<--input|-i> <directory>
225
226The unscrambled KorAP-XML directory.
227
228
229=item B<--output|-o> <directory>
230
231The output directory
232
233
234=item B<--rules|-r> <file>
235
236The rule file for transformation as a json file.
237Example:
238
239 [
240 [
241 "dgd/annot.xml",
242 [
243 ["f[name=trans]", "="],
244 ["f[name=lemma]", "^"],
245 ["f[name=pos]", "~"]
246 ]
247 ],
248 ["struct/structure.xml"]
249 ]
250
251All elements of the json list are copied from the input directory to
252the output directory.
253The C<data.xml> file will be automatically coppied and scrambled.
254If the file name is followed by a rule set, these
255CSS selector rules followed by a transformation type marker
256are used to transform elements of the file.
257
258All CSS selectors are nested in C<spanList > span>.
259
260The following markers are supported:
261
262=over 4
263
264=item B<=>
265
266Take the scrambled surface form from the C<data.xml>.
267
268=item B<^>
269
270Take the scrambled surface form from the C<data.xml> and
271modify the term by appending the string C<ui>.
272
273=item B<~>
274
275Create a randomized string, keeping the characteristicts of
276the original element content.
Akron53bc81d2020-04-27 16:24:35 +0200277Two identical words in a single run will always be transfered
278to the same target word.
279
Akron59a0e4b2020-04-27 17:43:29 +0200280=back
281
282=back