blob: 48d6396a10041c807f764af655d72fe5cebabd92 [file] [log] [blame]
Akron53bc81d2020-04-27 16:24:35 +02001#!/usr/bin/env perl
2use Mojo::Base -strict;
3use Mojo::DOM;
4use Mojo::File qw'path';
5use Mojo::ByteStream 'b';
6use String::Random;
7use Pod::Usage;
8use Getopt::Long qw/GetOptions :config no_auto_abbrev/;
9
10#############################################################
11# This helper tool iterates over a single KorAP-XML files #
12# and randomizes all word strings occurring following #
13# several rules. This is useful to create example files #
14# based on corpora that can't be published. #
15# (c) IDS Mannheim #
16#############################################################
17
18my %ERROR_HASH = (
19 -sections => 'NAME|SYNOPSIS',
20 -verbose => 99,
21 -output => '-',
22 -exit => 1
23);
24
25my ($orig_folder, $scr_folder);
26GetOptions(
27 'input|i=s' => \$orig_folder,
28 'output|o=s' => \$scr_folder,
29 'help|h' => sub {
30 pod2usage(
31 -sections => 'NAME|SYNOPSIS|DESCRIPTION|ARGUMENTS|OPTIONS',
32 -verbose => 99,
33 -output => '-'
34 );
35 }
36);
37
38unless ($orig_folder || $scr_folder) {
39 pod2usage(%ERROR_HASH);
40};
41
42my $string_gen = String::Random->new;
43
44# Remember all generated pairs orig -> random
45my %replacements = ();
46my $offset = 0;
47my @offsets = ();
48
49# Turn a word into a random word with similar characteristics
50sub get_rnd_word {
51 my $o_word = shift;
52 return $o_word unless $o_word =~ /[a-z]/i;
53
54 # Return the old replacement
55 if ($replacements{$o_word}) {
56 return $replacements{$o_word};
57 };
58
59 my $word = $o_word;
60
61 # Turn the word into a pattern for String::Random
62 # c: Any Latin lowercase character [a-z]
63 # C: Any Latin uppercase character [A-Z]
64 # n: Any digit [0-9]
65 # !: A punctuation character
66 $word =~ tr/ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzöäü1234567890~`!@$%^&*()-_+={}[]|\\:;"'.<>?\/#,/CCCCCCCCCCCCCCCCCCCCCCCCCCccccccccccccccccccccccccccccccnnnnnnnnnn!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!/;
67 $word =~ s/[^Ccn!]/n/g;
68 $replacements{$o_word} = $string_gen->randpattern($word);
69}
70
71# 1. Load data.xml
72# replace all surface forms of /[a-z]/
73# with character strings of the same length, randomly created.
74# Create an array, accessible by offsets.
75my $data_file = $orig_folder . '/data.xml';
76# Process the data file and replace all surface words with random words
77my $data = Mojo::File->new($data_file)->slurp;
78my $dom = Mojo::DOM->new->xml(1)->parse(b($data)->decode);
79my $new_text = b($dom->at('text')->text)->split(
80 " "
81)->map(
82 sub {
83 my $token = get_rnd_word($_);
84 $offsets[$offset] = $token;
85 # print $offset, ':', $_, ':', $token,"\n";
86 $offset += length($token);
87 $offset++; # space
88
89 # exit if $offset > 300;
90 return $token;
91 }
92)->join(
93 " "
94);
95$dom->at('text')->content($new_text);
96
97# Create folder
98path($scr_folder)->make_path->child('data.xml')->spurt(b($dom->to_string)->encode);
99
100
101# 2. Take some css selectors and rename attributes,
102# either according to the surface form ("=") or
103# somehow derived ("^"), or random as well ("~"),
104# based on the given content, that can be randomized and
105# stuffed in a hash as well.
106# If no CSS rules are parsed, the file will just be copied.
107
108scramble('dgd/annot.xml' => [
109 ["f[name=trans]", "="],
110 ["f[name=lemma]", "^"],
111 ["f[name=pos]", "~"]
112] => 'dgd/annot.xml');
113
114scramble('struct/structure.xml');
115scramble('header.xml');
116
117# Scramble an annotation file
118sub scramble {
119 my ($input, $rules, $output) = @_;
120 my $data_file = path($orig_folder)->child($input);
121
122 unless (-f $data_file) {
123 warn "$data_file does not exist";
124 return;
125 };
126
127 my $data = $data_file->slurp;
128
129 # Only transfer if rules exist
130 if ($rules) {
131 my $dom = Mojo::DOM->new->xml(1)->parse(b($data)->decode);
132
133 foreach (@$rules) {
134 transform($dom, $_->[0], $_->[1]);
135 };
136
137 $data = b($dom->to_string)->encode;
138 }
139
140 else {
141
142 # Just copy the data
143 $output = $input;
144 };
145
146 my $file = Mojo::File->new($scr_folder)->child($output);
147 path($file->dirname)->make_path;
148 $file->spurt($data);
149};
150
151
152# Iterate over an annotation document and scramble
153# all textual content based on CSS rules
154sub transform {
155 my ($dom, $selector, $rule) = @_;
156
157 $dom->find("spanList > span")->each(
158 sub {
159 my $from = $_->attr("from");
160 my $to = $_->attr("to");
161 $_->find($selector)->each(
162 sub {
163 my $word = $_->text;
164
165 unless ($offsets[$from]) {
166 # warn '!!! Unknown word at ' . $from . '!';
167 $_->content('UNKN');
168 return;
169 };
170
171 # The derive rule means that the original
172 # word is taken and appended the string 'ui'
173 if ($rule eq '^') {
174 my $deriv = $offsets[$from];
175 chop($deriv);
176 chop($deriv);
177 $_->content($deriv . 'ui');
178
179 }
180
181 # The random rule means the word is replaced by
182 # with a random word with the same characterisms.
183 elsif ($rule eq '~') {
184 $_->content(get_rnd_word($word));
185 }
186
187 # Any other rule means, that the original word
188 # from the character data is taken.
189 else {
190 $_->content($offsets[$from])
191 }
192 }
193 )
194 }
195 )
196};
197
198__END__
199
200# Config data:
201{
202 '/dgd/annot.xml' => [
203 ["f[name=norm]", "="],
204 ["f[name=lemma]", "^"],
205 ["f[name=pos]", "~"]
206 ],
207 '/dgd/morpho.xml' => [
208 ["f[name=norm]", "="],
209 ["f[name=lemma]", "^"],
210 ["f[name=pos]", "~"]
211 ],
212 '/dgd/nospeech.xml' => []
213}
214
215
216__END__
217
218=pod
219
220=encoding utf8
221
222=head1 NAME
223
224scramble_korapxml.pl - Merge KorAP-XML data and create Krill documents
225
226
227=head1 SYNOPSIS
228
229 scramble_korapxml.pl -i <input-directory> -o <output-directory>
230
231
232=head1 DESCRIPTION
233
234This helper tool iterates over a single KorAP-XML folder
235and randomizes all word strings occurring following
236several rules. This is useful to create example files
237based on corpora that can't be published.
238
239Two identical words in a single run will always be transfered
240to the same target word.
241
242The C<data.xml> file will be scrambled automatically.