Akron | 53bc81d | 2020-04-27 16:24:35 +0200 | [diff] [blame] | 1 | #!/usr/bin/env perl |
| 2 | use Mojo::Base -strict; |
| 3 | use Mojo::DOM; |
| 4 | use Mojo::File qw'path'; |
Akron | 59a0e4b | 2020-04-27 17:43:29 +0200 | [diff] [blame] | 5 | use Mojo::JSON qw'decode_json'; |
Akron | 53bc81d | 2020-04-27 16:24:35 +0200 | [diff] [blame] | 6 | use Mojo::ByteStream 'b'; |
| 7 | use String::Random; |
| 8 | use Pod::Usage; |
| 9 | use Getopt::Long qw/GetOptions :config no_auto_abbrev/; |
| 10 | |
| 11 | ############################################################# |
| 12 | # This helper tool iterates over a single KorAP-XML files # |
| 13 | # and randomizes all word strings occurring following # |
| 14 | # several rules. This is useful to create example files # |
| 15 | # based on corpora that can't be published. # |
| 16 | # (c) IDS Mannheim # |
| 17 | ############################################################# |
| 18 | |
| 19 | my %ERROR_HASH = ( |
| 20 | -sections => 'NAME|SYNOPSIS', |
| 21 | -verbose => 99, |
| 22 | -output => '-', |
| 23 | -exit => 1 |
| 24 | ); |
| 25 | |
| 26 | my ($orig_folder, $scr_folder); |
| 27 | GetOptions( |
| 28 | 'input|i=s' => \$orig_folder, |
| 29 | 'output|o=s' => \$scr_folder, |
Akron | 59a0e4b | 2020-04-27 17:43:29 +0200 | [diff] [blame] | 30 | 'rules|r=s' => \(my $rule_file), |
Akron | 53bc81d | 2020-04-27 16:24:35 +0200 | [diff] [blame] | 31 | 'help|h' => sub { |
| 32 | pod2usage( |
| 33 | -sections => 'NAME|SYNOPSIS|DESCRIPTION|ARGUMENTS|OPTIONS', |
| 34 | -verbose => 99, |
| 35 | -output => '-' |
| 36 | ); |
| 37 | } |
| 38 | ); |
| 39 | |
Akron | 59a0e4b | 2020-04-27 17:43:29 +0200 | [diff] [blame] | 40 | unless ($orig_folder || $scr_folder || $rule_file) { |
Akron | 53bc81d | 2020-04-27 16:24:35 +0200 | [diff] [blame] | 41 | pod2usage(%ERROR_HASH); |
| 42 | }; |
| 43 | |
| 44 | my $string_gen = String::Random->new; |
| 45 | |
| 46 | # Remember all generated pairs orig -> random |
| 47 | my %replacements = (); |
| 48 | my $offset = 0; |
| 49 | my @offsets = (); |
| 50 | |
| 51 | # Turn a word into a random word with similar characteristics |
| 52 | sub get_rnd_word { |
| 53 | my $o_word = shift; |
| 54 | return $o_word unless $o_word =~ /[a-z]/i; |
| 55 | |
| 56 | # Return the old replacement |
| 57 | if ($replacements{$o_word}) { |
| 58 | return $replacements{$o_word}; |
| 59 | }; |
| 60 | |
| 61 | my $word = $o_word; |
| 62 | |
| 63 | # Turn the word into a pattern for String::Random |
| 64 | # c: Any Latin lowercase character [a-z] |
| 65 | # C: Any Latin uppercase character [A-Z] |
| 66 | # n: Any digit [0-9] |
| 67 | # !: A punctuation character |
| 68 | $word =~ tr/ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzöäü1234567890~`!@$%^&*()-_+={}[]|\\:;"'.<>?\/#,/CCCCCCCCCCCCCCCCCCCCCCCCCCccccccccccccccccccccccccccccccnnnnnnnnnn!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!/; |
| 69 | $word =~ s/[^Ccn!]/n/g; |
| 70 | $replacements{$o_word} = $string_gen->randpattern($word); |
| 71 | } |
| 72 | |
| 73 | # 1. Load data.xml |
| 74 | # replace all surface forms of /[a-z]/ |
| 75 | # with character strings of the same length, randomly created. |
| 76 | # Create an array, accessible by offsets. |
| 77 | my $data_file = $orig_folder . '/data.xml'; |
| 78 | # Process the data file and replace all surface words with random words |
| 79 | my $data = Mojo::File->new($data_file)->slurp; |
| 80 | my $dom = Mojo::DOM->new->xml(1)->parse(b($data)->decode); |
| 81 | my $new_text = b($dom->at('text')->text)->split( |
| 82 | " " |
| 83 | )->map( |
| 84 | sub { |
| 85 | my $token = get_rnd_word($_); |
| 86 | $offsets[$offset] = $token; |
| 87 | # print $offset, ':', $_, ':', $token,"\n"; |
| 88 | $offset += length($token); |
| 89 | $offset++; # space |
| 90 | |
| 91 | # exit if $offset > 300; |
| 92 | return $token; |
| 93 | } |
| 94 | )->join( |
| 95 | " " |
| 96 | ); |
| 97 | $dom->at('text')->content($new_text); |
| 98 | |
| 99 | # Create folder |
| 100 | path($scr_folder)->make_path->child('data.xml')->spurt(b($dom->to_string)->encode); |
| 101 | |
| 102 | |
| 103 | # 2. Take some css selectors and rename attributes, |
| 104 | # either according to the surface form ("=") or |
| 105 | # somehow derived ("^"), or random as well ("~"), |
| 106 | # based on the given content, that can be randomized and |
| 107 | # stuffed in a hash as well. |
| 108 | # If no CSS rules are parsed, the file will just be copied. |
| 109 | |
Akron | b61d6df | 2021-02-16 10:52:12 +0100 | [diff] [blame^] | 110 | if ($rule_file) { |
| 111 | $rule_file = Mojo::File->new($rule_file); |
| 112 | if (-e $rule_file) { |
| 113 | my $rules = decode_json $rule_file->slurp; |
Akron | 53bc81d | 2020-04-27 16:24:35 +0200 | [diff] [blame] | 114 | |
Akron | b61d6df | 2021-02-16 10:52:12 +0100 | [diff] [blame^] | 115 | foreach my $rule (@$rules) { |
| 116 | scramble(@$rule); |
| 117 | }; |
Akron | 59a0e4b | 2020-04-27 17:43:29 +0200 | [diff] [blame] | 118 | }; |
| 119 | }; |
Akron | 53bc81d | 2020-04-27 16:24:35 +0200 | [diff] [blame] | 120 | |
| 121 | # Scramble an annotation file |
| 122 | sub scramble { |
Akron | 59a0e4b | 2020-04-27 17:43:29 +0200 | [diff] [blame] | 123 | my ($input, $rules) = @_; |
Akron | 53bc81d | 2020-04-27 16:24:35 +0200 | [diff] [blame] | 124 | my $data_file = path($orig_folder)->child($input); |
| 125 | |
| 126 | unless (-f $data_file) { |
| 127 | warn "$data_file does not exist"; |
| 128 | return; |
| 129 | }; |
| 130 | |
| 131 | my $data = $data_file->slurp; |
| 132 | |
| 133 | # Only transfer if rules exist |
| 134 | if ($rules) { |
| 135 | my $dom = Mojo::DOM->new->xml(1)->parse(b($data)->decode); |
| 136 | |
| 137 | foreach (@$rules) { |
Akron | cbf098a | 2020-04-27 17:56:42 +0200 | [diff] [blame] | 138 | if ($input =~ /header\.xml$/) { |
| 139 | transform_header($dom, $_->[0]); |
| 140 | } else { |
| 141 | transform($dom, $_->[0], $_->[1]); |
| 142 | }; |
Akron | 53bc81d | 2020-04-27 16:24:35 +0200 | [diff] [blame] | 143 | }; |
| 144 | |
| 145 | $data = b($dom->to_string)->encode; |
Akron | 53bc81d | 2020-04-27 16:24:35 +0200 | [diff] [blame] | 146 | }; |
| 147 | |
Akron | 59a0e4b | 2020-04-27 17:43:29 +0200 | [diff] [blame] | 148 | my $file = Mojo::File->new($scr_folder)->child($input); |
Akron | 53bc81d | 2020-04-27 16:24:35 +0200 | [diff] [blame] | 149 | path($file->dirname)->make_path; |
| 150 | $file->spurt($data); |
| 151 | }; |
| 152 | |
| 153 | |
| 154 | # Iterate over an annotation document and scramble |
| 155 | # all textual content based on CSS rules |
| 156 | sub transform { |
| 157 | my ($dom, $selector, $rule) = @_; |
| 158 | |
| 159 | $dom->find("spanList > span")->each( |
| 160 | sub { |
| 161 | my $from = $_->attr("from"); |
| 162 | my $to = $_->attr("to"); |
| 163 | $_->find($selector)->each( |
| 164 | sub { |
| 165 | my $word = $_->text; |
| 166 | |
| 167 | unless ($offsets[$from]) { |
| 168 | # warn '!!! Unknown word at ' . $from . '!'; |
| 169 | $_->content('UNKN'); |
| 170 | return; |
| 171 | }; |
| 172 | |
| 173 | # The derive rule means that the original |
| 174 | # word is taken and appended the string 'ui' |
| 175 | if ($rule eq '^') { |
| 176 | my $deriv = $offsets[$from]; |
| 177 | chop($deriv); |
| 178 | chop($deriv); |
| 179 | $_->content($deriv . 'ui'); |
| 180 | |
| 181 | } |
| 182 | |
| 183 | # The random rule means the word is replaced by |
| 184 | # with a random word with the same characterisms. |
| 185 | elsif ($rule eq '~') { |
| 186 | $_->content(get_rnd_word($word)); |
| 187 | } |
| 188 | |
| 189 | # Any other rule means, that the original word |
| 190 | # from the character data is taken. |
| 191 | else { |
| 192 | $_->content($offsets[$from]) |
| 193 | } |
| 194 | } |
| 195 | ) |
| 196 | } |
| 197 | ) |
| 198 | }; |
| 199 | |
Akron | 53bc81d | 2020-04-27 16:24:35 +0200 | [diff] [blame] | 200 | |
Akron | cbf098a | 2020-04-27 17:56:42 +0200 | [diff] [blame] | 201 | # Transform header file |
| 202 | sub transform_header { |
| 203 | my ($dom, $selector) = @_; |
| 204 | |
| 205 | $dom->find($selector)->each( |
| 206 | sub { |
| 207 | my $word = $_->text; |
| 208 | |
| 209 | # The random rule means the word is replaced by |
| 210 | # with a random word with the same characterisms. |
| 211 | $_->content(get_rnd_word($word)); |
| 212 | } |
| 213 | ) |
| 214 | }; |
| 215 | |
| 216 | |
| 217 | |
Akron | 53bc81d | 2020-04-27 16:24:35 +0200 | [diff] [blame] | 218 | __END__ |
| 219 | |
| 220 | =pod |
| 221 | |
| 222 | =encoding utf8 |
| 223 | |
| 224 | =head1 NAME |
| 225 | |
| 226 | scramble_korapxml.pl - Merge KorAP-XML data and create Krill documents |
| 227 | |
| 228 | |
| 229 | =head1 SYNOPSIS |
| 230 | |
| 231 | scramble_korapxml.pl -i <input-directory> -o <output-directory> |
| 232 | |
| 233 | |
| 234 | =head1 DESCRIPTION |
| 235 | |
| 236 | This helper tool iterates over a single KorAP-XML folder |
| 237 | and randomizes all word strings occurring following |
| 238 | several rules. This is useful to create example files |
| 239 | based on corpora that can't be published. |
| 240 | |
Akron | 59a0e4b | 2020-04-27 17:43:29 +0200 | [diff] [blame] | 241 | |
| 242 | =head1 OPTIONS |
| 243 | |
| 244 | =over 2 |
| 245 | |
| 246 | =item B<--input|-i> <directory> |
| 247 | |
| 248 | The unscrambled KorAP-XML directory. |
| 249 | |
| 250 | |
| 251 | =item B<--output|-o> <directory> |
| 252 | |
| 253 | The output directory |
| 254 | |
| 255 | |
| 256 | =item B<--rules|-r> <file> |
| 257 | |
| 258 | The rule file for transformation as a json file. |
| 259 | Example: |
| 260 | |
| 261 | [ |
| 262 | [ |
| 263 | "dgd/annot.xml", |
| 264 | [ |
| 265 | ["f[name=trans]", "="], |
| 266 | ["f[name=lemma]", "^"], |
| 267 | ["f[name=pos]", "~"] |
| 268 | ] |
| 269 | ], |
| 270 | ["struct/structure.xml"] |
| 271 | ] |
| 272 | |
| 273 | All elements of the json list are copied from the input directory to |
| 274 | the output directory. |
| 275 | The C<data.xml> file will be automatically coppied and scrambled. |
| 276 | If the file name is followed by a rule set, these |
| 277 | CSS selector rules followed by a transformation type marker |
| 278 | are used to transform elements of the file. |
| 279 | |
Akron | cbf098a | 2020-04-27 17:56:42 +0200 | [diff] [blame] | 280 | All CSS selectors for annotation files |
| 281 | are nested in C<spanList > span>. |
Akron | 59a0e4b | 2020-04-27 17:43:29 +0200 | [diff] [blame] | 282 | |
| 283 | The following markers are supported: |
| 284 | |
| 285 | =over 4 |
| 286 | |
| 287 | =item B<=> |
| 288 | |
| 289 | Take the scrambled surface form from the C<data.xml>. |
| 290 | |
| 291 | =item B<^> |
| 292 | |
| 293 | Take the scrambled surface form from the C<data.xml> and |
| 294 | modify the term by appending the string C<ui>. |
| 295 | |
| 296 | =item B<~> |
| 297 | |
| 298 | Create a randomized string, keeping the characteristicts of |
| 299 | the original element content. |
Akron | 53bc81d | 2020-04-27 16:24:35 +0200 | [diff] [blame] | 300 | Two identical words in a single run will always be transfered |
| 301 | to the same target word. |
| 302 | |
Akron | 59a0e4b | 2020-04-27 17:43:29 +0200 | [diff] [blame] | 303 | =back |
| 304 | |
Akron | cbf098a | 2020-04-27 17:56:42 +0200 | [diff] [blame] | 305 | For header files, the rules are not nested and only the |
| 306 | randomized marker C<~> is supported. |
| 307 | |
Akron | 59a0e4b | 2020-04-27 17:43:29 +0200 | [diff] [blame] | 308 | =back |