Akron | 53bc81d | 2020-04-27 16:24:35 +0200 | [diff] [blame^] | 1 | #!/usr/bin/env perl |
| 2 | use Mojo::Base -strict; |
| 3 | use Mojo::DOM; |
| 4 | use Mojo::File qw'path'; |
| 5 | use Mojo::ByteStream 'b'; |
| 6 | use String::Random; |
| 7 | use Pod::Usage; |
| 8 | use Getopt::Long qw/GetOptions :config no_auto_abbrev/; |
| 9 | |
| 10 | ############################################################# |
| 11 | # This helper tool iterates over a single KorAP-XML files # |
| 12 | # and randomizes all word strings occurring following # |
| 13 | # several rules. This is useful to create example files # |
| 14 | # based on corpora that can't be published. # |
| 15 | # (c) IDS Mannheim # |
| 16 | ############################################################# |
| 17 | |
| 18 | my %ERROR_HASH = ( |
| 19 | -sections => 'NAME|SYNOPSIS', |
| 20 | -verbose => 99, |
| 21 | -output => '-', |
| 22 | -exit => 1 |
| 23 | ); |
| 24 | |
| 25 | my ($orig_folder, $scr_folder); |
| 26 | GetOptions( |
| 27 | 'input|i=s' => \$orig_folder, |
| 28 | 'output|o=s' => \$scr_folder, |
| 29 | 'help|h' => sub { |
| 30 | pod2usage( |
| 31 | -sections => 'NAME|SYNOPSIS|DESCRIPTION|ARGUMENTS|OPTIONS', |
| 32 | -verbose => 99, |
| 33 | -output => '-' |
| 34 | ); |
| 35 | } |
| 36 | ); |
| 37 | |
| 38 | unless ($orig_folder || $scr_folder) { |
| 39 | pod2usage(%ERROR_HASH); |
| 40 | }; |
| 41 | |
| 42 | my $string_gen = String::Random->new; |
| 43 | |
| 44 | # Remember all generated pairs orig -> random |
| 45 | my %replacements = (); |
| 46 | my $offset = 0; |
| 47 | my @offsets = (); |
| 48 | |
| 49 | # Turn a word into a random word with similar characteristics |
| 50 | sub get_rnd_word { |
| 51 | my $o_word = shift; |
| 52 | return $o_word unless $o_word =~ /[a-z]/i; |
| 53 | |
| 54 | # Return the old replacement |
| 55 | if ($replacements{$o_word}) { |
| 56 | return $replacements{$o_word}; |
| 57 | }; |
| 58 | |
| 59 | my $word = $o_word; |
| 60 | |
| 61 | # Turn the word into a pattern for String::Random |
| 62 | # c: Any Latin lowercase character [a-z] |
| 63 | # C: Any Latin uppercase character [A-Z] |
| 64 | # n: Any digit [0-9] |
| 65 | # !: A punctuation character |
| 66 | $word =~ tr/ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzöäü1234567890~`!@$%^&*()-_+={}[]|\\:;"'.<>?\/#,/CCCCCCCCCCCCCCCCCCCCCCCCCCccccccccccccccccccccccccccccccnnnnnnnnnn!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!/; |
| 67 | $word =~ s/[^Ccn!]/n/g; |
| 68 | $replacements{$o_word} = $string_gen->randpattern($word); |
| 69 | } |
| 70 | |
| 71 | # 1. Load data.xml |
| 72 | # replace all surface forms of /[a-z]/ |
| 73 | # with character strings of the same length, randomly created. |
| 74 | # Create an array, accessible by offsets. |
| 75 | my $data_file = $orig_folder . '/data.xml'; |
| 76 | # Process the data file and replace all surface words with random words |
| 77 | my $data = Mojo::File->new($data_file)->slurp; |
| 78 | my $dom = Mojo::DOM->new->xml(1)->parse(b($data)->decode); |
| 79 | my $new_text = b($dom->at('text')->text)->split( |
| 80 | " " |
| 81 | )->map( |
| 82 | sub { |
| 83 | my $token = get_rnd_word($_); |
| 84 | $offsets[$offset] = $token; |
| 85 | # print $offset, ':', $_, ':', $token,"\n"; |
| 86 | $offset += length($token); |
| 87 | $offset++; # space |
| 88 | |
| 89 | # exit if $offset > 300; |
| 90 | return $token; |
| 91 | } |
| 92 | )->join( |
| 93 | " " |
| 94 | ); |
| 95 | $dom->at('text')->content($new_text); |
| 96 | |
| 97 | # Create folder |
| 98 | path($scr_folder)->make_path->child('data.xml')->spurt(b($dom->to_string)->encode); |
| 99 | |
| 100 | |
| 101 | # 2. Take some css selectors and rename attributes, |
| 102 | # either according to the surface form ("=") or |
| 103 | # somehow derived ("^"), or random as well ("~"), |
| 104 | # based on the given content, that can be randomized and |
| 105 | # stuffed in a hash as well. |
| 106 | # If no CSS rules are parsed, the file will just be copied. |
| 107 | |
| 108 | scramble('dgd/annot.xml' => [ |
| 109 | ["f[name=trans]", "="], |
| 110 | ["f[name=lemma]", "^"], |
| 111 | ["f[name=pos]", "~"] |
| 112 | ] => 'dgd/annot.xml'); |
| 113 | |
| 114 | scramble('struct/structure.xml'); |
| 115 | scramble('header.xml'); |
| 116 | |
| 117 | # Scramble an annotation file |
| 118 | sub scramble { |
| 119 | my ($input, $rules, $output) = @_; |
| 120 | my $data_file = path($orig_folder)->child($input); |
| 121 | |
| 122 | unless (-f $data_file) { |
| 123 | warn "$data_file does not exist"; |
| 124 | return; |
| 125 | }; |
| 126 | |
| 127 | my $data = $data_file->slurp; |
| 128 | |
| 129 | # Only transfer if rules exist |
| 130 | if ($rules) { |
| 131 | my $dom = Mojo::DOM->new->xml(1)->parse(b($data)->decode); |
| 132 | |
| 133 | foreach (@$rules) { |
| 134 | transform($dom, $_->[0], $_->[1]); |
| 135 | }; |
| 136 | |
| 137 | $data = b($dom->to_string)->encode; |
| 138 | } |
| 139 | |
| 140 | else { |
| 141 | |
| 142 | # Just copy the data |
| 143 | $output = $input; |
| 144 | }; |
| 145 | |
| 146 | my $file = Mojo::File->new($scr_folder)->child($output); |
| 147 | path($file->dirname)->make_path; |
| 148 | $file->spurt($data); |
| 149 | }; |
| 150 | |
| 151 | |
| 152 | # Iterate over an annotation document and scramble |
| 153 | # all textual content based on CSS rules |
| 154 | sub transform { |
| 155 | my ($dom, $selector, $rule) = @_; |
| 156 | |
| 157 | $dom->find("spanList > span")->each( |
| 158 | sub { |
| 159 | my $from = $_->attr("from"); |
| 160 | my $to = $_->attr("to"); |
| 161 | $_->find($selector)->each( |
| 162 | sub { |
| 163 | my $word = $_->text; |
| 164 | |
| 165 | unless ($offsets[$from]) { |
| 166 | # warn '!!! Unknown word at ' . $from . '!'; |
| 167 | $_->content('UNKN'); |
| 168 | return; |
| 169 | }; |
| 170 | |
| 171 | # The derive rule means that the original |
| 172 | # word is taken and appended the string 'ui' |
| 173 | if ($rule eq '^') { |
| 174 | my $deriv = $offsets[$from]; |
| 175 | chop($deriv); |
| 176 | chop($deriv); |
| 177 | $_->content($deriv . 'ui'); |
| 178 | |
| 179 | } |
| 180 | |
| 181 | # The random rule means the word is replaced by |
| 182 | # with a random word with the same characterisms. |
| 183 | elsif ($rule eq '~') { |
| 184 | $_->content(get_rnd_word($word)); |
| 185 | } |
| 186 | |
| 187 | # Any other rule means, that the original word |
| 188 | # from the character data is taken. |
| 189 | else { |
| 190 | $_->content($offsets[$from]) |
| 191 | } |
| 192 | } |
| 193 | ) |
| 194 | } |
| 195 | ) |
| 196 | }; |
| 197 | |
| 198 | __END__ |
| 199 | |
| 200 | # Config data: |
| 201 | { |
| 202 | '/dgd/annot.xml' => [ |
| 203 | ["f[name=norm]", "="], |
| 204 | ["f[name=lemma]", "^"], |
| 205 | ["f[name=pos]", "~"] |
| 206 | ], |
| 207 | '/dgd/morpho.xml' => [ |
| 208 | ["f[name=norm]", "="], |
| 209 | ["f[name=lemma]", "^"], |
| 210 | ["f[name=pos]", "~"] |
| 211 | ], |
| 212 | '/dgd/nospeech.xml' => [] |
| 213 | } |
| 214 | |
| 215 | |
| 216 | __END__ |
| 217 | |
| 218 | =pod |
| 219 | |
| 220 | =encoding utf8 |
| 221 | |
| 222 | =head1 NAME |
| 223 | |
| 224 | scramble_korapxml.pl - Merge KorAP-XML data and create Krill documents |
| 225 | |
| 226 | |
| 227 | =head1 SYNOPSIS |
| 228 | |
| 229 | scramble_korapxml.pl -i <input-directory> -o <output-directory> |
| 230 | |
| 231 | |
| 232 | =head1 DESCRIPTION |
| 233 | |
| 234 | This helper tool iterates over a single KorAP-XML folder |
| 235 | and randomizes all word strings occurring following |
| 236 | several rules. This is useful to create example files |
| 237 | based on corpora that can't be published. |
| 238 | |
| 239 | Two identical words in a single run will always be transfered |
| 240 | to the same target word. |
| 241 | |
| 242 | The C<data.xml> file will be scrambled automatically. |