Support rule files for scramble_korapxml tool

Change-Id: Ifdff8e6ccdb3c858321da338e14f75085bdee795
diff --git a/tools/scramble_korapxml.pl b/tools/scramble_korapxml.pl
index 48d6396..e92095a 100644
--- a/tools/scramble_korapxml.pl
+++ b/tools/scramble_korapxml.pl
@@ -2,6 +2,7 @@
 use Mojo::Base -strict;
 use Mojo::DOM;
 use Mojo::File qw'path';
+use Mojo::JSON qw'decode_json';
 use Mojo::ByteStream 'b';
 use String::Random;
 use Pod::Usage;
@@ -26,6 +27,7 @@
 GetOptions(
   'input|i=s' => \$orig_folder,
   'output|o=s' => \$scr_folder,
+  'rules|r=s' => \(my $rule_file),
   'help|h'      => sub {
     pod2usage(
       -sections => 'NAME|SYNOPSIS|DESCRIPTION|ARGUMENTS|OPTIONS',
@@ -35,7 +37,7 @@
   }
 );
 
-unless ($orig_folder || $scr_folder) {
+unless ($orig_folder || $scr_folder || $rule_file) {
   pod2usage(%ERROR_HASH);
 };
 
@@ -105,18 +107,19 @@
 # stuffed in a hash as well.
 # If no CSS rules are parsed, the file will just be copied.
 
-scramble('dgd/annot.xml' => [
-  ["f[name=trans]", "="],
-  ["f[name=lemma]", "^"],
-  ["f[name=pos]", "~"]
-] => 'dgd/annot.xml');
+$rule_file = Mojo::File->new($rule_file);
 
-scramble('struct/structure.xml');
-scramble('header.xml');
+if (-e $rule_file) {
+  my $rules = decode_json $rule_file->slurp;
+
+  foreach my $rule (@$rules) {
+    scramble(@$rule);
+  };
+};
 
 # Scramble an annotation file
 sub scramble {
-  my ($input, $rules, $output) = @_;
+  my ($input, $rules) = @_;
   my $data_file = path($orig_folder)->child($input);
 
   unless (-f $data_file) {
@@ -135,15 +138,9 @@
     };
 
     $data = b($dom->to_string)->encode;
-  }
-
-  else {
-
-    # Just copy the data
-    $output = $input;
   };
 
-  my $file = Mojo::File->new($scr_folder)->child($output);
+  my $file = Mojo::File->new($scr_folder)->child($input);
   path($file->dirname)->make_path;
   $file->spurt($data);
 };
@@ -195,23 +192,6 @@
   )
 };
 
-__END__
-
-# Config data:
-{
-  '/dgd/annot.xml' => [
-    ["f[name=norm]", "="],
-    ["f[name=lemma]", "^"],
-    ["f[name=pos]", "~"]
-  ],
-  '/dgd/morpho.xml' => [
-    ["f[name=norm]", "="],
-    ["f[name=lemma]", "^"],
-    ["f[name=pos]", "~"]
-  ],
-  '/dgd/nospeech.xml' => []
-}
-
 
 __END__
 
@@ -236,7 +216,67 @@
 several rules. This is useful to create example files
 based on corpora that can't be published.
 
+
+=head1 OPTIONS
+
+=over 2
+
+=item B<--input|-i> <directory>
+
+The unscrambled KorAP-XML directory.
+
+
+=item B<--output|-o> <directory>
+
+The output directory
+
+
+=item B<--rules|-r> <file>
+
+The rule file for transformation as a json file.
+Example:
+
+  [
+    [
+      "dgd/annot.xml",
+      [
+        ["f[name=trans]", "="],
+        ["f[name=lemma]", "^"],
+        ["f[name=pos]", "~"]
+      ]
+    ],
+    ["struct/structure.xml"]
+  ]
+
+All elements of the json list are copied from the input directory to
+the output directory.
+The C<data.xml> file will be automatically coppied and scrambled.
+If the file name is followed by a rule set, these
+CSS selector rules followed by a transformation type marker
+are used to transform elements of the file.
+
+All CSS selectors are nested in C<spanList > span>.
+
+The following markers are supported:
+
+=over 4
+
+=item B<=>
+
+Take the scrambled surface form from the C<data.xml>.
+
+=item B<^>
+
+Take the scrambled surface form from the C<data.xml> and
+modify the term by appending the string C<ui>.
+
+=item B<~>
+
+Create a randomized string, keeping the characteristicts of
+the original element content.
 Two identical words in a single run will always be transfered
 to the same target word.
 
-The C<data.xml> file will be scrambled automatically.
+=back
+
+=back