Merge "Add tool to rewrite licenses for the sample corpus"
diff --git a/.gitignore b/.gitignore
index f2eb48a..f46ff07 100644
--- a/.gitignore
+++ b/.gitignore
@@ -5,7 +5,8 @@
 benchmark.csv
 docs
 todo.org
-tools
+tools/*
+!tools/rewrite_goe_licenses.pl
 build
 fixtures.txt
 node_modules
diff --git a/tools/rewrite_goe_licenses.pl b/tools/rewrite_goe_licenses.pl
new file mode 100644
index 0000000..6061547
--- /dev/null
+++ b/tools/rewrite_goe_licenses.pl
@@ -0,0 +1,126 @@
+#!/usr/bin/env perl
+use Mojo::Base -strict;
+use Mojo::File;
+use Mojo::Util qw'gzip gunzip';
+use Mojo::JSON qw'decode_json encode_json';
+
+#############################################################
+# This helper tool iterates over a list of Krill-Json files #
+# that are used as a sample corpus for Kustvakt (the works  #
+# of Goethe) and adopts license strings from a list of      #
+# strings to have variouses licenses to check for.          #
+# (c) IDS Mannheim                                          #
+#############################################################
+
+# Create a license hash with all licenses
+my %license = ();
+foreach (<DATA>) {
+  my ($file, $license) = split /\s+/, $_;
+  chomp $license;
+  $license{$file} = $license;
+};
+
+
+# Compare the license
+sub _cmp_license {
+  my ($fn, $expected, $is) = @_;
+
+  # Compare the availability fields
+  if ($expected ne $is) {
+    print 'Mismatch: ', $fn, ': ', $expected, ' vs ', $is, "\n";
+    return $expected;
+  } else {
+    print 'Match:    ', $fn, ': ', $expected, "\n";
+    return;
+  };
+};
+
+
+# Iterate over all krill json files in the directory
+Mojo::File->new('.')->list->grep(qr!\.json(?:\.gz)?!)->each(
+  sub {
+
+    my $file = $_;
+
+    # Get the base name of the file
+    my $fn = $file->basename;
+    my $gzipped = 0;
+    my $content = $file->slurp;
+    if ($fn =~ s/\.gz$//) {
+      $gzipped = 1;
+      $content = gunzip $content;
+    };
+
+    # Get the json content
+    my $json = decode_json $content;
+
+    my $modified = 0;
+
+    # KoralQuery >= 0.3
+    if ($json->{fields}) {
+
+      # Iterate over all fields
+      foreach ($json->{fields}->@*) {
+
+        # Check for license fields
+        if ($_->{key} eq 'availability') {
+          my $cmp = _cmp_license($fn, $license{$fn}, $_->{value});
+
+          # The licenses match - do nothing
+          last unless $cmp;
+
+          # Rewrite license
+          $_->{value} = $license{$fn};
+          $modified = 1;
+          last;
+        }
+      };
+    }
+
+    # KoralQuery < 0.3
+    else {
+      my $cmp = _cmp_license($fn, $license{$fn}, $json->{availability});
+
+      # The licenses match - do nothing
+      last unless $cmp;
+
+      # Rewrite license
+      $json->{availability} = $license{$fn};
+      $modified = 1;
+    };
+
+    # Store the modified file
+    if ($modified) {
+      print 'Rewrite:  ', $_->basename, "\n";
+      if ($gzipped) {
+        $_->spurt(gzip encode_json $json);
+      }
+
+      else {
+        $_->spurt(encode_json $json);
+      };
+    };
+
+    delete $license{$fn};
+  }
+);
+
+
+# Warn on missing files
+foreach (keys %license) {
+  print 'Missing:  ', $_, ': ', $license{$_}, "\n";
+};
+
+
+__DATA__
+GOE-AGA-00000.json QAO-NC
+GOE-AGA-01784.json CC-BY-SA
+GOE-AGA-02232.json ACA-NC
+GOE-AGA-02616.json ACA-NC-LC
+GOE-AGA-03828.json QAO-NC-LOC:ids
+GOE-AGD-00000.json QAO-NC-LOC:ids-NU:1
+GOE-AGD-06345.json QAO-NC
+GOE-AGF-00000.json CC-BY-SA
+GOE-AGF-02286.json QAO-NC
+GOE-AGI-00000.json ACA-NC
+GOE-AGI-04846.json QAO-NC-LOC:ids