Add tool to rewrite licenses for the sample corpus
Change-Id: I1e4794b525bd3a112484f3b18abadbeb2bebdf6d
diff --git a/.gitignore b/.gitignore
index f2eb48a..f46ff07 100644
--- a/.gitignore
+++ b/.gitignore
@@ -5,7 +5,8 @@
benchmark.csv
docs
todo.org
-tools
+tools/*
+!tools/rewrite_goe_licenses.pl
build
fixtures.txt
node_modules
diff --git a/tools/rewrite_goe_licenses.pl b/tools/rewrite_goe_licenses.pl
new file mode 100644
index 0000000..6061547
--- /dev/null
+++ b/tools/rewrite_goe_licenses.pl
@@ -0,0 +1,126 @@
+#!/usr/bin/env perl
+use Mojo::Base -strict;
+use Mojo::File;
+use Mojo::Util qw'gzip gunzip';
+use Mojo::JSON qw'decode_json encode_json';
+
+#############################################################
+# This helper tool iterates over a list of Krill-Json files #
+# that are used as a sample corpus for Kustvakt (the works #
+# of Goethe) and adopts license strings from a list of #
+# strings to have variouses licenses to check for. #
+# (c) IDS Mannheim #
+#############################################################
+
+# Create a license hash with all licenses
+my %license = ();
+foreach (<DATA>) {
+ my ($file, $license) = split /\s+/, $_;
+ chomp $license;
+ $license{$file} = $license;
+};
+
+
+# Compare the license
+sub _cmp_license {
+ my ($fn, $expected, $is) = @_;
+
+ # Compare the availability fields
+ if ($expected ne $is) {
+ print 'Mismatch: ', $fn, ': ', $expected, ' vs ', $is, "\n";
+ return $expected;
+ } else {
+ print 'Match: ', $fn, ': ', $expected, "\n";
+ return;
+ };
+};
+
+
+# Iterate over all krill json files in the directory
+Mojo::File->new('.')->list->grep(qr!\.json(?:\.gz)?!)->each(
+ sub {
+
+ my $file = $_;
+
+ # Get the base name of the file
+ my $fn = $file->basename;
+ my $gzipped = 0;
+ my $content = $file->slurp;
+ if ($fn =~ s/\.gz$//) {
+ $gzipped = 1;
+ $content = gunzip $content;
+ };
+
+ # Get the json content
+ my $json = decode_json $content;
+
+ my $modified = 0;
+
+ # KoralQuery >= 0.3
+ if ($json->{fields}) {
+
+ # Iterate over all fields
+ foreach ($json->{fields}->@*) {
+
+ # Check for license fields
+ if ($_->{key} eq 'availability') {
+ my $cmp = _cmp_license($fn, $license{$fn}, $_->{value});
+
+ # The licenses match - do nothing
+ last unless $cmp;
+
+ # Rewrite license
+ $_->{value} = $license{$fn};
+ $modified = 1;
+ last;
+ }
+ };
+ }
+
+ # KoralQuery < 0.3
+ else {
+ my $cmp = _cmp_license($fn, $license{$fn}, $json->{availability});
+
+ # The licenses match - do nothing
+ last unless $cmp;
+
+ # Rewrite license
+ $json->{availability} = $license{$fn};
+ $modified = 1;
+ };
+
+ # Store the modified file
+ if ($modified) {
+ print 'Rewrite: ', $_->basename, "\n";
+ if ($gzipped) {
+ $_->spurt(gzip encode_json $json);
+ }
+
+ else {
+ $_->spurt(encode_json $json);
+ };
+ };
+
+ delete $license{$fn};
+ }
+);
+
+
+# Warn on missing files
+foreach (keys %license) {
+ print 'Missing: ', $_, ': ', $license{$_}, "\n";
+};
+
+
+__DATA__
+GOE-AGA-00000.json QAO-NC
+GOE-AGA-01784.json CC-BY-SA
+GOE-AGA-02232.json ACA-NC
+GOE-AGA-02616.json ACA-NC-LC
+GOE-AGA-03828.json QAO-NC-LOC:ids
+GOE-AGD-00000.json QAO-NC-LOC:ids-NU:1
+GOE-AGD-06345.json QAO-NC
+GOE-AGF-00000.json CC-BY-SA
+GOE-AGF-02286.json QAO-NC
+GOE-AGI-00000.json ACA-NC
+GOE-AGI-04846.json QAO-NC-LOC:ids