blob: 6061547333d605c11235cfc7cbae8b4b9724b4a7 [file] [log] [blame]
#!/usr/bin/env perl
use Mojo::Base -strict;
use Mojo::File;
use Mojo::Util qw'gzip gunzip';
use Mojo::JSON qw'decode_json encode_json';
# This helper tool iterates over a list of Krill-Json files #
# that are used as a sample corpus for Kustvakt (the works #
# of Goethe) and adopts license strings from a list of #
# strings to have variouses licenses to check for. #
# (c) IDS Mannheim #
# Create a license hash with all licenses
my %license = ();
foreach (<DATA>) {
my ($file, $license) = split /\s+/, $_;
chomp $license;
$license{$file} = $license;
# Compare the license
sub _cmp_license {
my ($fn, $expected, $is) = @_;
# Compare the availability fields
if ($expected ne $is) {
print 'Mismatch: ', $fn, ': ', $expected, ' vs ', $is, "\n";
return $expected;
} else {
print 'Match: ', $fn, ': ', $expected, "\n";
# Iterate over all krill json files in the directory
sub {
my $file = $_;
# Get the base name of the file
my $fn = $file->basename;
my $gzipped = 0;
my $content = $file->slurp;
if ($fn =~ s/\.gz$//) {
$gzipped = 1;
$content = gunzip $content;
# Get the json content
my $json = decode_json $content;
my $modified = 0;
# KoralQuery >= 0.3
if ($json->{fields}) {
# Iterate over all fields
foreach ($json->{fields}->@*) {
# Check for license fields
if ($_->{key} eq 'availability') {
my $cmp = _cmp_license($fn, $license{$fn}, $_->{value});
# The licenses match - do nothing
last unless $cmp;
# Rewrite license
$_->{value} = $license{$fn};
$modified = 1;
# KoralQuery < 0.3
else {
my $cmp = _cmp_license($fn, $license{$fn}, $json->{availability});
# The licenses match - do nothing
last unless $cmp;
# Rewrite license
$json->{availability} = $license{$fn};
$modified = 1;
# Store the modified file
if ($modified) {
print 'Rewrite: ', $_->basename, "\n";
if ($gzipped) {
$_->spurt(gzip encode_json $json);
else {
$_->spurt(encode_json $json);
delete $license{$fn};
# Warn on missing files
foreach (keys %license) {
print 'Missing: ', $_, ': ', $license{$_}, "\n";
GOE-AGA-00000.json QAO-NC
GOE-AGA-01784.json CC-BY-SA
GOE-AGA-02232.json ACA-NC
GOE-AGA-02616.json ACA-NC-LC
GOE-AGA-03828.json QAO-NC-LOC:ids
GOE-AGD-00000.json QAO-NC-LOC:ids-NU:1
GOE-AGD-06345.json QAO-NC
GOE-AGF-00000.json CC-BY-SA
GOE-AGF-02286.json QAO-NC
GOE-AGI-00000.json ACA-NC
GOE-AGI-04846.json QAO-NC-LOC:ids