blob: 6061547333d605c11235cfc7cbae8b4b9724b4a7 [file] [log] [blame]
Akron3eadf2e2020-03-04 11:44:50 +01001#!/usr/bin/env perl
2use Mojo::Base -strict;
3use Mojo::File;
4use Mojo::Util qw'gzip gunzip';
5use Mojo::JSON qw'decode_json encode_json';
6
7#############################################################
8# This helper tool iterates over a list of Krill-Json files #
9# that are used as a sample corpus for Kustvakt (the works #
10# of Goethe) and adopts license strings from a list of #
11# strings to have variouses licenses to check for. #
12# (c) IDS Mannheim #
13#############################################################
14
15# Create a license hash with all licenses
16my %license = ();
17foreach (<DATA>) {
18 my ($file, $license) = split /\s+/, $_;
19 chomp $license;
20 $license{$file} = $license;
21};
22
23
24# Compare the license
25sub _cmp_license {
26 my ($fn, $expected, $is) = @_;
27
28 # Compare the availability fields
29 if ($expected ne $is) {
30 print 'Mismatch: ', $fn, ': ', $expected, ' vs ', $is, "\n";
31 return $expected;
32 } else {
33 print 'Match: ', $fn, ': ', $expected, "\n";
34 return;
35 };
36};
37
38
39# Iterate over all krill json files in the directory
40Mojo::File->new('.')->list->grep(qr!\.json(?:\.gz)?!)->each(
41 sub {
42
43 my $file = $_;
44
45 # Get the base name of the file
46 my $fn = $file->basename;
47 my $gzipped = 0;
48 my $content = $file->slurp;
49 if ($fn =~ s/\.gz$//) {
50 $gzipped = 1;
51 $content = gunzip $content;
52 };
53
54 # Get the json content
55 my $json = decode_json $content;
56
57 my $modified = 0;
58
59 # KoralQuery >= 0.3
60 if ($json->{fields}) {
61
62 # Iterate over all fields
63 foreach ($json->{fields}->@*) {
64
65 # Check for license fields
66 if ($_->{key} eq 'availability') {
67 my $cmp = _cmp_license($fn, $license{$fn}, $_->{value});
68
69 # The licenses match - do nothing
70 last unless $cmp;
71
72 # Rewrite license
73 $_->{value} = $license{$fn};
74 $modified = 1;
75 last;
76 }
77 };
78 }
79
80 # KoralQuery < 0.3
81 else {
82 my $cmp = _cmp_license($fn, $license{$fn}, $json->{availability});
83
84 # The licenses match - do nothing
85 last unless $cmp;
86
87 # Rewrite license
88 $json->{availability} = $license{$fn};
89 $modified = 1;
90 };
91
92 # Store the modified file
93 if ($modified) {
94 print 'Rewrite: ', $_->basename, "\n";
95 if ($gzipped) {
96 $_->spurt(gzip encode_json $json);
97 }
98
99 else {
100 $_->spurt(encode_json $json);
101 };
102 };
103
104 delete $license{$fn};
105 }
106);
107
108
109# Warn on missing files
110foreach (keys %license) {
111 print 'Missing: ', $_, ': ', $license{$_}, "\n";
112};
113
114
115__DATA__
116GOE-AGA-00000.json QAO-NC
117GOE-AGA-01784.json CC-BY-SA
118GOE-AGA-02232.json ACA-NC
119GOE-AGA-02616.json ACA-NC-LC
120GOE-AGA-03828.json QAO-NC-LOC:ids
121GOE-AGD-00000.json QAO-NC-LOC:ids-NU:1
122GOE-AGD-06345.json QAO-NC
123GOE-AGF-00000.json CC-BY-SA
124GOE-AGF-02286.json QAO-NC
125GOE-AGI-00000.json ACA-NC
126GOE-AGI-04846.json QAO-NC-LOC:ids