Akron | 3eadf2e | 2020-03-04 11:44:50 +0100 | [diff] [blame] | 1 | #!/usr/bin/env perl |
| 2 | use Mojo::Base -strict; |
| 3 | use Mojo::File; |
| 4 | use Mojo::Util qw'gzip gunzip'; |
| 5 | use Mojo::JSON qw'decode_json encode_json'; |
| 6 | |
| 7 | ############################################################# |
| 8 | # This helper tool iterates over a list of Krill-Json files # |
| 9 | # that are used as a sample corpus for Kustvakt (the works # |
| 10 | # of Goethe) and adopts license strings from a list of # |
| 11 | # strings to have variouses licenses to check for. # |
| 12 | # (c) IDS Mannheim # |
| 13 | ############################################################# |
| 14 | |
| 15 | # Create a license hash with all licenses |
| 16 | my %license = (); |
| 17 | foreach (<DATA>) { |
| 18 | my ($file, $license) = split /\s+/, $_; |
| 19 | chomp $license; |
| 20 | $license{$file} = $license; |
| 21 | }; |
| 22 | |
| 23 | |
| 24 | # Compare the license |
| 25 | sub _cmp_license { |
| 26 | my ($fn, $expected, $is) = @_; |
| 27 | |
| 28 | # Compare the availability fields |
| 29 | if ($expected ne $is) { |
| 30 | print 'Mismatch: ', $fn, ': ', $expected, ' vs ', $is, "\n"; |
| 31 | return $expected; |
| 32 | } else { |
| 33 | print 'Match: ', $fn, ': ', $expected, "\n"; |
| 34 | return; |
| 35 | }; |
| 36 | }; |
| 37 | |
| 38 | |
| 39 | # Iterate over all krill json files in the directory |
| 40 | Mojo::File->new('.')->list->grep(qr!\.json(?:\.gz)?!)->each( |
| 41 | sub { |
| 42 | |
| 43 | my $file = $_; |
| 44 | |
| 45 | # Get the base name of the file |
| 46 | my $fn = $file->basename; |
| 47 | my $gzipped = 0; |
| 48 | my $content = $file->slurp; |
| 49 | if ($fn =~ s/\.gz$//) { |
| 50 | $gzipped = 1; |
| 51 | $content = gunzip $content; |
| 52 | }; |
| 53 | |
| 54 | # Get the json content |
| 55 | my $json = decode_json $content; |
| 56 | |
| 57 | my $modified = 0; |
| 58 | |
| 59 | # KoralQuery >= 0.3 |
| 60 | if ($json->{fields}) { |
| 61 | |
| 62 | # Iterate over all fields |
| 63 | foreach ($json->{fields}->@*) { |
| 64 | |
| 65 | # Check for license fields |
| 66 | if ($_->{key} eq 'availability') { |
| 67 | my $cmp = _cmp_license($fn, $license{$fn}, $_->{value}); |
| 68 | |
| 69 | # The licenses match - do nothing |
| 70 | last unless $cmp; |
| 71 | |
| 72 | # Rewrite license |
| 73 | $_->{value} = $license{$fn}; |
| 74 | $modified = 1; |
| 75 | last; |
| 76 | } |
| 77 | }; |
| 78 | } |
| 79 | |
| 80 | # KoralQuery < 0.3 |
| 81 | else { |
| 82 | my $cmp = _cmp_license($fn, $license{$fn}, $json->{availability}); |
| 83 | |
| 84 | # The licenses match - do nothing |
| 85 | last unless $cmp; |
| 86 | |
| 87 | # Rewrite license |
| 88 | $json->{availability} = $license{$fn}; |
| 89 | $modified = 1; |
| 90 | }; |
| 91 | |
| 92 | # Store the modified file |
| 93 | if ($modified) { |
| 94 | print 'Rewrite: ', $_->basename, "\n"; |
| 95 | if ($gzipped) { |
| 96 | $_->spurt(gzip encode_json $json); |
| 97 | } |
| 98 | |
| 99 | else { |
| 100 | $_->spurt(encode_json $json); |
| 101 | }; |
| 102 | }; |
| 103 | |
| 104 | delete $license{$fn}; |
| 105 | } |
| 106 | ); |
| 107 | |
| 108 | |
| 109 | # Warn on missing files |
| 110 | foreach (keys %license) { |
| 111 | print 'Missing: ', $_, ': ', $license{$_}, "\n"; |
| 112 | }; |
| 113 | |
| 114 | |
| 115 | __DATA__ |
| 116 | GOE-AGA-00000.json QAO-NC |
| 117 | GOE-AGA-01784.json CC-BY-SA |
| 118 | GOE-AGA-02232.json ACA-NC |
| 119 | GOE-AGA-02616.json ACA-NC-LC |
| 120 | GOE-AGA-03828.json QAO-NC-LOC:ids |
| 121 | GOE-AGD-00000.json QAO-NC-LOC:ids-NU:1 |
| 122 | GOE-AGD-06345.json QAO-NC |
| 123 | GOE-AGF-00000.json CC-BY-SA |
| 124 | GOE-AGF-02286.json QAO-NC |
| 125 | GOE-AGI-00000.json ACA-NC |
| 126 | GOE-AGI-04846.json QAO-NC-LOC:ids |