Merge "Update lombok to 1.18.12"
diff --git a/.gitignore b/.gitignore
index bd4989d..752da96 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,6 +7,7 @@
*.iml
dependency-reduced-pom.xml
admin_token
+/sandbox/
/bin/
/db.sqlite
/lite/liteDB.sqlite
diff --git a/core/Changes b/core/Changes
index bec0bd2..12117b3 100644
--- a/core/Changes
+++ b/core/Changes
@@ -2,7 +2,9 @@
24/01/2020
- Removed salt from config and updated config files.
05/02/2020
- - Added welcome page.
+ - Added welcome page.
+11/05/2020
+ - Added tool to create VC from list (diewald)
# version 0.62.3
03/12/2019
diff --git a/tools/list2vc.pl b/tools/list2vc.pl
new file mode 100755
index 0000000..35a205a
--- /dev/null
+++ b/tools/list2vc.pl
@@ -0,0 +1,452 @@
+#!/usr/bin/env perl
+package KorAP::VirtualCorpus;
+use strict;
+use warnings;
+
+
+# Get or set name of the VC
+sub name {
+ my $self = shift;
+ unless (@_) {
+ return $self->{name};
+ };
+ $self->{name} = shift;
+ return $self;
+};
+
+
+# Comment
+sub comment {
+ my $self = shift;
+ unless (@_) {
+ return $self->{comment};
+ };
+ $self->{comment} //= [];
+
+ push @{$self->{comment}}, shift;
+ return $self;
+};
+
+
+# Quote utility function
+sub quote {
+ shift;
+ my $str = shift;
+ $str =~ s/(["\\])/\\$1/g;
+ return qq{"$str"};
+};
+
+
+# Escaped quote utility function
+sub equote {
+ shift;
+ my $str = shift;
+ $str =~ s/(["\\])/\\$1/g;
+ $str =~ s/(["\\])/\\$1/g;
+ return '\\"' . $str . '\\"';
+};
+
+
+sub _commentparam_to_string {
+ my $self = shift;
+ my $comment = $self->_comment_to_string;
+ if ($comment) {
+ return qq!,"comment":"$comment"!;
+ };
+ return '';
+};
+
+
+sub _comment_to_string {
+ my $self = shift;
+ if (!$self->name && !$self->comment) {
+ return '';
+ };
+
+ my $json = '';
+ $json .= 'name:' . $self->equote($self->name) if $self->name;
+ if ($self->name && $self->comment) {
+ $json .= ','
+ };
+ $json .= join(',', @{$self->{comment}}) if $self->{comment};
+
+ return $json;
+};
+
+
+# Stringify globally
+sub to_string {
+ my $self = shift;
+ ## Create collection object
+
+ my $json = '{';
+ $json .= '"@context":"http://korap.ids-mannheim.de/ns/KoralQuery/v0.3/context.jsonld",';
+ $json .= '"collection":';
+ $json .= $self->_to_fragment;
+ # Set at the end, when all comments are done
+ $json .= $self->_commentparam_to_string;
+ return $json .= '}';
+};
+
+
+package KorAP::VirtualCorpus::Group;
+use strict;
+use warnings;
+use base 'KorAP::VirtualCorpus';
+
+
+# Construct a new VC group
+sub new {
+ my $class = shift;
+ bless {
+ with => [],
+ with_fields => {},
+ without => [],
+ without_fields => {},
+ }, $class;
+};
+
+
+# Define an operand to be "or"ed
+sub with {
+ my $self = shift;
+ push @{$self->{with}}, shift;
+};
+
+
+# Define a field that should be "or"ed
+sub with_field {
+ my $self = shift;
+ my $field = shift;
+ push @{$self->{with_fields}->{$field}}, shift;
+};
+
+# Define an operand to be "and"ed
+sub without {
+ my $self = shift;
+ push @{$self->{without}}, shift;
+};
+
+
+# Define a field that should be "and"ed
+sub without_field {
+ my $self = shift;
+ my $field = shift;
+ push @{$self->{without_fields}->{$field}}, shift;
+};
+
+
+# VC contains only with fields
+sub only_with_fields {
+ my $self = shift;
+
+ if (keys %{$self->{without_fields}} || @{$self->{with}} || @{$self->{without}}) {
+ return 0;
+ };
+
+ return 1;
+};
+
+
+# Create a document vector field
+sub _doc_vec {
+ my $field = shift;
+ my $vec = shift;
+ my $json = '{';
+ $json .= '"@type":"koral:doc",';
+ $json .= '"key":"' . $field . '",';
+ $json .= '"match":"match:eq",';
+ $json .= '"value":[';
+ $json .= join ',', map { '"' . $_ . '"' } @$vec;
+ $json .= ']';
+ $json .= '},';
+ return $json;
+}
+
+
+# Stringify fragment
+sub _to_fragment {
+ my $self = shift;
+
+ my $json = '{';
+ $json .= '"@type":"koral:docGroup",';
+
+ # Make the outer group "and"
+ if (keys %{$self->{without_fields}}) {
+ $json .= '"operation":"operation:and",';
+ $json .= '"operands":[';
+
+ foreach my $field (sort keys %{$self->{without_fields}}) {
+ unless (@{$self->{without_fields}->{$field}}) {
+ next;
+ };
+ $json .= _doc_vec($field, $self->{without_fields}->{$field});
+ };
+
+ # Remove the last comma
+ chop $json;
+
+ $json .= ']';
+ }
+
+ elsif (keys %{$self->{with_fields}} || @{$self->{with}}) {
+ $json .= '"operation":"operation:or",';
+
+ $json .= '"operands":[';
+
+ # Flatten embedded "or"-VCs
+ foreach my $op (@{$self->{with}}) {
+
+ # The embedded VC has only extending fields
+ if ($op->only_with_fields) {
+
+ $self->comment('embed:[' . $op->_comment_to_string . ']');
+
+ foreach my $k (keys %{$op->{with_fields}}) {
+ foreach my $v (@{$op->{with_fields}->{$k}}) {
+ $self->with_field($k, $v);
+ };
+ };
+ }
+
+ # Embed complex VC
+ else {
+ $json .= $op->_to_fragment . ',';
+ };
+ };
+
+ foreach my $field (sort keys %{$self->{with_fields}}) {
+ unless (@{$self->{with_fields}->{$field}}) {
+ next;
+ };
+ $json .= _doc_vec($field, $self->{with_fields}->{$field});
+ };
+
+ # Remove the last comma
+ chop $json;
+
+ $json .= ']';
+ }
+
+ # No operands in the group
+ else {
+ # Remove the last comma after the comment
+ chop $json;
+ };
+
+ # Set at the end, when all comments are done
+ $json .= $self->_commentparam_to_string;
+ return $json . '}';
+};
+
+
+package main;
+use strict;
+use warnings;
+
+# 2020-05-20
+# Preliminary support for C2 def-files.
+
+our @ARGV;
+
+unless (@ARGV) {
+ print <<'HELP';
+Convert a line-separated list of corpus sigles, doc sigles or
+text sigles into a virtual corpus query.
+
+ $ perl list2vc.pl my_vc.txt | gzip -vc > my_vc.jsonld.gz
+ $ cat my_vc.txt | perl list2vc.pl - | gzip -vc > my_vc.jsonld.gz
+
+HELP
+exit 0;
+};
+
+
+# Shorten long strings for logging
+sub _shorten ($) {
+ my $line = shift;
+ if (length($line) < 20) {
+ return $line;
+ }
+ else {
+ return substr($line,0,17) . '...';
+ };
+};
+
+
+my $fh;
+if ($ARGV[0] eq '-') {
+ $fh = *STDIN;
+} elsif (!open($fh, '<' . $ARGV[0])) {
+ warn $ARGV[0] . " can't be opened";
+ exit(0);
+};
+
+# Initial VC group
+my $vc;
+
+# Create an intensional and an extensional VC
+my $vc_ext = KorAP::VirtualCorpus::Group->new;
+my $vc_int = KorAP::VirtualCorpus::Group->new;
+
+# Load ext initially
+$$vc = $vc_ext;
+
+# Collect all virtual corpora
+my %all_vcs;
+
+my $frozen = 0;
+
+# Iterate over the whole list
+while (!eof $fh) {
+ my $line = readline($fh);
+ chomp $line;
+
+
+ # Skip empty lines
+ if (!$line || length($line) == 0 || $line =~ /^[\s\t\n]*$/) {
+ # empty
+ next;
+ };
+
+ my ($key, $value, $desc);
+
+ # Line-Type: <e>c</a>
+ if ($line =~ /^\s*<([^>]+)>\s*([^<]*)\s*<\/\1>\s*$/) {
+ $key = $1;
+ $value = $2 // undef;
+ }
+
+ # Line-Type: <e>c
+ elsif($line =~ /^\s*<([^>]+)>\s*([^<]+)\s*$/) {
+ $key = $1;
+ $value = $2;
+ }
+
+ # Get text sigles
+ elsif ($line =~ m!^(?:\w+\/){2}\w+$!) {
+ $key = 'text';
+ $value = $line;
+ }
+
+ # Get doc sigles
+ elsif ($line =~ m!^(\w+\/\w+?)(?:\s.+?)?$!) {
+ $key = 'doc';
+ $value = $1;
+ }
+
+ # Get corpus sigles
+ elsif ($line !~ m!(?:\/|\s)!) {
+ $key = 'corpus';
+ $value = $line;
+ }
+
+ # Not known
+ else {
+ warn _shorten($line) . q! isn't a valid VC definition!;
+ next;
+ };
+
+ # Add text field
+ if ($key eq 'text') {
+
+ # Convert C2 sigle to KorAP form
+ $value =~ s!^([^/]+?/[^\.]+?)\.(.+?)$!$1\/$2!;
+ ${$vc}->with_field(textSigle => $value);
+ }
+
+ # Add doc field
+ elsif ($key eq 'doc') {
+ ${$vc}->with_field(docSigle => $value);
+ }
+
+ # Add corpus field
+ elsif ($key eq 'corpus') {
+ ${$vc}->with_field(corpusSigle => $value);
+ }
+
+ # Add corpus field
+ elsif ($key eq 'cn') {
+ # Korpussigle, z.B. 'F97 Frankfurter Allgemeine 1997'
+ if ($value =~ m!^([^\/\s]+)(?:\s.+?)?$!) {
+ ${$vc}->with_field(corpusSigle => $1);
+ };
+ }
+
+ # Mark the vc as frozen
+ # This means that an extended VC area is expected
+ elsif ($key eq 'frozen') {
+ $frozen = 1;
+ }
+
+ # Start/End intended VC area
+ elsif ($key eq 'intended') {
+ if ($value eq 'start') {
+ $$vc = $vc_int;
+ }
+ elsif ($value ne 'end') {
+ warn 'Unknown intension value ' . $value;
+ };
+ }
+
+ # Start/End extended VC area
+ elsif ($key eq 'extended') {
+ if ($value eq 'start') {
+ $$vc = $vc_ext;
+ }
+ elsif ($value ne 'end') {
+ warn 'Unknown extension value ' . $value;
+ };
+ }
+
+ # Set VC name
+ elsif ($key eq 'name') {
+ # "Name des virt. Korpus, der angezeigt wird.
+ # Wird auch intern zur Korpusbildung referenziert, z.B. für <and>,
+ # <add>, <sub>"
+
+ # No global name defined yet
+ if ($$vc && !$$vc->name) {
+ $vc_ext->name($value);
+ $vc_int->name($value);
+ next;
+ };
+
+ ${$vc} = KorAP::VirtualCorpus::Group->new;
+ ${$vc}->name($value);
+ }
+
+ # End VC def
+ elsif ($key eq 'end') {
+ $all_vcs{${$vc}->name} = $$vc;
+ # $vc = undef;
+ }
+
+ # Add VC definition
+ elsif ($key eq 'add') {
+ unless (defined $all_vcs{$value}) {
+ # warn 'VC ' . $value . ' not defined';
+ # exit(1);
+ next;
+ };
+
+ $$vc->with($all_vcs{$value});
+ }
+
+ # Add reduction value as a comment
+ elsif ($key eq 'redabs') {
+ # "red. Anz. Texte
+ # absoluter Wert der durch Reduktion zu erzielende Anzahl Texte"
+ $$vc->comment('redabs:' . $value);
+ }
+
+ # Unknown
+ else {
+ # warn $key . ' is an unknown field';
+ };
+};
+
+close($fh);
+
+# Stringify current (extended?) virtual corpus
+print $$vc->to_string;
diff --git a/tools/t/data/list1.txt b/tools/t/data/list1.txt
new file mode 100644
index 0000000..bc22c1c
--- /dev/null
+++ b/tools/t/data/list1.txt
@@ -0,0 +1,7 @@
+A02
+A01/B02/c04
+A03
+
+B04/X02
+B04/X03
+A01/B02/c05
diff --git a/tools/t/data/list2.def b/tools/t/data/list2.def
new file mode 100644
index 0000000..7b2cc1e
--- /dev/null
+++ b/tools/t/data/list2.def
@@ -0,0 +1,14 @@
+<name>Example</name>
+BRZ05/SEP Braunschweiger Zeitung, September 2005
+BRZ05/OKT Braunschweiger Zeitung, Oktober 2005
+BRZ05/NOV Braunschweiger Zeitung, November 2005
+BRZ05/DEZ Braunschweiger Zeitung, Dezember 2005
+BRZ06/JAN Braunschweiger Zeitung, Januar 2006
+<text>B19/AUG.01665</text>
+<text>B19/AUG.01666</text>
+BRZ07/SEP Braunschweiger Zeitung, September 2007
+BRZ07/OKT Braunschweiger Zeitung, Oktober 2007
+BRZ07/NOV Braunschweiger Zeitung, November 2007
+BRZ07/DEZ Braunschweiger Zeitung, Dezember 2007
+BRZ08/JAN Braunschweiger Zeitung, Januar 2008
+BRZ08/FEB Braunschweiger Zeitung, Februar 2008
diff --git a/tools/t/data/list3.def b/tools/t/data/list3.def
new file mode 100644
index 0000000..783cf23
--- /dev/null
+++ b/tools/t/data/list3.def
@@ -0,0 +1,58 @@
+<name>VAS-N91 (Stand "2013", korr. 2017)</name>
+
+<frozen></frozen>
+
+<intended>start</intended>
+
+<name>1991-2012</name>
+<date>m1=1991/1 bis 2012/12</date>
+<end></end>
+
+<name>Berliner Zeitung</name>
+<ql>Berliner Zeitung</ql>
+<and>1991-2012</and>
+<redabs>143237</redabs>
+<end></end>
+
+<name>Frankfurter Allgemeine</name>
+<cn>F97 Frankfurter Allgemeine 1997</cn>
+<cn>F99 Frankfurter Allgemeine 1999</cn>
+<cn>F01 Frankfurter Allgemeine 2001</cn>
+<cn>F03 Frankfurter Allgemeine 2003</cn>
+<cn>F05 Frankfurter Allgemeine 2005</cn>
+<redabs>301166</redabs>
+<end></end>
+
+<name>VAS N91</name>
+<add>Berliner Zeitung</add>
+<add>Braunschweiger Zeitung</add>
+<add>Hamburger Morgenpost</add>
+<add>Hannoversche Allgemeine</add>
+<add>Die Rheinpfalz</add>
+<add>Mannheimer Morgen</add>
+<add>Rhein-Zeitung</add>
+<add>Nürnberger Nachrichten</add>
+<add>Nürnberger Zeitung</add>
+<add>die tageszeitung</add>
+<add>Frankfurter Allgemeine</add>
+<add>Frankfurter Rundschau</add>
+<add>Burgenländische Volkszeitung</add>
+<add>Die Presse</add>
+<add>Kleine Zeitung</add>
+<add>Neue Kronen-Zeitung</add>
+<add>Niederösterreichische Nachrichten</add>
+<add>Salzburger Nachrichten</add>
+<add>Tiroler Tageszeitung</add>
+<add>Vorarlberger Nachrichten</add>
+<add>Die Südostschweiz</add>
+<add>St. Galler Tagblatt</add>
+<add>Zürcher Tagesanzeiger</add>
+<and>1991-2012</and>
+<end></end>
+
+<intended>end</intended>
+
+<extended>start</extended>
+<text>A00/APR.23232</text>
+<text>A00/APR.23233</text>
+<extended>end</extended>
diff --git a/tools/t/data/list4.def b/tools/t/data/list4.def
new file mode 100644
index 0000000..0cd73aa
--- /dev/null
+++ b/tools/t/data/list4.def
@@ -0,0 +1,27 @@
+<name>VAS-N91 (Stand "2013", korr. 2017)</name>
+
+<name>1991-2012</name>
+<date>m1=1991/1 bis 2012/12</date>
+<end></end>
+
+<name>Berliner Zeitung</name>
+<ql>Berliner Zeitung</ql>
+<and>1991-2012</and>
+<redabs>143237</redabs>
+<end></end>
+
+<name>Frankfurter Allgemeine</name>
+<cn>F97 Frankfurter Allgemeine 1997</cn>
+<cn>F99 Frankfurter Allgemeine 1999</cn>
+<cn>F01 Frankfurter Allgemeine 2001</cn>
+<cn>F03 Frankfurter Allgemeine 2003</cn>
+<cn>F05 Frankfurter Allgemeine 2005</cn>
+<redabs>301166</redabs>
+<end></end>
+
+<name>VAS N91</name>
+<add>Berliner Zeitung</add>
+<add>Frankfurter Allgemeine</add>
+<add>Frankfurter Rundschau</add>
+<and>1991-2012</and>
+<end></end>
diff --git a/tools/t/list2vc-def.t b/tools/t/list2vc-def.t
new file mode 100644
index 0000000..baaefda
--- /dev/null
+++ b/tools/t/list2vc-def.t
@@ -0,0 +1,81 @@
+#!/usr/bin/env perl
+use strict;
+use warnings;
+use Test::More;
+use File::Basename;
+use File::Spec::Functions;
+use Data::Dumper;
+
+use Test::Output;
+use Mojo::JSON 'decode_json';
+
+my $script = catfile(dirname(__FILE__), '..', 'list2vc.pl');
+my $list1 = catfile(dirname(__FILE__), 'data', 'list2.def');
+
+# Check STDOUT
+stdout_like(
+ sub {
+ system($script, $list1);
+ },
+ qr!^\{\"\@context\".+?\}$!,
+ "check stdout"
+);
+
+# Check JSON
+my $json = decode_json(join('', `$script $list1`));
+
+is($json->{'collection'}->{'@type'}, 'koral:docGroup', 'type');
+is($json->{'collection'}->{'operation'}, 'operation:or', 'operation');
+
+my $op1 = $json->{'collection'}->{'operands'}->[0];
+is($op1->{'@type'}, 'koral:doc', 'type');
+is($op1->{'key'}, 'docSigle', 'key');
+is($op1->{'match'}, 'match:eq', 'match');
+is($op1->{'value'}->[0], "BRZ05/SEP", 'value');
+is($op1->{'value'}->[1], ,"BRZ05/OKT", 'value');
+is($op1->{'value'}->[-1], ,"BRZ08/FEB", 'value');
+
+my $op2 = $json->{'collection'}->{'operands'}->[1];
+is($op2->{'@type'}, 'koral:doc', 'type');
+is($op2->{'key'}, 'textSigle', 'key');
+is($op2->{'match'}, 'match:eq', 'match');
+is($op2->{'value'}->[0], "B19/AUG/01665", 'value');
+is($op2->{'value'}->[1], ,"B19/AUG/01666", 'value');
+
+my $list3 = catfile(dirname(__FILE__), 'data', 'list3.def');
+
+# Check JSON
+# Only return extended area
+$json = decode_json(join('', `$script $list3`));
+
+is($json->{'collection'}->{'@type'}, 'koral:docGroup', 'type');
+is($json->{'collection'}->{'operation'}, 'operation:or', 'operation');
+is($json->{'collection'}->{'comment'}, 'name:"VAS-N91 (Stand \"2013\", korr. 2017)"', 'type');
+
+$op1 = $json->{'collection'}->{'operands'}->[0];
+is($op1->{'@type'}, 'koral:doc', 'type');
+is($op1->{'key'}, 'textSigle', 'key');
+is($op1->{'match'}, 'match:eq', 'match');
+is($op1->{'value'}->[0], "A00/APR/23232", 'value');
+is($op1->{'value'}->[1], "A00/APR/23233", 'value');
+
+
+my $list4 = catfile(dirname(__FILE__), 'data', 'list4.def');
+
+# Only contains intended area
+$json = decode_json(join('', `$script $list4`));
+
+is($json->{'collection'}->{'@type'}, 'koral:docGroup', 'type');
+is($json->{'collection'}->{'operation'}, 'operation:or', 'operation');
+like($json->{'collection'}->{'comment'}, qr!^name:"VAS N91"!, 'name');
+like($json->{'collection'}->{'comment'}, qr!embed:\[name:"Berliner Zeitung",redabs:143237\]!, 'embed');
+like($json->{'collection'}->{'comment'}, qr!embed:\[name:"Frankfurter Allgemeine",redabs:301166\]!, 'embed');
+
+$op1 = $json->{'collection'}->{'operands'}->[0];
+is($op1->{'@type'}, 'koral:doc', 'type');
+is($op1->{'key'}, 'corpusSigle', 'key');
+is($op1->{'match'}, 'match:eq', 'match');
+is($op1->{'value'}->[0], "F97", 'value');
+is($op1->{'value'}->[1], "F99", 'value');
+
+done_testing;
diff --git a/tools/t/list2vc.t b/tools/t/list2vc.t
new file mode 100644
index 0000000..93b5226
--- /dev/null
+++ b/tools/t/list2vc.t
@@ -0,0 +1,52 @@
+#!/usr/bin/env perl
+use strict;
+use warnings;
+use Test::More;
+use File::Basename;
+use File::Spec::Functions;
+
+use Test::Output;
+use Mojo::JSON 'decode_json';
+
+my $script = catfile(dirname(__FILE__), '..', 'list2vc.pl');
+my $list1 = catfile(dirname(__FILE__), 'data', 'list1.txt');
+
+# Check STDOUT
+stdout_like(
+ sub {
+ system($script, $list1);
+ },
+ qr!^\{\"\@context\".+?\}$!,
+ "check stdout"
+);
+
+# Check JSON
+my $json = decode_json(join('', `$script $list1`));
+
+is($json->{'collection'}->{'@type'}, 'koral:docGroup', 'type');
+is($json->{'collection'}->{'operation'}, 'operation:or', 'operation');
+
+my $op1 = $json->{'collection'}->{'operands'}->[0];
+is($op1->{'@type'}, 'koral:doc', 'type');
+is($op1->{'key'}, 'corpusSigle', 'key');
+is($op1->{'match'}, 'match:eq', 'match');
+is_deeply($op1->{'value'}, ["A02","A03"], 'value');
+
+my $op2 = $json->{'collection'}->{'operands'}->[1];
+is($op2->{'@type'}, 'koral:doc', 'type');
+is($op2->{'key'}, 'docSigle', 'key');
+is($op2->{'match'}, 'match:eq', 'match');
+is_deeply($op2->{'value'}, ["B04/X02","B04/X03"], 'value');
+
+my $op3 = $json->{'collection'}->{'operands'}->[2];
+is($op3->{'@type'}, 'koral:doc', 'type');
+is($op3->{'key'}, 'textSigle', 'key');
+is($op3->{'match'}, 'match:eq', 'match');
+is_deeply($op3->{'value'}, ["A01/B02/c04","A01/B02/c05"], 'value');
+
+
+# Check STDIN
+my $json2 = decode_json(join('', `cat $list1 | $script -`));
+is_deeply($json, $json2);
+
+done_testing;