Merge "Update lombok to 1.18.12"
diff --git a/.gitignore b/.gitignore
index bd4989d..752da96 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,6 +7,7 @@
 *.iml
 dependency-reduced-pom.xml
 admin_token
+/sandbox/
 /bin/
 /db.sqlite
 /lite/liteDB.sqlite
diff --git a/core/Changes b/core/Changes
index bec0bd2..12117b3 100644
--- a/core/Changes
+++ b/core/Changes
@@ -2,7 +2,9 @@
 24/01/2020
    - Removed salt from config and updated config files.
 05/02/2020
-   - Added welcome page.   
+   - Added welcome page.
+11/05/2020
+   - Added tool to create VC from list (diewald)
 
 # version 0.62.3
 03/12/2019
diff --git a/tools/list2vc.pl b/tools/list2vc.pl
new file mode 100755
index 0000000..35a205a
--- /dev/null
+++ b/tools/list2vc.pl
@@ -0,0 +1,452 @@
+#!/usr/bin/env perl
+package KorAP::VirtualCorpus;
+use strict;
+use warnings;
+
+
+# Get or set name of the VC
+sub name {
+  my $self = shift;
+  unless (@_) {
+    return $self->{name};
+  };
+  $self->{name} = shift;
+  return $self;
+};
+
+
+# Comment
+sub comment {
+  my $self = shift;
+  unless (@_) {
+    return $self->{comment};
+  };
+  $self->{comment} //= [];
+
+  push @{$self->{comment}}, shift;
+  return $self;
+};
+
+
+# Quote utility function
+sub quote {
+  shift;
+  my $str = shift;
+  $str =~ s/(["\\])/\\$1/g;
+  return qq{"$str"};
+};
+
+
+# Escaped quote utility function
+sub equote {
+  shift;
+  my $str = shift;
+  $str =~ s/(["\\])/\\$1/g;
+  $str =~ s/(["\\])/\\$1/g;
+  return '\\"' . $str . '\\"';
+};
+
+
+sub _commentparam_to_string {
+  my $self = shift;
+  my $comment = $self->_comment_to_string;
+  if ($comment) {
+    return qq!,"comment":"$comment"!;
+  };
+  return '';
+};
+
+
+sub _comment_to_string {
+  my $self = shift;
+  if (!$self->name && !$self->comment) {
+    return '';
+  };
+
+  my $json = '';
+  $json .= 'name:' . $self->equote($self->name) if $self->name;
+  if ($self->name && $self->comment) {
+    $json .= ','
+  };
+  $json .= join(',', @{$self->{comment}}) if $self->{comment};
+
+  return $json;
+};
+
+
+# Stringify globally
+sub to_string {
+  my $self = shift;
+  ## Create collection object
+
+  my $json = '{';
+  $json .= '"@context":"http://korap.ids-mannheim.de/ns/KoralQuery/v0.3/context.jsonld",';
+  $json .= '"collection":';
+  $json .= $self->_to_fragment;
+  # Set at the end, when all comments are done
+  $json .= $self->_commentparam_to_string;
+  return $json .= '}';
+};
+
+
+package KorAP::VirtualCorpus::Group;
+use strict;
+use warnings;
+use base 'KorAP::VirtualCorpus';
+
+
+# Construct a new VC group
+sub new {
+  my $class = shift;
+  bless {
+    with => [],
+    with_fields => {},
+    without => [],
+    without_fields => {},
+  }, $class;
+};
+
+
+# Define an operand to be "or"ed
+sub with {
+  my $self = shift;
+  push @{$self->{with}}, shift;
+};
+
+
+# Define a field that should be "or"ed
+sub with_field {
+  my $self = shift;
+  my $field = shift;
+  push @{$self->{with_fields}->{$field}}, shift;
+};
+
+# Define an operand to be "and"ed
+sub without {
+  my $self = shift;
+  push @{$self->{without}}, shift;
+};
+
+
+# Define a field that should be "and"ed
+sub without_field {
+  my $self = shift;
+  my $field = shift;
+  push @{$self->{without_fields}->{$field}}, shift;
+};
+
+
+# VC contains only with fields
+sub only_with_fields {
+  my $self = shift;
+
+  if (keys %{$self->{without_fields}} || @{$self->{with}} || @{$self->{without}}) {
+    return 0;
+  };
+
+  return 1;
+};
+
+
+# Create a document vector field
+sub _doc_vec {
+  my $field = shift;
+  my $vec = shift;
+  my $json = '{';
+  $json .= '"@type":"koral:doc",';
+  $json .= '"key":"' . $field . '",';
+  $json .= '"match":"match:eq",';
+  $json .= '"value":[';
+  $json .= join ',', map { '"' . $_ . '"' } @$vec;
+  $json .=  ']';
+  $json .= '},';
+  return $json;
+}
+
+
+# Stringify fragment
+sub _to_fragment {
+  my $self = shift;
+
+  my $json = '{';
+  $json .= '"@type":"koral:docGroup",';
+
+  # Make the outer group "and"
+  if (keys %{$self->{without_fields}}) {
+    $json .= '"operation":"operation:and",';
+    $json .= '"operands":[';
+
+    foreach my $field (sort keys %{$self->{without_fields}}) {
+      unless (@{$self->{without_fields}->{$field}}) {
+        next;
+      };
+      $json .= _doc_vec($field, $self->{without_fields}->{$field});
+    };
+
+    # Remove the last comma
+    chop $json;
+
+    $json .= ']';
+  }
+
+  elsif (keys %{$self->{with_fields}} || @{$self->{with}}) {
+    $json .= '"operation":"operation:or",';
+
+    $json .= '"operands":[';
+
+    # Flatten embedded "or"-VCs
+    foreach my $op (@{$self->{with}}) {
+
+      # The embedded VC has only extending fields
+      if ($op->only_with_fields) {
+
+        $self->comment('embed:[' . $op->_comment_to_string . ']');
+
+        foreach my $k (keys %{$op->{with_fields}}) {
+          foreach my $v (@{$op->{with_fields}->{$k}}) {
+            $self->with_field($k, $v);
+          };
+        };
+      }
+
+      # Embed complex VC
+      else {
+        $json .= $op->_to_fragment . ',';
+      };
+    };
+
+    foreach my $field (sort keys %{$self->{with_fields}}) {
+      unless (@{$self->{with_fields}->{$field}}) {
+        next;
+      };
+      $json .= _doc_vec($field, $self->{with_fields}->{$field});
+    };
+
+    # Remove the last comma
+    chop $json;
+
+    $json .= ']';
+  }
+
+  # No operands in the group
+  else {
+    # Remove the last comma after the comment
+    chop $json;
+  };
+
+  # Set at the end, when all comments are done
+  $json .= $self->_commentparam_to_string;
+  return $json . '}';
+};
+
+
+package main;
+use strict;
+use warnings;
+
+# 2020-05-20
+#   Preliminary support for C2 def-files.
+
+our @ARGV;
+
+unless (@ARGV) {
+  print <<'HELP';
+Convert a line-separated list of corpus sigles, doc sigles or
+text sigles into a virtual corpus query.
+
+  $ perl list2vc.pl my_vc.txt | gzip -vc > my_vc.jsonld.gz
+  $ cat my_vc.txt | perl list2vc.pl - | gzip -vc > my_vc.jsonld.gz
+
+HELP
+exit 0;
+};
+
+
+# Shorten long strings for logging
+sub _shorten ($) {
+  my $line = shift;
+  if (length($line) < 20) {
+    return $line;
+  }
+  else {
+    return substr($line,0,17) . '...';
+  };
+};
+
+
+my $fh;
+if ($ARGV[0] eq '-') {
+  $fh = *STDIN;
+} elsif (!open($fh, '<' . $ARGV[0])) {
+  warn $ARGV[0] . " can't be opened";
+  exit(0);
+};
+
+# Initial VC group
+my $vc;
+
+# Create an intensional and an extensional VC
+my $vc_ext = KorAP::VirtualCorpus::Group->new;
+my $vc_int = KorAP::VirtualCorpus::Group->new;
+
+# Load ext initially
+$$vc = $vc_ext;
+
+# Collect all virtual corpora
+my %all_vcs;
+
+my $frozen = 0;
+
+# Iterate over the whole list
+while (!eof $fh) {
+  my $line = readline($fh);
+  chomp $line;
+
+
+  # Skip empty lines
+  if (!$line || length($line) == 0 || $line =~ /^[\s\t\n]*$/) {
+    # empty
+    next;
+  };
+
+  my ($key, $value, $desc);
+
+  # Line-Type: <e>c</a>
+  if ($line =~ /^\s*<([^>]+)>\s*([^<]*)\s*<\/\1>\s*$/) {
+    $key = $1;
+    $value = $2 // undef;
+  }
+
+  # Line-Type: <e>c
+  elsif($line =~ /^\s*<([^>]+)>\s*([^<]+)\s*$/) {
+    $key = $1;
+    $value = $2;
+  }
+
+  # Get text sigles
+  elsif ($line =~ m!^(?:\w+\/){2}\w+$!) {
+    $key = 'text';
+    $value = $line;
+  }
+
+  # Get doc sigles
+  elsif ($line =~ m!^(\w+\/\w+?)(?:\s.+?)?$!) {
+    $key = 'doc';
+    $value = $1;
+  }
+
+  # Get corpus sigles
+  elsif ($line !~ m!(?:\/|\s)!) {
+    $key = 'corpus';
+    $value = $line;
+  }
+
+  # Not known
+  else {
+    warn _shorten($line) . q! isn't a valid VC definition!;
+    next;
+  };
+
+  # Add text field
+  if ($key eq 'text') {
+
+    # Convert C2 sigle to KorAP form
+    $value =~ s!^([^/]+?/[^\.]+?)\.(.+?)$!$1\/$2!;
+    ${$vc}->with_field(textSigle => $value);
+  }
+
+  # Add doc field
+  elsif ($key eq 'doc') {
+    ${$vc}->with_field(docSigle => $value);
+  }
+
+  # Add corpus field
+  elsif ($key eq 'corpus') {
+    ${$vc}->with_field(corpusSigle => $value);
+  }
+
+  # Add corpus field
+  elsif ($key eq 'cn') {
+    # Korpussigle, z.B. 'F97 Frankfurter Allgemeine 1997'
+    if ($value =~ m!^([^\/\s]+)(?:\s.+?)?$!) {
+      ${$vc}->with_field(corpusSigle => $1);
+    };
+  }
+
+  # Mark the vc as frozen
+  # This means that an extended VC area is expected
+  elsif ($key eq 'frozen') {
+    $frozen = 1;
+  }
+
+  # Start/End intended VC area
+  elsif ($key eq 'intended') {
+    if ($value eq 'start') {
+      $$vc = $vc_int;
+    }
+    elsif ($value ne 'end') {
+      warn 'Unknown intension value ' . $value;
+    };
+  }
+
+  # Start/End extended VC area
+  elsif ($key eq 'extended') {
+    if ($value eq 'start') {
+      $$vc = $vc_ext;
+    }
+    elsif ($value ne 'end') {
+      warn 'Unknown extension value ' . $value;
+    };
+  }
+
+  # Set VC name
+  elsif ($key eq 'name') {
+    # "Name des virt. Korpus, der angezeigt wird.
+    # Wird auch intern zur Korpusbildung referenziert, z.B. für <and>,
+    # <add>, <sub>"
+
+    # No global name defined yet
+    if ($$vc && !$$vc->name) {
+      $vc_ext->name($value);
+      $vc_int->name($value);
+      next;
+    };
+
+    ${$vc} = KorAP::VirtualCorpus::Group->new;
+    ${$vc}->name($value);
+  }
+
+  # End VC def
+  elsif ($key eq 'end') {
+    $all_vcs{${$vc}->name} = $$vc;
+    # $vc = undef;
+  }
+
+  # Add VC definition
+  elsif ($key eq 'add') {
+    unless (defined $all_vcs{$value}) {
+      #       warn 'VC ' . $value . ' not defined';
+      # exit(1);
+      next;
+    };
+
+    $$vc->with($all_vcs{$value});
+  }
+
+  # Add reduction value as a comment
+  elsif ($key eq 'redabs') {
+    # "red. Anz. Texte
+    # absoluter Wert der durch Reduktion zu erzielende Anzahl Texte"
+    $$vc->comment('redabs:' . $value);
+  }
+
+  # Unknown
+  else {
+    # warn $key . ' is an unknown field';
+  };
+};
+
+close($fh);
+
+# Stringify current (extended?) virtual corpus
+print $$vc->to_string;
diff --git a/tools/t/data/list1.txt b/tools/t/data/list1.txt
new file mode 100644
index 0000000..bc22c1c
--- /dev/null
+++ b/tools/t/data/list1.txt
@@ -0,0 +1,7 @@
+A02
+A01/B02/c04
+A03
+
+B04/X02
+B04/X03
+A01/B02/c05
diff --git a/tools/t/data/list2.def b/tools/t/data/list2.def
new file mode 100644
index 0000000..7b2cc1e
--- /dev/null
+++ b/tools/t/data/list2.def
@@ -0,0 +1,14 @@
+<name>Example</name>
+BRZ05/SEP Braunschweiger Zeitung, September 2005
+BRZ05/OKT Braunschweiger Zeitung, Oktober 2005
+BRZ05/NOV Braunschweiger Zeitung, November 2005
+BRZ05/DEZ Braunschweiger Zeitung, Dezember 2005
+BRZ06/JAN Braunschweiger Zeitung, Januar 2006
+<text>B19/AUG.01665</text>
+<text>B19/AUG.01666</text>
+BRZ07/SEP Braunschweiger Zeitung, September 2007
+BRZ07/OKT Braunschweiger Zeitung, Oktober 2007
+BRZ07/NOV Braunschweiger Zeitung, November 2007
+BRZ07/DEZ Braunschweiger Zeitung, Dezember 2007
+BRZ08/JAN Braunschweiger Zeitung, Januar 2008
+BRZ08/FEB Braunschweiger Zeitung, Februar 2008
diff --git a/tools/t/data/list3.def b/tools/t/data/list3.def
new file mode 100644
index 0000000..783cf23
--- /dev/null
+++ b/tools/t/data/list3.def
@@ -0,0 +1,58 @@
+<name>VAS-N91 (Stand "2013", korr. 2017)</name>
+
+<frozen></frozen>
+
+<intended>start</intended>
+
+<name>1991-2012</name>
+<date>m1=1991/1 bis 2012/12</date>
+<end></end>
+
+<name>Berliner Zeitung</name>
+<ql>Berliner Zeitung</ql>
+<and>1991-2012</and>
+<redabs>143237</redabs>
+<end></end>
+
+<name>Frankfurter Allgemeine</name>
+<cn>F97 Frankfurter Allgemeine 1997</cn>
+<cn>F99 Frankfurter Allgemeine 1999</cn>
+<cn>F01 Frankfurter Allgemeine 2001</cn>
+<cn>F03 Frankfurter Allgemeine 2003</cn>
+<cn>F05 Frankfurter Allgemeine 2005</cn>
+<redabs>301166</redabs>
+<end></end>
+
+<name>VAS N91</name>
+<add>Berliner Zeitung</add>
+<add>Braunschweiger Zeitung</add>
+<add>Hamburger Morgenpost</add>
+<add>Hannoversche Allgemeine</add>
+<add>Die Rheinpfalz</add>
+<add>Mannheimer Morgen</add>
+<add>Rhein-Zeitung</add>
+<add>Nürnberger Nachrichten</add>
+<add>Nürnberger Zeitung</add>
+<add>die tageszeitung</add>
+<add>Frankfurter Allgemeine</add>
+<add>Frankfurter Rundschau</add>
+<add>Burgenländische Volkszeitung</add>
+<add>Die Presse</add>
+<add>Kleine Zeitung</add>
+<add>Neue Kronen-Zeitung</add>
+<add>Niederösterreichische Nachrichten</add>
+<add>Salzburger Nachrichten</add>
+<add>Tiroler Tageszeitung</add>
+<add>Vorarlberger Nachrichten</add>
+<add>Die Südostschweiz</add>
+<add>St. Galler Tagblatt</add>
+<add>Zürcher Tagesanzeiger</add>
+<and>1991-2012</and>
+<end></end>
+
+<intended>end</intended>
+
+<extended>start</extended>
+<text>A00/APR.23232</text>
+<text>A00/APR.23233</text>
+<extended>end</extended>
diff --git a/tools/t/data/list4.def b/tools/t/data/list4.def
new file mode 100644
index 0000000..0cd73aa
--- /dev/null
+++ b/tools/t/data/list4.def
@@ -0,0 +1,27 @@
+<name>VAS-N91 (Stand "2013", korr. 2017)</name>
+
+<name>1991-2012</name>
+<date>m1=1991/1 bis 2012/12</date>
+<end></end>
+
+<name>Berliner Zeitung</name>
+<ql>Berliner Zeitung</ql>
+<and>1991-2012</and>
+<redabs>143237</redabs>
+<end></end>
+
+<name>Frankfurter Allgemeine</name>
+<cn>F97 Frankfurter Allgemeine 1997</cn>
+<cn>F99 Frankfurter Allgemeine 1999</cn>
+<cn>F01 Frankfurter Allgemeine 2001</cn>
+<cn>F03 Frankfurter Allgemeine 2003</cn>
+<cn>F05 Frankfurter Allgemeine 2005</cn>
+<redabs>301166</redabs>
+<end></end>
+
+<name>VAS N91</name>
+<add>Berliner Zeitung</add>
+<add>Frankfurter Allgemeine</add>
+<add>Frankfurter Rundschau</add>
+<and>1991-2012</and>
+<end></end>
diff --git a/tools/t/list2vc-def.t b/tools/t/list2vc-def.t
new file mode 100644
index 0000000..baaefda
--- /dev/null
+++ b/tools/t/list2vc-def.t
@@ -0,0 +1,81 @@
+#!/usr/bin/env perl
+use strict;
+use warnings;
+use Test::More;
+use File::Basename;
+use File::Spec::Functions;
+use Data::Dumper;
+
+use Test::Output;
+use Mojo::JSON 'decode_json';
+
+my $script = catfile(dirname(__FILE__), '..', 'list2vc.pl');
+my $list1 = catfile(dirname(__FILE__), 'data', 'list2.def');
+
+# Check STDOUT
+stdout_like(
+  sub {
+    system($script, $list1);
+  },
+  qr!^\{\"\@context\".+?\}$!,
+  "check stdout"
+);
+
+# Check JSON
+my $json = decode_json(join('', `$script $list1`));
+
+is($json->{'collection'}->{'@type'}, 'koral:docGroup', 'type');
+is($json->{'collection'}->{'operation'}, 'operation:or', 'operation');
+
+my $op1 = $json->{'collection'}->{'operands'}->[0];
+is($op1->{'@type'}, 'koral:doc', 'type');
+is($op1->{'key'}, 'docSigle', 'key');
+is($op1->{'match'}, 'match:eq', 'match');
+is($op1->{'value'}->[0], "BRZ05/SEP", 'value');
+is($op1->{'value'}->[1], ,"BRZ05/OKT", 'value');
+is($op1->{'value'}->[-1], ,"BRZ08/FEB", 'value');
+
+my $op2 = $json->{'collection'}->{'operands'}->[1];
+is($op2->{'@type'}, 'koral:doc', 'type');
+is($op2->{'key'}, 'textSigle', 'key');
+is($op2->{'match'}, 'match:eq', 'match');
+is($op2->{'value'}->[0], "B19/AUG/01665", 'value');
+is($op2->{'value'}->[1], ,"B19/AUG/01666", 'value');
+
+my $list3 = catfile(dirname(__FILE__), 'data', 'list3.def');
+
+# Check JSON
+# Only return extended area
+$json = decode_json(join('', `$script $list3`));
+
+is($json->{'collection'}->{'@type'}, 'koral:docGroup', 'type');
+is($json->{'collection'}->{'operation'}, 'operation:or', 'operation');
+is($json->{'collection'}->{'comment'}, 'name:"VAS-N91 (Stand \"2013\", korr. 2017)"', 'type');
+
+$op1 = $json->{'collection'}->{'operands'}->[0];
+is($op1->{'@type'}, 'koral:doc', 'type');
+is($op1->{'key'}, 'textSigle', 'key');
+is($op1->{'match'}, 'match:eq', 'match');
+is($op1->{'value'}->[0], "A00/APR/23232", 'value');
+is($op1->{'value'}->[1], "A00/APR/23233", 'value');
+
+
+my $list4 = catfile(dirname(__FILE__), 'data', 'list4.def');
+
+# Only contains intended area
+$json = decode_json(join('', `$script $list4`));
+
+is($json->{'collection'}->{'@type'}, 'koral:docGroup', 'type');
+is($json->{'collection'}->{'operation'}, 'operation:or', 'operation');
+like($json->{'collection'}->{'comment'}, qr!^name:"VAS N91"!, 'name');
+like($json->{'collection'}->{'comment'}, qr!embed:\[name:"Berliner Zeitung",redabs:143237\]!, 'embed');
+like($json->{'collection'}->{'comment'}, qr!embed:\[name:"Frankfurter Allgemeine",redabs:301166\]!, 'embed');
+
+$op1 = $json->{'collection'}->{'operands'}->[0];
+is($op1->{'@type'}, 'koral:doc', 'type');
+is($op1->{'key'}, 'corpusSigle', 'key');
+is($op1->{'match'}, 'match:eq', 'match');
+is($op1->{'value'}->[0], "F97", 'value');
+is($op1->{'value'}->[1], "F99", 'value');
+
+done_testing;
diff --git a/tools/t/list2vc.t b/tools/t/list2vc.t
new file mode 100644
index 0000000..93b5226
--- /dev/null
+++ b/tools/t/list2vc.t
@@ -0,0 +1,52 @@
+#!/usr/bin/env perl
+use strict;
+use warnings;
+use Test::More;
+use File::Basename;
+use File::Spec::Functions;
+
+use Test::Output;
+use Mojo::JSON 'decode_json';
+
+my $script = catfile(dirname(__FILE__), '..', 'list2vc.pl');
+my $list1 = catfile(dirname(__FILE__), 'data', 'list1.txt');
+
+# Check STDOUT
+stdout_like(
+  sub {
+    system($script, $list1);
+  },
+  qr!^\{\"\@context\".+?\}$!,
+  "check stdout"
+);
+
+# Check JSON
+my $json = decode_json(join('', `$script $list1`));
+
+is($json->{'collection'}->{'@type'}, 'koral:docGroup', 'type');
+is($json->{'collection'}->{'operation'}, 'operation:or', 'operation');
+
+my $op1 = $json->{'collection'}->{'operands'}->[0];
+is($op1->{'@type'}, 'koral:doc', 'type');
+is($op1->{'key'}, 'corpusSigle', 'key');
+is($op1->{'match'}, 'match:eq', 'match');
+is_deeply($op1->{'value'}, ["A02","A03"], 'value');
+
+my $op2 = $json->{'collection'}->{'operands'}->[1];
+is($op2->{'@type'}, 'koral:doc', 'type');
+is($op2->{'key'}, 'docSigle', 'key');
+is($op2->{'match'}, 'match:eq', 'match');
+is_deeply($op2->{'value'}, ["B04/X02","B04/X03"], 'value');
+
+my $op3 = $json->{'collection'}->{'operands'}->[2];
+is($op3->{'@type'}, 'koral:doc', 'type');
+is($op3->{'key'}, 'textSigle', 'key');
+is($op3->{'match'}, 'match:eq', 'match');
+is_deeply($op3->{'value'}, ["A01/B02/c04","A01/B02/c05"], 'value');
+
+
+# Check STDIN
+my $json2 = decode_json(join('', `cat $list1 | $script -`));
+is_deeply($json, $json2);
+
+done_testing;