Make script installable

Change-Id: I88ccc2f661eb6758a91ed6cf9c325083fd708325
diff --git a/script/cosmasvc2koralquery b/script/cosmasvc2koralquery
new file mode 100755
index 0000000..85986a9
--- /dev/null
+++ b/script/cosmasvc2koralquery
@@ -0,0 +1,305 @@
+#!/usr/bin/env perl
+use strict;
+use warnings;
+use lib 'lib';
+use KorAP::VirtualCorpus::Group;
+
+# 2020-05-20
+#   Preliminary support for C2 def-files.
+# 2020-05-29
+#   Introduce optimizable object system.
+
+our $VERSION = 0.1;
+
+our @ARGV;
+
+unless (@ARGV) {
+  print <<'HELP';
+Convert a line-separated list of corpus sigles, doc sigles or
+text sigles into a virtual corpus query.
+
+  $ perl list2vc.pl my_vc.txt | gzip -vc > my_vc.jsonld.gz
+  $ cat my_vc.txt | perl list2vc.pl - | gzip -vc > my_vc.jsonld.gz
+
+HELP
+exit 0;
+};
+
+
+# Shorten long strings for logging
+sub _shorten ($) {
+  my $line = shift;
+  if (length($line) < 20) {
+    return $line;
+  }
+  else {
+    return substr($line,0,17) . '...';
+  };
+};
+
+
+my $fh;
+if ($ARGV[0] eq '-') {
+  $fh = *STDIN;
+} elsif (!open($fh, '<' . $ARGV[0])) {
+  warn $ARGV[0] . " can't be opened";
+  exit(0);
+};
+
+# Initial VC group
+my $vc;
+
+# Create an intensional and an extensional VC
+my $vc_ext = KorAP::VirtualCorpus::Group->new;
+my $vc_int = KorAP::VirtualCorpus::Group->new;
+
+# Load ext initially
+$$vc = $vc_ext;
+
+# Collect all virtual corpora
+my %all_vcs;
+
+my $frozen = 0;
+
+# Iterate over the whole list
+while (!eof $fh) {
+  my $line = readline($fh);
+  chomp $line;
+
+
+  # Skip empty lines
+  if (!$line || length($line) == 0 || $line =~ /^[\s\t\n]*$/) {
+    # empty
+    next;
+  };
+
+  my ($key, $value, $desc);
+
+  # Line-Type: <e>c</a>
+  if ($line =~ /^\s*<([^>]+)>\s*([^<]*)\s*<\/\1>\s*$/) {
+    $key = $1;
+    $value = $2 // undef;
+  }
+
+  # Line-Type: <e>c
+  elsif($line =~ /^\s*<([^>]+)>\s*([^<]+)\s*$/) {
+    $key = $1;
+    $value = $2;
+  }
+
+  # Get text sigles
+  elsif ($line =~ m!^(?:\w+\/){2}\w+$!) {
+    $key = 'text';
+    $value = $line;
+  }
+
+  # Get doc sigles
+  elsif ($line =~ m!^(\w+\/\w+?)(?:\s.+?)?$!) {
+    $key = 'doc';
+    $value = $1;
+  }
+
+  # Get corpus sigles
+  elsif ($line !~ m!(?:\/|\s)!) {
+    $key = 'corpus';
+    $value = $line;
+  }
+
+  # Not known
+  else {
+    warn _shorten($line) . q! isn't a valid VC definition!;
+    next;
+  };
+
+  # Add text field
+  if ($key eq 'text') {
+
+    # Convert C2 sigle to KorAP form
+    $value =~ s!^([^/]+?/[^\.]+?)\.(.+?)$!$1\/$2!;
+    ${$vc}->union_field(textSigle => $value);
+  }
+
+  # Add doc field
+  elsif ($key eq 'doc') {
+    ${$vc}->union_field(docSigle => $value);
+  }
+
+  # Add corpus field
+  elsif ($key eq 'corpus') {
+    ${$vc}->union_field(corpusSigle => $value);
+  }
+
+  # Add corpus field
+  elsif ($key eq 'cn') {
+    # Korpussigle, z.B. 'F97 Frankfurter Allgemeine 1997'
+    if ($value =~ m!^([^\/\s]+)(?:\s.+?)?$!) {
+      ${$vc}->union_field(corpusSigle => $1);
+    };
+  }
+
+  # Mark the vc as frozen
+  # This means that an extended VC area is expected
+  elsif ($key eq 'frozen') {
+    $frozen = 1;
+  }
+
+  # Start/End intended VC area
+  elsif ($key eq 'intended') {
+    if ($value eq 'start') {
+      $$vc = $vc_int;
+    }
+    elsif ($value ne 'end') {
+      warn 'Unknown intension value ' . $value;
+    };
+  }
+
+  # Start/End extended VC area
+  elsif ($key eq 'extended') {
+    if ($value eq 'start') {
+      $$vc = $vc_ext;
+    }
+    elsif ($value ne 'end') {
+      warn 'Unknown extension value ' . $value;
+    };
+  }
+
+  # Set VC name
+  elsif ($key eq 'name') {
+    # "Name des virt. Korpus, der angezeigt wird.
+    # Wird auch intern zur Korpusbildung referenziert, z.B. für <and>,
+    # <add>, <sub>"
+
+    # No global name defined yet
+    if ($$vc && !$$vc->name) {
+      $vc_ext->name($value);
+      $vc_int->name($value);
+      next;
+    };
+
+    ${$vc} = KorAP::VirtualCorpus::Group->new;
+    ${$vc}->name($value);
+  }
+
+  # End VC def
+  elsif ($key eq 'end') {
+    $all_vcs{${$vc}->name} = $$vc;
+    # $vc = undef;
+  }
+
+  # Add VC definition
+  elsif ($key eq 'add') {
+    unless (defined $all_vcs{$value}) {
+      #       warn 'VC ' . $value . ' not defined';
+      # exit(1);
+      next;
+    };
+
+    $$vc->union($all_vcs{$value}->clone->to_koral);
+  }
+
+  # AND definition
+  elsif ($key eq 'and') {
+    unless (defined $all_vcs{$value}) {
+      #       warn 'VC ' . $value . ' not defined';
+      # exit(1);
+      next;
+    };
+
+    $$vc->joint($all_vcs{$value}->clone->to_koral);
+  }
+
+  # Source of the corpus
+  elsif ($key eq 'ql') {
+    # Quellenname, z.B. "Neue Zürcher Zeitung"
+    $$vc->union_field(corpusTitle => $value);
+  }
+
+  elsif ($key eq 'sub') {
+    # "Sub" is the difference - it is the "and not" operation.
+    warn $key . ' is not yet supported';
+  }
+
+  elsif ($key eq 'co') {
+    # Country,	z.B. DE für Text in Deutschland erschienen
+    warn $key . ' is not yet supported';
+  }
+
+  elsif ($key eq 'tl') {
+    # Textlength, Bereich von Texten der angegebenen Länge [in Anz. Wörtern]
+    warn $key . ' is not yet supported';
+  }
+
+  elsif ($key eq 'ts') {
+    # Textsorte, 	z.B. "Bericht"
+    warn $key . ' is not yet supported';
+  }
+
+  elsif ($key eq 'th') {
+    # Thema, z.B. "Sport - Fußball"
+    warn $key . ' is not yet supported';
+  }
+
+  elsif ($key eq 'red') {
+    # Reduktionsfaktor
+    # Wert zw. 1-99%: virt. Korpus wird auf diesen Wert
+    # reduziert. Modus: feste Reduzierung, nicht variabel.
+    warn $key . ' is not yet supported';
+  }
+
+  elsif ($key eq 'thprob') {
+    # ThemaProbability
+    # Wert, der für <th>Thema verwendet wird um zu bestimmen, ab welchem
+    # Zuverläßigkeitswert ein Thema übernommen wird
+  }
+
+
+  # Add reduction value as a comment
+  elsif ($key eq 'redabs') {
+    # "red. Anz. Texte
+    # absoluter Wert der durch Reduktion zu erzielende Anzahl Texte"
+    $$vc->comment('redabs:' . $value);
+    warn $key . ' is not yet supported';
+  }
+
+  # Add reduction value as a comment
+  elsif ($key eq 'date') {
+    # Supports two pattern schemes:
+    # m1=Year1/Month1 bis Year2/Month2
+    #   Datumsbereich Schema 1: z.B. "2000/01 bis 2010/12"
+
+    # Schema 1
+    if ($value =~ m!^(?:m1\s*=\s*)?\s*(\d+)\/(\d+) bis (\d+)\/(\d+)\s*$!s) {
+      my ($y1, $m1, $y2, $m2) = ($1, $2, $3, $4);
+      if ($m1 < 10) {
+        $m1 = '0' . (0+$m1);
+      };
+      if ($m2 < 10) {
+        $m2 = '0' . (0+$m2);
+      };
+      $$vc->from($y1, $m1);
+      $$vc->to($y2, $m2);
+    }
+
+    # Scheme 2
+    elsif ($value =~ m!^\s*\d{4}-\d{4}\s+und\s+\d{1,2}-\d{1,2}\s*$!) {
+      # m2=Year1-Year2 und Month1-Month2
+      #   Datumsbereich Schema 2: z.B. "1990-2000 und 06-06"
+
+      warn 'Second date scheme not yet supported!'
+    }
+
+    else {
+      warn 'Unknown date scheme ' . $value;
+    };
+  }
+
+  # Unknown
+  else {
+    warn $key . ' is an unknown field';
+  };
+};
+
+close($fh);
+
+# Stringify current (extended?) virtual corpus
+print $$vc->to_string;