Support regex definition for virtual corpora

Change-Id: Iecf55d050f02b019c2591f100cd4d45cb90488a7
diff --git a/lib/KorAP/DefList.pm b/lib/KorAP/DefList.pm
new file mode 100644
index 0000000..09fd1cb
--- /dev/null
+++ b/lib/KorAP/DefList.pm
@@ -0,0 +1,135 @@
+package KorAP::DefList;
+use KorAP::Def;
+use KorAP::VirtualCorpus::Group;
+use KorAP::VirtualCorpus::Doc;
+use strict;
+use warnings;
+
+# Constructor
+sub new {
+  my $class = shift;
+  my $file = shift;
+
+  warn 'xx';
+
+  if (ref $file && ref $file eq 'GLOB') {
+    return bless {
+      file => '',
+      fh => $file
+    }, $class;
+  };
+
+  my $self = {
+    file => $file
+  };
+
+  # Open def file
+  if (!open($self->{fh}, '<' . $file)) {
+    warn $ARGV[0] . " can't be opened";
+    exit(0);
+  };
+
+  return bless $self, $class;
+};
+
+
+# Parse list file
+sub parse {
+  my $self = shift;
+
+  while (!eof($self->{fh})) {
+    my $line = readline($self->{fh});
+
+    next if $line =~ /^\#/;
+
+    if ($line =~ /^(.+?)\t+\@COPY\@\s*$/) {
+
+      # Take from Def file!
+      $self->copy_vc($1);
+    }
+
+    elsif ($line =~ /^([^\t]+?)\t+(.*?)\t(.+?)\/?$/) {
+      $self->regex_to_vc($1,$2,$3);
+      # print $1,': /'.$3.'/',"\n";
+    }
+
+    elsif ($line =~ /^\s*$/) {
+      # Ignore
+    }
+
+    else {
+      warn 'Unknown VC definition: ', $line,"\n";
+    };
+  };
+
+  close($self->{fh});
+};
+
+sub copy_vc {
+  my $self = shift;
+  my $file = shift;
+  print "Convert ", $file, " from def file\n";
+};
+
+sub regex_to_vc {
+  my $self = shift;
+  my ($name, $desc, $regex) = @_;
+
+  print "Convert ", $name, " from regex\n";
+  return from_regex($regex, $name, $desc);
+};
+
+sub from_regex {
+  my $regex = shift;
+  my $name = shift;
+  my $desc = pop;
+
+  my $vc;
+
+  # Group with negation
+  if ($regex =~ m!^([^~]+?)~(.+?)$!) {
+    $vc = KorAP::VirtualCorpus::Group->new;
+
+    my $pos = KorAP::VirtualCorpus::Doc->new;
+    _regex_to_doc($pos, $1);
+
+    $vc->add('and', $pos);
+
+    my $neg = KorAP::VirtualCorpus::Doc->new;
+    _regex_to_doc($neg, $2);
+
+    $neg->match('ne');
+    $vc->joint($neg);
+  }
+
+  # Simle doc
+  else {
+    $vc = KorAP::VirtualCorpus::Doc->new;
+    _regex_to_doc($vc, $regex);
+  };
+
+  $vc->name($name) if $name;
+  $vc->comment('desc:' . $vc->equote($desc)) if $desc;
+
+  return $vc;
+};
+
+sub _regex_to_doc {
+  my ($vc, $regex) = @_;
+
+  if ($regex =~ m!^([^/]+)\/?$!) {
+    $vc->key('corpusSigle');
+    $vc->value($1);
+    my $value = $1;
+    if ($1 !~ /^[a-zA-Z0-9]+?$/) {
+      $vc->type('regex');
+    };
+    return;
+  }
+
+  $vc->key('docSigle');
+  $vc->value($regex);
+  $vc->type('regex');
+}
+
+1;
diff --git a/lib/KorAP/VirtualCorpus.pm b/lib/KorAP/VirtualCorpus.pm
index 58adc6b..933e4b7 100644
--- a/lib/KorAP/VirtualCorpus.pm
+++ b/lib/KorAP/VirtualCorpus.pm
@@ -42,7 +42,7 @@
 # Quote utility function
 sub quote {
   shift;
-  my $str = shift;
+  my $str = shift or return '""';
   $str =~ s/(["\\])/\\$1/g;
   return qq{"$str"};
 };
diff --git a/script/cosmasvc2koralquery b/script/cosmasvc2koralquery
index c0b3ed2..a95605f 100755
--- a/script/cosmasvc2koralquery
+++ b/script/cosmasvc2koralquery
@@ -2,7 +2,9 @@
 use strict;
 use warnings;
 use KorAP::Def;
+use KorAP::DefList;
 use lib 'lib';
+use Getopt::Long;
 
 # 2020-05-20
 #   Preliminary support for C2 def-files.
@@ -12,25 +14,47 @@
 #   Add KorAP::Def.
 
 our $VERSION = 0.2;
-
 our @ARGV;
 
-unless (@ARGV) {
-  print <<'HELP';
-Convert a line-separated list of corpus sigles, doc sigles or
-text sigles into a virtual corpus query.
+my $cmd = shift @ARGV;
 
-  $ perl list2vc.pl my_vc.txt | gzip -vc > my_vc.jsonld.gz
-  $ cat my_vc.txt | perl list2vc.pl - | gzip -vc > my_vc.jsonld.gz
+
+my $input;
+
+GetOptions (
+  "input|i=s" => \$input
+)
+or die("Error in command line arguments\n");
+
+if ($cmd ne 'def' && $cmd ne 'list') {
+  print <<'HELP';
+Convert a list of C2 VC definitions or a single definition into
+KoralQuery VCs.
+
+  $ perl cosmasvc2koralquery def my_vc.txt | gzip -vc > my_vc.jsonld.gz
+  $ cat my_vc.txt | perl cosmasvc2koralquery def - | gzip -vc > my_vc.jsonld.gz
+
+Commands: def, list
 
 HELP
-exit 0;
+exit 1;
 };
 
+
+# Process a list
+if ($cmd eq 'list') {
+  KorAP::DefList->new($input || $ARGV[0])->parse;
+  exit(0);
+};
+
+# Parse a single def
 my $def_parser;
 if ($ARGV[0] eq '-') {
   $def_parser = KorAP::Def->new(\*STDIN);
 }
+elsif ($input) {
+  $def_parser = KorAP::Def->new($input);
+}
 else {
   $def_parser = KorAP::Def->new($ARGV[0]);
 };
diff --git a/t/data/list-example.ls b/t/data/list-example.ls
new file mode 100644
index 0000000..6878eef
--- /dev/null
+++ b/t/data/list-example.ls
@@ -0,0 +1,20 @@
+# BEMERKUNG:
+# - alle Korpusdateinamen klein schreiben (wegen des Skripte in doIndexIds)
+
+bih	Herausgebertexte zum Korpus bio	BIH
+bio	Biografische Literatur	BIO/(BKA|LTI|TK1|TK2|TK3|TK4|TK5|TK6)
+bio-pub	Biografische Literatur	BIO/~BIO/(BKA|LTI|TK1|TK2|TK3|TK4|TK5|TK6)
+l	Berliner Morgenpost	L[0-9][0-9]/
+#l-n	l-n	L20/
+dpa	Meldungen der Deutschen Presse-Agentur	DPA[0-9][0-9]/
+fsp	Fachsprachenkorpus	FSP/(ANG|ANR|EIN|GEB|KAR|REI|SCH|TYP|VER|VID)
+fsp-pub	Fachsprachenkorpus	FSP/~FSP/(ANG|ANR|EIN|GEB|KAR|REI|SCH|TYP|VER|VID)
+kjl	Kinder- und Jugendliteratur	KJL/
+
+thm-lit	Belletristik/Trivialliteratur: Thomas-Mann-Korpus	THM/(AMB|AMD|AME|AMF|AMH|AMJ|AMK|AML|AMN|AMZ)
+wxx11	Wikipedia Artikel und Diskussionen	W[PD]D11/
+zca	Zeit Campus (Feb.-Apr.;Jun.;Aug.;Okt.-Dez.; Dez. 2013 n.v.)	ZCA[0-9][0-9]/
+corp-w-gesamt.2023-i.16.03.23	@COPY@
+corp-a	@COPY@
+misc-lit	Belletristik/Trivialliteratur (öffentlich)	(GOE/(AGD|AGM|AGN|AGV|AGW))|(MK1/(LBC|LBT|LFH|LGB|LJA|LMB|LSO|MHE|TJM|TPM))|(MK2/TRI)
+
diff --git a/t/list2vc-def.t b/t/list2vc-def.t
index 6a5fcec..37adadb 100644
--- a/t/list2vc-def.t
+++ b/t/list2vc-def.t
@@ -15,14 +15,14 @@
 # Check STDOUT
 stdout_like(
   sub {
-    system($script, $list1);
+    system($script, 'def', $list1);
   },
   qr!^\{\"\@context\".+?\}$!,
   "check stdout"
 );
 
 # Check JSON
-my $json = decode_json(join('', `$script $list1`));
+my $json = decode_json(join('', `$script def $list1`));
 
 is($json->{'collection'}->{'@type'}, 'koral:docGroup', 'type');
 is($json->{'collection'}->{'operation'}, 'operation:or', 'operation');
@@ -47,7 +47,7 @@
 
 # Check JSON
 # Only return extended area
-$json = decode_json(join('', `$script $list3`));
+$json = decode_json(join('', `$script def $list3`));
 
 is($json->{'collection'}->{'@type'}, 'koral:doc', 'type');
 
@@ -64,7 +64,7 @@
 my $list4 = catfile(dirname(__FILE__), 'data', 'list4.def');
 
 # Only contains intended area
-$json = decode_json(join('', `$script $list4`));
+$json = decode_json(join('', `$script def $list4`));
 
 is($json->{'collection'}->{'@type'}, 'koral:docGroup', 'type');
 is($json->{'collection'}->{'comment'}, 'name:"VAS N91"', 'name');
diff --git a/t/list2vc-deflist.t b/t/list2vc-deflist.t
new file mode 100644
index 0000000..41c2176
--- /dev/null
+++ b/t/list2vc-deflist.t
@@ -0,0 +1,28 @@
+#!/usr/bin/env perl
+use strict;
+use warnings;
+use Test::More;
+use File::Basename;
+use File::Spec::Functions;
+use Data::Dumper;
+
+use Test::Output;
+use Mojo::JSON 'decode_json';
+
+my $script = catfile(dirname(__FILE__), '..', 'script', 'cosmasvc2koralquery');
+my $list1 = catfile(dirname(__FILE__), 'data', 'list-example.ls');
+
+# Check STDOUT
+stdout_like(
+  sub {
+    system($script, 'list', $list1);
+  },
+  qr!Convert!,
+  "check stdout"
+);
+
+# Check JSON
+# my $protocol = join('', `$script list $list1`);
+
+done_testing;
+__END__
diff --git a/t/list2vc.t b/t/list2vc.t
index fd7ea52..1dc3c4c 100644
--- a/t/list2vc.t
+++ b/t/list2vc.t
@@ -4,7 +4,6 @@
 use Test::More;
 use File::Basename;
 use File::Spec::Functions;
-
 use Test::Output;
 use Mojo::JSON 'decode_json';
 
@@ -14,14 +13,14 @@
 # Check STDOUT
 stdout_like(
   sub {
-    system($script, $list1);
+    system($script, 'def', $list1);
   },
   qr!^\{\"\@context\".+?\}$!,
   "check stdout"
 );
 
 # Check JSON
-my $json = decode_json(join('', `$script $list1`));
+my $json = decode_json(join('', `$script def $list1`));
 
 is($json->{'collection'}->{'@type'}, 'koral:docGroup', 'type');
 is($json->{'collection'}->{'operation'}, 'operation:or', 'operation');
@@ -46,7 +45,7 @@
 
 
 # Check STDIN
-my $json2 = decode_json(join('', `cat $list1 | $script -`));
+my $json2 = decode_json(join('', `cat $list1 | $script def -`));
 is_deeply($json, $json2);
 
 done_testing;
diff --git a/t/regex2vc.t b/t/regex2vc.t
new file mode 100644
index 0000000..fbdb721
--- /dev/null
+++ b/t/regex2vc.t
@@ -0,0 +1,59 @@
+#!/usr/bin/env perl
+use strict;
+use warnings;
+use Test::More;
+use KorAP::DefList;
+use Mojo::JSON 'decode_json';
+
+my $rf = \&KorAP::DefList::from_regex;
+
+sub _collection {
+  return decode_json($rf->(@_)->to_string)->{collection}
+}
+
+my $doc = _collection("x",'Name','Beschreibung');
+is($doc->{key},'corpusSigle');
+is($doc->{value},'x');
+is($doc->{type},'type:string');
+is($doc->{comment},'name:"Name",desc:"Beschreibung"');
+
+$doc = _collection("x/");
+is($doc->{key},'corpusSigle');
+is($doc->{value},'x');
+is($doc->{type},'type:string');
+
+$doc = _collection("x[0-3]",'Na"me','Besch"re\'ibung');
+is($doc->{key},'corpusSigle');
+is($doc->{value},'x[0-3]');
+is($doc->{type},'type:regex');
+is($doc->{comment},'name:"Na\"me",desc:"Besch\"re\'ibung"');
+
+$doc = _collection('x[0-3]/');
+is($doc->{key},'corpusSigle');
+is($doc->{value},'x[0-3]');
+is($doc->{type},'type:regex');
+
+$doc = _collection('BIO/(BKA|LTI|TK1|TK2|TK3|TK4|TK5|TK6)');
+is($doc->{key},'docSigle');
+is($doc->{value},'BIO/(BKA|LTI|TK1|TK2|TK3|TK4|TK5|TK6)');
+is($doc->{type},'type:regex');
+
+$doc = _collection('(GOE/(AGD|AGM|AGN|AGV|AGW))|(MK1/(LBC|LBT|LFH|LGB|LJA|LMB|LSO|MHE|TJM|TPM))|(MK2/TRI)');
+is($doc->{key},'docSigle');
+is($doc->{value},'(GOE/(AGD|AGM|AGN|AGV|AGW))|(MK1/(LBC|LBT|LFH|LGB|LJA|LMB|LSO|MHE|TJM|TPM))|(MK2/TRI)');
+is($doc->{type},'type:regex');
+
+$doc = _collection('FSP/~FSP/(ANG|ANR|EIN|GEB|KAR|REI|SCH|TYP|VER|VID)','fsp-pub','Fachsprachenkorpus');
+
+is($doc->{'@type'},'koral:docGroup');
+is($doc->{'operation'},'operation:and');
+is($doc->{operands}->[0]->{type},'type:string');
+is($doc->{operands}->[0]->{match},'match:eq');
+is($doc->{operands}->[0]->{key},'corpusSigle');
+is($doc->{operands}->[0]->{value},'FSP');
+is($doc->{operands}->[1]->{type},'type:regex');
+is($doc->{operands}->[1]->{match},'match:ne');
+is($doc->{operands}->[1]->{key},'docSigle');
+is($doc->{operands}->[1]->{value},'FSP/(ANG|ANR|EIN|GEB|KAR|REI|SCH|TYP|VER|VID)');
+
+done_testing;