Support regex definition for virtual corpora

Change-Id: Iecf55d050f02b019c2591f100cd4d45cb90488a7
diff --git a/t/data/list-example.ls b/t/data/list-example.ls
new file mode 100644
index 0000000..6878eef
--- /dev/null
+++ b/t/data/list-example.ls
@@ -0,0 +1,20 @@
+# BEMERKUNG:
+# - alle Korpusdateinamen klein schreiben (wegen des Skripte in doIndexIds)
+
+bih	Herausgebertexte zum Korpus bio	BIH
+bio	Biografische Literatur	BIO/(BKA|LTI|TK1|TK2|TK3|TK4|TK5|TK6)
+bio-pub	Biografische Literatur	BIO/~BIO/(BKA|LTI|TK1|TK2|TK3|TK4|TK5|TK6)
+l	Berliner Morgenpost	L[0-9][0-9]/
+#l-n	l-n	L20/
+dpa	Meldungen der Deutschen Presse-Agentur	DPA[0-9][0-9]/
+fsp	Fachsprachenkorpus	FSP/(ANG|ANR|EIN|GEB|KAR|REI|SCH|TYP|VER|VID)
+fsp-pub	Fachsprachenkorpus	FSP/~FSP/(ANG|ANR|EIN|GEB|KAR|REI|SCH|TYP|VER|VID)
+kjl	Kinder- und Jugendliteratur	KJL/
+
+thm-lit	Belletristik/Trivialliteratur: Thomas-Mann-Korpus	THM/(AMB|AMD|AME|AMF|AMH|AMJ|AMK|AML|AMN|AMZ)
+wxx11	Wikipedia Artikel und Diskussionen	W[PD]D11/
+zca	Zeit Campus (Feb.-Apr.;Jun.;Aug.;Okt.-Dez.; Dez. 2013 n.v.)	ZCA[0-9][0-9]/
+corp-w-gesamt.2023-i.16.03.23	@COPY@
+corp-a	@COPY@
+misc-lit	Belletristik/Trivialliteratur (öffentlich)	(GOE/(AGD|AGM|AGN|AGV|AGW))|(MK1/(LBC|LBT|LFH|LGB|LJA|LMB|LSO|MHE|TJM|TPM))|(MK2/TRI)
+
diff --git a/t/list2vc-def.t b/t/list2vc-def.t
index 6a5fcec..37adadb 100644
--- a/t/list2vc-def.t
+++ b/t/list2vc-def.t
@@ -15,14 +15,14 @@
 # Check STDOUT
 stdout_like(
   sub {
-    system($script, $list1);
+    system($script, 'def', $list1);
   },
   qr!^\{\"\@context\".+?\}$!,
   "check stdout"
 );
 
 # Check JSON
-my $json = decode_json(join('', `$script $list1`));
+my $json = decode_json(join('', `$script def $list1`));
 
 is($json->{'collection'}->{'@type'}, 'koral:docGroup', 'type');
 is($json->{'collection'}->{'operation'}, 'operation:or', 'operation');
@@ -47,7 +47,7 @@
 
 # Check JSON
 # Only return extended area
-$json = decode_json(join('', `$script $list3`));
+$json = decode_json(join('', `$script def $list3`));
 
 is($json->{'collection'}->{'@type'}, 'koral:doc', 'type');
 
@@ -64,7 +64,7 @@
 my $list4 = catfile(dirname(__FILE__), 'data', 'list4.def');
 
 # Only contains intended area
-$json = decode_json(join('', `$script $list4`));
+$json = decode_json(join('', `$script def $list4`));
 
 is($json->{'collection'}->{'@type'}, 'koral:docGroup', 'type');
 is($json->{'collection'}->{'comment'}, 'name:"VAS N91"', 'name');
diff --git a/t/list2vc-deflist.t b/t/list2vc-deflist.t
new file mode 100644
index 0000000..41c2176
--- /dev/null
+++ b/t/list2vc-deflist.t
@@ -0,0 +1,28 @@
+#!/usr/bin/env perl
+use strict;
+use warnings;
+use Test::More;
+use File::Basename;
+use File::Spec::Functions;
+use Data::Dumper;
+
+use Test::Output;
+use Mojo::JSON 'decode_json';
+
+my $script = catfile(dirname(__FILE__), '..', 'script', 'cosmasvc2koralquery');
+my $list1 = catfile(dirname(__FILE__), 'data', 'list-example.ls');
+
+# Check STDOUT
+stdout_like(
+  sub {
+    system($script, 'list', $list1);
+  },
+  qr!Convert!,
+  "check stdout"
+);
+
+# Check JSON
+# my $protocol = join('', `$script list $list1`);
+
+done_testing;
+__END__
diff --git a/t/list2vc.t b/t/list2vc.t
index fd7ea52..1dc3c4c 100644
--- a/t/list2vc.t
+++ b/t/list2vc.t
@@ -4,7 +4,6 @@
 use Test::More;
 use File::Basename;
 use File::Spec::Functions;
-
 use Test::Output;
 use Mojo::JSON 'decode_json';
 
@@ -14,14 +13,14 @@
 # Check STDOUT
 stdout_like(
   sub {
-    system($script, $list1);
+    system($script, 'def', $list1);
   },
   qr!^\{\"\@context\".+?\}$!,
   "check stdout"
 );
 
 # Check JSON
-my $json = decode_json(join('', `$script $list1`));
+my $json = decode_json(join('', `$script def $list1`));
 
 is($json->{'collection'}->{'@type'}, 'koral:docGroup', 'type');
 is($json->{'collection'}->{'operation'}, 'operation:or', 'operation');
@@ -46,7 +45,7 @@
 
 
 # Check STDIN
-my $json2 = decode_json(join('', `cat $list1 | $script -`));
+my $json2 = decode_json(join('', `cat $list1 | $script def -`));
 is_deeply($json, $json2);
 
 done_testing;
diff --git a/t/regex2vc.t b/t/regex2vc.t
new file mode 100644
index 0000000..fbdb721
--- /dev/null
+++ b/t/regex2vc.t
@@ -0,0 +1,59 @@
+#!/usr/bin/env perl
+use strict;
+use warnings;
+use Test::More;
+use KorAP::DefList;
+use Mojo::JSON 'decode_json';
+
+my $rf = \&KorAP::DefList::from_regex;
+
+sub _collection {
+  return decode_json($rf->(@_)->to_string)->{collection}
+}
+
+my $doc = _collection("x",'Name','Beschreibung');
+is($doc->{key},'corpusSigle');
+is($doc->{value},'x');
+is($doc->{type},'type:string');
+is($doc->{comment},'name:"Name",desc:"Beschreibung"');
+
+$doc = _collection("x/");
+is($doc->{key},'corpusSigle');
+is($doc->{value},'x');
+is($doc->{type},'type:string');
+
+$doc = _collection("x[0-3]",'Na"me','Besch"re\'ibung');
+is($doc->{key},'corpusSigle');
+is($doc->{value},'x[0-3]');
+is($doc->{type},'type:regex');
+is($doc->{comment},'name:"Na\"me",desc:"Besch\"re\'ibung"');
+
+$doc = _collection('x[0-3]/');
+is($doc->{key},'corpusSigle');
+is($doc->{value},'x[0-3]');
+is($doc->{type},'type:regex');
+
+$doc = _collection('BIO/(BKA|LTI|TK1|TK2|TK3|TK4|TK5|TK6)');
+is($doc->{key},'docSigle');
+is($doc->{value},'BIO/(BKA|LTI|TK1|TK2|TK3|TK4|TK5|TK6)');
+is($doc->{type},'type:regex');
+
+$doc = _collection('(GOE/(AGD|AGM|AGN|AGV|AGW))|(MK1/(LBC|LBT|LFH|LGB|LJA|LMB|LSO|MHE|TJM|TPM))|(MK2/TRI)');
+is($doc->{key},'docSigle');
+is($doc->{value},'(GOE/(AGD|AGM|AGN|AGV|AGW))|(MK1/(LBC|LBT|LFH|LGB|LJA|LMB|LSO|MHE|TJM|TPM))|(MK2/TRI)');
+is($doc->{type},'type:regex');
+
+$doc = _collection('FSP/~FSP/(ANG|ANR|EIN|GEB|KAR|REI|SCH|TYP|VER|VID)','fsp-pub','Fachsprachenkorpus');
+
+is($doc->{'@type'},'koral:docGroup');
+is($doc->{'operation'},'operation:and');
+is($doc->{operands}->[0]->{type},'type:string');
+is($doc->{operands}->[0]->{match},'match:eq');
+is($doc->{operands}->[0]->{key},'corpusSigle');
+is($doc->{operands}->[0]->{value},'FSP');
+is($doc->{operands}->[1]->{type},'type:regex');
+is($doc->{operands}->[1]->{match},'match:ne');
+is($doc->{operands}->[1]->{key},'docSigle');
+is($doc->{operands}->[1]->{value},'FSP/(ANG|ANR|EIN|GEB|KAR|REI|SCH|TYP|VER|VID)');
+
+done_testing;