Support regex definition for virtual corpora
Change-Id: Iecf55d050f02b019c2591f100cd4d45cb90488a7
diff --git a/t/data/list-example.ls b/t/data/list-example.ls
new file mode 100644
index 0000000..6878eef
--- /dev/null
+++ b/t/data/list-example.ls
@@ -0,0 +1,20 @@
+# BEMERKUNG:
+# - alle Korpusdateinamen klein schreiben (wegen des Skripte in doIndexIds)
+
+bih Herausgebertexte zum Korpus bio BIH
+bio Biografische Literatur BIO/(BKA|LTI|TK1|TK2|TK3|TK4|TK5|TK6)
+bio-pub Biografische Literatur BIO/~BIO/(BKA|LTI|TK1|TK2|TK3|TK4|TK5|TK6)
+l Berliner Morgenpost L[0-9][0-9]/
+#l-n l-n L20/
+dpa Meldungen der Deutschen Presse-Agentur DPA[0-9][0-9]/
+fsp Fachsprachenkorpus FSP/(ANG|ANR|EIN|GEB|KAR|REI|SCH|TYP|VER|VID)
+fsp-pub Fachsprachenkorpus FSP/~FSP/(ANG|ANR|EIN|GEB|KAR|REI|SCH|TYP|VER|VID)
+kjl Kinder- und Jugendliteratur KJL/
+
+thm-lit Belletristik/Trivialliteratur: Thomas-Mann-Korpus THM/(AMB|AMD|AME|AMF|AMH|AMJ|AMK|AML|AMN|AMZ)
+wxx11 Wikipedia Artikel und Diskussionen W[PD]D11/
+zca Zeit Campus (Feb.-Apr.;Jun.;Aug.;Okt.-Dez.; Dez. 2013 n.v.) ZCA[0-9][0-9]/
+corp-w-gesamt.2023-i.16.03.23 @COPY@
+corp-a @COPY@
+misc-lit Belletristik/Trivialliteratur (öffentlich) (GOE/(AGD|AGM|AGN|AGV|AGW))|(MK1/(LBC|LBT|LFH|LGB|LJA|LMB|LSO|MHE|TJM|TPM))|(MK2/TRI)
+
diff --git a/t/list2vc-def.t b/t/list2vc-def.t
index 6a5fcec..37adadb 100644
--- a/t/list2vc-def.t
+++ b/t/list2vc-def.t
@@ -15,14 +15,14 @@
# Check STDOUT
stdout_like(
sub {
- system($script, $list1);
+ system($script, 'def', $list1);
},
qr!^\{\"\@context\".+?\}$!,
"check stdout"
);
# Check JSON
-my $json = decode_json(join('', `$script $list1`));
+my $json = decode_json(join('', `$script def $list1`));
is($json->{'collection'}->{'@type'}, 'koral:docGroup', 'type');
is($json->{'collection'}->{'operation'}, 'operation:or', 'operation');
@@ -47,7 +47,7 @@
# Check JSON
# Only return extended area
-$json = decode_json(join('', `$script $list3`));
+$json = decode_json(join('', `$script def $list3`));
is($json->{'collection'}->{'@type'}, 'koral:doc', 'type');
@@ -64,7 +64,7 @@
my $list4 = catfile(dirname(__FILE__), 'data', 'list4.def');
# Only contains intended area
-$json = decode_json(join('', `$script $list4`));
+$json = decode_json(join('', `$script def $list4`));
is($json->{'collection'}->{'@type'}, 'koral:docGroup', 'type');
is($json->{'collection'}->{'comment'}, 'name:"VAS N91"', 'name');
diff --git a/t/list2vc-deflist.t b/t/list2vc-deflist.t
new file mode 100644
index 0000000..41c2176
--- /dev/null
+++ b/t/list2vc-deflist.t
@@ -0,0 +1,28 @@
+#!/usr/bin/env perl
+use strict;
+use warnings;
+use Test::More;
+use File::Basename;
+use File::Spec::Functions;
+use Data::Dumper;
+
+use Test::Output;
+use Mojo::JSON 'decode_json';
+
+my $script = catfile(dirname(__FILE__), '..', 'script', 'cosmasvc2koralquery');
+my $list1 = catfile(dirname(__FILE__), 'data', 'list-example.ls');
+
+# Check STDOUT
+stdout_like(
+ sub {
+ system($script, 'list', $list1);
+ },
+ qr!Convert!,
+ "check stdout"
+);
+
+# Check JSON
+# my $protocol = join('', `$script list $list1`);
+
+done_testing;
+__END__
diff --git a/t/list2vc.t b/t/list2vc.t
index fd7ea52..1dc3c4c 100644
--- a/t/list2vc.t
+++ b/t/list2vc.t
@@ -4,7 +4,6 @@
use Test::More;
use File::Basename;
use File::Spec::Functions;
-
use Test::Output;
use Mojo::JSON 'decode_json';
@@ -14,14 +13,14 @@
# Check STDOUT
stdout_like(
sub {
- system($script, $list1);
+ system($script, 'def', $list1);
},
qr!^\{\"\@context\".+?\}$!,
"check stdout"
);
# Check JSON
-my $json = decode_json(join('', `$script $list1`));
+my $json = decode_json(join('', `$script def $list1`));
is($json->{'collection'}->{'@type'}, 'koral:docGroup', 'type');
is($json->{'collection'}->{'operation'}, 'operation:or', 'operation');
@@ -46,7 +45,7 @@
# Check STDIN
-my $json2 = decode_json(join('', `cat $list1 | $script -`));
+my $json2 = decode_json(join('', `cat $list1 | $script def -`));
is_deeply($json, $json2);
done_testing;
diff --git a/t/regex2vc.t b/t/regex2vc.t
new file mode 100644
index 0000000..fbdb721
--- /dev/null
+++ b/t/regex2vc.t
@@ -0,0 +1,59 @@
+#!/usr/bin/env perl
+use strict;
+use warnings;
+use Test::More;
+use KorAP::DefList;
+use Mojo::JSON 'decode_json';
+
+my $rf = \&KorAP::DefList::from_regex;
+
+sub _collection {
+ return decode_json($rf->(@_)->to_string)->{collection}
+}
+
+my $doc = _collection("x",'Name','Beschreibung');
+is($doc->{key},'corpusSigle');
+is($doc->{value},'x');
+is($doc->{type},'type:string');
+is($doc->{comment},'name:"Name",desc:"Beschreibung"');
+
+$doc = _collection("x/");
+is($doc->{key},'corpusSigle');
+is($doc->{value},'x');
+is($doc->{type},'type:string');
+
+$doc = _collection("x[0-3]",'Na"me','Besch"re\'ibung');
+is($doc->{key},'corpusSigle');
+is($doc->{value},'x[0-3]');
+is($doc->{type},'type:regex');
+is($doc->{comment},'name:"Na\"me",desc:"Besch\"re\'ibung"');
+
+$doc = _collection('x[0-3]/');
+is($doc->{key},'corpusSigle');
+is($doc->{value},'x[0-3]');
+is($doc->{type},'type:regex');
+
+$doc = _collection('BIO/(BKA|LTI|TK1|TK2|TK3|TK4|TK5|TK6)');
+is($doc->{key},'docSigle');
+is($doc->{value},'BIO/(BKA|LTI|TK1|TK2|TK3|TK4|TK5|TK6)');
+is($doc->{type},'type:regex');
+
+$doc = _collection('(GOE/(AGD|AGM|AGN|AGV|AGW))|(MK1/(LBC|LBT|LFH|LGB|LJA|LMB|LSO|MHE|TJM|TPM))|(MK2/TRI)');
+is($doc->{key},'docSigle');
+is($doc->{value},'(GOE/(AGD|AGM|AGN|AGV|AGW))|(MK1/(LBC|LBT|LFH|LGB|LJA|LMB|LSO|MHE|TJM|TPM))|(MK2/TRI)');
+is($doc->{type},'type:regex');
+
+$doc = _collection('FSP/~FSP/(ANG|ANR|EIN|GEB|KAR|REI|SCH|TYP|VER|VID)','fsp-pub','Fachsprachenkorpus');
+
+is($doc->{'@type'},'koral:docGroup');
+is($doc->{'operation'},'operation:and');
+is($doc->{operands}->[0]->{type},'type:string');
+is($doc->{operands}->[0]->{match},'match:eq');
+is($doc->{operands}->[0]->{key},'corpusSigle');
+is($doc->{operands}->[0]->{value},'FSP');
+is($doc->{operands}->[1]->{type},'type:regex');
+is($doc->{operands}->[1]->{match},'match:ne');
+is($doc->{operands}->[1]->{key},'docSigle');
+is($doc->{operands}->[1]->{value},'FSP/(ANG|ANR|EIN|GEB|KAR|REI|SCH|TYP|VER|VID)');
+
+done_testing;