Support regex definition for virtual corpora
Change-Id: Iecf55d050f02b019c2591f100cd4d45cb90488a7
diff --git a/lib/KorAP/DefList.pm b/lib/KorAP/DefList.pm
new file mode 100644
index 0000000..09fd1cb
--- /dev/null
+++ b/lib/KorAP/DefList.pm
@@ -0,0 +1,135 @@
+package KorAP::DefList;
+use KorAP::Def;
+use KorAP::VirtualCorpus::Group;
+use KorAP::VirtualCorpus::Doc;
+use strict;
+use warnings;
+
+# Constructor
+sub new {
+ my $class = shift;
+ my $file = shift;
+
+ warn 'xx';
+
+ if (ref $file && ref $file eq 'GLOB') {
+ return bless {
+ file => '',
+ fh => $file
+ }, $class;
+ };
+
+ my $self = {
+ file => $file
+ };
+
+ # Open def file
+ if (!open($self->{fh}, '<' . $file)) {
+ warn $ARGV[0] . " can't be opened";
+ exit(0);
+ };
+
+ return bless $self, $class;
+};
+
+
+# Parse list file
+sub parse {
+ my $self = shift;
+
+ while (!eof($self->{fh})) {
+ my $line = readline($self->{fh});
+
+ next if $line =~ /^\#/;
+
+ if ($line =~ /^(.+?)\t+\@COPY\@\s*$/) {
+
+ # Take from Def file!
+ $self->copy_vc($1);
+ }
+
+ elsif ($line =~ /^([^\t]+?)\t+(.*?)\t(.+?)\/?$/) {
+ $self->regex_to_vc($1,$2,$3);
+ # print $1,': /'.$3.'/',"\n";
+ }
+
+ elsif ($line =~ /^\s*$/) {
+ # Ignore
+ }
+
+ else {
+ warn 'Unknown VC definition: ', $line,"\n";
+ };
+ };
+
+ close($self->{fh});
+};
+
+sub copy_vc {
+ my $self = shift;
+ my $file = shift;
+ print "Convert ", $file, " from def file\n";
+};
+
+sub regex_to_vc {
+ my $self = shift;
+ my ($name, $desc, $regex) = @_;
+
+ print "Convert ", $name, " from regex\n";
+ return from_regex($regex, $name, $desc);
+};
+
+sub from_regex {
+ my $regex = shift;
+ my $name = shift;
+ my $desc = pop;
+
+ my $vc;
+
+ # Group with negation
+ if ($regex =~ m!^([^~]+?)~(.+?)$!) {
+ $vc = KorAP::VirtualCorpus::Group->new;
+
+ my $pos = KorAP::VirtualCorpus::Doc->new;
+ _regex_to_doc($pos, $1);
+
+ $vc->add('and', $pos);
+
+ my $neg = KorAP::VirtualCorpus::Doc->new;
+ _regex_to_doc($neg, $2);
+
+ $neg->match('ne');
+ $vc->joint($neg);
+ }
+
+ # Simle doc
+ else {
+ $vc = KorAP::VirtualCorpus::Doc->new;
+ _regex_to_doc($vc, $regex);
+ };
+
+ $vc->name($name) if $name;
+ $vc->comment('desc:' . $vc->equote($desc)) if $desc;
+
+ return $vc;
+};
+
+sub _regex_to_doc {
+ my ($vc, $regex) = @_;
+
+ if ($regex =~ m!^([^/]+)\/?$!) {
+ $vc->key('corpusSigle');
+ $vc->value($1);
+ my $value = $1;
+ if ($1 !~ /^[a-zA-Z0-9]+?$/) {
+ $vc->type('regex');
+ };
+ return;
+ }
+
+ $vc->key('docSigle');
+ $vc->value($regex);
+ $vc->type('regex');
+}
+
+1;
diff --git a/lib/KorAP/VirtualCorpus.pm b/lib/KorAP/VirtualCorpus.pm
index 58adc6b..933e4b7 100644
--- a/lib/KorAP/VirtualCorpus.pm
+++ b/lib/KorAP/VirtualCorpus.pm
@@ -42,7 +42,7 @@
# Quote utility function
sub quote {
shift;
- my $str = shift;
+ my $str = shift or return '""';
$str =~ s/(["\\])/\\$1/g;
return qq{"$str"};
};
diff --git a/script/cosmasvc2koralquery b/script/cosmasvc2koralquery
index c0b3ed2..a95605f 100755
--- a/script/cosmasvc2koralquery
+++ b/script/cosmasvc2koralquery
@@ -2,7 +2,9 @@
use strict;
use warnings;
use KorAP::Def;
+use KorAP::DefList;
use lib 'lib';
+use Getopt::Long;
# 2020-05-20
# Preliminary support for C2 def-files.
@@ -12,25 +14,47 @@
# Add KorAP::Def.
our $VERSION = 0.2;
-
our @ARGV;
-unless (@ARGV) {
- print <<'HELP';
-Convert a line-separated list of corpus sigles, doc sigles or
-text sigles into a virtual corpus query.
+my $cmd = shift @ARGV;
- $ perl list2vc.pl my_vc.txt | gzip -vc > my_vc.jsonld.gz
- $ cat my_vc.txt | perl list2vc.pl - | gzip -vc > my_vc.jsonld.gz
+
+my $input;
+
+GetOptions (
+ "input|i=s" => \$input
+)
+or die("Error in command line arguments\n");
+
+if ($cmd ne 'def' && $cmd ne 'list') {
+ print <<'HELP';
+Convert a list of C2 VC definitions or a single definition into
+KoralQuery VCs.
+
+ $ perl cosmasvc2koralquery def my_vc.txt | gzip -vc > my_vc.jsonld.gz
+ $ cat my_vc.txt | perl cosmasvc2koralquery def - | gzip -vc > my_vc.jsonld.gz
+
+Commands: def, list
HELP
-exit 0;
+exit 1;
};
+
+# Process a list
+if ($cmd eq 'list') {
+ KorAP::DefList->new($input || $ARGV[0])->parse;
+ exit(0);
+};
+
+# Parse a single def
my $def_parser;
if ($ARGV[0] eq '-') {
$def_parser = KorAP::Def->new(\*STDIN);
}
+elsif ($input) {
+ $def_parser = KorAP::Def->new($input);
+}
else {
$def_parser = KorAP::Def->new($ARGV[0]);
};
diff --git a/t/data/list-example.ls b/t/data/list-example.ls
new file mode 100644
index 0000000..6878eef
--- /dev/null
+++ b/t/data/list-example.ls
@@ -0,0 +1,20 @@
+# BEMERKUNG:
+# - alle Korpusdateinamen klein schreiben (wegen des Skripte in doIndexIds)
+
+bih Herausgebertexte zum Korpus bio BIH
+bio Biografische Literatur BIO/(BKA|LTI|TK1|TK2|TK3|TK4|TK5|TK6)
+bio-pub Biografische Literatur BIO/~BIO/(BKA|LTI|TK1|TK2|TK3|TK4|TK5|TK6)
+l Berliner Morgenpost L[0-9][0-9]/
+#l-n l-n L20/
+dpa Meldungen der Deutschen Presse-Agentur DPA[0-9][0-9]/
+fsp Fachsprachenkorpus FSP/(ANG|ANR|EIN|GEB|KAR|REI|SCH|TYP|VER|VID)
+fsp-pub Fachsprachenkorpus FSP/~FSP/(ANG|ANR|EIN|GEB|KAR|REI|SCH|TYP|VER|VID)
+kjl Kinder- und Jugendliteratur KJL/
+
+thm-lit Belletristik/Trivialliteratur: Thomas-Mann-Korpus THM/(AMB|AMD|AME|AMF|AMH|AMJ|AMK|AML|AMN|AMZ)
+wxx11 Wikipedia Artikel und Diskussionen W[PD]D11/
+zca Zeit Campus (Feb.-Apr.;Jun.;Aug.;Okt.-Dez.; Dez. 2013 n.v.) ZCA[0-9][0-9]/
+corp-w-gesamt.2023-i.16.03.23 @COPY@
+corp-a @COPY@
+misc-lit Belletristik/Trivialliteratur (öffentlich) (GOE/(AGD|AGM|AGN|AGV|AGW))|(MK1/(LBC|LBT|LFH|LGB|LJA|LMB|LSO|MHE|TJM|TPM))|(MK2/TRI)
+
diff --git a/t/list2vc-def.t b/t/list2vc-def.t
index 6a5fcec..37adadb 100644
--- a/t/list2vc-def.t
+++ b/t/list2vc-def.t
@@ -15,14 +15,14 @@
# Check STDOUT
stdout_like(
sub {
- system($script, $list1);
+ system($script, 'def', $list1);
},
qr!^\{\"\@context\".+?\}$!,
"check stdout"
);
# Check JSON
-my $json = decode_json(join('', `$script $list1`));
+my $json = decode_json(join('', `$script def $list1`));
is($json->{'collection'}->{'@type'}, 'koral:docGroup', 'type');
is($json->{'collection'}->{'operation'}, 'operation:or', 'operation');
@@ -47,7 +47,7 @@
# Check JSON
# Only return extended area
-$json = decode_json(join('', `$script $list3`));
+$json = decode_json(join('', `$script def $list3`));
is($json->{'collection'}->{'@type'}, 'koral:doc', 'type');
@@ -64,7 +64,7 @@
my $list4 = catfile(dirname(__FILE__), 'data', 'list4.def');
# Only contains intended area
-$json = decode_json(join('', `$script $list4`));
+$json = decode_json(join('', `$script def $list4`));
is($json->{'collection'}->{'@type'}, 'koral:docGroup', 'type');
is($json->{'collection'}->{'comment'}, 'name:"VAS N91"', 'name');
diff --git a/t/list2vc-deflist.t b/t/list2vc-deflist.t
new file mode 100644
index 0000000..41c2176
--- /dev/null
+++ b/t/list2vc-deflist.t
@@ -0,0 +1,28 @@
+#!/usr/bin/env perl
+use strict;
+use warnings;
+use Test::More;
+use File::Basename;
+use File::Spec::Functions;
+use Data::Dumper;
+
+use Test::Output;
+use Mojo::JSON 'decode_json';
+
+my $script = catfile(dirname(__FILE__), '..', 'script', 'cosmasvc2koralquery');
+my $list1 = catfile(dirname(__FILE__), 'data', 'list-example.ls');
+
+# Check STDOUT
+stdout_like(
+ sub {
+ system($script, 'list', $list1);
+ },
+ qr!Convert!,
+ "check stdout"
+);
+
+# Check JSON
+# my $protocol = join('', `$script list $list1`);
+
+done_testing;
+__END__
diff --git a/t/list2vc.t b/t/list2vc.t
index fd7ea52..1dc3c4c 100644
--- a/t/list2vc.t
+++ b/t/list2vc.t
@@ -4,7 +4,6 @@
use Test::More;
use File::Basename;
use File::Spec::Functions;
-
use Test::Output;
use Mojo::JSON 'decode_json';
@@ -14,14 +13,14 @@
# Check STDOUT
stdout_like(
sub {
- system($script, $list1);
+ system($script, 'def', $list1);
},
qr!^\{\"\@context\".+?\}$!,
"check stdout"
);
# Check JSON
-my $json = decode_json(join('', `$script $list1`));
+my $json = decode_json(join('', `$script def $list1`));
is($json->{'collection'}->{'@type'}, 'koral:docGroup', 'type');
is($json->{'collection'}->{'operation'}, 'operation:or', 'operation');
@@ -46,7 +45,7 @@
# Check STDIN
-my $json2 = decode_json(join('', `cat $list1 | $script -`));
+my $json2 = decode_json(join('', `cat $list1 | $script def -`));
is_deeply($json, $json2);
done_testing;
diff --git a/t/regex2vc.t b/t/regex2vc.t
new file mode 100644
index 0000000..fbdb721
--- /dev/null
+++ b/t/regex2vc.t
@@ -0,0 +1,59 @@
+#!/usr/bin/env perl
+use strict;
+use warnings;
+use Test::More;
+use KorAP::DefList;
+use Mojo::JSON 'decode_json';
+
+my $rf = \&KorAP::DefList::from_regex;
+
+sub _collection {
+ return decode_json($rf->(@_)->to_string)->{collection}
+}
+
+my $doc = _collection("x",'Name','Beschreibung');
+is($doc->{key},'corpusSigle');
+is($doc->{value},'x');
+is($doc->{type},'type:string');
+is($doc->{comment},'name:"Name",desc:"Beschreibung"');
+
+$doc = _collection("x/");
+is($doc->{key},'corpusSigle');
+is($doc->{value},'x');
+is($doc->{type},'type:string');
+
+$doc = _collection("x[0-3]",'Na"me','Besch"re\'ibung');
+is($doc->{key},'corpusSigle');
+is($doc->{value},'x[0-3]');
+is($doc->{type},'type:regex');
+is($doc->{comment},'name:"Na\"me",desc:"Besch\"re\'ibung"');
+
+$doc = _collection('x[0-3]/');
+is($doc->{key},'corpusSigle');
+is($doc->{value},'x[0-3]');
+is($doc->{type},'type:regex');
+
+$doc = _collection('BIO/(BKA|LTI|TK1|TK2|TK3|TK4|TK5|TK6)');
+is($doc->{key},'docSigle');
+is($doc->{value},'BIO/(BKA|LTI|TK1|TK2|TK3|TK4|TK5|TK6)');
+is($doc->{type},'type:regex');
+
+$doc = _collection('(GOE/(AGD|AGM|AGN|AGV|AGW))|(MK1/(LBC|LBT|LFH|LGB|LJA|LMB|LSO|MHE|TJM|TPM))|(MK2/TRI)');
+is($doc->{key},'docSigle');
+is($doc->{value},'(GOE/(AGD|AGM|AGN|AGV|AGW))|(MK1/(LBC|LBT|LFH|LGB|LJA|LMB|LSO|MHE|TJM|TPM))|(MK2/TRI)');
+is($doc->{type},'type:regex');
+
+$doc = _collection('FSP/~FSP/(ANG|ANR|EIN|GEB|KAR|REI|SCH|TYP|VER|VID)','fsp-pub','Fachsprachenkorpus');
+
+is($doc->{'@type'},'koral:docGroup');
+is($doc->{'operation'},'operation:and');
+is($doc->{operands}->[0]->{type},'type:string');
+is($doc->{operands}->[0]->{match},'match:eq');
+is($doc->{operands}->[0]->{key},'corpusSigle');
+is($doc->{operands}->[0]->{value},'FSP');
+is($doc->{operands}->[1]->{type},'type:regex');
+is($doc->{operands}->[1]->{match},'match:ne');
+is($doc->{operands}->[1]->{key},'docSigle');
+is($doc->{operands}->[1]->{value},'FSP/(ANG|ANR|EIN|GEB|KAR|REI|SCH|TYP|VER|VID)');
+
+done_testing;