Support referencing embedded virtual corpora in vc conversion tool
Change-Id: I71d4243b948a8f235b249f98fb7a0050ec59b911
diff --git a/tools/list2vc.pl b/tools/list2vc.pl
index d68e22d..a25bf29 100755
--- a/tools/list2vc.pl
+++ b/tools/list2vc.pl
@@ -39,9 +39,10 @@
## Create collection object
my $json = '{';
$json .= '"@context":"http://korap.ids-mannheim.de/ns/KoralQuery/v0.3/context.jsonld",';
- $json .= '"collection":{';
+ $json .= '"comment":"Name: ' . $self->equote($self->name) . '",' if $self->name;
+ $json .= '"collection":';
$json .= $self->_to_fragment;
- return $json .= '}}';
+ return $json .= '}';
};
@@ -55,52 +56,114 @@
sub new {
my $class = shift;
bless {
- op => shift,
- fields => {}
+ with => [],
+ with_fields => {},
+ without => [],
+ without_fields => {},
}, $class;
};
+# Define an operand to be "or"ed
+sub with {
+ my $self = shift;
+ push @{$self->{with}}, shift;
+};
-# Add field information to group
-sub add_field {
+
+# Define a field that should be "or"ed
+sub with_field {
my $self = shift;
my $field = shift;
- push @{$self->{fields}->{$field}}, shift;
+ push @{$self->{with_fields}->{$field}}, shift;
};
+# Define an operand to be "and"ed
+sub without {
+ my $self = shift;
+ push @{$self->{without}}, shift;
+};
+
+
+# Define a field that should be "and"ed
+sub without_field {
+ my $self = shift;
+ my $field = shift;
+ push @{$self->{without_fields}->{$field}}, shift;
+};
+
+# Create a document vector field
+sub _doc_vec {
+ my $field = shift;
+ my $vec = shift;
+ my $json = '{';
+ $json .= '"@type":"koral:doc",';
+ $json .= '"key":"' . $field . '",';
+ $json .= '"match":"match:eq",';
+ $json .= '"value":[';
+ $json .= join ',', map { '"' . $_ . '"' } @$vec;
+ $json .= ']';
+ $json .= '},';
+ return $json;
+}
+
# Stringify fragment
sub _to_fragment {
my $self = shift;
- my $json = '';
- unless (keys %{$self->{fields}}) {
- return $json . '}}';
- };
-
+ my $json = '{';
$json .= '"@type":"koral:docGroup",';
$json .= '"comment":"Name: ' . $self->equote($self->name) . '",' if $self->name;
- $json .= '"operation":"operation:' . $self->{op} . '",';
- $json .= '"operands":[';
- foreach my $field (sort keys %{$self->{fields}}) {
- unless (@{$self->{fields}->{$field}}) {
- next;
+ # Make the outer group "and"
+ if (keys %{$self->{without_fields}}) {
+ $json .= '"operation":"operation:and",';
+ $json .= '"operands":[';
+
+ foreach my $field (sort keys %{$self->{without_fields}}) {
+ unless (@{$self->{without_fields}->{$field}}) {
+ next;
+ };
+ $json .= _doc_vec($field, $self->{without_fields}->{$field});
};
- $json .= '{';
- $json .= '"@type":"koral:doc",';
- $json .= '"key":"' . $field . '",';
- $json .= '"match":"match:eq",';
- $json .= '"value":[';
- $json .= join ',', map { '"' . $_ . '"' } @{$self->{fields}->{$field}};
- $json .= ']';
- $json .= '},';
+
+ # Remove the last comma
+ chop $json;
+
+ $json .= ']';
+ }
+
+ elsif (keys %{$self->{with_fields}} || @{$self->{with}}) {
+ $json .= '"operation":"operation:or",';
+
+ # TODO:
+ # Flatten embedded or-VCs!
+ $json .= '"operands":[';
+
+ foreach my $field (sort keys %{$self->{with_fields}}) {
+ unless (@{$self->{with_fields}->{$field}}) {
+ next;
+ };
+ $json .= _doc_vec($field, $self->{with_fields}->{$field});
+ };
+
+ foreach my $op (@{$self->{with}}) {
+ $json .= $op->_to_fragment . ',';
+ };
+
+ # Remove the last comma
+ chop $json;
+
+ $json .= ']';
+ }
+
+ # No operands in the group
+ else {
+ # Remove the last comma after the comment
+ chop $json;
};
- # Remove the last comma
- chop $json;
-
- return $json . ']';
+ return $json . '}';
};
@@ -126,6 +189,7 @@
};
+# Shorten long strings for logging
sub _shorten ($) {
my $line = shift;
if (length($line) < 20) {
@@ -147,12 +211,15 @@
# Create an intensional and an extensional VC
-my $vc_ext = KorAP::VirtualCorpus::Group->new('or');
-my $vc_int = KorAP::VirtualCorpus::Group->new('or');
+my $vc_ext = KorAP::VirtualCorpus::Group->new;
+my $vc_int = KorAP::VirtualCorpus::Group->new;
# Initial VC group
my $vc = \$vc_ext;
+# Collect all virtual corpora
+my %all_vcs;
+
my $frozen = 0;
# Iterate over the whole list
@@ -210,17 +277,25 @@
# Convert C2 sigle to KorAP form
$value =~ s!^([^/]+?/[^\.]+?)\.(.+?)$!$1\/$2!;
- ${$vc}->add_field(textSigle => $value);
+ ${$vc}->with_field(textSigle => $value);
}
# Add doc field
elsif ($key eq 'doc') {
- ${$vc}->add_field(docSigle => $value);
+ ${$vc}->with_field(docSigle => $value);
}
# Add corpus field
elsif ($key eq 'corpus') {
- ${$vc}->add_field(corpusSigle => $value);
+ ${$vc}->with_field(corpusSigle => $value);
+ }
+
+ # Add corpus field
+ elsif ($key eq 'cn') {
+ # Korpussigle, z.B. 'F97 Frankfurter Allgemeine 1997'
+ if ($value =~ m!^([^\/\s]+)(?:\s.+?)?$!) {
+ ${$vc}->with_field(corpusSigle => $1);
+ };
}
# Mark the vc as frozen
@@ -256,11 +331,31 @@
# <add>, <sub>"
# No global name defined yet
- unless ($$vc->name) {
+ if ($$vc && !$$vc->name) {
$vc_ext->name($value);
$vc_int->name($value);
next;
};
+
+ ${$vc} = KorAP::VirtualCorpus::Group->new;
+ ${$vc}->name($value);
+ }
+
+ # End VC def
+ elsif ($key eq 'end') {
+ $all_vcs{${$vc}->name} = $$vc;
+ # $vc = undef;
+ }
+
+ # Add VC definition
+ elsif ($key eq 'add') {
+ unless (defined $all_vcs{$value}) {
+ # warn 'VC ' . $value . ' not defined';
+ # exit(1);
+ next;
+ };
+
+ $$vc->with($all_vcs{$value});
}
# Unknown
@@ -271,5 +366,5 @@
close($fh);
-# Stringify current (extended) virtual corpus
+# Stringify current (extended?) virtual corpus
print $$vc->to_string;
diff --git a/tools/t/data/list4.def b/tools/t/data/list4.def
new file mode 100644
index 0000000..0cd73aa
--- /dev/null
+++ b/tools/t/data/list4.def
@@ -0,0 +1,27 @@
+<name>VAS-N91 (Stand "2013", korr. 2017)</name>
+
+<name>1991-2012</name>
+<date>m1=1991/1 bis 2012/12</date>
+<end></end>
+
+<name>Berliner Zeitung</name>
+<ql>Berliner Zeitung</ql>
+<and>1991-2012</and>
+<redabs>143237</redabs>
+<end></end>
+
+<name>Frankfurter Allgemeine</name>
+<cn>F97 Frankfurter Allgemeine 1997</cn>
+<cn>F99 Frankfurter Allgemeine 1999</cn>
+<cn>F01 Frankfurter Allgemeine 2001</cn>
+<cn>F03 Frankfurter Allgemeine 2003</cn>
+<cn>F05 Frankfurter Allgemeine 2005</cn>
+<redabs>301166</redabs>
+<end></end>
+
+<name>VAS N91</name>
+<add>Berliner Zeitung</add>
+<add>Frankfurter Allgemeine</add>
+<add>Frankfurter Rundschau</add>
+<and>1991-2012</and>
+<end></end>
diff --git a/tools/t/list2vc-def.t b/tools/t/list2vc-def.t
index 508c8f9..d60ca8d 100644
--- a/tools/t/list2vc-def.t
+++ b/tools/t/list2vc-def.t
@@ -4,6 +4,7 @@
use Test::More;
use File::Basename;
use File::Spec::Functions;
+use Data::Dumper;
use Test::Output;
use Mojo::JSON 'decode_json';
@@ -41,17 +42,16 @@
is($op2->{'value'}->[0], "B19/AUG/01665", 'value');
is($op2->{'value'}->[1], ,"B19/AUG/01666", 'value');
-
-my $list2 = catfile(dirname(__FILE__), 'data', 'list3.def');
+my $list3 = catfile(dirname(__FILE__), 'data', 'list3.def');
# Check JSON
# Only return extended area
-$json = decode_json(join('', `$script $list2`));
+$json = decode_json(join('', `$script $list3`));
is($json->{'collection'}->{'@type'}, 'koral:docGroup', 'type');
is($json->{'collection'}->{'operation'}, 'operation:or', 'operation');
-is($json->{'collection'}->{'comment'}, 'Name: "VAS-N91 (Stand \"2013\", korr. 2017)"', 'type');
-
+# is($json->{'collection'}->{'comment'}, 'Name: "VAS-N91 (Stand \"2013\", korr. 2017)"', 'type');
+is($json->{'collection'}->{'comment'}, 'Name: "VAS N91"', 'type');
$op1 = $json->{'collection'}->{'operands'}->[0];
is($op1->{'@type'}, 'koral:doc', 'type');
@@ -60,4 +60,16 @@
is($op1->{'value'}->[0], "A00/APR/23232", 'value');
is($op1->{'value'}->[1], ,"A00/APR/23233", 'value');
+
+my $list4 = catfile(dirname(__FILE__), 'data', 'list4.def');
+
+# Only contains intended area
+$json = decode_json(join('', `$script $list4`));
+
+is($json->{'collection'}->{'@type'}, 'koral:docGroup', 'type');
+is($json->{'collection'}->{'operation'}, 'operation:or', 'operation');
+# is($json->{'collection'}->{'comment'}, 'Name: "VAS-N91 (Stand \"2013\", korr. 2017)"', 'type');
+is($json->{'collection'}->{'comment'}, 'Name: "VAS N91"', 'type');
+
+
done_testing;