Support intended/extended C2 definitions in vc conversion tool
Change-Id: I340cc892e2355ad73d6396651db90611c3b6df75
diff --git a/tools/list2vc.pl b/tools/list2vc.pl
index ac73ab9..95259cc 100755
--- a/tools/list2vc.pl
+++ b/tools/list2vc.pl
@@ -101,9 +101,14 @@
};
-# Initial VC group
-my $vc = KorAP::VirtualCorpus::Group->new('or');
+# Create an intensional and an extensional VC
+my $vc_ext = KorAP::VirtualCorpus::Group->new('or');
+my $vc_int = KorAP::VirtualCorpus::Group->new('or');
+# Initial VC group
+my $vc = \$vc_ext;
+
+my $frozen = 0;
# Iterate over the whole list
while (!eof $fh) {
@@ -132,13 +137,13 @@
}
# Get text sigles
- elsif ($line =~ m!^(?:[^\/\s]+\/){2}[^\/\s]+$!) {
+ elsif ($line =~ m!^(?:\w+\/){2}\w+$!) {
$key = 'text';
$value = $line;
}
# Get doc sigles
- elsif ($line =~ m!^([^\/\s]+\/[^\/\s]+?)(?:\s.+?)?$!) {
+ elsif ($line =~ m!^(\w+\/\w+?)(?:\s.+?)?$!) {
$key = 'doc';
$value = $1;
}
@@ -160,21 +165,47 @@
# Convert C2 sigle to KorAP form
$value =~ s!^([^/]+?/[^\.]+?)\.(.+?)$!$1\/$2!;
- $vc->add_field(textSigle => $value);
+ ${$vc}->add_field(textSigle => $value);
}
# Add doc field
elsif ($key eq 'doc') {
- $vc->add_field(docSigle => $value);
+ ${$vc}->add_field(docSigle => $value);
}
# Add corpus field
elsif ($key eq 'corpus') {
- $vc->add_field(corpusSigle => $value);
- };
+ ${$vc}->add_field(corpusSigle => $value);
+ }
+
+ # Mark the vc as frozen
+ # This means that an extended VC area is expected
+ elsif ($key eq 'frozen') {
+ $frozen = 1;
+ }
+
+ # Start/End intended VC area
+ elsif ($key eq 'intended') {
+ if ($value eq 'start') {
+ $$vc = $vc_int;
+ }
+ elsif ($value ne 'end') {
+ warn 'Unknown intension value ' . $value;
+ };
+ }
+
+ # Start/End extended VC area
+ elsif ($key eq 'extended') {
+ if ($value eq 'start') {
+ $$vc = $vc_ext;
+ }
+ elsif ($value ne 'end') {
+ warn 'Unknown extension value ' . $value;
+ };
+ }
};
close($fh);
-# Stringify virtual corpus
-print $vc->to_string;
+# Stringify current (extended) virtual corpus
+print $$vc->to_string;
diff --git a/tools/t/data/list3.def b/tools/t/data/list3.def
new file mode 100644
index 0000000..b6543a6
--- /dev/null
+++ b/tools/t/data/list3.def
@@ -0,0 +1,58 @@
+<name>VAS-N91 (Stand 2013, korr. 2017)</name>
+
+<frozen></frozen>
+
+<intended>start</intended>
+
+<name>1991-2012</name>
+<date>m1=1991/1 bis 2012/12</date>
+<end></end>
+
+<name>Berliner Zeitung</name>
+<ql>Berliner Zeitung</ql>
+<and>1991-2012</and>
+<redabs>143237</redabs>
+<end></end>
+
+<name>Frankfurter Allgemeine</name>
+<cn>F97 Frankfurter Allgemeine 1997</cn>
+<cn>F99 Frankfurter Allgemeine 1999</cn>
+<cn>F01 Frankfurter Allgemeine 2001</cn>
+<cn>F03 Frankfurter Allgemeine 2003</cn>
+<cn>F05 Frankfurter Allgemeine 2005</cn>
+<redabs>301166</redabs>
+<end></end>
+
+<name>VAS N91</name>
+<add>Berliner Zeitung</add>
+<add>Braunschweiger Zeitung</add>
+<add>Hamburger Morgenpost</add>
+<add>Hannoversche Allgemeine</add>
+<add>Die Rheinpfalz</add>
+<add>Mannheimer Morgen</add>
+<add>Rhein-Zeitung</add>
+<add>Nürnberger Nachrichten</add>
+<add>Nürnberger Zeitung</add>
+<add>die tageszeitung</add>
+<add>Frankfurter Allgemeine</add>
+<add>Frankfurter Rundschau</add>
+<add>Burgenländische Volkszeitung</add>
+<add>Die Presse</add>
+<add>Kleine Zeitung</add>
+<add>Neue Kronen-Zeitung</add>
+<add>Niederösterreichische Nachrichten</add>
+<add>Salzburger Nachrichten</add>
+<add>Tiroler Tageszeitung</add>
+<add>Vorarlberger Nachrichten</add>
+<add>Die Südostschweiz</add>
+<add>St. Galler Tagblatt</add>
+<add>Zürcher Tagesanzeiger</add>
+<and>1991-2012</and>
+<end></end>
+
+<intended>end</intended>
+
+<extended>start</extended>
+<text>A00/APR.23232</text>
+<text>A00/APR.23233</text>
+<extended>end</extended>
diff --git a/tools/t/list2vc-def.t b/tools/t/list2vc-def.t
index 7810408..d38d9ee 100644
--- a/tools/t/list2vc-def.t
+++ b/tools/t/list2vc-def.t
@@ -41,4 +41,21 @@
is($op2->{'value'}->[0], "B19/AUG/01665", 'value');
is($op2->{'value'}->[1], ,"B19/AUG/01666", 'value');
+
+my $list2 = catfile(dirname(__FILE__), 'data', 'list3.def');
+
+# Check JSON
+# Only return extended area
+$json = decode_json(join('', `$script $list2`));
+
+is($json->{'collection'}->{'@type'}, 'koral:docGroup', 'type');
+is($json->{'collection'}->{'operation'}, 'operation:or', 'operation');
+
+$op1 = $json->{'collection'}->{'operands'}->[0];
+is($op1->{'@type'}, 'koral:doc', 'type');
+is($op1->{'key'}, 'textSigle', 'key');
+is($op1->{'match'}, 'match:eq', 'match');
+is($op1->{'value'}->[0], "A00/APR/23232", 'value');
+is($op1->{'value'}->[1], ,"A00/APR/23233", 'value');
+
done_testing;