Support intended/extended C2 definitions in vc conversion tool

Change-Id: I340cc892e2355ad73d6396651db90611c3b6df75
diff --git a/list2vc.pl b/list2vc.pl
index ac73ab9..95259cc 100755
--- a/list2vc.pl
+++ b/list2vc.pl
@@ -101,9 +101,14 @@
 };
 
 
-# Initial VC group
-my $vc = KorAP::VirtualCorpus::Group->new('or');
+# Create an intensional and an extensional VC
+my $vc_ext = KorAP::VirtualCorpus::Group->new('or');
+my $vc_int = KorAP::VirtualCorpus::Group->new('or');
 
+# Initial VC group
+my $vc = \$vc_ext;
+
+my $frozen = 0;
 
 # Iterate over the whole list
 while (!eof $fh) {
@@ -132,13 +137,13 @@
   }
 
   # Get text sigles
-  elsif ($line =~ m!^(?:[^\/\s]+\/){2}[^\/\s]+$!) {
+  elsif ($line =~ m!^(?:\w+\/){2}\w+$!) {
     $key = 'text';
     $value = $line;
   }
 
   # Get doc sigles
-  elsif ($line =~ m!^([^\/\s]+\/[^\/\s]+?)(?:\s.+?)?$!) {
+  elsif ($line =~ m!^(\w+\/\w+?)(?:\s.+?)?$!) {
     $key = 'doc';
     $value = $1;
   }
@@ -160,21 +165,47 @@
 
     # Convert C2 sigle to KorAP form
     $value =~ s!^([^/]+?/[^\.]+?)\.(.+?)$!$1\/$2!;
-    $vc->add_field(textSigle => $value);
+    ${$vc}->add_field(textSigle => $value);
   }
 
   # Add doc field
   elsif ($key eq 'doc') {
-    $vc->add_field(docSigle => $value);
+    ${$vc}->add_field(docSigle => $value);
   }
 
   # Add corpus field
   elsif ($key eq 'corpus') {
-    $vc->add_field(corpusSigle => $value);
-  };
+    ${$vc}->add_field(corpusSigle => $value);
+  }
+
+  # Mark the vc as frozen
+  # This means that an extended VC area is expected
+  elsif ($key eq 'frozen') {
+    $frozen = 1;
+  }
+
+  # Start/End intended VC area
+  elsif ($key eq 'intended') {
+    if ($value eq 'start') {
+      $$vc = $vc_int;
+    }
+    elsif ($value ne 'end') {
+      warn 'Unknown intension value ' . $value;
+    };
+  }
+
+  # Start/End extended VC area
+  elsif ($key eq 'extended') {
+    if ($value eq 'start') {
+      $$vc = $vc_ext;
+    }
+    elsif ($value ne 'end') {
+      warn 'Unknown extension value ' . $value;
+    };
+  }
 };
 
 close($fh);
 
-# Stringify virtual corpus
-print $vc->to_string;
+# Stringify current (extended) virtual corpus
+print $$vc->to_string;
diff --git a/t/data/list3.def b/t/data/list3.def
new file mode 100644
index 0000000..b6543a6
--- /dev/null
+++ b/t/data/list3.def
@@ -0,0 +1,58 @@
+<name>VAS-N91 (Stand 2013, korr. 2017)</name>
+
+<frozen></frozen>
+
+<intended>start</intended>
+
+<name>1991-2012</name>
+<date>m1=1991/1 bis 2012/12</date>
+<end></end>
+
+<name>Berliner Zeitung</name>
+<ql>Berliner Zeitung</ql>
+<and>1991-2012</and>
+<redabs>143237</redabs>
+<end></end>
+
+<name>Frankfurter Allgemeine</name>
+<cn>F97 Frankfurter Allgemeine 1997</cn>
+<cn>F99 Frankfurter Allgemeine 1999</cn>
+<cn>F01 Frankfurter Allgemeine 2001</cn>
+<cn>F03 Frankfurter Allgemeine 2003</cn>
+<cn>F05 Frankfurter Allgemeine 2005</cn>
+<redabs>301166</redabs>
+<end></end>
+
+<name>VAS N91</name>
+<add>Berliner Zeitung</add>
+<add>Braunschweiger Zeitung</add>
+<add>Hamburger Morgenpost</add>
+<add>Hannoversche Allgemeine</add>
+<add>Die Rheinpfalz</add>
+<add>Mannheimer Morgen</add>
+<add>Rhein-Zeitung</add>
+<add>Nürnberger Nachrichten</add>
+<add>Nürnberger Zeitung</add>
+<add>die tageszeitung</add>
+<add>Frankfurter Allgemeine</add>
+<add>Frankfurter Rundschau</add>
+<add>Burgenländische Volkszeitung</add>
+<add>Die Presse</add>
+<add>Kleine Zeitung</add>
+<add>Neue Kronen-Zeitung</add>
+<add>Niederösterreichische Nachrichten</add>
+<add>Salzburger Nachrichten</add>
+<add>Tiroler Tageszeitung</add>
+<add>Vorarlberger Nachrichten</add>
+<add>Die Südostschweiz</add>
+<add>St. Galler Tagblatt</add>
+<add>Zürcher Tagesanzeiger</add>
+<and>1991-2012</and>
+<end></end>
+
+<intended>end</intended>
+
+<extended>start</extended>
+<text>A00/APR.23232</text>
+<text>A00/APR.23233</text>
+<extended>end</extended>
diff --git a/t/list2vc-def.t b/t/list2vc-def.t
index 7810408..d38d9ee 100644
--- a/t/list2vc-def.t
+++ b/t/list2vc-def.t
@@ -41,4 +41,21 @@
 is($op2->{'value'}->[0], "B19/AUG/01665", 'value');
 is($op2->{'value'}->[1], ,"B19/AUG/01666", 'value');
 
+
+my $list2 = catfile(dirname(__FILE__), 'data', 'list3.def');
+
+# Check JSON
+# Only return extended area
+$json = decode_json(join('', `$script $list2`));
+
+is($json->{'collection'}->{'@type'}, 'koral:docGroup', 'type');
+is($json->{'collection'}->{'operation'}, 'operation:or', 'operation');
+
+$op1 = $json->{'collection'}->{'operands'}->[0];
+is($op1->{'@type'}, 'koral:doc', 'type');
+is($op1->{'key'}, 'textSigle', 'key');
+is($op1->{'match'}, 'match:eq', 'match');
+is($op1->{'value'}->[0], "A00/APR/23232", 'value');
+is($op1->{'value'}->[1], ,"A00/APR/23233", 'value');
+
 done_testing;