Support referencing embedded virtual corpora in vc conversion tool

Change-Id: I71d4243b948a8f235b249f98fb7a0050ec59b911
diff --git a/list2vc.pl b/list2vc.pl
index d68e22d..a25bf29 100755
--- a/list2vc.pl
+++ b/list2vc.pl
@@ -39,9 +39,10 @@
   ## Create collection object
   my $json = '{';
   $json .= '"@context":"http://korap.ids-mannheim.de/ns/KoralQuery/v0.3/context.jsonld",';
-  $json .= '"collection":{';
+  $json .= '"comment":"Name: ' . $self->equote($self->name) .  '",' if $self->name;
+  $json .= '"collection":';
   $json .= $self->_to_fragment;
-  return $json .= '}}';
+  return $json .= '}';
 };
 
 
@@ -55,52 +56,114 @@
 sub new {
   my $class = shift;
   bless {
-    op => shift,
-    fields => {}
+    with => [],
+    with_fields => {},
+    without => [],
+    without_fields => {},
   }, $class;
 };
 
+# Define an operand to be "or"ed
+sub with {
+  my $self = shift;
+  push @{$self->{with}}, shift;
+};
 
-# Add field information to group
-sub add_field {
+
+# Define a field that should be "or"ed
+sub with_field {
   my $self = shift;
   my $field = shift;
-  push @{$self->{fields}->{$field}}, shift;
+  push @{$self->{with_fields}->{$field}}, shift;
 };
 
+# Define an operand to be "and"ed
+sub without {
+  my $self = shift;
+  push @{$self->{without}}, shift;
+};
+
+
+# Define a field that should be "and"ed
+sub without_field {
+  my $self = shift;
+  my $field = shift;
+  push @{$self->{without_fields}->{$field}}, shift;
+};
+
+# Create a document vector field
+sub _doc_vec {
+  my $field = shift;
+  my $vec = shift;
+  my $json = '{';
+  $json .= '"@type":"koral:doc",';
+  $json .= '"key":"' . $field . '",';
+  $json .= '"match":"match:eq",';
+  $json .= '"value":[';
+  $json .= join ',', map { '"' . $_ . '"' } @$vec;
+  $json .=  ']';
+  $json .= '},';
+  return $json;
+}
+
 
 # Stringify fragment
 sub _to_fragment {
   my $self = shift;
-  my $json = '';
 
-  unless (keys %{$self->{fields}}) {
-    return $json . '}}';
-  };
-
+  my $json = '{';
   $json .= '"@type":"koral:docGroup",';
   $json .= '"comment":"Name: ' . $self->equote($self->name) .  '",' if $self->name;
-  $json .= '"operation":"operation:' . $self->{op} . '",';
-  $json .= '"operands":[';
 
-  foreach my $field (sort keys %{$self->{fields}}) {
-    unless (@{$self->{fields}->{$field}}) {
-      next;
+  # Make the outer group "and"
+  if (keys %{$self->{without_fields}}) {
+    $json .= '"operation":"operation:and",';
+    $json .= '"operands":[';
+
+    foreach my $field (sort keys %{$self->{without_fields}}) {
+      unless (@{$self->{without_fields}->{$field}}) {
+        next;
+      };
+      $json .= _doc_vec($field, $self->{without_fields}->{$field});
     };
-    $json .= '{';
-    $json .= '"@type":"koral:doc",';
-    $json .= '"key":"' . $field . '",';
-    $json .= '"match":"match:eq",';
-    $json .= '"value":[';
-    $json .= join ',', map { '"' . $_ . '"' } @{$self->{fields}->{$field}};
-    $json .=  ']';
-    $json .= '},';
+
+    # Remove the last comma
+    chop $json;
+
+    $json .= ']';
+  }
+
+  elsif (keys %{$self->{with_fields}} || @{$self->{with}}) {
+    $json .= '"operation":"operation:or",';
+
+    # TODO:
+    #   Flatten embedded or-VCs!
+    $json .= '"operands":[';
+
+    foreach my $field (sort keys %{$self->{with_fields}}) {
+      unless (@{$self->{with_fields}->{$field}}) {
+        next;
+      };
+      $json .= _doc_vec($field, $self->{with_fields}->{$field});
+    };
+
+    foreach my $op (@{$self->{with}}) {
+      $json .= $op->_to_fragment . ',';
+    };
+
+    # Remove the last comma
+    chop $json;
+
+    $json .= ']';
+  }
+
+  # No operands in the group
+  else {
+    # Remove the last comma after the comment
+    chop $json;
   };
 
-  # Remove the last comma
-  chop $json;
-
-  return $json . ']';
+  return $json . '}';
 };
 
 
@@ -126,6 +189,7 @@
 };
 
 
+# Shorten long strings for logging
 sub _shorten ($) {
   my $line = shift;
   if (length($line) < 20) {
@@ -147,12 +211,15 @@
 
 
 # Create an intensional and an extensional VC
-my $vc_ext = KorAP::VirtualCorpus::Group->new('or');
-my $vc_int = KorAP::VirtualCorpus::Group->new('or');
+my $vc_ext = KorAP::VirtualCorpus::Group->new;
+my $vc_int = KorAP::VirtualCorpus::Group->new;
 
 # Initial VC group
 my $vc = \$vc_ext;
 
+# Collect all virtual corpora
+my %all_vcs;
+
 my $frozen = 0;
 
 # Iterate over the whole list
@@ -210,17 +277,25 @@
 
     # Convert C2 sigle to KorAP form
     $value =~ s!^([^/]+?/[^\.]+?)\.(.+?)$!$1\/$2!;
-    ${$vc}->add_field(textSigle => $value);
+    ${$vc}->with_field(textSigle => $value);
   }
 
   # Add doc field
   elsif ($key eq 'doc') {
-    ${$vc}->add_field(docSigle => $value);
+    ${$vc}->with_field(docSigle => $value);
   }
 
   # Add corpus field
   elsif ($key eq 'corpus') {
-    ${$vc}->add_field(corpusSigle => $value);
+    ${$vc}->with_field(corpusSigle => $value);
+  }
+
+  # Add corpus field
+  elsif ($key eq 'cn') {
+    # Korpussigle, z.B. 'F97 Frankfurter Allgemeine 1997'
+    if ($value =~ m!^([^\/\s]+)(?:\s.+?)?$!) {
+      ${$vc}->with_field(corpusSigle => $1);
+    };
   }
 
   # Mark the vc as frozen
@@ -256,11 +331,31 @@
     # <add>, <sub>"
 
     # No global name defined yet
-    unless ($$vc->name) {
+    if ($$vc && !$$vc->name) {
       $vc_ext->name($value);
       $vc_int->name($value);
       next;
     };
+
+    ${$vc} = KorAP::VirtualCorpus::Group->new;
+    ${$vc}->name($value);
+  }
+
+  # End VC def
+  elsif ($key eq 'end') {
+    $all_vcs{${$vc}->name} = $$vc;
+    # $vc = undef;
+  }
+
+  # Add VC definition
+  elsif ($key eq 'add') {
+    unless (defined $all_vcs{$value}) {
+      #       warn 'VC ' . $value . ' not defined';
+      # exit(1);
+      next;
+    };
+
+    $$vc->with($all_vcs{$value});
   }
 
   # Unknown
@@ -271,5 +366,5 @@
 
 close($fh);
 
-# Stringify current (extended) virtual corpus
+# Stringify current (extended?) virtual corpus
 print $$vc->to_string;
diff --git a/t/data/list4.def b/t/data/list4.def
new file mode 100644
index 0000000..0cd73aa
--- /dev/null
+++ b/t/data/list4.def
@@ -0,0 +1,27 @@
+<name>VAS-N91 (Stand "2013", korr. 2017)</name>
+
+<name>1991-2012</name>
+<date>m1=1991/1 bis 2012/12</date>
+<end></end>
+
+<name>Berliner Zeitung</name>
+<ql>Berliner Zeitung</ql>
+<and>1991-2012</and>
+<redabs>143237</redabs>
+<end></end>
+
+<name>Frankfurter Allgemeine</name>
+<cn>F97 Frankfurter Allgemeine 1997</cn>
+<cn>F99 Frankfurter Allgemeine 1999</cn>
+<cn>F01 Frankfurter Allgemeine 2001</cn>
+<cn>F03 Frankfurter Allgemeine 2003</cn>
+<cn>F05 Frankfurter Allgemeine 2005</cn>
+<redabs>301166</redabs>
+<end></end>
+
+<name>VAS N91</name>
+<add>Berliner Zeitung</add>
+<add>Frankfurter Allgemeine</add>
+<add>Frankfurter Rundschau</add>
+<and>1991-2012</and>
+<end></end>
diff --git a/t/list2vc-def.t b/t/list2vc-def.t
index 508c8f9..d60ca8d 100644
--- a/t/list2vc-def.t
+++ b/t/list2vc-def.t
@@ -4,6 +4,7 @@
 use Test::More;
 use File::Basename;
 use File::Spec::Functions;
+use Data::Dumper;
 
 use Test::Output;
 use Mojo::JSON 'decode_json';
@@ -41,17 +42,16 @@
 is($op2->{'value'}->[0], "B19/AUG/01665", 'value');
 is($op2->{'value'}->[1], ,"B19/AUG/01666", 'value');
 
-
-my $list2 = catfile(dirname(__FILE__), 'data', 'list3.def');
+my $list3 = catfile(dirname(__FILE__), 'data', 'list3.def');
 
 # Check JSON
 # Only return extended area
-$json = decode_json(join('', `$script $list2`));
+$json = decode_json(join('', `$script $list3`));
 
 is($json->{'collection'}->{'@type'}, 'koral:docGroup', 'type');
 is($json->{'collection'}->{'operation'}, 'operation:or', 'operation');
-is($json->{'collection'}->{'comment'}, 'Name: "VAS-N91 (Stand \"2013\", korr. 2017)"', 'type');
-
+# is($json->{'collection'}->{'comment'}, 'Name: "VAS-N91 (Stand \"2013\", korr. 2017)"', 'type');
+is($json->{'collection'}->{'comment'}, 'Name: "VAS N91"', 'type');
 
 $op1 = $json->{'collection'}->{'operands'}->[0];
 is($op1->{'@type'}, 'koral:doc', 'type');
@@ -60,4 +60,16 @@
 is($op1->{'value'}->[0], "A00/APR/23232", 'value');
 is($op1->{'value'}->[1], ,"A00/APR/23233", 'value');
 
+
+my $list4 = catfile(dirname(__FILE__), 'data', 'list4.def');
+
+# Only contains intended area
+$json = decode_json(join('', `$script $list4`));
+
+is($json->{'collection'}->{'@type'}, 'koral:docGroup', 'type');
+is($json->{'collection'}->{'operation'}, 'operation:or', 'operation');
+# is($json->{'collection'}->{'comment'}, 'Name: "VAS-N91 (Stand \"2013\", korr. 2017)"', 'type');
+is($json->{'collection'}->{'comment'}, 'Name: "VAS N91"', 'type');
+
+
 done_testing;