Support referencing embedded virtual corpora in vc conversion tool

Change-Id: I71d4243b948a8f235b249f98fb7a0050ec59b911
diff --git a/list2vc.pl b/list2vc.pl
index d68e22d..a25bf29 100755
--- a/list2vc.pl
+++ b/list2vc.pl
@@ -39,9 +39,10 @@
   ## Create collection object
   my $json = '{';
   $json .= '"@context":"http://korap.ids-mannheim.de/ns/KoralQuery/v0.3/context.jsonld",';
-  $json .= '"collection":{';
+  $json .= '"comment":"Name: ' . $self->equote($self->name) .  '",' if $self->name;
+  $json .= '"collection":';
   $json .= $self->_to_fragment;
-  return $json .= '}}';
+  return $json .= '}';
 };
 
 
@@ -55,52 +56,114 @@
 sub new {
   my $class = shift;
   bless {
-    op => shift,
-    fields => {}
+    with => [],
+    with_fields => {},
+    without => [],
+    without_fields => {},
   }, $class;
 };
 
+# Define an operand to be "or"ed
+sub with {
+  my $self = shift;
+  push @{$self->{with}}, shift;
+};
 
-# Add field information to group
-sub add_field {
+
+# Define a field that should be "or"ed
+sub with_field {
   my $self = shift;
   my $field = shift;
-  push @{$self->{fields}->{$field}}, shift;
+  push @{$self->{with_fields}->{$field}}, shift;
 };
 
+# Define an operand to be "and"ed
+sub without {
+  my $self = shift;
+  push @{$self->{without}}, shift;
+};
+
+
+# Define a field that should be "and"ed
+sub without_field {
+  my $self = shift;
+  my $field = shift;
+  push @{$self->{without_fields}->{$field}}, shift;
+};
+
+# Create a document vector field
+sub _doc_vec {
+  my $field = shift;
+  my $vec = shift;
+  my $json = '{';
+  $json .= '"@type":"koral:doc",';
+  $json .= '"key":"' . $field . '",';
+  $json .= '"match":"match:eq",';
+  $json .= '"value":[';
+  $json .= join ',', map { '"' . $_ . '"' } @$vec;
+  $json .=  ']';
+  $json .= '},';
+  return $json;
+}
+
 
 # Stringify fragment
 sub _to_fragment {
   my $self = shift;
-  my $json = '';
 
-  unless (keys %{$self->{fields}}) {
-    return $json . '}}';
-  };
-
+  my $json = '{';
   $json .= '"@type":"koral:docGroup",';
   $json .= '"comment":"Name: ' . $self->equote($self->name) .  '",' if $self->name;
-  $json .= '"operation":"operation:' . $self->{op} . '",';
-  $json .= '"operands":[';
 
-  foreach my $field (sort keys %{$self->{fields}}) {
-    unless (@{$self->{fields}->{$field}}) {
-      next;
+  # Make the outer group "and"
+  if (keys %{$self->{without_fields}}) {
+    $json .= '"operation":"operation:and",';
+    $json .= '"operands":[';
+
+    foreach my $field (sort keys %{$self->{without_fields}}) {
+      unless (@{$self->{without_fields}->{$field}}) {
+        next;
+      };
+      $json .= _doc_vec($field, $self->{without_fields}->{$field});
     };
-    $json .= '{';
-    $json .= '"@type":"koral:doc",';
-    $json .= '"key":"' . $field . '",';
-    $json .= '"match":"match:eq",';
-    $json .= '"value":[';
-    $json .= join ',', map { '"' . $_ . '"' } @{$self->{fields}->{$field}};
-    $json .=  ']';
-    $json .= '},';
+
+    # Remove the last comma
+    chop $json;
+
+    $json .= ']';
+  }
+
+  elsif (keys %{$self->{with_fields}} || @{$self->{with}}) {
+    $json .= '"operation":"operation:or",';
+
+    # TODO:
+    #   Flatten embedded or-VCs!
+    $json .= '"operands":[';
+
+    foreach my $field (sort keys %{$self->{with_fields}}) {
+      unless (@{$self->{with_fields}->{$field}}) {
+        next;
+      };
+      $json .= _doc_vec($field, $self->{with_fields}->{$field});
+    };
+
+    foreach my $op (@{$self->{with}}) {
+      $json .= $op->_to_fragment . ',';
+    };
+
+    # Remove the last comma
+    chop $json;
+
+    $json .= ']';
+  }
+
+  # No operands in the group
+  else {
+    # Remove the last comma after the comment
+    chop $json;
   };
 
-  # Remove the last comma
-  chop $json;
-
-  return $json . ']';
+  return $json . '}';
 };
 
 
@@ -126,6 +189,7 @@
 };
 
 
+# Shorten long strings for logging
 sub _shorten ($) {
   my $line = shift;
   if (length($line) < 20) {
@@ -147,12 +211,15 @@
 
 
 # Create an intensional and an extensional VC
-my $vc_ext = KorAP::VirtualCorpus::Group->new('or');
-my $vc_int = KorAP::VirtualCorpus::Group->new('or');
+my $vc_ext = KorAP::VirtualCorpus::Group->new;
+my $vc_int = KorAP::VirtualCorpus::Group->new;
 
 # Initial VC group
 my $vc = \$vc_ext;
 
+# Collect all virtual corpora
+my %all_vcs;
+
 my $frozen = 0;
 
 # Iterate over the whole list
@@ -210,17 +277,25 @@
 
     # Convert C2 sigle to KorAP form
     $value =~ s!^([^/]+?/[^\.]+?)\.(.+?)$!$1\/$2!;
-    ${$vc}->add_field(textSigle => $value);
+    ${$vc}->with_field(textSigle => $value);
   }
 
   # Add doc field
   elsif ($key eq 'doc') {
-    ${$vc}->add_field(docSigle => $value);
+    ${$vc}->with_field(docSigle => $value);
   }
 
   # Add corpus field
   elsif ($key eq 'corpus') {
-    ${$vc}->add_field(corpusSigle => $value);
+    ${$vc}->with_field(corpusSigle => $value);
+  }
+
+  # Add corpus field
+  elsif ($key eq 'cn') {
+    # Korpussigle, z.B. 'F97 Frankfurter Allgemeine 1997'
+    if ($value =~ m!^([^\/\s]+)(?:\s.+?)?$!) {
+      ${$vc}->with_field(corpusSigle => $1);
+    };
   }
 
   # Mark the vc as frozen
@@ -256,11 +331,31 @@
     # <add>, <sub>"
 
     # No global name defined yet
-    unless ($$vc->name) {
+    if ($$vc && !$$vc->name) {
       $vc_ext->name($value);
       $vc_int->name($value);
       next;
     };
+
+    ${$vc} = KorAP::VirtualCorpus::Group->new;
+    ${$vc}->name($value);
+  }
+
+  # End VC def
+  elsif ($key eq 'end') {
+    $all_vcs{${$vc}->name} = $$vc;
+    # $vc = undef;
+  }
+
+  # Add VC definition
+  elsif ($key eq 'add') {
+    unless (defined $all_vcs{$value}) {
+      #       warn 'VC ' . $value . ' not defined';
+      # exit(1);
+      next;
+    };
+
+    $$vc->with($all_vcs{$value});
   }
 
   # Unknown
@@ -271,5 +366,5 @@
 
 close($fh);
 
-# Stringify current (extended) virtual corpus
+# Stringify current (extended?) virtual corpus
 print $$vc->to_string;