Added preliminary support for C2 def-files in VC conversion tool

Change-Id: If2a6a24e7401bc1222597670fb38b5cba7e3aa80
diff --git a/tools/list2vc.pl b/tools/list2vc.pl
index c632ec7..508f88f 100755
--- a/tools/list2vc.pl
+++ b/tools/list2vc.pl
@@ -1,24 +1,13 @@
 #!/usr/bin/env perl
-
-
-
-package main;
 use strict;
 use warnings;
 
+# 2020-05-20
+#   Preliminary support for C2 def-files.
+
+
 our @ARGV;
 
-sub shorten ($) {
-  my $line = shift;
-  if (length($line) < 20) {
-    return $line;
-  }
-  else {
-    return substr($line,0,17) . '...';
-  };
-};
-
-
 unless (@ARGV) {
   print <<'HELP';
 Convert a line-separated list of corpus sigles, doc sigles or
@@ -31,6 +20,18 @@
 exit 0;
 };
 
+
+sub shorten ($) {
+  my $line = shift;
+  if (length($line) < 20) {
+    return $line;
+  }
+  else {
+    return substr($line,0,17) . '...';
+  };
+};
+
+
 my $fh;
 if ($ARGV[0] eq '-') {
   $fh = *STDIN;
@@ -58,23 +59,54 @@
     next;
   };
 
+  my ($key, $value, $desc);
+
+  # Line-Type: <e>c</a>
+  if ($line =~ /^\s*<([^>]+)>\s*([^<]*)\s*<\/\1>\s*$/) {
+    $key = $1;
+    $value = $2 // undef;
+  }
+
+  # Line-Type: <e>c
+  elsif($line =~ /^\s*<([^>]+)>\s*([^<]+)\s*$/) {
+    $key = $1;
+    $value = $2;
+  }
+
   # Get text sigles
-  if ($line =~ m!^([^\/]+\/){2}[^\/]+$!) {
-    push @{$data{text}}, $line;
+  elsif ($line =~ m!^(?:[^\/\s]+\/){2}[^\/\s]+$!) {
+    $key = 'text';
+    $value = $line;
   }
 
   # Get doc sigles
-  elsif ($line =~ m!^[^\/]+\/[^\/]+$!) {
-    push @{$data{doc}}, $line;
+  elsif ($line =~ m!^([^\/\s]+\/[^\/\s]+?)(?:\s.+?)?$!) {
+    $key = 'doc';
+    $value = $1;
   }
 
   # Get corpus sigles
-  elsif ($line !~ m!\/!) {
-    push @{$data{corpus}}, $line;
+  elsif ($line !~ m!(?:\/|\s)!) {
+    $key = 'corpus';
+    $value = $line;
   }
 
+  # Not known
   else {
     warn shorten($line) . q! isn't a valid sigle!;
+    next;
+  };
+
+  if ($key eq 'text') {
+    push @{$data{text}}, $value;
+  }
+
+  elsif ($key eq 'doc') {
+    push @{$data{doc}}, $value;
+  }
+
+  elsif ($key eq 'corpus') {
+    push @{$data{corpus}}, $value;
   };
 };
 
diff --git a/tools/t/data/list2.def b/tools/t/data/list2.def
new file mode 100644
index 0000000..8cf6da9
--- /dev/null
+++ b/tools/t/data/list2.def
@@ -0,0 +1,31 @@
+<name>Example</name>
+BRZ05/SEP Braunschweiger Zeitung, September 2005
+BRZ05/OKT Braunschweiger Zeitung, Oktober 2005
+BRZ05/NOV Braunschweiger Zeitung, November 2005
+BRZ05/DEZ Braunschweiger Zeitung, Dezember 2005
+BRZ06/JAN Braunschweiger Zeitung, Januar 2006
+BRZ06/FEB Braunschweiger Zeitung, Februar 2006
+BRZ06/MAR Braunschweiger Zeitung, März 2006
+BRZ06/APR Braunschweiger Zeitung, April 2006
+BRZ06/MAI Braunschweiger Zeitung, Mai 2006
+BRZ06/JUN Braunschweiger Zeitung, Juni 2006
+BRZ06/JUL Braunschweiger Zeitung, Juli 2006
+BRZ06/AUG Braunschweiger Zeitung, August 2006
+BRZ06/SEP Braunschweiger Zeitung, September 2006
+BRZ06/OKT Braunschweiger Zeitung, Oktober 2006
+BRZ06/NOV Braunschweiger Zeitung, November 2006
+BRZ06/DEZ Braunschweiger Zeitung, Dezember 2006
+BRZ07/JAN Braunschweiger Zeitung, Januar 2007
+BRZ07/FEB Braunschweiger Zeitung, Februar 2007
+BRZ07/MAR Braunschweiger Zeitung, März 2007
+BRZ07/APR Braunschweiger Zeitung, April 2007
+BRZ07/MAI Braunschweiger Zeitung, Mai 2007
+BRZ07/JUN Braunschweiger Zeitung, Juni 2007
+BRZ07/JUL Braunschweiger Zeitung, Juli 2007
+BRZ07/AUG Braunschweiger Zeitung, August 2007
+BRZ07/SEP Braunschweiger Zeitung, September 2007
+BRZ07/OKT Braunschweiger Zeitung, Oktober 2007
+BRZ07/NOV Braunschweiger Zeitung, November 2007
+BRZ07/DEZ Braunschweiger Zeitung, Dezember 2007
+BRZ08/JAN Braunschweiger Zeitung, Januar 2008
+BRZ08/FEB Braunschweiger Zeitung, Februar 2008
diff --git a/tools/t/list2vc-def.t b/tools/t/list2vc-def.t
new file mode 100644
index 0000000..8c24fa3
--- /dev/null
+++ b/tools/t/list2vc-def.t
@@ -0,0 +1,36 @@
+#!/usr/bin/env perl
+use strict;
+use warnings;
+use Test::More;
+use File::Basename;
+use File::Spec::Functions;
+
+use Test::Output;
+use Mojo::JSON 'decode_json';
+
+my $script = catfile(dirname(__FILE__), '..', 'list2vc.pl');
+my $list1 = catfile(dirname(__FILE__), 'data', 'list2.def');
+
+# Check STDOUT
+stdout_like(
+  sub {
+    system($script, $list1);
+  },
+  qr!^\{\"\@context\".+?\}$!,
+  "check stdout"
+);
+
+# Check JSON
+my $json = decode_json(join('', `$script $list1`));
+
+is($json->{'collection'}->{'@type'}, 'koral:docGroup', 'type');
+is($json->{'collection'}->{'operation'}, 'operation:or', 'operation');
+
+my $op1 = $json->{'collection'}->{'operands'}->[0];
+is($op1->{'@type'}, 'koral:doc', 'type');
+is($op1->{'key'}, 'docSigle', 'key');
+is($op1->{'match'}, 'match:eq', 'match');
+is($op1->{'value'}->[0], "BRZ05/SEP", 'value');
+is($op1->{'value'}->[1], ,"BRZ05/OKT", 'value');
+is($op1->{'value'}->[-1], ,"BRZ08/FEB", 'value');
+done_testing;