Added preliminary support for C2 def-files in VC conversion tool
Change-Id: If2a6a24e7401bc1222597670fb38b5cba7e3aa80
diff --git a/tools/list2vc.pl b/tools/list2vc.pl
index c632ec7..508f88f 100755
--- a/tools/list2vc.pl
+++ b/tools/list2vc.pl
@@ -1,24 +1,13 @@
#!/usr/bin/env perl
-
-
-
-package main;
use strict;
use warnings;
+# 2020-05-20
+# Preliminary support for C2 def-files.
+
+
our @ARGV;
-sub shorten ($) {
- my $line = shift;
- if (length($line) < 20) {
- return $line;
- }
- else {
- return substr($line,0,17) . '...';
- };
-};
-
-
unless (@ARGV) {
print <<'HELP';
Convert a line-separated list of corpus sigles, doc sigles or
@@ -31,6 +20,18 @@
exit 0;
};
+
+sub shorten ($) {
+ my $line = shift;
+ if (length($line) < 20) {
+ return $line;
+ }
+ else {
+ return substr($line,0,17) . '...';
+ };
+};
+
+
my $fh;
if ($ARGV[0] eq '-') {
$fh = *STDIN;
@@ -58,23 +59,54 @@
next;
};
+ my ($key, $value, $desc);
+
+ # Line-Type: <e>c</a>
+ if ($line =~ /^\s*<([^>]+)>\s*([^<]*)\s*<\/\1>\s*$/) {
+ $key = $1;
+ $value = $2 // undef;
+ }
+
+ # Line-Type: <e>c
+ elsif($line =~ /^\s*<([^>]+)>\s*([^<]+)\s*$/) {
+ $key = $1;
+ $value = $2;
+ }
+
# Get text sigles
- if ($line =~ m!^([^\/]+\/){2}[^\/]+$!) {
- push @{$data{text}}, $line;
+ elsif ($line =~ m!^(?:[^\/\s]+\/){2}[^\/\s]+$!) {
+ $key = 'text';
+ $value = $line;
}
# Get doc sigles
- elsif ($line =~ m!^[^\/]+\/[^\/]+$!) {
- push @{$data{doc}}, $line;
+ elsif ($line =~ m!^([^\/\s]+\/[^\/\s]+?)(?:\s.+?)?$!) {
+ $key = 'doc';
+ $value = $1;
}
# Get corpus sigles
- elsif ($line !~ m!\/!) {
- push @{$data{corpus}}, $line;
+ elsif ($line !~ m!(?:\/|\s)!) {
+ $key = 'corpus';
+ $value = $line;
}
+ # Not known
else {
warn shorten($line) . q! isn't a valid sigle!;
+ next;
+ };
+
+ if ($key eq 'text') {
+ push @{$data{text}}, $value;
+ }
+
+ elsif ($key eq 'doc') {
+ push @{$data{doc}}, $value;
+ }
+
+ elsif ($key eq 'corpus') {
+ push @{$data{corpus}}, $value;
};
};
diff --git a/tools/t/data/list2.def b/tools/t/data/list2.def
new file mode 100644
index 0000000..8cf6da9
--- /dev/null
+++ b/tools/t/data/list2.def
@@ -0,0 +1,31 @@
+<name>Example</name>
+BRZ05/SEP Braunschweiger Zeitung, September 2005
+BRZ05/OKT Braunschweiger Zeitung, Oktober 2005
+BRZ05/NOV Braunschweiger Zeitung, November 2005
+BRZ05/DEZ Braunschweiger Zeitung, Dezember 2005
+BRZ06/JAN Braunschweiger Zeitung, Januar 2006
+BRZ06/FEB Braunschweiger Zeitung, Februar 2006
+BRZ06/MAR Braunschweiger Zeitung, März 2006
+BRZ06/APR Braunschweiger Zeitung, April 2006
+BRZ06/MAI Braunschweiger Zeitung, Mai 2006
+BRZ06/JUN Braunschweiger Zeitung, Juni 2006
+BRZ06/JUL Braunschweiger Zeitung, Juli 2006
+BRZ06/AUG Braunschweiger Zeitung, August 2006
+BRZ06/SEP Braunschweiger Zeitung, September 2006
+BRZ06/OKT Braunschweiger Zeitung, Oktober 2006
+BRZ06/NOV Braunschweiger Zeitung, November 2006
+BRZ06/DEZ Braunschweiger Zeitung, Dezember 2006
+BRZ07/JAN Braunschweiger Zeitung, Januar 2007
+BRZ07/FEB Braunschweiger Zeitung, Februar 2007
+BRZ07/MAR Braunschweiger Zeitung, März 2007
+BRZ07/APR Braunschweiger Zeitung, April 2007
+BRZ07/MAI Braunschweiger Zeitung, Mai 2007
+BRZ07/JUN Braunschweiger Zeitung, Juni 2007
+BRZ07/JUL Braunschweiger Zeitung, Juli 2007
+BRZ07/AUG Braunschweiger Zeitung, August 2007
+BRZ07/SEP Braunschweiger Zeitung, September 2007
+BRZ07/OKT Braunschweiger Zeitung, Oktober 2007
+BRZ07/NOV Braunschweiger Zeitung, November 2007
+BRZ07/DEZ Braunschweiger Zeitung, Dezember 2007
+BRZ08/JAN Braunschweiger Zeitung, Januar 2008
+BRZ08/FEB Braunschweiger Zeitung, Februar 2008
diff --git a/tools/t/list2vc-def.t b/tools/t/list2vc-def.t
new file mode 100644
index 0000000..8c24fa3
--- /dev/null
+++ b/tools/t/list2vc-def.t
@@ -0,0 +1,36 @@
+#!/usr/bin/env perl
+use strict;
+use warnings;
+use Test::More;
+use File::Basename;
+use File::Spec::Functions;
+
+use Test::Output;
+use Mojo::JSON 'decode_json';
+
+my $script = catfile(dirname(__FILE__), '..', 'list2vc.pl');
+my $list1 = catfile(dirname(__FILE__), 'data', 'list2.def');
+
+# Check STDOUT
+stdout_like(
+ sub {
+ system($script, $list1);
+ },
+ qr!^\{\"\@context\".+?\}$!,
+ "check stdout"
+);
+
+# Check JSON
+my $json = decode_json(join('', `$script $list1`));
+
+is($json->{'collection'}->{'@type'}, 'koral:docGroup', 'type');
+is($json->{'collection'}->{'operation'}, 'operation:or', 'operation');
+
+my $op1 = $json->{'collection'}->{'operands'}->[0];
+is($op1->{'@type'}, 'koral:doc', 'type');
+is($op1->{'key'}, 'docSigle', 'key');
+is($op1->{'match'}, 'match:eq', 'match');
+is($op1->{'value'}->[0], "BRZ05/SEP", 'value');
+is($op1->{'value'}->[1], ,"BRZ05/OKT", 'value');
+is($op1->{'value'}->[-1], ,"BRZ08/FEB", 'value');
+done_testing;