Added preliminary support for C2 def-files in VC conversion tool
Change-Id: If2a6a24e7401bc1222597670fb38b5cba7e3aa80
diff --git a/tools/list2vc.pl b/tools/list2vc.pl
index c632ec7..508f88f 100755
--- a/tools/list2vc.pl
+++ b/tools/list2vc.pl
@@ -1,24 +1,13 @@
#!/usr/bin/env perl
-
-
-
-package main;
use strict;
use warnings;
+# 2020-05-20
+# Preliminary support for C2 def-files.
+
+
our @ARGV;
-sub shorten ($) {
- my $line = shift;
- if (length($line) < 20) {
- return $line;
- }
- else {
- return substr($line,0,17) . '...';
- };
-};
-
-
unless (@ARGV) {
print <<'HELP';
Convert a line-separated list of corpus sigles, doc sigles or
@@ -31,6 +20,18 @@
exit 0;
};
+
+sub shorten ($) {
+ my $line = shift;
+ if (length($line) < 20) {
+ return $line;
+ }
+ else {
+ return substr($line,0,17) . '...';
+ };
+};
+
+
my $fh;
if ($ARGV[0] eq '-') {
$fh = *STDIN;
@@ -58,23 +59,54 @@
next;
};
+ my ($key, $value, $desc);
+
+ # Line-Type: <e>c</a>
+ if ($line =~ /^\s*<([^>]+)>\s*([^<]*)\s*<\/\1>\s*$/) {
+ $key = $1;
+ $value = $2 // undef;
+ }
+
+ # Line-Type: <e>c
+ elsif($line =~ /^\s*<([^>]+)>\s*([^<]+)\s*$/) {
+ $key = $1;
+ $value = $2;
+ }
+
# Get text sigles
- if ($line =~ m!^([^\/]+\/){2}[^\/]+$!) {
- push @{$data{text}}, $line;
+ elsif ($line =~ m!^(?:[^\/\s]+\/){2}[^\/\s]+$!) {
+ $key = 'text';
+ $value = $line;
}
# Get doc sigles
- elsif ($line =~ m!^[^\/]+\/[^\/]+$!) {
- push @{$data{doc}}, $line;
+ elsif ($line =~ m!^([^\/\s]+\/[^\/\s]+?)(?:\s.+?)?$!) {
+ $key = 'doc';
+ $value = $1;
}
# Get corpus sigles
- elsif ($line !~ m!\/!) {
- push @{$data{corpus}}, $line;
+ elsif ($line !~ m!(?:\/|\s)!) {
+ $key = 'corpus';
+ $value = $line;
}
+ # Not known
else {
warn shorten($line) . q! isn't a valid sigle!;
+ next;
+ };
+
+ if ($key eq 'text') {
+ push @{$data{text}}, $value;
+ }
+
+ elsif ($key eq 'doc') {
+ push @{$data{doc}}, $value;
+ }
+
+ elsif ($key eq 'corpus') {
+ push @{$data{corpus}}, $value;
};
};