blob: 85986a979e6c438f957a67678be4e7f088722f05 [file] [log] [blame]
#!/usr/bin/env perl
use strict;
use warnings;
use lib 'lib';
use KorAP::VirtualCorpus::Group;
# 2020-05-20
# Preliminary support for C2 def-files.
# 2020-05-29
# Introduce optimizable object system.
our $VERSION = 0.1;
our @ARGV;
unless (@ARGV) {
print <<'HELP';
Convert a line-separated list of corpus sigles, doc sigles or
text sigles into a virtual corpus query.
$ perl list2vc.pl my_vc.txt | gzip -vc > my_vc.jsonld.gz
$ cat my_vc.txt | perl list2vc.pl - | gzip -vc > my_vc.jsonld.gz
HELP
exit 0;
};
# Shorten long strings for logging
sub _shorten ($) {
my $line = shift;
if (length($line) < 20) {
return $line;
}
else {
return substr($line,0,17) . '...';
};
};
my $fh;
if ($ARGV[0] eq '-') {
$fh = *STDIN;
} elsif (!open($fh, '<' . $ARGV[0])) {
warn $ARGV[0] . " can't be opened";
exit(0);
};
# Initial VC group
my $vc;
# Create an intensional and an extensional VC
my $vc_ext = KorAP::VirtualCorpus::Group->new;
my $vc_int = KorAP::VirtualCorpus::Group->new;
# Load ext initially
$$vc = $vc_ext;
# Collect all virtual corpora
my %all_vcs;
my $frozen = 0;
# Iterate over the whole list
while (!eof $fh) {
my $line = readline($fh);
chomp $line;
# Skip empty lines
if (!$line || length($line) == 0 || $line =~ /^[\s\t\n]*$/) {
# empty
next;
};
my ($key, $value, $desc);
# Line-Type: <e>c</a>
if ($line =~ /^\s*<([^>]+)>\s*([^<]*)\s*<\/\1>\s*$/) {
$key = $1;
$value = $2 // undef;
}
# Line-Type: <e>c
elsif($line =~ /^\s*<([^>]+)>\s*([^<]+)\s*$/) {
$key = $1;
$value = $2;
}
# Get text sigles
elsif ($line =~ m!^(?:\w+\/){2}\w+$!) {
$key = 'text';
$value = $line;
}
# Get doc sigles
elsif ($line =~ m!^(\w+\/\w+?)(?:\s.+?)?$!) {
$key = 'doc';
$value = $1;
}
# Get corpus sigles
elsif ($line !~ m!(?:\/|\s)!) {
$key = 'corpus';
$value = $line;
}
# Not known
else {
warn _shorten($line) . q! isn't a valid VC definition!;
next;
};
# Add text field
if ($key eq 'text') {
# Convert C2 sigle to KorAP form
$value =~ s!^([^/]+?/[^\.]+?)\.(.+?)$!$1\/$2!;
${$vc}->union_field(textSigle => $value);
}
# Add doc field
elsif ($key eq 'doc') {
${$vc}->union_field(docSigle => $value);
}
# Add corpus field
elsif ($key eq 'corpus') {
${$vc}->union_field(corpusSigle => $value);
}
# Add corpus field
elsif ($key eq 'cn') {
# Korpussigle, z.B. 'F97 Frankfurter Allgemeine 1997'
if ($value =~ m!^([^\/\s]+)(?:\s.+?)?$!) {
${$vc}->union_field(corpusSigle => $1);
};
}
# Mark the vc as frozen
# This means that an extended VC area is expected
elsif ($key eq 'frozen') {
$frozen = 1;
}
# Start/End intended VC area
elsif ($key eq 'intended') {
if ($value eq 'start') {
$$vc = $vc_int;
}
elsif ($value ne 'end') {
warn 'Unknown intension value ' . $value;
};
}
# Start/End extended VC area
elsif ($key eq 'extended') {
if ($value eq 'start') {
$$vc = $vc_ext;
}
elsif ($value ne 'end') {
warn 'Unknown extension value ' . $value;
};
}
# Set VC name
elsif ($key eq 'name') {
# "Name des virt. Korpus, der angezeigt wird.
# Wird auch intern zur Korpusbildung referenziert, z.B. für <and>,
# <add>, <sub>"
# No global name defined yet
if ($$vc && !$$vc->name) {
$vc_ext->name($value);
$vc_int->name($value);
next;
};
${$vc} = KorAP::VirtualCorpus::Group->new;
${$vc}->name($value);
}
# End VC def
elsif ($key eq 'end') {
$all_vcs{${$vc}->name} = $$vc;
# $vc = undef;
}
# Add VC definition
elsif ($key eq 'add') {
unless (defined $all_vcs{$value}) {
# warn 'VC ' . $value . ' not defined';
# exit(1);
next;
};
$$vc->union($all_vcs{$value}->clone->to_koral);
}
# AND definition
elsif ($key eq 'and') {
unless (defined $all_vcs{$value}) {
# warn 'VC ' . $value . ' not defined';
# exit(1);
next;
};
$$vc->joint($all_vcs{$value}->clone->to_koral);
}
# Source of the corpus
elsif ($key eq 'ql') {
# Quellenname, z.B. "Neue Zürcher Zeitung"
$$vc->union_field(corpusTitle => $value);
}
elsif ($key eq 'sub') {
# "Sub" is the difference - it is the "and not" operation.
warn $key . ' is not yet supported';
}
elsif ($key eq 'co') {
# Country, z.B. DE für Text in Deutschland erschienen
warn $key . ' is not yet supported';
}
elsif ($key eq 'tl') {
# Textlength, Bereich von Texten der angegebenen Länge [in Anz. Wörtern]
warn $key . ' is not yet supported';
}
elsif ($key eq 'ts') {
# Textsorte, z.B. "Bericht"
warn $key . ' is not yet supported';
}
elsif ($key eq 'th') {
# Thema, z.B. "Sport - Fußball"
warn $key . ' is not yet supported';
}
elsif ($key eq 'red') {
# Reduktionsfaktor
# Wert zw. 1-99%: virt. Korpus wird auf diesen Wert
# reduziert. Modus: feste Reduzierung, nicht variabel.
warn $key . ' is not yet supported';
}
elsif ($key eq 'thprob') {
# ThemaProbability
# Wert, der für <th>Thema verwendet wird um zu bestimmen, ab welchem
# Zuverläßigkeitswert ein Thema übernommen wird
}
# Add reduction value as a comment
elsif ($key eq 'redabs') {
# "red. Anz. Texte
# absoluter Wert der durch Reduktion zu erzielende Anzahl Texte"
$$vc->comment('redabs:' . $value);
warn $key . ' is not yet supported';
}
# Add reduction value as a comment
elsif ($key eq 'date') {
# Supports two pattern schemes:
# m1=Year1/Month1 bis Year2/Month2
# Datumsbereich Schema 1: z.B. "2000/01 bis 2010/12"
# Schema 1
if ($value =~ m!^(?:m1\s*=\s*)?\s*(\d+)\/(\d+) bis (\d+)\/(\d+)\s*$!s) {
my ($y1, $m1, $y2, $m2) = ($1, $2, $3, $4);
if ($m1 < 10) {
$m1 = '0' . (0+$m1);
};
if ($m2 < 10) {
$m2 = '0' . (0+$m2);
};
$$vc->from($y1, $m1);
$$vc->to($y2, $m2);
}
# Scheme 2
elsif ($value =~ m!^\s*\d{4}-\d{4}\s+und\s+\d{1,2}-\d{1,2}\s*$!) {
# m2=Year1-Year2 und Month1-Month2
# Datumsbereich Schema 2: z.B. "1990-2000 und 06-06"
warn 'Second date scheme not yet supported!'
}
else {
warn 'Unknown date scheme ' . $value;
};
}
# Unknown
else {
warn $key . ' is an unknown field';
};
};
close($fh);
# Stringify current (extended?) virtual corpus
print $$vc->to_string;