blob: a25bf299f3954d266bb4a86f56a431cb00366f3e [file] [log] [blame]
#!/usr/bin/env perl
package KorAP::VirtualCorpus;
use strict;
use warnings;
# Get or set name of the VC
sub name {
my $self = shift;
unless (@_) {
return $self->{name};
};
$self->{name} = shift;
return $self;
};
# Quote utility function
sub quote {
shift;
my $str = shift;
$str =~ s/(["\\])/\\$1/g;
return qq{"$str"};
};
# Escaped quote utility function
sub equote {
shift;
my $str = shift;
$str =~ s/(["\\])/\\$1/g;
$str =~ s/(["\\])/\\$1/g;
return '\\"' . $str . '\\"';
};
# Stringify globally
sub to_string {
my $self = shift;
## Create collection object
my $json = '{';
$json .= '"@context":"http://korap.ids-mannheim.de/ns/KoralQuery/v0.3/context.jsonld",';
$json .= '"comment":"Name: ' . $self->equote($self->name) . '",' if $self->name;
$json .= '"collection":';
$json .= $self->_to_fragment;
return $json .= '}';
};
package KorAP::VirtualCorpus::Group;
use strict;
use warnings;
use base 'KorAP::VirtualCorpus';
# Construct a new VC group
sub new {
my $class = shift;
bless {
with => [],
with_fields => {},
without => [],
without_fields => {},
}, $class;
};
# Define an operand to be "or"ed
sub with {
my $self = shift;
push @{$self->{with}}, shift;
};
# Define a field that should be "or"ed
sub with_field {
my $self = shift;
my $field = shift;
push @{$self->{with_fields}->{$field}}, shift;
};
# Define an operand to be "and"ed
sub without {
my $self = shift;
push @{$self->{without}}, shift;
};
# Define a field that should be "and"ed
sub without_field {
my $self = shift;
my $field = shift;
push @{$self->{without_fields}->{$field}}, shift;
};
# Create a document vector field
sub _doc_vec {
my $field = shift;
my $vec = shift;
my $json = '{';
$json .= '"@type":"koral:doc",';
$json .= '"key":"' . $field . '",';
$json .= '"match":"match:eq",';
$json .= '"value":[';
$json .= join ',', map { '"' . $_ . '"' } @$vec;
$json .= ']';
$json .= '},';
return $json;
}
# Stringify fragment
sub _to_fragment {
my $self = shift;
my $json = '{';
$json .= '"@type":"koral:docGroup",';
$json .= '"comment":"Name: ' . $self->equote($self->name) . '",' if $self->name;
# Make the outer group "and"
if (keys %{$self->{without_fields}}) {
$json .= '"operation":"operation:and",';
$json .= '"operands":[';
foreach my $field (sort keys %{$self->{without_fields}}) {
unless (@{$self->{without_fields}->{$field}}) {
next;
};
$json .= _doc_vec($field, $self->{without_fields}->{$field});
};
# Remove the last comma
chop $json;
$json .= ']';
}
elsif (keys %{$self->{with_fields}} || @{$self->{with}}) {
$json .= '"operation":"operation:or",';
# TODO:
# Flatten embedded or-VCs!
$json .= '"operands":[';
foreach my $field (sort keys %{$self->{with_fields}}) {
unless (@{$self->{with_fields}->{$field}}) {
next;
};
$json .= _doc_vec($field, $self->{with_fields}->{$field});
};
foreach my $op (@{$self->{with}}) {
$json .= $op->_to_fragment . ',';
};
# Remove the last comma
chop $json;
$json .= ']';
}
# No operands in the group
else {
# Remove the last comma after the comment
chop $json;
};
return $json . '}';
};
package main;
use strict;
use warnings;
# 2020-05-20
# Preliminary support for C2 def-files.
our @ARGV;
unless (@ARGV) {
print <<'HELP';
Convert a line-separated list of corpus sigles, doc sigles or
text sigles into a virtual corpus query.
$ perl list2vc.pl my_vc.txt | gzip -vc > my_vc.jsonld.gz
$ cat my_vc.txt | perl list2vc.pl - | gzip -vc > my_vc.jsonld.gz
HELP
exit 0;
};
# Shorten long strings for logging
sub _shorten ($) {
my $line = shift;
if (length($line) < 20) {
return $line;
}
else {
return substr($line,0,17) . '...';
};
};
my $fh;
if ($ARGV[0] eq '-') {
$fh = *STDIN;
} elsif (!open($fh, '<' . $ARGV[0])) {
warn $ARGV[0] . " can't be opened";
exit(0);
};
# Create an intensional and an extensional VC
my $vc_ext = KorAP::VirtualCorpus::Group->new;
my $vc_int = KorAP::VirtualCorpus::Group->new;
# Initial VC group
my $vc = \$vc_ext;
# Collect all virtual corpora
my %all_vcs;
my $frozen = 0;
# Iterate over the whole list
while (!eof $fh) {
my $line = readline($fh);
chomp $line;
# Skip empty lines
if (!$line || length($line) == 0 || $line =~ /^[\s\t\n]*$/) {
# empty
next;
};
my ($key, $value, $desc);
# Line-Type: <e>c</a>
if ($line =~ /^\s*<([^>]+)>\s*([^<]*)\s*<\/\1>\s*$/) {
$key = $1;
$value = $2 // undef;
}
# Line-Type: <e>c
elsif($line =~ /^\s*<([^>]+)>\s*([^<]+)\s*$/) {
$key = $1;
$value = $2;
}
# Get text sigles
elsif ($line =~ m!^(?:\w+\/){2}\w+$!) {
$key = 'text';
$value = $line;
}
# Get doc sigles
elsif ($line =~ m!^(\w+\/\w+?)(?:\s.+?)?$!) {
$key = 'doc';
$value = $1;
}
# Get corpus sigles
elsif ($line !~ m!(?:\/|\s)!) {
$key = 'corpus';
$value = $line;
}
# Not known
else {
warn _shorten($line) . q! isn't a valid VC definition!;
next;
};
# Add text field
if ($key eq 'text') {
# Convert C2 sigle to KorAP form
$value =~ s!^([^/]+?/[^\.]+?)\.(.+?)$!$1\/$2!;
${$vc}->with_field(textSigle => $value);
}
# Add doc field
elsif ($key eq 'doc') {
${$vc}->with_field(docSigle => $value);
}
# Add corpus field
elsif ($key eq 'corpus') {
${$vc}->with_field(corpusSigle => $value);
}
# Add corpus field
elsif ($key eq 'cn') {
# Korpussigle, z.B. 'F97 Frankfurter Allgemeine 1997'
if ($value =~ m!^([^\/\s]+)(?:\s.+?)?$!) {
${$vc}->with_field(corpusSigle => $1);
};
}
# Mark the vc as frozen
# This means that an extended VC area is expected
elsif ($key eq 'frozen') {
$frozen = 1;
}
# Start/End intended VC area
elsif ($key eq 'intended') {
if ($value eq 'start') {
$$vc = $vc_int;
}
elsif ($value ne 'end') {
warn 'Unknown intension value ' . $value;
};
}
# Start/End extended VC area
elsif ($key eq 'extended') {
if ($value eq 'start') {
$$vc = $vc_ext;
}
elsif ($value ne 'end') {
warn 'Unknown extension value ' . $value;
};
}
# Set VC name
elsif ($key eq 'name') {
# "Name des virt. Korpus, der angezeigt wird.
# Wird auch intern zur Korpusbildung referenziert, z.B. für <and>,
# <add>, <sub>"
# No global name defined yet
if ($$vc && !$$vc->name) {
$vc_ext->name($value);
$vc_int->name($value);
next;
};
${$vc} = KorAP::VirtualCorpus::Group->new;
${$vc}->name($value);
}
# End VC def
elsif ($key eq 'end') {
$all_vcs{${$vc}->name} = $$vc;
# $vc = undef;
}
# Add VC definition
elsif ($key eq 'add') {
unless (defined $all_vcs{$value}) {
# warn 'VC ' . $value . ' not defined';
# exit(1);
next;
};
$$vc->with($all_vcs{$value});
}
# Unknown
else {
# warn $key . ' is an unknown field';
};
};
close($fh);
# Stringify current (extended?) virtual corpus
print $$vc->to_string;