Introduce object mechanism to VCs in vc conversion tool
Change-Id: If5d06a0b9b08b9c39700a0eb471a0edfb0d5daf5
diff --git a/list2vc.pl b/list2vc.pl
index 35a205a..3e31e50 100755
--- a/list2vc.pl
+++ b/list2vc.pl
@@ -1,251 +1,13 @@
#!/usr/bin/env perl
-package KorAP::VirtualCorpus;
use strict;
use warnings;
-
-
-# Get or set name of the VC
-sub name {
- my $self = shift;
- unless (@_) {
- return $self->{name};
- };
- $self->{name} = shift;
- return $self;
-};
-
-
-# Comment
-sub comment {
- my $self = shift;
- unless (@_) {
- return $self->{comment};
- };
- $self->{comment} //= [];
-
- push @{$self->{comment}}, shift;
- return $self;
-};
-
-
-# Quote utility function
-sub quote {
- shift;
- my $str = shift;
- $str =~ s/(["\\])/\\$1/g;
- return qq{"$str"};
-};
-
-
-# Escaped quote utility function
-sub equote {
- shift;
- my $str = shift;
- $str =~ s/(["\\])/\\$1/g;
- $str =~ s/(["\\])/\\$1/g;
- return '\\"' . $str . '\\"';
-};
-
-
-sub _commentparam_to_string {
- my $self = shift;
- my $comment = $self->_comment_to_string;
- if ($comment) {
- return qq!,"comment":"$comment"!;
- };
- return '';
-};
-
-
-sub _comment_to_string {
- my $self = shift;
- if (!$self->name && !$self->comment) {
- return '';
- };
-
- my $json = '';
- $json .= 'name:' . $self->equote($self->name) if $self->name;
- if ($self->name && $self->comment) {
- $json .= ','
- };
- $json .= join(',', @{$self->{comment}}) if $self->{comment};
-
- return $json;
-};
-
-
-# Stringify globally
-sub to_string {
- my $self = shift;
- ## Create collection object
-
- my $json = '{';
- $json .= '"@context":"http://korap.ids-mannheim.de/ns/KoralQuery/v0.3/context.jsonld",';
- $json .= '"collection":';
- $json .= $self->_to_fragment;
- # Set at the end, when all comments are done
- $json .= $self->_commentparam_to_string;
- return $json .= '}';
-};
-
-
-package KorAP::VirtualCorpus::Group;
-use strict;
-use warnings;
-use base 'KorAP::VirtualCorpus';
-
-
-# Construct a new VC group
-sub new {
- my $class = shift;
- bless {
- with => [],
- with_fields => {},
- without => [],
- without_fields => {},
- }, $class;
-};
-
-
-# Define an operand to be "or"ed
-sub with {
- my $self = shift;
- push @{$self->{with}}, shift;
-};
-
-
-# Define a field that should be "or"ed
-sub with_field {
- my $self = shift;
- my $field = shift;
- push @{$self->{with_fields}->{$field}}, shift;
-};
-
-# Define an operand to be "and"ed
-sub without {
- my $self = shift;
- push @{$self->{without}}, shift;
-};
-
-
-# Define a field that should be "and"ed
-sub without_field {
- my $self = shift;
- my $field = shift;
- push @{$self->{without_fields}->{$field}}, shift;
-};
-
-
-# VC contains only with fields
-sub only_with_fields {
- my $self = shift;
-
- if (keys %{$self->{without_fields}} || @{$self->{with}} || @{$self->{without}}) {
- return 0;
- };
-
- return 1;
-};
-
-
-# Create a document vector field
-sub _doc_vec {
- my $field = shift;
- my $vec = shift;
- my $json = '{';
- $json .= '"@type":"koral:doc",';
- $json .= '"key":"' . $field . '",';
- $json .= '"match":"match:eq",';
- $json .= '"value":[';
- $json .= join ',', map { '"' . $_ . '"' } @$vec;
- $json .= ']';
- $json .= '},';
- return $json;
-}
-
-
-# Stringify fragment
-sub _to_fragment {
- my $self = shift;
-
- my $json = '{';
- $json .= '"@type":"koral:docGroup",';
-
- # Make the outer group "and"
- if (keys %{$self->{without_fields}}) {
- $json .= '"operation":"operation:and",';
- $json .= '"operands":[';
-
- foreach my $field (sort keys %{$self->{without_fields}}) {
- unless (@{$self->{without_fields}->{$field}}) {
- next;
- };
- $json .= _doc_vec($field, $self->{without_fields}->{$field});
- };
-
- # Remove the last comma
- chop $json;
-
- $json .= ']';
- }
-
- elsif (keys %{$self->{with_fields}} || @{$self->{with}}) {
- $json .= '"operation":"operation:or",';
-
- $json .= '"operands":[';
-
- # Flatten embedded "or"-VCs
- foreach my $op (@{$self->{with}}) {
-
- # The embedded VC has only extending fields
- if ($op->only_with_fields) {
-
- $self->comment('embed:[' . $op->_comment_to_string . ']');
-
- foreach my $k (keys %{$op->{with_fields}}) {
- foreach my $v (@{$op->{with_fields}->{$k}}) {
- $self->with_field($k, $v);
- };
- };
- }
-
- # Embed complex VC
- else {
- $json .= $op->_to_fragment . ',';
- };
- };
-
- foreach my $field (sort keys %{$self->{with_fields}}) {
- unless (@{$self->{with_fields}->{$field}}) {
- next;
- };
- $json .= _doc_vec($field, $self->{with_fields}->{$field});
- };
-
- # Remove the last comma
- chop $json;
-
- $json .= ']';
- }
-
- # No operands in the group
- else {
- # Remove the last comma after the comment
- chop $json;
- };
-
- # Set at the end, when all comments are done
- $json .= $self->_commentparam_to_string;
- return $json . '}';
-};
-
-
-package main;
-use strict;
-use warnings;
+use lib 'lib';
+use KorAP::VirtualCorpus::Group;
# 2020-05-20
# Preliminary support for C2 def-files.
+# 2020-05-29
+# Introduce optimizable object system.
our @ARGV;
@@ -352,24 +114,24 @@
# Convert C2 sigle to KorAP form
$value =~ s!^([^/]+?/[^\.]+?)\.(.+?)$!$1\/$2!;
- ${$vc}->with_field(textSigle => $value);
+ ${$vc}->union_field(textSigle => $value);
}
# Add doc field
elsif ($key eq 'doc') {
- ${$vc}->with_field(docSigle => $value);
+ ${$vc}->union_field(docSigle => $value);
}
# Add corpus field
elsif ($key eq 'corpus') {
- ${$vc}->with_field(corpusSigle => $value);
+ ${$vc}->union_field(corpusSigle => $value);
}
# Add corpus field
elsif ($key eq 'cn') {
# Korpussigle, z.B. 'F97 Frankfurter Allgemeine 1997'
if ($value =~ m!^([^\/\s]+)(?:\s.+?)?$!) {
- ${$vc}->with_field(corpusSigle => $1);
+ ${$vc}->union_field(corpusSigle => $1);
};
}
@@ -430,19 +192,108 @@
next;
};
- $$vc->with($all_vcs{$value});
+ $$vc->union($all_vcs{$value}->clone->to_koral);
}
+ # AND definition
+ elsif ($key eq 'and') {
+ unless (defined $all_vcs{$value}) {
+ # warn 'VC ' . $value . ' not defined';
+ # exit(1);
+ next;
+ };
+
+ $$vc->joint($all_vcs{$value}->clone->to_koral);
+ }
+
+ # Source of the corpus
+ elsif ($key eq 'ql') {
+ # Quellenname, z.B. "Neue Zürcher Zeitung"
+ $$vc->union_field(corpusTitle => $value);
+ }
+
+ elsif ($key eq 'sub') {
+ # "Sub" is the difference - it is the "and not" operation.
+ warn $key . ' is not yet supported';
+ }
+
+ elsif ($key eq 'co') {
+ # Country, z.B. DE für Text in Deutschland erschienen
+ warn $key . ' is not yet supported';
+ }
+
+ elsif ($key eq 'tl') {
+ # Textlength, Bereich von Texten der angegebenen Länge [in Anz. Wörtern]
+ warn $key . ' is not yet supported';
+ }
+
+ elsif ($key eq 'ts') {
+ # Textsorte, z.B. "Bericht"
+ warn $key . ' is not yet supported';
+ }
+
+ elsif ($key eq 'th') {
+ # Thema, z.B. "Sport - Fußball"
+ warn $key . ' is not yet supported';
+ }
+
+ elsif ($key eq 'red') {
+ # Reduktionsfaktor
+ # Wert zw. 1-99%: virt. Korpus wird auf diesen Wert
+ # reduziert. Modus: feste Reduzierung, nicht variabel.
+ warn $key . ' is not yet supported';
+ }
+
+ elsif ($key eq 'thprob') {
+ # ThemaProbability
+ # Wert, der für <th>Thema verwendet wird um zu bestimmen, ab welchem
+ # Zuverläßigkeitswert ein Thema übernommen wird
+ }
+
+
# Add reduction value as a comment
elsif ($key eq 'redabs') {
# "red. Anz. Texte
# absoluter Wert der durch Reduktion zu erzielende Anzahl Texte"
$$vc->comment('redabs:' . $value);
+ warn $key . ' is not yet supported';
+ }
+
+ # Add reduction value as a comment
+ elsif ($key eq 'date') {
+ # Supports two pattern schemes:
+ # m1=Year1/Month1 bis Year2/Month2
+ # Datumsbereich Schema 1: z.B. "2000/01 bis 2010/12"
+
+ # Schema 1
+ if ($value =~ m!^(?:m1\s*=\s*)?\s*(\d+)\/(\d+) bis (\d+)\/(\d+)\s*$!s) {
+ my ($y1, $m1, $y2, $m2) = ($1, $2, $3, $4);
+ if ($m1 < 10) {
+ $m1 = '0' . (0+$m1);
+ };
+ if ($m2 < 10) {
+ $m2 = '0' . (0+$m2);
+ };
+ $$vc->from($y1, $m1);
+ $$vc->to($y2, $m2);
+ }
+
+ # Scheme 2
+ elsif ($value =~ m!^\s*\d{4}-\d{4}\s+und\s+\d{1,2}-\d{1,2}\s*$!) {
+ # m2=Year1-Year2 und Month1-Month2
+ # Datumsbereich Schema 2: z.B. "1990-2000 und 06-06"
+
+ warn 'Second date scheme not yet supported!'
+ }
+
+ else {
+ warn 'Unknown date scheme ' . $value;
+ };
}
# Unknown
else {
- # warn $key . ' is an unknown field';
+ warn $key . ' is an unknown field';
};
};