Turn Def parser into a package
Change-Id: I2afb9d79f3f4a9c8bb9afce728c162af0507bf31
diff --git a/lib/KorAP/Def.pm b/lib/KorAP/Def.pm
new file mode 100644
index 0000000..5a4591d
--- /dev/null
+++ b/lib/KorAP/Def.pm
@@ -0,0 +1,310 @@
+package KorAP::Def;
+use KorAP::VirtualCorpus::Group;
+use strict;
+use warnings;
+
+sub new {
+ my $class = shift;
+ my $file = shift;
+
+ my $self = {};
+
+ if (ref $file && ref $file eq 'GLOB') {
+ $self->{file} = '';
+ $self->{fh} = $file;
+ }
+
+ elsif ($file =~ /\.def\.bz2$/) {
+ $self->{file} = $file;
+ # $self->{fh} = $file;
+ # TODO
+ }
+
+ elsif (-f $file) {
+ $self->{file} = $file;
+
+ # Open def file
+ if (!open($self->{fh}, '<' . $file)) {
+ warn $ARGV[0] . " can't be opened";
+ exit(0);
+ };
+ }; # or guess
+
+ return bless $self, $class;
+};
+
+sub parse {
+ my $self = shift;
+
+ # Initial VC group
+ my $vc;
+
+ # Create an intensional and an extensional VC
+ my $vc_ext = KorAP::VirtualCorpus::Group->new;
+ my $vc_int = KorAP::VirtualCorpus::Group->new;
+
+ # Load ext initially
+ $$vc = $vc_ext;
+
+ # Collect all virtual corpora
+ my %all_vcs;
+
+ my $frozen = 0;
+
+ # Iterate over the whole list
+ while (!eof($self->{fh})) {
+ my $line = readline($self->{fh});
+ chomp $line;
+
+ # Skip empty lines
+ if (!$line || length($line) == 0 || $line =~ /^[\s\t\n]*$/) {
+ # empty
+ next;
+ };
+
+ my ($key, $value, $desc);
+
+ # Line-Type: <e>c</a>
+ if ($line =~ /^\s*<([^>]+)>\s*([^<]*)\s*<\/\1>\s*$/) {
+ $key = $1;
+ $value = $2 // undef;
+ }
+
+ # Line-Type: <e>c
+ elsif($line =~ /^\s*<([^>]+)>\s*([^<]+)\s*$/) {
+ $key = $1;
+ $value = $2;
+ }
+
+ # Get text sigles
+ elsif ($line =~ m!^(?:\w+\/){2}\w+$!) {
+ $key = 'text';
+ $value = $line;
+ }
+
+ # Get doc sigles
+ elsif ($line =~ m!^(\w+\/\w+?)(?:\s.+?)?$!) {
+ $key = 'doc';
+ $value = $1;
+ }
+
+ # Get corpus sigles
+ elsif ($line !~ m!(?:\/|\s)!) {
+ $key = 'corpus';
+ $value = $line;
+ }
+
+ # Not known
+ else {
+ warn _shorten($line) . q! isn't a valid VC definition!;
+ next;
+ };
+
+ # Add text field
+ if ($key eq 'text') {
+
+ # Convert C2 sigle to KorAP form
+ $value =~ s!^([^/]+?/[^\.]+?)\.(.+?)$!$1\/$2!;
+ ${$vc}->union_field(textSigle => $value);
+ }
+
+ # Add doc field
+ elsif ($key eq 'doc') {
+ ${$vc}->union_field(docSigle => $value);
+ }
+
+ # Add corpus field
+ elsif ($key eq 'corpus') {
+ ${$vc}->union_field(corpusSigle => $value);
+ }
+
+ # Add corpus field
+ elsif ($key eq 'cn') {
+ # Korpussigle, z.B. 'F97 Frankfurter Allgemeine 1997'
+ if ($value =~ m!^([^\/\s]+)(?:\s.+?)?$!) {
+ ${$vc}->union_field(corpusSigle => $1);
+ };
+ }
+
+ # Mark the vc as frozen
+ # This means that an extended VC area is expected
+ elsif ($key eq 'frozen') {
+ $frozen = 1;
+ }
+
+ # Start/End intended VC area
+ elsif ($key eq 'intended') {
+ if ($value eq 'start') {
+ $$vc = $vc_int;
+ }
+ elsif ($value ne 'end') {
+ warn 'Unknown intension value ' . $value;
+ };
+ }
+
+ # Start/End extended VC area
+ elsif ($key eq 'extended') {
+ if ($value eq 'start') {
+ $$vc = $vc_ext;
+ }
+ elsif ($value ne 'end') {
+ warn 'Unknown extension value ' . $value;
+ };
+ }
+
+ # Set VC name
+ elsif ($key eq 'name') {
+ # "Name des virt. Korpus, der angezeigt wird.
+ # Wird auch intern zur Korpusbildung referenziert, z.B. für <and>,
+ # <add>, <sub>"
+
+ # No global name defined yet
+ if ($$vc && !$$vc->name) {
+ $vc_ext->name($value);
+ $vc_int->name($value);
+ next;
+ };
+
+ ${$vc} = KorAP::VirtualCorpus::Group->new;
+ ${$vc}->name($value);
+ }
+
+ # End VC def
+ elsif ($key eq 'end') {
+ $all_vcs{${$vc}->name} = $$vc;
+ # $vc = undef;
+ }
+
+ # Add VC definition
+ elsif ($key eq 'add') {
+ unless (defined $all_vcs{$value}) {
+ # warn 'VC ' . $value . ' not defined';
+ # exit(1);
+ next;
+ };
+
+ $$vc->union($all_vcs{$value}->clone->to_koral);
+ }
+
+ # AND definition
+ elsif ($key eq 'and') {
+ unless (defined $all_vcs{$value}) {
+ # warn 'VC ' . $value . ' not defined';
+ # exit(1);
+ next;
+ };
+
+ $$vc->joint($all_vcs{$value}->clone->to_koral);
+ }
+
+ # Source of the corpus
+ elsif ($key eq 'ql') {
+ # Quellenname, z.B. "Neue Zürcher Zeitung"
+ $$vc->union_field(corpusTitle => $value);
+ }
+
+ elsif ($key eq 'sub') {
+ # "Sub" is the difference - it is the "and not" operation.
+ warn $key . ' is not yet supported';
+ }
+
+ elsif ($key eq 'co') {
+ # Country, z.B. DE für Text in Deutschland erschienen
+ warn $key . ' is not yet supported';
+ }
+
+ elsif ($key eq 'tl') {
+ # Textlength, Bereich von Texten der angegebenen Länge [in Anz. Wörtern]
+ warn $key . ' is not yet supported';
+ }
+
+ elsif ($key eq 'ts') {
+ # Textsorte, z.B. "Bericht"
+ warn $key . ' is not yet supported';
+ }
+
+ elsif ($key eq 'th') {
+ # Thema, z.B. "Sport - Fußball"
+ warn $key . ' is not yet supported';
+ }
+
+ elsif ($key eq 'red') {
+ # Reduktionsfaktor
+ # Wert zw. 1-99%: virt. Korpus wird auf diesen Wert
+ # reduziert. Modus: feste Reduzierung, nicht variabel.
+ warn $key . ' is not yet supported';
+ }
+
+ elsif ($key eq 'thprob') {
+ # ThemaProbability
+ # Wert, der für <th>Thema verwendet wird um zu bestimmen, ab welchem
+ # Zuverläßigkeitswert ein Thema übernommen wird
+ }
+
+ # Add reduction value as a comment
+ elsif ($key eq 'redabs') {
+ # "red. Anz. Texte
+ # absoluter Wert der durch Reduktion zu erzielende Anzahl Texte"
+ $$vc->comment('redabs:' . $value);
+ warn $key . ' is not yet supported';
+ }
+
+ # Add reduction value as a comment
+ elsif ($key eq 'date') {
+ # Supports two pattern schemes:
+ # m1=Year1/Month1 bis Year2/Month2
+ # Datumsbereich Schema 1: z.B. "2000/01 bis 2010/12"
+
+ # Schema 1
+ if ($value =~ m!^(?:m1\s*=\s*)?\s*(\d+)\/(\d+) bis (\d+)\/(\d+)\s*$!s) {
+ my ($y1, $m1, $y2, $m2) = ($1, $2, $3, $4);
+ if ($m1 < 10) {
+ $m1 = '0' . (0+$m1);
+ };
+ if ($m2 < 10) {
+ $m2 = '0' . (0+$m2);
+ };
+ $$vc->from($y1, $m1);
+ $$vc->to($y2, $m2);
+ }
+
+ # Scheme 2
+ elsif ($value =~ m!^\s*\d{4}-\d{4}\s+und\s+\d{1,2}-\d{1,2}\s*$!) {
+ # m2=Year1-Year2 und Month1-Month2
+ # Datumsbereich Schema 2: z.B. "1990-2000 und 06-06"
+
+ warn 'Second date scheme not yet supported!'
+ }
+
+ else {
+ warn 'Unknown date scheme ' . $value;
+ };
+ }
+
+ # Unknown
+ else {
+ warn $key . ' is an unknown field';
+ };
+ };
+ $self->{vc} = $vc;
+
+ close($self->{fh});
+};
+
+sub to_string {
+ return ${shift->{vc}}->to_string;
+};
+
+# Shorten long strings for logging
+sub _shorten ($) {
+ my $line = shift;
+ if (length($line) < 20) {
+ return $line;
+ }
+ else {
+ return substr($line,0,17) . '...';
+ };
+};
+
+
+1;
diff --git a/script/cosmasvc2koralquery b/script/cosmasvc2koralquery
index 85986a9..c0b3ed2 100755
--- a/script/cosmasvc2koralquery
+++ b/script/cosmasvc2koralquery
@@ -1,15 +1,17 @@
#!/usr/bin/env perl
use strict;
use warnings;
+use KorAP::Def;
use lib 'lib';
-use KorAP::VirtualCorpus::Group;
# 2020-05-20
# Preliminary support for C2 def-files.
# 2020-05-29
# Introduce optimizable object system.
+# 2024-07-17
+# Add KorAP::Def.
-our $VERSION = 0.1;
+our $VERSION = 0.2;
our @ARGV;
@@ -25,281 +27,15 @@
exit 0;
};
-
-# Shorten long strings for logging
-sub _shorten ($) {
- my $line = shift;
- if (length($line) < 20) {
- return $line;
- }
- else {
- return substr($line,0,17) . '...';
- };
-};
-
-
-my $fh;
+my $def_parser;
if ($ARGV[0] eq '-') {
- $fh = *STDIN;
-} elsif (!open($fh, '<' . $ARGV[0])) {
- warn $ARGV[0] . " can't be opened";
- exit(0);
+ $def_parser = KorAP::Def->new(\*STDIN);
+}
+else {
+ $def_parser = KorAP::Def->new($ARGV[0]);
};
-# Initial VC group
-my $vc;
-
-# Create an intensional and an extensional VC
-my $vc_ext = KorAP::VirtualCorpus::Group->new;
-my $vc_int = KorAP::VirtualCorpus::Group->new;
-
-# Load ext initially
-$$vc = $vc_ext;
-
-# Collect all virtual corpora
-my %all_vcs;
-
-my $frozen = 0;
-
-# Iterate over the whole list
-while (!eof $fh) {
- my $line = readline($fh);
- chomp $line;
-
-
- # Skip empty lines
- if (!$line || length($line) == 0 || $line =~ /^[\s\t\n]*$/) {
- # empty
- next;
- };
-
- my ($key, $value, $desc);
-
- # Line-Type: <e>c</a>
- if ($line =~ /^\s*<([^>]+)>\s*([^<]*)\s*<\/\1>\s*$/) {
- $key = $1;
- $value = $2 // undef;
- }
-
- # Line-Type: <e>c
- elsif($line =~ /^\s*<([^>]+)>\s*([^<]+)\s*$/) {
- $key = $1;
- $value = $2;
- }
-
- # Get text sigles
- elsif ($line =~ m!^(?:\w+\/){2}\w+$!) {
- $key = 'text';
- $value = $line;
- }
-
- # Get doc sigles
- elsif ($line =~ m!^(\w+\/\w+?)(?:\s.+?)?$!) {
- $key = 'doc';
- $value = $1;
- }
-
- # Get corpus sigles
- elsif ($line !~ m!(?:\/|\s)!) {
- $key = 'corpus';
- $value = $line;
- }
-
- # Not known
- else {
- warn _shorten($line) . q! isn't a valid VC definition!;
- next;
- };
-
- # Add text field
- if ($key eq 'text') {
-
- # Convert C2 sigle to KorAP form
- $value =~ s!^([^/]+?/[^\.]+?)\.(.+?)$!$1\/$2!;
- ${$vc}->union_field(textSigle => $value);
- }
-
- # Add doc field
- elsif ($key eq 'doc') {
- ${$vc}->union_field(docSigle => $value);
- }
-
- # Add corpus field
- elsif ($key eq 'corpus') {
- ${$vc}->union_field(corpusSigle => $value);
- }
-
- # Add corpus field
- elsif ($key eq 'cn') {
- # Korpussigle, z.B. 'F97 Frankfurter Allgemeine 1997'
- if ($value =~ m!^([^\/\s]+)(?:\s.+?)?$!) {
- ${$vc}->union_field(corpusSigle => $1);
- };
- }
-
- # Mark the vc as frozen
- # This means that an extended VC area is expected
- elsif ($key eq 'frozen') {
- $frozen = 1;
- }
-
- # Start/End intended VC area
- elsif ($key eq 'intended') {
- if ($value eq 'start') {
- $$vc = $vc_int;
- }
- elsif ($value ne 'end') {
- warn 'Unknown intension value ' . $value;
- };
- }
-
- # Start/End extended VC area
- elsif ($key eq 'extended') {
- if ($value eq 'start') {
- $$vc = $vc_ext;
- }
- elsif ($value ne 'end') {
- warn 'Unknown extension value ' . $value;
- };
- }
-
- # Set VC name
- elsif ($key eq 'name') {
- # "Name des virt. Korpus, der angezeigt wird.
- # Wird auch intern zur Korpusbildung referenziert, z.B. für <and>,
- # <add>, <sub>"
-
- # No global name defined yet
- if ($$vc && !$$vc->name) {
- $vc_ext->name($value);
- $vc_int->name($value);
- next;
- };
-
- ${$vc} = KorAP::VirtualCorpus::Group->new;
- ${$vc}->name($value);
- }
-
- # End VC def
- elsif ($key eq 'end') {
- $all_vcs{${$vc}->name} = $$vc;
- # $vc = undef;
- }
-
- # Add VC definition
- elsif ($key eq 'add') {
- unless (defined $all_vcs{$value}) {
- # warn 'VC ' . $value . ' not defined';
- # exit(1);
- next;
- };
-
- $$vc->union($all_vcs{$value}->clone->to_koral);
- }
-
- # AND definition
- elsif ($key eq 'and') {
- unless (defined $all_vcs{$value}) {
- # warn 'VC ' . $value . ' not defined';
- # exit(1);
- next;
- };
-
- $$vc->joint($all_vcs{$value}->clone->to_koral);
- }
-
- # Source of the corpus
- elsif ($key eq 'ql') {
- # Quellenname, z.B. "Neue Zürcher Zeitung"
- $$vc->union_field(corpusTitle => $value);
- }
-
- elsif ($key eq 'sub') {
- # "Sub" is the difference - it is the "and not" operation.
- warn $key . ' is not yet supported';
- }
-
- elsif ($key eq 'co') {
- # Country, z.B. DE für Text in Deutschland erschienen
- warn $key . ' is not yet supported';
- }
-
- elsif ($key eq 'tl') {
- # Textlength, Bereich von Texten der angegebenen Länge [in Anz. Wörtern]
- warn $key . ' is not yet supported';
- }
-
- elsif ($key eq 'ts') {
- # Textsorte, z.B. "Bericht"
- warn $key . ' is not yet supported';
- }
-
- elsif ($key eq 'th') {
- # Thema, z.B. "Sport - Fußball"
- warn $key . ' is not yet supported';
- }
-
- elsif ($key eq 'red') {
- # Reduktionsfaktor
- # Wert zw. 1-99%: virt. Korpus wird auf diesen Wert
- # reduziert. Modus: feste Reduzierung, nicht variabel.
- warn $key . ' is not yet supported';
- }
-
- elsif ($key eq 'thprob') {
- # ThemaProbability
- # Wert, der für <th>Thema verwendet wird um zu bestimmen, ab welchem
- # Zuverläßigkeitswert ein Thema übernommen wird
- }
-
-
- # Add reduction value as a comment
- elsif ($key eq 'redabs') {
- # "red. Anz. Texte
- # absoluter Wert der durch Reduktion zu erzielende Anzahl Texte"
- $$vc->comment('redabs:' . $value);
- warn $key . ' is not yet supported';
- }
-
- # Add reduction value as a comment
- elsif ($key eq 'date') {
- # Supports two pattern schemes:
- # m1=Year1/Month1 bis Year2/Month2
- # Datumsbereich Schema 1: z.B. "2000/01 bis 2010/12"
-
- # Schema 1
- if ($value =~ m!^(?:m1\s*=\s*)?\s*(\d+)\/(\d+) bis (\d+)\/(\d+)\s*$!s) {
- my ($y1, $m1, $y2, $m2) = ($1, $2, $3, $4);
- if ($m1 < 10) {
- $m1 = '0' . (0+$m1);
- };
- if ($m2 < 10) {
- $m2 = '0' . (0+$m2);
- };
- $$vc->from($y1, $m1);
- $$vc->to($y2, $m2);
- }
-
- # Scheme 2
- elsif ($value =~ m!^\s*\d{4}-\d{4}\s+und\s+\d{1,2}-\d{1,2}\s*$!) {
- # m2=Year1-Year2 und Month1-Month2
- # Datumsbereich Schema 2: z.B. "1990-2000 und 06-06"
-
- warn 'Second date scheme not yet supported!'
- }
-
- else {
- warn 'Unknown date scheme ' . $value;
- };
- }
-
- # Unknown
- else {
- warn $key . ' is an unknown field';
- };
-};
-
-close($fh);
+$def_parser->parse;
# Stringify current (extended?) virtual corpus
-print $$vc->to_string;
+print $def_parser->to_string;