blob: c9dfeb91f8b12398353d6d6a5cfd6b734adeca4c [file] [log] [blame]
package KorAP::Def;
use KorAP::VirtualCorpus::Group;
use IO::Uncompress::Bunzip2 qw($Bunzip2Error);
use Mojo::Util qw!decode!;
use IO::File;
use strict;
use warnings;
sub new {
my $class = shift;
my $file = shift;
my $self = {};
if (ref $file && ref $file eq 'GLOB') {
$self->{file} = '';
$self->{fh} = $file;
return bless $self, $class;
}
if (!-f $file) {
if (-f $file . '.def') {
$file .= '.def';
}
elsif (-f $file . '.txt') {
$file .= '.txt';
}
elsif (-f $file . '.def.bz2') {
$file .= '.def.bz2';
}
else {
warn 'Unable to load def file from ' . $file;
return;
};
};
if ($file =~ /\.def\.bz2$/) {
$self->{file} = $file;
$self->{fh} = IO::Uncompress::Bunzip2->new($file)
or die "bunzip2 failed: $Bunzip2Error\n";;
}
elsif (-f $file) {
$self->{file} = $file;
# Open def file
if (!open($self->{fh}, '<' . $file)) {
warn $ARGV[0] . " can't be opened";
exit(0);
};
}; # or guess
return bless $self, $class;
};
# Check if the def is frozen
sub frozen_check {
my ($self, $key) = @_;
my $fh = $self->{fh};
while (!eof($fh)) {
my $line = <$fh>;
if (index($line, '<frozen>') > 0) {
return 1;
}
};
return 0;
};
# Parse def file
sub parse {
my $self = shift;
# Initial VC group
my $vc;
# Create an intensional and an extensional VC
my $vc_ext = KorAP::VirtualCorpus::Group->new;
my $vc_int = KorAP::VirtualCorpus::Group->new;
# Load ext initially
$$vc = $vc_ext;
# Collect all virtual corpora
my %all_vcs;
my $frozen = 0;
my $fh = $self->{fh};
# Iterate over the whole list
while (!eof($fh)) {
my $line = <$fh>;
chomp $line;
$line = decode 'latin-1', $line;
# Skip empty lines
if (!$line || length($line) == 0 || $line =~ /^[\s\t\n]*$/) {
# empty
next;
};
my ($key, $value, $desc);
# Line-Type: <e>c</a>
#if ($line =~ /^\s*<([^>]+)>\s*([^<]*)\s*<\/\1>\s*$/) {
if ($line =~ /^\s*<([^>]+)>\s*([^<]*)\s*<[^>]+>\s*$/) {
$key = $1;
$value = $2 // undef;
}
# Line-Type: <e>c
elsif($line =~ /^\s*<([^>]+)>\s*([^<]+)\s*$/) {
$key = $1;
$value = $2;
}
# Get text sigles
elsif ($line =~ m!^\w+\/\w+[\/\.]\w+$!) {
$key = 'text';
$value = $line;
}
# Get doc sigles
elsif ($line =~ m!^(\w+\/\w+?)(?:\s.+?)?$!) {
$key = 'doc';
$value = $1;
}
# Get corpus sigles
elsif ($line !~ m!(?:\/|\s)!) {
$key = 'corpus';
$value = $line;
}
# Not known
else {
warn _shorten($line) . q! isn't a valid VC definition!;
next;
};
# Add text field
if ($key eq 'text') {
# Convert C2 sigle to KorAP form
$value =~ s!^([^/]+?/[^\.]+?)\.(.+?)$!$1\/$2!;
${$vc}->union_field(textSigle => $value);
}
# Add doc field
elsif ($key eq 'doc') {
${$vc}->union_field(docSigle => $value);
}
# Add corpus field
elsif ($key eq 'corpus') {
${$vc}->union_field(corpusSigle => $value);
}
# Add corpus field
elsif ($key eq 'cn') {
# Korpussigle, z.B. 'F97 Frankfurter Allgemeine 1997'
if ($value =~ m!^([^\/\s]+)(?:\s.+?)?$!) {
${$vc}->union_field(corpusSigle => $1);
};
}
# Mark the vc as frozen
# This means that an extended VC area is expected
elsif ($key eq 'frozen') {
$frozen = 1;
}
# Start/End intended VC area
elsif ($key eq 'intended') {
if ($value eq 'start') {
$$vc = $vc_int;
}
elsif ($value ne 'end') {
warn 'Unknown intension value ' . $value;
};
}
# Start/End extended VC area
elsif ($key eq 'extended') {
if ($value eq 'start') {
$$vc = $vc_ext;
}
elsif ($value ne 'end') {
warn 'Unknown extension value ' . $value;
};
}
# Set VC name
elsif ($key eq 'name') {
# "Name des virt. Korpus, der angezeigt wird.
# Wird auch intern zur Korpusbildung referenziert, z.B. für <and>,
# <add>, <sub>"
# No global name defined yet
if ($$vc && !$$vc->name) {
$vc_ext->name($value);
$vc_int->name($value);
next;
};
${$vc} = KorAP::VirtualCorpus::Group->new;
${$vc}->name($value);
}
# End VC def
elsif ($key eq 'end') {
$all_vcs{${$vc}->name} = $$vc;
# $vc = undef;
}
# Add VC definition
elsif ($key eq 'add') {
unless (defined $all_vcs{$value}) {
# warn 'VC ' . $value . ' not defined';
# exit(1);
next;
};
$$vc->union($all_vcs{$value}->clone->to_koral);
}
# AND definition
elsif ($key eq 'and') {
unless (defined $all_vcs{$value}) {
# warn 'VC ' . $value . ' not defined';
# exit(1);
next;
};
$$vc->joint($all_vcs{$value}->clone->to_koral);
}
# Source of the corpus
elsif ($key eq 'ql') {
# Quellenname, z.B. "Neue Zürcher Zeitung"
$$vc->union_field(corpusTitle => $value);
}
elsif ($key eq 'sub') {
# "Sub" is the difference - it is the "and not" operation.
warn $key . ' is not yet supported';
}
elsif ($key eq 'co') {
# Country, z.B. DE für Text in Deutschland erschienen
${$vc}->union_field(pubPlaceKey => $value);
}
elsif ($key eq 'tl') {
# Textlength, Bereich von Texten der angegebenen Länge [in Anz. Wörtern]
warn $key . ' is not yet supported';
}
elsif ($key eq 'ts') {
# Textsorte, z.B. "Bericht"
warn $key . ' is not yet supported';
}
elsif ($key eq 'th') {
# Thema, z.B. "Sport - Fußball"
warn $key . ' is not yet supported';
}
elsif ($key eq 'red') {
# Reduktionsfaktor
# Wert zw. 1-99%: virt. Korpus wird auf diesen Wert
# reduziert. Modus: feste Reduzierung, nicht variabel.
warn $key . ' is not yet supported';
}
elsif ($key eq 'thprob') {
# ThemaProbability
# Wert, der für <th>Thema verwendet wird um zu bestimmen, ab welchem
# Zuverläßigkeitswert ein Thema übernommen wird
}
# Add reduction value as a comment
elsif ($key eq 'redabs') {
# "red. Anz. Texte
# absoluter Wert der durch Reduktion zu erzielende Anzahl Texte"
$$vc->comment('redabs:' . $value);
warn $key . ' is not yet supported';
}
# Add reduction value as a comment
elsif ($key eq 'date') {
# Supports two pattern schemes:
# m1=Year1/Month1 bis Year2/Month2
# Datumsbereich Schema 1: z.B. "2000/01 bis 2010/12"
# Schema 1
if ($value =~ m!^(?:m1\s*=\s*)?\s*(\d+)\/(\d+) bis (\d+)\/(\d+)\s*$!s) {
my ($y1, $m1, $y2, $m2) = ($1, $2, $3, $4);
if ($m1 < 10) {
$m1 = '0' . (0+$m1);
};
if ($m2 < 10) {
$m2 = '0' . (0+$m2);
};
$$vc->from($y1, $m1);
$$vc->to($y2, $m2);
}
# Scheme 2
elsif ($value =~ m!^\s*\d{4}-\d{4}\s+und\s+\d{1,2}-\d{1,2}\s*$!) {
# m2=Year1-Year2 und Month1-Month2
# Datumsbereich Schema 2: z.B. "1990-2000 und 06-06"
warn 'Second date scheme not yet supported!'
}
else {
warn 'Unknown date scheme ' . $value;
};
}
# Unknown
else {
warn $key . ' is an unknown field';
};
};
if ($frozen) {
warn '' . ($self->{file} ? $self->{file} : 'Input') . ' is frozen';
}
$self->{vc} = $vc;
close($fh);
};
sub to_string {
return ${shift->{vc}}->to_string;
};
# Shorten long strings for logging
sub _shorten ($) {
my $line = shift;
if (length($line) < 20) {
return $line;
}
else {
return substr($line,0,17) . '...';
};
};
1;