| package KorAP::Def; |
| use KorAP::VirtualCorpus::Group; |
| use IO::Uncompress::Bunzip2 qw($Bunzip2Error); |
| use Mojo::Util qw!decode!; |
| use IO::File; |
| use strict; |
| use warnings; |
| |
| sub new { |
| my $class = shift; |
| my $file = shift; |
| |
| my $self = {}; |
| |
| if (ref $file && ref $file eq 'GLOB') { |
| $self->{file} = ''; |
| $self->{fh} = $file; |
| return bless $self, $class; |
| } |
| |
| if (!-f $file) { |
| if (-f $file . '.def') { |
| $file .= '.def'; |
| } |
| |
| elsif (-f $file . '.txt') { |
| $file .= '.txt'; |
| } |
| |
| elsif (-f $file . '.def.bz2') { |
| $file .= '.def.bz2'; |
| } |
| |
| else { |
| warn 'Unable to load def file from ' . $file; |
| return; |
| }; |
| }; |
| |
| if ($file =~ /\.def\.bz2$/) { |
| $self->{file} = $file; |
| $self->{fh} = IO::Uncompress::Bunzip2->new($file) |
| or die "bunzip2 failed: $Bunzip2Error\n";; |
| } |
| |
| elsif (-f $file) { |
| $self->{file} = $file; |
| |
| # Open def file |
| if (!open($self->{fh}, '<' . $file)) { |
| warn $ARGV[0] . " can't be opened"; |
| exit(0); |
| }; |
| }; # or guess |
| |
| return bless $self, $class; |
| }; |
| |
| |
| # Check if the def is frozen |
| sub frozen_check { |
| my ($self, $key) = @_; |
| my $fh = $self->{fh}; |
| while (!eof($fh)) { |
| my $line = <$fh>; |
| if (index($line, '<frozen>') > 0) { |
| return 1; |
| } |
| }; |
| return 0; |
| }; |
| |
| |
| # Parse def file |
| sub parse { |
| my $self = shift; |
| |
| # Initial VC group |
| my $vc; |
| |
| # Create an intensional and an extensional VC |
| my $vc_ext = KorAP::VirtualCorpus::Group->new; |
| my $vc_int = KorAP::VirtualCorpus::Group->new; |
| |
| # Load ext initially |
| $$vc = $vc_ext; |
| |
| # Collect all virtual corpora |
| my %all_vcs; |
| |
| my $frozen = 0; |
| |
| my $fh = $self->{fh}; |
| |
| # Iterate over the whole list |
| while (!eof($fh)) { |
| my $line = <$fh>; |
| chomp $line; |
| $line = decode 'latin-1', $line; |
| |
| # Skip empty lines |
| if (!$line || length($line) == 0 || $line =~ /^[\s\t\n]*$/) { |
| # empty |
| next; |
| }; |
| |
| my ($key, $value, $desc); |
| |
| # Line-Type: <e>c</a> |
| #if ($line =~ /^\s*<([^>]+)>\s*([^<]*)\s*<\/\1>\s*$/) { |
| if ($line =~ /^\s*<([^>]+)>\s*([^<]*)\s*<[^>]+>\s*$/) { |
| $key = $1; |
| $value = $2 // undef; |
| } |
| |
| # Line-Type: <e>c |
| elsif($line =~ /^\s*<([^>]+)>\s*([^<]+)\s*$/) { |
| $key = $1; |
| $value = $2; |
| } |
| |
| # Get text sigles |
| elsif ($line =~ m!^\w+\/\w+[\/\.]\w+$!) { |
| $key = 'text'; |
| $value = $line; |
| } |
| |
| # Get doc sigles |
| elsif ($line =~ m!^(\w+\/\w+?)(?:\s.+?)?$!) { |
| $key = 'doc'; |
| $value = $1; |
| } |
| |
| # Get corpus sigles |
| elsif ($line !~ m!(?:\/|\s)!) { |
| $key = 'corpus'; |
| $value = $line; |
| } |
| |
| # Not known |
| else { |
| warn _shorten($line) . q! isn't a valid VC definition!; |
| next; |
| }; |
| |
| # Add text field |
| if ($key eq 'text') { |
| |
| # Convert C2 sigle to KorAP form |
| $value =~ s!^([^/]+?/[^\.]+?)\.(.+?)$!$1\/$2!; |
| ${$vc}->union_field(textSigle => $value); |
| } |
| |
| # Add doc field |
| elsif ($key eq 'doc') { |
| ${$vc}->union_field(docSigle => $value); |
| } |
| |
| # Add corpus field |
| elsif ($key eq 'corpus') { |
| ${$vc}->union_field(corpusSigle => $value); |
| } |
| |
| # Add corpus field |
| elsif ($key eq 'cn') { |
| # Korpussigle, z.B. 'F97 Frankfurter Allgemeine 1997' |
| if ($value =~ m!^([^\/\s]+)(?:\s.+?)?$!) { |
| ${$vc}->union_field(corpusSigle => $1); |
| }; |
| } |
| |
| # Mark the vc as frozen |
| # This means that an extended VC area is expected |
| elsif ($key eq 'frozen') { |
| $frozen = 1; |
| } |
| |
| # Start/End intended VC area |
| elsif ($key eq 'intended') { |
| if ($value eq 'start') { |
| $$vc = $vc_int; |
| } |
| elsif ($value ne 'end') { |
| warn 'Unknown intension value ' . $value; |
| }; |
| } |
| |
| # Start/End extended VC area |
| elsif ($key eq 'extended') { |
| if ($value eq 'start') { |
| $$vc = $vc_ext; |
| } |
| elsif ($value ne 'end') { |
| warn 'Unknown extension value ' . $value; |
| }; |
| } |
| |
| # Set VC name |
| elsif ($key eq 'name') { |
| # "Name des virt. Korpus, der angezeigt wird. |
| # Wird auch intern zur Korpusbildung referenziert, z.B. für <and>, |
| # <add>, <sub>" |
| |
| # No global name defined yet |
| if ($$vc && !$$vc->name) { |
| $vc_ext->name($value); |
| $vc_int->name($value); |
| next; |
| }; |
| |
| ${$vc} = KorAP::VirtualCorpus::Group->new; |
| ${$vc}->name($value); |
| } |
| |
| # End VC def |
| elsif ($key eq 'end') { |
| $all_vcs{${$vc}->name} = $$vc; |
| # $vc = undef; |
| } |
| |
| # Add VC definition |
| elsif ($key eq 'add') { |
| unless (defined $all_vcs{$value}) { |
| # warn 'VC ' . $value . ' not defined'; |
| # exit(1); |
| next; |
| }; |
| |
| $$vc->union($all_vcs{$value}->clone->to_koral); |
| } |
| |
| # AND definition |
| elsif ($key eq 'and') { |
| unless (defined $all_vcs{$value}) { |
| # warn 'VC ' . $value . ' not defined'; |
| # exit(1); |
| next; |
| }; |
| |
| $$vc->joint($all_vcs{$value}->clone->to_koral); |
| } |
| |
| # Source of the corpus |
| elsif ($key eq 'ql') { |
| # Quellenname, z.B. "Neue Zürcher Zeitung" |
| $$vc->union_field(corpusTitle => $value); |
| } |
| |
| elsif ($key eq 'sub') { |
| # "Sub" is the difference - it is the "and not" operation. |
| warn $key . ' is not yet supported'; |
| } |
| |
| elsif ($key eq 'co') { |
| # Country, z.B. DE für Text in Deutschland erschienen |
| ${$vc}->union_field(pubPlaceKey => $value); |
| } |
| |
| elsif ($key eq 'tl') { |
| # Textlength, Bereich von Texten der angegebenen Länge [in Anz. Wörtern] |
| warn $key . ' is not yet supported'; |
| } |
| |
| elsif ($key eq 'ts') { |
| # Textsorte, z.B. "Bericht" |
| warn $key . ' is not yet supported'; |
| } |
| |
| elsif ($key eq 'th') { |
| # Thema, z.B. "Sport - Fußball" |
| warn $key . ' is not yet supported'; |
| } |
| |
| elsif ($key eq 'red') { |
| # Reduktionsfaktor |
| # Wert zw. 1-99%: virt. Korpus wird auf diesen Wert |
| # reduziert. Modus: feste Reduzierung, nicht variabel. |
| warn $key . ' is not yet supported'; |
| } |
| |
| elsif ($key eq 'thprob') { |
| # ThemaProbability |
| # Wert, der für <th>Thema verwendet wird um zu bestimmen, ab welchem |
| # Zuverläßigkeitswert ein Thema übernommen wird |
| } |
| |
| # Add reduction value as a comment |
| elsif ($key eq 'redabs') { |
| # "red. Anz. Texte |
| # absoluter Wert der durch Reduktion zu erzielende Anzahl Texte" |
| $$vc->comment('redabs:' . $value); |
| warn $key . ' is not yet supported'; |
| } |
| |
| # Add reduction value as a comment |
| elsif ($key eq 'date') { |
| # Supports two pattern schemes: |
| # m1=Year1/Month1 bis Year2/Month2 |
| # Datumsbereich Schema 1: z.B. "2000/01 bis 2010/12" |
| |
| # Schema 1 |
| if ($value =~ m!^(?:m1\s*=\s*)?\s*(\d+)\/(\d+) bis (\d+)\/(\d+)\s*$!s) { |
| my ($y1, $m1, $y2, $m2) = ($1, $2, $3, $4); |
| if ($m1 < 10) { |
| $m1 = '0' . (0+$m1); |
| }; |
| if ($m2 < 10) { |
| $m2 = '0' . (0+$m2); |
| }; |
| $$vc->from($y1, $m1); |
| $$vc->to($y2, $m2); |
| } |
| |
| # Scheme 2 |
| elsif ($value =~ m!^\s*\d{4}-\d{4}\s+und\s+\d{1,2}-\d{1,2}\s*$!) { |
| # m2=Year1-Year2 und Month1-Month2 |
| # Datumsbereich Schema 2: z.B. "1990-2000 und 06-06" |
| |
| warn 'Second date scheme not yet supported!' |
| } |
| |
| else { |
| warn 'Unknown date scheme ' . $value; |
| }; |
| } |
| |
| # Unknown |
| else { |
| warn $key . ' is an unknown field'; |
| }; |
| }; |
| |
| if ($frozen) { |
| warn '' . ($self->{file} ? $self->{file} : 'Input') . ' is frozen'; |
| } |
| $self->{vc} = $vc; |
| |
| close($fh); |
| }; |
| |
| sub to_string { |
| return ${shift->{vc}}->to_string; |
| }; |
| |
| # Shorten long strings for logging |
| sub _shorten ($) { |
| my $line = shift; |
| if (length($line) < 20) { |
| return $line; |
| } |
| else { |
| return substr($line,0,17) . '...'; |
| }; |
| }; |
| |
| |
| 1; |