|  | #!/usr/bin/env perl | 
|  | use strict; | 
|  | use warnings; | 
|  | use lib 'lib'; | 
|  | use KorAP::VirtualCorpus::Group; | 
|  |  | 
|  | # 2020-05-20 | 
|  | #   Preliminary support for C2 def-files. | 
|  | # 2020-05-29 | 
|  | #   Introduce optimizable object system. | 
|  |  | 
|  | our @ARGV; | 
|  |  | 
|  | unless (@ARGV) { | 
|  | print <<'HELP'; | 
|  | Convert a line-separated list of corpus sigles, doc sigles or | 
|  | text sigles into a virtual corpus query. | 
|  |  | 
|  | $ perl list2vc.pl my_vc.txt | gzip -vc > my_vc.jsonld.gz | 
|  | $ cat my_vc.txt | perl list2vc.pl - | gzip -vc > my_vc.jsonld.gz | 
|  |  | 
|  | HELP | 
|  | exit 0; | 
|  | }; | 
|  |  | 
|  |  | 
|  | # Shorten long strings for logging | 
|  | sub _shorten ($) { | 
|  | my $line = shift; | 
|  | if (length($line) < 20) { | 
|  | return $line; | 
|  | } | 
|  | else { | 
|  | return substr($line,0,17) . '...'; | 
|  | }; | 
|  | }; | 
|  |  | 
|  |  | 
|  | my $fh; | 
|  | if ($ARGV[0] eq '-') { | 
|  | $fh = *STDIN; | 
|  | } elsif (!open($fh, '<' . $ARGV[0])) { | 
|  | warn $ARGV[0] . " can't be opened"; | 
|  | exit(0); | 
|  | }; | 
|  |  | 
|  | # Initial VC group | 
|  | my $vc; | 
|  |  | 
|  | # Create an intensional and an extensional VC | 
|  | my $vc_ext = KorAP::VirtualCorpus::Group->new; | 
|  | my $vc_int = KorAP::VirtualCorpus::Group->new; | 
|  |  | 
|  | # Load ext initially | 
|  | $$vc = $vc_ext; | 
|  |  | 
|  | # Collect all virtual corpora | 
|  | my %all_vcs; | 
|  |  | 
|  | my $frozen = 0; | 
|  |  | 
|  | # Iterate over the whole list | 
|  | while (!eof $fh) { | 
|  | my $line = readline($fh); | 
|  | chomp $line; | 
|  |  | 
|  |  | 
|  | # Skip empty lines | 
|  | if (!$line || length($line) == 0 || $line =~ /^[\s\t\n]*$/) { | 
|  | # empty | 
|  | next; | 
|  | }; | 
|  |  | 
|  | my ($key, $value, $desc); | 
|  |  | 
|  | # Line-Type: <e>c</a> | 
|  | if ($line =~ /^\s*<([^>]+)>\s*([^<]*)\s*<\/\1>\s*$/) { | 
|  | $key = $1; | 
|  | $value = $2 // undef; | 
|  | } | 
|  |  | 
|  | # Line-Type: <e>c | 
|  | elsif($line =~ /^\s*<([^>]+)>\s*([^<]+)\s*$/) { | 
|  | $key = $1; | 
|  | $value = $2; | 
|  | } | 
|  |  | 
|  | # Get text sigles | 
|  | elsif ($line =~ m!^(?:\w+\/){2}\w+$!) { | 
|  | $key = 'text'; | 
|  | $value = $line; | 
|  | } | 
|  |  | 
|  | # Get doc sigles | 
|  | elsif ($line =~ m!^(\w+\/\w+?)(?:\s.+?)?$!) { | 
|  | $key = 'doc'; | 
|  | $value = $1; | 
|  | } | 
|  |  | 
|  | # Get corpus sigles | 
|  | elsif ($line !~ m!(?:\/|\s)!) { | 
|  | $key = 'corpus'; | 
|  | $value = $line; | 
|  | } | 
|  |  | 
|  | # Not known | 
|  | else { | 
|  | warn _shorten($line) . q! isn't a valid VC definition!; | 
|  | next; | 
|  | }; | 
|  |  | 
|  | # Add text field | 
|  | if ($key eq 'text') { | 
|  |  | 
|  | # Convert C2 sigle to KorAP form | 
|  | $value =~ s!^([^/]+?/[^\.]+?)\.(.+?)$!$1\/$2!; | 
|  | ${$vc}->union_field(textSigle => $value); | 
|  | } | 
|  |  | 
|  | # Add doc field | 
|  | elsif ($key eq 'doc') { | 
|  | ${$vc}->union_field(docSigle => $value); | 
|  | } | 
|  |  | 
|  | # Add corpus field | 
|  | elsif ($key eq 'corpus') { | 
|  | ${$vc}->union_field(corpusSigle => $value); | 
|  | } | 
|  |  | 
|  | # Add corpus field | 
|  | elsif ($key eq 'cn') { | 
|  | # Korpussigle, z.B. 'F97 Frankfurter Allgemeine 1997' | 
|  | if ($value =~ m!^([^\/\s]+)(?:\s.+?)?$!) { | 
|  | ${$vc}->union_field(corpusSigle => $1); | 
|  | }; | 
|  | } | 
|  |  | 
|  | # Mark the vc as frozen | 
|  | # This means that an extended VC area is expected | 
|  | elsif ($key eq 'frozen') { | 
|  | $frozen = 1; | 
|  | } | 
|  |  | 
|  | # Start/End intended VC area | 
|  | elsif ($key eq 'intended') { | 
|  | if ($value eq 'start') { | 
|  | $$vc = $vc_int; | 
|  | } | 
|  | elsif ($value ne 'end') { | 
|  | warn 'Unknown intension value ' . $value; | 
|  | }; | 
|  | } | 
|  |  | 
|  | # Start/End extended VC area | 
|  | elsif ($key eq 'extended') { | 
|  | if ($value eq 'start') { | 
|  | $$vc = $vc_ext; | 
|  | } | 
|  | elsif ($value ne 'end') { | 
|  | warn 'Unknown extension value ' . $value; | 
|  | }; | 
|  | } | 
|  |  | 
|  | # Set VC name | 
|  | elsif ($key eq 'name') { | 
|  | # "Name des virt. Korpus, der angezeigt wird. | 
|  | # Wird auch intern zur Korpusbildung referenziert, z.B. für <and>, | 
|  | # <add>, <sub>" | 
|  |  | 
|  | # No global name defined yet | 
|  | if ($$vc && !$$vc->name) { | 
|  | $vc_ext->name($value); | 
|  | $vc_int->name($value); | 
|  | next; | 
|  | }; | 
|  |  | 
|  | ${$vc} = KorAP::VirtualCorpus::Group->new; | 
|  | ${$vc}->name($value); | 
|  | } | 
|  |  | 
|  | # End VC def | 
|  | elsif ($key eq 'end') { | 
|  | $all_vcs{${$vc}->name} = $$vc; | 
|  | # $vc = undef; | 
|  | } | 
|  |  | 
|  | # Add VC definition | 
|  | elsif ($key eq 'add') { | 
|  | unless (defined $all_vcs{$value}) { | 
|  | #       warn 'VC ' . $value . ' not defined'; | 
|  | # exit(1); | 
|  | next; | 
|  | }; | 
|  |  | 
|  | $$vc->union($all_vcs{$value}->clone->to_koral); | 
|  | } | 
|  |  | 
|  | # AND definition | 
|  | elsif ($key eq 'and') { | 
|  | unless (defined $all_vcs{$value}) { | 
|  | #       warn 'VC ' . $value . ' not defined'; | 
|  | # exit(1); | 
|  | next; | 
|  | }; | 
|  |  | 
|  | $$vc->joint($all_vcs{$value}->clone->to_koral); | 
|  | } | 
|  |  | 
|  | # Source of the corpus | 
|  | elsif ($key eq 'ql') { | 
|  | # Quellenname, z.B. "Neue Zürcher Zeitung" | 
|  | $$vc->union_field(corpusTitle => $value); | 
|  | } | 
|  |  | 
|  | elsif ($key eq 'sub') { | 
|  | # "Sub" is the difference - it is the "and not" operation. | 
|  | warn $key . ' is not yet supported'; | 
|  | } | 
|  |  | 
|  | elsif ($key eq 'co') { | 
|  | # Country,	z.B. DE für Text in Deutschland erschienen | 
|  | warn $key . ' is not yet supported'; | 
|  | } | 
|  |  | 
|  | elsif ($key eq 'tl') { | 
|  | # Textlength, Bereich von Texten der angegebenen Länge [in Anz. Wörtern] | 
|  | warn $key . ' is not yet supported'; | 
|  | } | 
|  |  | 
|  | elsif ($key eq 'ts') { | 
|  | # Textsorte, 	z.B. "Bericht" | 
|  | warn $key . ' is not yet supported'; | 
|  | } | 
|  |  | 
|  | elsif ($key eq 'th') { | 
|  | # Thema, z.B. "Sport - Fußball" | 
|  | warn $key . ' is not yet supported'; | 
|  | } | 
|  |  | 
|  | elsif ($key eq 'red') { | 
|  | # Reduktionsfaktor | 
|  | # Wert zw. 1-99%: virt. Korpus wird auf diesen Wert | 
|  | # reduziert. Modus: feste Reduzierung, nicht variabel. | 
|  | warn $key . ' is not yet supported'; | 
|  | } | 
|  |  | 
|  | elsif ($key eq 'thprob') { | 
|  | # ThemaProbability | 
|  | # Wert, der für <th>Thema verwendet wird um zu bestimmen, ab welchem | 
|  | # Zuverläßigkeitswert ein Thema übernommen wird | 
|  | } | 
|  |  | 
|  |  | 
|  | # Add reduction value as a comment | 
|  | elsif ($key eq 'redabs') { | 
|  | # "red. Anz. Texte | 
|  | # absoluter Wert der durch Reduktion zu erzielende Anzahl Texte" | 
|  | $$vc->comment('redabs:' . $value); | 
|  | warn $key . ' is not yet supported'; | 
|  | } | 
|  |  | 
|  | # Add reduction value as a comment | 
|  | elsif ($key eq 'date') { | 
|  | # Supports two pattern schemes: | 
|  | # m1=Year1/Month1 bis Year2/Month2 | 
|  | #   Datumsbereich Schema 1: z.B. "2000/01 bis 2010/12" | 
|  |  | 
|  | # Schema 1 | 
|  | if ($value =~ m!^(?:m1\s*=\s*)?\s*(\d+)\/(\d+) bis (\d+)\/(\d+)\s*$!s) { | 
|  | my ($y1, $m1, $y2, $m2) = ($1, $2, $3, $4); | 
|  | if ($m1 < 10) { | 
|  | $m1 = '0' . (0+$m1); | 
|  | }; | 
|  | if ($m2 < 10) { | 
|  | $m2 = '0' . (0+$m2); | 
|  | }; | 
|  | $$vc->from($y1, $m1); | 
|  | $$vc->to($y2, $m2); | 
|  | } | 
|  |  | 
|  | # Scheme 2 | 
|  | elsif ($value =~ m!^\s*\d{4}-\d{4}\s+und\s+\d{1,2}-\d{1,2}\s*$!) { | 
|  | # m2=Year1-Year2 und Month1-Month2 | 
|  | #   Datumsbereich Schema 2: z.B. "1990-2000 und 06-06" | 
|  |  | 
|  | warn 'Second date scheme not yet supported!' | 
|  | } | 
|  |  | 
|  | else { | 
|  | warn 'Unknown date scheme ' . $value; | 
|  | }; | 
|  | } | 
|  |  | 
|  | # Unknown | 
|  | else { | 
|  | warn $key . ' is an unknown field'; | 
|  | }; | 
|  | }; | 
|  |  | 
|  | close($fh); | 
|  |  | 
|  | # Stringify current (extended?) virtual corpus | 
|  | print $$vc->to_string; |