| #!/usr/bin/env perl | 
 | use strict; | 
 | use warnings; | 
 | use lib 'lib'; | 
 | use KorAP::VirtualCorpus::Group; | 
 |  | 
 | # 2020-05-20 | 
 | #   Preliminary support for C2 def-files. | 
 | # 2020-05-29 | 
 | #   Introduce optimizable object system. | 
 |  | 
 | our $VERSION = 0.1; | 
 |  | 
 | our @ARGV; | 
 |  | 
 | unless (@ARGV) { | 
 |   print <<'HELP'; | 
 | Convert a line-separated list of corpus sigles, doc sigles or | 
 | text sigles into a virtual corpus query. | 
 |  | 
 |   $ perl list2vc.pl my_vc.txt | gzip -vc > my_vc.jsonld.gz | 
 |   $ cat my_vc.txt | perl list2vc.pl - | gzip -vc > my_vc.jsonld.gz | 
 |  | 
 | HELP | 
 | exit 0; | 
 | }; | 
 |  | 
 |  | 
 | # Shorten long strings for logging | 
 | sub _shorten ($) { | 
 |   my $line = shift; | 
 |   if (length($line) < 20) { | 
 |     return $line; | 
 |   } | 
 |   else { | 
 |     return substr($line,0,17) . '...'; | 
 |   }; | 
 | }; | 
 |  | 
 |  | 
 | my $fh; | 
 | if ($ARGV[0] eq '-') { | 
 |   $fh = *STDIN; | 
 | } elsif (!open($fh, '<' . $ARGV[0])) { | 
 |   warn $ARGV[0] . " can't be opened"; | 
 |   exit(0); | 
 | }; | 
 |  | 
 | # Initial VC group | 
 | my $vc; | 
 |  | 
 | # Create an intensional and an extensional VC | 
 | my $vc_ext = KorAP::VirtualCorpus::Group->new; | 
 | my $vc_int = KorAP::VirtualCorpus::Group->new; | 
 |  | 
 | # Load ext initially | 
 | $$vc = $vc_ext; | 
 |  | 
 | # Collect all virtual corpora | 
 | my %all_vcs; | 
 |  | 
 | my $frozen = 0; | 
 |  | 
 | # Iterate over the whole list | 
 | while (!eof $fh) { | 
 |   my $line = readline($fh); | 
 |   chomp $line; | 
 |  | 
 |  | 
 |   # Skip empty lines | 
 |   if (!$line || length($line) == 0 || $line =~ /^[\s\t\n]*$/) { | 
 |     # empty | 
 |     next; | 
 |   }; | 
 |  | 
 |   my ($key, $value, $desc); | 
 |  | 
 |   # Line-Type: <e>c</a> | 
 |   if ($line =~ /^\s*<([^>]+)>\s*([^<]*)\s*<\/\1>\s*$/) { | 
 |     $key = $1; | 
 |     $value = $2 // undef; | 
 |   } | 
 |  | 
 |   # Line-Type: <e>c | 
 |   elsif($line =~ /^\s*<([^>]+)>\s*([^<]+)\s*$/) { | 
 |     $key = $1; | 
 |     $value = $2; | 
 |   } | 
 |  | 
 |   # Get text sigles | 
 |   elsif ($line =~ m!^(?:\w+\/){2}\w+$!) { | 
 |     $key = 'text'; | 
 |     $value = $line; | 
 |   } | 
 |  | 
 |   # Get doc sigles | 
 |   elsif ($line =~ m!^(\w+\/\w+?)(?:\s.+?)?$!) { | 
 |     $key = 'doc'; | 
 |     $value = $1; | 
 |   } | 
 |  | 
 |   # Get corpus sigles | 
 |   elsif ($line !~ m!(?:\/|\s)!) { | 
 |     $key = 'corpus'; | 
 |     $value = $line; | 
 |   } | 
 |  | 
 |   # Not known | 
 |   else { | 
 |     warn _shorten($line) . q! isn't a valid VC definition!; | 
 |     next; | 
 |   }; | 
 |  | 
 |   # Add text field | 
 |   if ($key eq 'text') { | 
 |  | 
 |     # Convert C2 sigle to KorAP form | 
 |     $value =~ s!^([^/]+?/[^\.]+?)\.(.+?)$!$1\/$2!; | 
 |     ${$vc}->union_field(textSigle => $value); | 
 |   } | 
 |  | 
 |   # Add doc field | 
 |   elsif ($key eq 'doc') { | 
 |     ${$vc}->union_field(docSigle => $value); | 
 |   } | 
 |  | 
 |   # Add corpus field | 
 |   elsif ($key eq 'corpus') { | 
 |     ${$vc}->union_field(corpusSigle => $value); | 
 |   } | 
 |  | 
 |   # Add corpus field | 
 |   elsif ($key eq 'cn') { | 
 |     # Korpussigle, z.B. 'F97 Frankfurter Allgemeine 1997' | 
 |     if ($value =~ m!^([^\/\s]+)(?:\s.+?)?$!) { | 
 |       ${$vc}->union_field(corpusSigle => $1); | 
 |     }; | 
 |   } | 
 |  | 
 |   # Mark the vc as frozen | 
 |   # This means that an extended VC area is expected | 
 |   elsif ($key eq 'frozen') { | 
 |     $frozen = 1; | 
 |   } | 
 |  | 
 |   # Start/End intended VC area | 
 |   elsif ($key eq 'intended') { | 
 |     if ($value eq 'start') { | 
 |       $$vc = $vc_int; | 
 |     } | 
 |     elsif ($value ne 'end') { | 
 |       warn 'Unknown intension value ' . $value; | 
 |     }; | 
 |   } | 
 |  | 
 |   # Start/End extended VC area | 
 |   elsif ($key eq 'extended') { | 
 |     if ($value eq 'start') { | 
 |       $$vc = $vc_ext; | 
 |     } | 
 |     elsif ($value ne 'end') { | 
 |       warn 'Unknown extension value ' . $value; | 
 |     }; | 
 |   } | 
 |  | 
 |   # Set VC name | 
 |   elsif ($key eq 'name') { | 
 |     # "Name des virt. Korpus, der angezeigt wird. | 
 |     # Wird auch intern zur Korpusbildung referenziert, z.B. für <and>, | 
 |     # <add>, <sub>" | 
 |  | 
 |     # No global name defined yet | 
 |     if ($$vc && !$$vc->name) { | 
 |       $vc_ext->name($value); | 
 |       $vc_int->name($value); | 
 |       next; | 
 |     }; | 
 |  | 
 |     ${$vc} = KorAP::VirtualCorpus::Group->new; | 
 |     ${$vc}->name($value); | 
 |   } | 
 |  | 
 |   # End VC def | 
 |   elsif ($key eq 'end') { | 
 |     $all_vcs{${$vc}->name} = $$vc; | 
 |     # $vc = undef; | 
 |   } | 
 |  | 
 |   # Add VC definition | 
 |   elsif ($key eq 'add') { | 
 |     unless (defined $all_vcs{$value}) { | 
 |       #       warn 'VC ' . $value . ' not defined'; | 
 |       # exit(1); | 
 |       next; | 
 |     }; | 
 |  | 
 |     $$vc->union($all_vcs{$value}->clone->to_koral); | 
 |   } | 
 |  | 
 |   # AND definition | 
 |   elsif ($key eq 'and') { | 
 |     unless (defined $all_vcs{$value}) { | 
 |       #       warn 'VC ' . $value . ' not defined'; | 
 |       # exit(1); | 
 |       next; | 
 |     }; | 
 |  | 
 |     $$vc->joint($all_vcs{$value}->clone->to_koral); | 
 |   } | 
 |  | 
 |   # Source of the corpus | 
 |   elsif ($key eq 'ql') { | 
 |     # Quellenname, z.B. "Neue Zürcher Zeitung" | 
 |     $$vc->union_field(corpusTitle => $value); | 
 |   } | 
 |  | 
 |   elsif ($key eq 'sub') { | 
 |     # "Sub" is the difference - it is the "and not" operation. | 
 |     warn $key . ' is not yet supported'; | 
 |   } | 
 |  | 
 |   elsif ($key eq 'co') { | 
 |     # Country,	z.B. DE für Text in Deutschland erschienen | 
 |     warn $key . ' is not yet supported'; | 
 |   } | 
 |  | 
 |   elsif ($key eq 'tl') { | 
 |     # Textlength, Bereich von Texten der angegebenen Länge [in Anz. Wörtern] | 
 |     warn $key . ' is not yet supported'; | 
 |   } | 
 |  | 
 |   elsif ($key eq 'ts') { | 
 |     # Textsorte, 	z.B. "Bericht" | 
 |     warn $key . ' is not yet supported'; | 
 |   } | 
 |  | 
 |   elsif ($key eq 'th') { | 
 |     # Thema, z.B. "Sport - Fußball" | 
 |     warn $key . ' is not yet supported'; | 
 |   } | 
 |  | 
 |   elsif ($key eq 'red') { | 
 |     # Reduktionsfaktor | 
 |     # Wert zw. 1-99%: virt. Korpus wird auf diesen Wert | 
 |     # reduziert. Modus: feste Reduzierung, nicht variabel. | 
 |     warn $key . ' is not yet supported'; | 
 |   } | 
 |  | 
 |   elsif ($key eq 'thprob') { | 
 |     # ThemaProbability | 
 |     # Wert, der für <th>Thema verwendet wird um zu bestimmen, ab welchem | 
 |     # Zuverläßigkeitswert ein Thema übernommen wird | 
 |   } | 
 |  | 
 |  | 
 |   # Add reduction value as a comment | 
 |   elsif ($key eq 'redabs') { | 
 |     # "red. Anz. Texte | 
 |     # absoluter Wert der durch Reduktion zu erzielende Anzahl Texte" | 
 |     $$vc->comment('redabs:' . $value); | 
 |     warn $key . ' is not yet supported'; | 
 |   } | 
 |  | 
 |   # Add reduction value as a comment | 
 |   elsif ($key eq 'date') { | 
 |     # Supports two pattern schemes: | 
 |     # m1=Year1/Month1 bis Year2/Month2 | 
 |     #   Datumsbereich Schema 1: z.B. "2000/01 bis 2010/12" | 
 |  | 
 |     # Schema 1 | 
 |     if ($value =~ m!^(?:m1\s*=\s*)?\s*(\d+)\/(\d+) bis (\d+)\/(\d+)\s*$!s) { | 
 |       my ($y1, $m1, $y2, $m2) = ($1, $2, $3, $4); | 
 |       if ($m1 < 10) { | 
 |         $m1 = '0' . (0+$m1); | 
 |       }; | 
 |       if ($m2 < 10) { | 
 |         $m2 = '0' . (0+$m2); | 
 |       }; | 
 |       $$vc->from($y1, $m1); | 
 |       $$vc->to($y2, $m2); | 
 |     } | 
 |  | 
 |     # Scheme 2 | 
 |     elsif ($value =~ m!^\s*\d{4}-\d{4}\s+und\s+\d{1,2}-\d{1,2}\s*$!) { | 
 |       # m2=Year1-Year2 und Month1-Month2 | 
 |       #   Datumsbereich Schema 2: z.B. "1990-2000 und 06-06" | 
 |  | 
 |       warn 'Second date scheme not yet supported!' | 
 |     } | 
 |  | 
 |     else { | 
 |       warn 'Unknown date scheme ' . $value; | 
 |     }; | 
 |   } | 
 |  | 
 |   # Unknown | 
 |   else { | 
 |     warn $key . ' is an unknown field'; | 
 |   }; | 
 | }; | 
 |  | 
 | close($fh); | 
 |  | 
 | # Stringify current (extended?) virtual corpus | 
 | print $$vc->to_string; |