| Akron | 3f875be | 2020-05-11 14:57:19 +0200 | [diff] [blame] | 1 | #!/usr/bin/env perl | 
| Akron | 3587f36 | 2020-05-20 17:50:38 +0200 | [diff] [blame] | 2 | use strict; | 
 | 3 | use warnings; | 
| Akron | 34a4f58 | 2020-05-27 12:18:57 +0200 | [diff] [blame] | 4 | use lib 'lib'; | 
 | 5 | use KorAP::VirtualCorpus::Group; | 
| Akron | 3f875be | 2020-05-11 14:57:19 +0200 | [diff] [blame] | 6 |  | 
| Akron | 340a9cb | 2020-05-20 12:55:22 +0200 | [diff] [blame] | 7 | # 2020-05-20 | 
 | 8 | #   Preliminary support for C2 def-files. | 
| Akron | 34a4f58 | 2020-05-27 12:18:57 +0200 | [diff] [blame] | 9 | # 2020-05-29 | 
 | 10 | #   Introduce optimizable object system. | 
| Akron | 340a9cb | 2020-05-20 12:55:22 +0200 | [diff] [blame] | 11 |  | 
| Akron | dd0aa3a | 2024-04-10 11:03:38 +0200 | [diff] [blame^] | 12 | our $VERSION = 0.1; | 
 | 13 |  | 
| Akron | 26b5970 | 2020-05-19 12:14:41 +0200 | [diff] [blame] | 14 | our @ARGV; | 
 | 15 |  | 
| Akron | 3f875be | 2020-05-11 14:57:19 +0200 | [diff] [blame] | 16 | unless (@ARGV) { | 
 | 17 |   print <<'HELP'; | 
 | 18 | Convert a line-separated list of corpus sigles, doc sigles or | 
 | 19 | text sigles into a virtual corpus query. | 
 | 20 |  | 
 | 21 |   $ perl list2vc.pl my_vc.txt | gzip -vc > my_vc.jsonld.gz | 
| Akron | 26b5970 | 2020-05-19 12:14:41 +0200 | [diff] [blame] | 22 |   $ cat my_vc.txt | perl list2vc.pl - | gzip -vc > my_vc.jsonld.gz | 
| Akron | 3f875be | 2020-05-11 14:57:19 +0200 | [diff] [blame] | 23 |  | 
 | 24 | HELP | 
 | 25 | exit 0; | 
 | 26 | }; | 
 | 27 |  | 
| Akron | 340a9cb | 2020-05-20 12:55:22 +0200 | [diff] [blame] | 28 |  | 
| Akron | 1c07045 | 2020-05-25 11:28:30 +0200 | [diff] [blame] | 29 | # Shorten long strings for logging | 
| Akron | 49c765f | 2020-05-20 16:41:22 +0200 | [diff] [blame] | 30 | sub _shorten ($) { | 
| Akron | 340a9cb | 2020-05-20 12:55:22 +0200 | [diff] [blame] | 31 |   my $line = shift; | 
 | 32 |   if (length($line) < 20) { | 
 | 33 |     return $line; | 
 | 34 |   } | 
 | 35 |   else { | 
 | 36 |     return substr($line,0,17) . '...'; | 
 | 37 |   }; | 
 | 38 | }; | 
 | 39 |  | 
 | 40 |  | 
| Akron | 3f875be | 2020-05-11 14:57:19 +0200 | [diff] [blame] | 41 | my $fh; | 
| Akron | 26b5970 | 2020-05-19 12:14:41 +0200 | [diff] [blame] | 42 | if ($ARGV[0] eq '-') { | 
 | 43 |   $fh = *STDIN; | 
 | 44 | } elsif (!open($fh, '<' . $ARGV[0])) { | 
| Akron | 3f875be | 2020-05-11 14:57:19 +0200 | [diff] [blame] | 45 |   warn $ARGV[0] . " can't be opened"; | 
| Akron | 26b5970 | 2020-05-19 12:14:41 +0200 | [diff] [blame] | 46 |   exit(0); | 
| Akron | 3f875be | 2020-05-11 14:57:19 +0200 | [diff] [blame] | 47 | }; | 
 | 48 |  | 
| Akron | 286b46e | 2020-05-25 17:07:48 +0200 | [diff] [blame] | 49 | # Initial VC group | 
 | 50 | my $vc; | 
| Akron | 26b5970 | 2020-05-19 12:14:41 +0200 | [diff] [blame] | 51 |  | 
| Akron | 323881c | 2020-05-20 17:15:42 +0200 | [diff] [blame] | 52 | # Create an intensional and an extensional VC | 
| Akron | 1c07045 | 2020-05-25 11:28:30 +0200 | [diff] [blame] | 53 | my $vc_ext = KorAP::VirtualCorpus::Group->new; | 
 | 54 | my $vc_int = KorAP::VirtualCorpus::Group->new; | 
| Akron | 49c765f | 2020-05-20 16:41:22 +0200 | [diff] [blame] | 55 |  | 
| Akron | 286b46e | 2020-05-25 17:07:48 +0200 | [diff] [blame] | 56 | # Load ext initially | 
 | 57 | $$vc = $vc_ext; | 
| Akron | 323881c | 2020-05-20 17:15:42 +0200 | [diff] [blame] | 58 |  | 
| Akron | 1c07045 | 2020-05-25 11:28:30 +0200 | [diff] [blame] | 59 | # Collect all virtual corpora | 
 | 60 | my %all_vcs; | 
 | 61 |  | 
| Akron | 323881c | 2020-05-20 17:15:42 +0200 | [diff] [blame] | 62 | my $frozen = 0; | 
| Akron | 26b5970 | 2020-05-19 12:14:41 +0200 | [diff] [blame] | 63 |  | 
 | 64 | # Iterate over the whole list | 
 | 65 | while (!eof $fh) { | 
 | 66 |   my $line = readline($fh); | 
 | 67 |   chomp $line; | 
 | 68 |  | 
| Akron | e2645ec | 2020-05-20 12:37:25 +0200 | [diff] [blame] | 69 |  | 
 | 70 |   # Skip empty lines | 
 | 71 |   if (!$line || length($line) == 0 || $line =~ /^[\s\t\n]*$/) { | 
 | 72 |     # empty | 
 | 73 |     next; | 
 | 74 |   }; | 
 | 75 |  | 
| Akron | 340a9cb | 2020-05-20 12:55:22 +0200 | [diff] [blame] | 76 |   my ($key, $value, $desc); | 
 | 77 |  | 
 | 78 |   # Line-Type: <e>c</a> | 
 | 79 |   if ($line =~ /^\s*<([^>]+)>\s*([^<]*)\s*<\/\1>\s*$/) { | 
 | 80 |     $key = $1; | 
 | 81 |     $value = $2 // undef; | 
 | 82 |   } | 
 | 83 |  | 
 | 84 |   # Line-Type: <e>c | 
 | 85 |   elsif($line =~ /^\s*<([^>]+)>\s*([^<]+)\s*$/) { | 
 | 86 |     $key = $1; | 
 | 87 |     $value = $2; | 
 | 88 |   } | 
 | 89 |  | 
| Akron | 26b5970 | 2020-05-19 12:14:41 +0200 | [diff] [blame] | 90 |   # Get text sigles | 
| Akron | 323881c | 2020-05-20 17:15:42 +0200 | [diff] [blame] | 91 |   elsif ($line =~ m!^(?:\w+\/){2}\w+$!) { | 
| Akron | 340a9cb | 2020-05-20 12:55:22 +0200 | [diff] [blame] | 92 |     $key = 'text'; | 
 | 93 |     $value = $line; | 
| Akron | 26b5970 | 2020-05-19 12:14:41 +0200 | [diff] [blame] | 94 |   } | 
 | 95 |  | 
 | 96 |   # Get doc sigles | 
| Akron | 323881c | 2020-05-20 17:15:42 +0200 | [diff] [blame] | 97 |   elsif ($line =~ m!^(\w+\/\w+?)(?:\s.+?)?$!) { | 
| Akron | 340a9cb | 2020-05-20 12:55:22 +0200 | [diff] [blame] | 98 |     $key = 'doc'; | 
 | 99 |     $value = $1; | 
| Akron | 26b5970 | 2020-05-19 12:14:41 +0200 | [diff] [blame] | 100 |   } | 
 | 101 |  | 
 | 102 |   # Get corpus sigles | 
| Akron | 340a9cb | 2020-05-20 12:55:22 +0200 | [diff] [blame] | 103 |   elsif ($line !~ m!(?:\/|\s)!) { | 
 | 104 |     $key = 'corpus'; | 
 | 105 |     $value = $line; | 
| Akron | 26b5970 | 2020-05-19 12:14:41 +0200 | [diff] [blame] | 106 |   } | 
 | 107 |  | 
| Akron | 340a9cb | 2020-05-20 12:55:22 +0200 | [diff] [blame] | 108 |   # Not known | 
| Akron | 26b5970 | 2020-05-19 12:14:41 +0200 | [diff] [blame] | 109 |   else { | 
| Akron | 49c765f | 2020-05-20 16:41:22 +0200 | [diff] [blame] | 110 |     warn _shorten($line) . q! isn't a valid VC definition!; | 
| Akron | 340a9cb | 2020-05-20 12:55:22 +0200 | [diff] [blame] | 111 |     next; | 
 | 112 |   }; | 
 | 113 |  | 
| Akron | 49c765f | 2020-05-20 16:41:22 +0200 | [diff] [blame] | 114 |   # Add text field | 
| Akron | 340a9cb | 2020-05-20 12:55:22 +0200 | [diff] [blame] | 115 |   if ($key eq 'text') { | 
| Akron | 68746a1 | 2020-05-20 15:19:55 +0200 | [diff] [blame] | 116 |  | 
 | 117 |     # Convert C2 sigle to KorAP form | 
 | 118 |     $value =~ s!^([^/]+?/[^\.]+?)\.(.+?)$!$1\/$2!; | 
| Akron | 34a4f58 | 2020-05-27 12:18:57 +0200 | [diff] [blame] | 119 |     ${$vc}->union_field(textSigle => $value); | 
| Akron | 340a9cb | 2020-05-20 12:55:22 +0200 | [diff] [blame] | 120 |   } | 
 | 121 |  | 
| Akron | 49c765f | 2020-05-20 16:41:22 +0200 | [diff] [blame] | 122 |   # Add doc field | 
| Akron | 340a9cb | 2020-05-20 12:55:22 +0200 | [diff] [blame] | 123 |   elsif ($key eq 'doc') { | 
| Akron | 34a4f58 | 2020-05-27 12:18:57 +0200 | [diff] [blame] | 124 |     ${$vc}->union_field(docSigle => $value); | 
| Akron | 340a9cb | 2020-05-20 12:55:22 +0200 | [diff] [blame] | 125 |   } | 
 | 126 |  | 
| Akron | 49c765f | 2020-05-20 16:41:22 +0200 | [diff] [blame] | 127 |   # Add corpus field | 
| Akron | 340a9cb | 2020-05-20 12:55:22 +0200 | [diff] [blame] | 128 |   elsif ($key eq 'corpus') { | 
| Akron | 34a4f58 | 2020-05-27 12:18:57 +0200 | [diff] [blame] | 129 |     ${$vc}->union_field(corpusSigle => $value); | 
| Akron | 1c07045 | 2020-05-25 11:28:30 +0200 | [diff] [blame] | 130 |   } | 
 | 131 |  | 
 | 132 |   # Add corpus field | 
 | 133 |   elsif ($key eq 'cn') { | 
 | 134 |     # Korpussigle, z.B. 'F97 Frankfurter Allgemeine 1997' | 
 | 135 |     if ($value =~ m!^([^\/\s]+)(?:\s.+?)?$!) { | 
| Akron | 34a4f58 | 2020-05-27 12:18:57 +0200 | [diff] [blame] | 136 |       ${$vc}->union_field(corpusSigle => $1); | 
| Akron | 1c07045 | 2020-05-25 11:28:30 +0200 | [diff] [blame] | 137 |     }; | 
| Akron | 323881c | 2020-05-20 17:15:42 +0200 | [diff] [blame] | 138 |   } | 
 | 139 |  | 
 | 140 |   # Mark the vc as frozen | 
 | 141 |   # This means that an extended VC area is expected | 
 | 142 |   elsif ($key eq 'frozen') { | 
 | 143 |     $frozen = 1; | 
 | 144 |   } | 
 | 145 |  | 
 | 146 |   # Start/End intended VC area | 
 | 147 |   elsif ($key eq 'intended') { | 
 | 148 |     if ($value eq 'start') { | 
 | 149 |       $$vc = $vc_int; | 
 | 150 |     } | 
 | 151 |     elsif ($value ne 'end') { | 
 | 152 |       warn 'Unknown intension value ' . $value; | 
 | 153 |     }; | 
 | 154 |   } | 
 | 155 |  | 
 | 156 |   # Start/End extended VC area | 
 | 157 |   elsif ($key eq 'extended') { | 
 | 158 |     if ($value eq 'start') { | 
 | 159 |       $$vc = $vc_ext; | 
 | 160 |     } | 
 | 161 |     elsif ($value ne 'end') { | 
 | 162 |       warn 'Unknown extension value ' . $value; | 
 | 163 |     }; | 
 | 164 |   } | 
| Akron | 3587f36 | 2020-05-20 17:50:38 +0200 | [diff] [blame] | 165 |  | 
 | 166 |   # Set VC name | 
 | 167 |   elsif ($key eq 'name') { | 
 | 168 |     # "Name des virt. Korpus, der angezeigt wird. | 
 | 169 |     # Wird auch intern zur Korpusbildung referenziert, z.B. für <and>, | 
 | 170 |     # <add>, <sub>" | 
 | 171 |  | 
 | 172 |     # No global name defined yet | 
| Akron | 1c07045 | 2020-05-25 11:28:30 +0200 | [diff] [blame] | 173 |     if ($$vc && !$$vc->name) { | 
| Akron | 3587f36 | 2020-05-20 17:50:38 +0200 | [diff] [blame] | 174 |       $vc_ext->name($value); | 
 | 175 |       $vc_int->name($value); | 
 | 176 |       next; | 
 | 177 |     }; | 
| Akron | 1c07045 | 2020-05-25 11:28:30 +0200 | [diff] [blame] | 178 |  | 
 | 179 |     ${$vc} = KorAP::VirtualCorpus::Group->new; | 
 | 180 |     ${$vc}->name($value); | 
 | 181 |   } | 
 | 182 |  | 
 | 183 |   # End VC def | 
 | 184 |   elsif ($key eq 'end') { | 
 | 185 |     $all_vcs{${$vc}->name} = $$vc; | 
 | 186 |     # $vc = undef; | 
 | 187 |   } | 
 | 188 |  | 
 | 189 |   # Add VC definition | 
 | 190 |   elsif ($key eq 'add') { | 
 | 191 |     unless (defined $all_vcs{$value}) { | 
 | 192 |       #       warn 'VC ' . $value . ' not defined'; | 
 | 193 |       # exit(1); | 
 | 194 |       next; | 
 | 195 |     }; | 
 | 196 |  | 
| Akron | 34a4f58 | 2020-05-27 12:18:57 +0200 | [diff] [blame] | 197 |     $$vc->union($all_vcs{$value}->clone->to_koral); | 
| Akron | 3587f36 | 2020-05-20 17:50:38 +0200 | [diff] [blame] | 198 |   } | 
 | 199 |  | 
| Akron | 34a4f58 | 2020-05-27 12:18:57 +0200 | [diff] [blame] | 200 |   # AND definition | 
 | 201 |   elsif ($key eq 'and') { | 
 | 202 |     unless (defined $all_vcs{$value}) { | 
 | 203 |       #       warn 'VC ' . $value . ' not defined'; | 
 | 204 |       # exit(1); | 
 | 205 |       next; | 
 | 206 |     }; | 
 | 207 |  | 
 | 208 |     $$vc->joint($all_vcs{$value}->clone->to_koral); | 
 | 209 |   } | 
 | 210 |  | 
 | 211 |   # Source of the corpus | 
 | 212 |   elsif ($key eq 'ql') { | 
 | 213 |     # Quellenname, z.B. "Neue Zürcher Zeitung" | 
 | 214 |     $$vc->union_field(corpusTitle => $value); | 
 | 215 |   } | 
 | 216 |  | 
 | 217 |   elsif ($key eq 'sub') { | 
 | 218 |     # "Sub" is the difference - it is the "and not" operation. | 
 | 219 |     warn $key . ' is not yet supported'; | 
 | 220 |   } | 
 | 221 |  | 
 | 222 |   elsif ($key eq 'co') { | 
 | 223 |     # Country,	z.B. DE für Text in Deutschland erschienen | 
 | 224 |     warn $key . ' is not yet supported'; | 
 | 225 |   } | 
 | 226 |  | 
 | 227 |   elsif ($key eq 'tl') { | 
 | 228 |     # Textlength, Bereich von Texten der angegebenen Länge [in Anz. Wörtern] | 
 | 229 |     warn $key . ' is not yet supported'; | 
 | 230 |   } | 
 | 231 |  | 
 | 232 |   elsif ($key eq 'ts') { | 
 | 233 |     # Textsorte, 	z.B. "Bericht" | 
 | 234 |     warn $key . ' is not yet supported'; | 
 | 235 |   } | 
 | 236 |  | 
 | 237 |   elsif ($key eq 'th') { | 
 | 238 |     # Thema, z.B. "Sport - Fußball" | 
 | 239 |     warn $key . ' is not yet supported'; | 
 | 240 |   } | 
 | 241 |  | 
 | 242 |   elsif ($key eq 'red') { | 
 | 243 |     # Reduktionsfaktor | 
 | 244 |     # Wert zw. 1-99%: virt. Korpus wird auf diesen Wert | 
 | 245 |     # reduziert. Modus: feste Reduzierung, nicht variabel. | 
 | 246 |     warn $key . ' is not yet supported'; | 
 | 247 |   } | 
 | 248 |  | 
 | 249 |   elsif ($key eq 'thprob') { | 
 | 250 |     # ThemaProbability | 
 | 251 |     # Wert, der für <th>Thema verwendet wird um zu bestimmen, ab welchem | 
 | 252 |     # Zuverläßigkeitswert ein Thema übernommen wird | 
 | 253 |   } | 
 | 254 |  | 
 | 255 |  | 
| Akron | 286b46e | 2020-05-25 17:07:48 +0200 | [diff] [blame] | 256 |   # Add reduction value as a comment | 
 | 257 |   elsif ($key eq 'redabs') { | 
 | 258 |     # "red. Anz. Texte | 
 | 259 |     # absoluter Wert der durch Reduktion zu erzielende Anzahl Texte" | 
 | 260 |     $$vc->comment('redabs:' . $value); | 
| Akron | 34a4f58 | 2020-05-27 12:18:57 +0200 | [diff] [blame] | 261 |     warn $key . ' is not yet supported'; | 
 | 262 |   } | 
 | 263 |  | 
 | 264 |   # Add reduction value as a comment | 
 | 265 |   elsif ($key eq 'date') { | 
 | 266 |     # Supports two pattern schemes: | 
 | 267 |     # m1=Year1/Month1 bis Year2/Month2 | 
 | 268 |     #   Datumsbereich Schema 1: z.B. "2000/01 bis 2010/12" | 
 | 269 |  | 
 | 270 |     # Schema 1 | 
 | 271 |     if ($value =~ m!^(?:m1\s*=\s*)?\s*(\d+)\/(\d+) bis (\d+)\/(\d+)\s*$!s) { | 
 | 272 |       my ($y1, $m1, $y2, $m2) = ($1, $2, $3, $4); | 
 | 273 |       if ($m1 < 10) { | 
 | 274 |         $m1 = '0' . (0+$m1); | 
 | 275 |       }; | 
 | 276 |       if ($m2 < 10) { | 
 | 277 |         $m2 = '0' . (0+$m2); | 
 | 278 |       }; | 
 | 279 |       $$vc->from($y1, $m1); | 
 | 280 |       $$vc->to($y2, $m2); | 
 | 281 |     } | 
 | 282 |  | 
 | 283 |     # Scheme 2 | 
 | 284 |     elsif ($value =~ m!^\s*\d{4}-\d{4}\s+und\s+\d{1,2}-\d{1,2}\s*$!) { | 
 | 285 |       # m2=Year1-Year2 und Month1-Month2 | 
 | 286 |       #   Datumsbereich Schema 2: z.B. "1990-2000 und 06-06" | 
 | 287 |  | 
 | 288 |       warn 'Second date scheme not yet supported!' | 
 | 289 |     } | 
 | 290 |  | 
 | 291 |     else { | 
 | 292 |       warn 'Unknown date scheme ' . $value; | 
 | 293 |     }; | 
| Akron | 286b46e | 2020-05-25 17:07:48 +0200 | [diff] [blame] | 294 |   } | 
 | 295 |  | 
| Akron | 3587f36 | 2020-05-20 17:50:38 +0200 | [diff] [blame] | 296 |   # Unknown | 
 | 297 |   else { | 
| Akron | 34a4f58 | 2020-05-27 12:18:57 +0200 | [diff] [blame] | 298 |     warn $key . ' is an unknown field'; | 
| Akron | 3587f36 | 2020-05-20 17:50:38 +0200 | [diff] [blame] | 299 |   }; | 
| Akron | 26b5970 | 2020-05-19 12:14:41 +0200 | [diff] [blame] | 300 | }; | 
 | 301 |  | 
| Akron | 26b5970 | 2020-05-19 12:14:41 +0200 | [diff] [blame] | 302 | close($fh); | 
 | 303 |  | 
| Akron | 1c07045 | 2020-05-25 11:28:30 +0200 | [diff] [blame] | 304 | # Stringify current (extended?) virtual corpus | 
| Akron | 323881c | 2020-05-20 17:15:42 +0200 | [diff] [blame] | 305 | print $$vc->to_string; |