Akron | 3f875be | 2020-05-11 14:57:19 +0200 | [diff] [blame] | 1 | #!/usr/bin/env perl |
Akron | 3587f36 | 2020-05-20 17:50:38 +0200 | [diff] [blame] | 2 | use strict; |
| 3 | use warnings; |
Akron | 34a4f58 | 2020-05-27 12:18:57 +0200 | [diff] [blame] | 4 | use lib 'lib'; |
| 5 | use KorAP::VirtualCorpus::Group; |
Akron | 3f875be | 2020-05-11 14:57:19 +0200 | [diff] [blame] | 6 | |
Akron | 340a9cb | 2020-05-20 12:55:22 +0200 | [diff] [blame] | 7 | # 2020-05-20 |
| 8 | # Preliminary support for C2 def-files. |
Akron | 34a4f58 | 2020-05-27 12:18:57 +0200 | [diff] [blame] | 9 | # 2020-05-29 |
| 10 | # Introduce optimizable object system. |
Akron | 340a9cb | 2020-05-20 12:55:22 +0200 | [diff] [blame] | 11 | |
Akron | dd0aa3a | 2024-04-10 11:03:38 +0200 | [diff] [blame] | 12 | our $VERSION = 0.1; |
| 13 | |
Akron | 26b5970 | 2020-05-19 12:14:41 +0200 | [diff] [blame] | 14 | our @ARGV; |
| 15 | |
Akron | 3f875be | 2020-05-11 14:57:19 +0200 | [diff] [blame] | 16 | unless (@ARGV) { |
| 17 | print <<'HELP'; |
| 18 | Convert a line-separated list of corpus sigles, doc sigles or |
| 19 | text sigles into a virtual corpus query. |
| 20 | |
| 21 | $ perl list2vc.pl my_vc.txt | gzip -vc > my_vc.jsonld.gz |
Akron | 26b5970 | 2020-05-19 12:14:41 +0200 | [diff] [blame] | 22 | $ cat my_vc.txt | perl list2vc.pl - | gzip -vc > my_vc.jsonld.gz |
Akron | 3f875be | 2020-05-11 14:57:19 +0200 | [diff] [blame] | 23 | |
| 24 | HELP |
| 25 | exit 0; |
| 26 | }; |
| 27 | |
Akron | 340a9cb | 2020-05-20 12:55:22 +0200 | [diff] [blame] | 28 | |
Akron | 1c07045 | 2020-05-25 11:28:30 +0200 | [diff] [blame] | 29 | # Shorten long strings for logging |
Akron | 49c765f | 2020-05-20 16:41:22 +0200 | [diff] [blame] | 30 | sub _shorten ($) { |
Akron | 340a9cb | 2020-05-20 12:55:22 +0200 | [diff] [blame] | 31 | my $line = shift; |
| 32 | if (length($line) < 20) { |
| 33 | return $line; |
| 34 | } |
| 35 | else { |
| 36 | return substr($line,0,17) . '...'; |
| 37 | }; |
| 38 | }; |
| 39 | |
| 40 | |
Akron | 3f875be | 2020-05-11 14:57:19 +0200 | [diff] [blame] | 41 | my $fh; |
Akron | 26b5970 | 2020-05-19 12:14:41 +0200 | [diff] [blame] | 42 | if ($ARGV[0] eq '-') { |
| 43 | $fh = *STDIN; |
| 44 | } elsif (!open($fh, '<' . $ARGV[0])) { |
Akron | 3f875be | 2020-05-11 14:57:19 +0200 | [diff] [blame] | 45 | warn $ARGV[0] . " can't be opened"; |
Akron | 26b5970 | 2020-05-19 12:14:41 +0200 | [diff] [blame] | 46 | exit(0); |
Akron | 3f875be | 2020-05-11 14:57:19 +0200 | [diff] [blame] | 47 | }; |
| 48 | |
Akron | 286b46e | 2020-05-25 17:07:48 +0200 | [diff] [blame] | 49 | # Initial VC group |
| 50 | my $vc; |
Akron | 26b5970 | 2020-05-19 12:14:41 +0200 | [diff] [blame] | 51 | |
Akron | 323881c | 2020-05-20 17:15:42 +0200 | [diff] [blame] | 52 | # Create an intensional and an extensional VC |
Akron | 1c07045 | 2020-05-25 11:28:30 +0200 | [diff] [blame] | 53 | my $vc_ext = KorAP::VirtualCorpus::Group->new; |
| 54 | my $vc_int = KorAP::VirtualCorpus::Group->new; |
Akron | 49c765f | 2020-05-20 16:41:22 +0200 | [diff] [blame] | 55 | |
Akron | 286b46e | 2020-05-25 17:07:48 +0200 | [diff] [blame] | 56 | # Load ext initially |
| 57 | $$vc = $vc_ext; |
Akron | 323881c | 2020-05-20 17:15:42 +0200 | [diff] [blame] | 58 | |
Akron | 1c07045 | 2020-05-25 11:28:30 +0200 | [diff] [blame] | 59 | # Collect all virtual corpora |
| 60 | my %all_vcs; |
| 61 | |
Akron | 323881c | 2020-05-20 17:15:42 +0200 | [diff] [blame] | 62 | my $frozen = 0; |
Akron | 26b5970 | 2020-05-19 12:14:41 +0200 | [diff] [blame] | 63 | |
| 64 | # Iterate over the whole list |
| 65 | while (!eof $fh) { |
| 66 | my $line = readline($fh); |
| 67 | chomp $line; |
| 68 | |
Akron | e2645ec | 2020-05-20 12:37:25 +0200 | [diff] [blame] | 69 | |
| 70 | # Skip empty lines |
| 71 | if (!$line || length($line) == 0 || $line =~ /^[\s\t\n]*$/) { |
| 72 | # empty |
| 73 | next; |
| 74 | }; |
| 75 | |
Akron | 340a9cb | 2020-05-20 12:55:22 +0200 | [diff] [blame] | 76 | my ($key, $value, $desc); |
| 77 | |
| 78 | # Line-Type: <e>c</a> |
| 79 | if ($line =~ /^\s*<([^>]+)>\s*([^<]*)\s*<\/\1>\s*$/) { |
| 80 | $key = $1; |
| 81 | $value = $2 // undef; |
| 82 | } |
| 83 | |
| 84 | # Line-Type: <e>c |
| 85 | elsif($line =~ /^\s*<([^>]+)>\s*([^<]+)\s*$/) { |
| 86 | $key = $1; |
| 87 | $value = $2; |
| 88 | } |
| 89 | |
Akron | 26b5970 | 2020-05-19 12:14:41 +0200 | [diff] [blame] | 90 | # Get text sigles |
Akron | 323881c | 2020-05-20 17:15:42 +0200 | [diff] [blame] | 91 | elsif ($line =~ m!^(?:\w+\/){2}\w+$!) { |
Akron | 340a9cb | 2020-05-20 12:55:22 +0200 | [diff] [blame] | 92 | $key = 'text'; |
| 93 | $value = $line; |
Akron | 26b5970 | 2020-05-19 12:14:41 +0200 | [diff] [blame] | 94 | } |
| 95 | |
| 96 | # Get doc sigles |
Akron | 323881c | 2020-05-20 17:15:42 +0200 | [diff] [blame] | 97 | elsif ($line =~ m!^(\w+\/\w+?)(?:\s.+?)?$!) { |
Akron | 340a9cb | 2020-05-20 12:55:22 +0200 | [diff] [blame] | 98 | $key = 'doc'; |
| 99 | $value = $1; |
Akron | 26b5970 | 2020-05-19 12:14:41 +0200 | [diff] [blame] | 100 | } |
| 101 | |
| 102 | # Get corpus sigles |
Akron | 340a9cb | 2020-05-20 12:55:22 +0200 | [diff] [blame] | 103 | elsif ($line !~ m!(?:\/|\s)!) { |
| 104 | $key = 'corpus'; |
| 105 | $value = $line; |
Akron | 26b5970 | 2020-05-19 12:14:41 +0200 | [diff] [blame] | 106 | } |
| 107 | |
Akron | 340a9cb | 2020-05-20 12:55:22 +0200 | [diff] [blame] | 108 | # Not known |
Akron | 26b5970 | 2020-05-19 12:14:41 +0200 | [diff] [blame] | 109 | else { |
Akron | 49c765f | 2020-05-20 16:41:22 +0200 | [diff] [blame] | 110 | warn _shorten($line) . q! isn't a valid VC definition!; |
Akron | 340a9cb | 2020-05-20 12:55:22 +0200 | [diff] [blame] | 111 | next; |
| 112 | }; |
| 113 | |
Akron | 49c765f | 2020-05-20 16:41:22 +0200 | [diff] [blame] | 114 | # Add text field |
Akron | 340a9cb | 2020-05-20 12:55:22 +0200 | [diff] [blame] | 115 | if ($key eq 'text') { |
Akron | 68746a1 | 2020-05-20 15:19:55 +0200 | [diff] [blame] | 116 | |
| 117 | # Convert C2 sigle to KorAP form |
| 118 | $value =~ s!^([^/]+?/[^\.]+?)\.(.+?)$!$1\/$2!; |
Akron | 34a4f58 | 2020-05-27 12:18:57 +0200 | [diff] [blame] | 119 | ${$vc}->union_field(textSigle => $value); |
Akron | 340a9cb | 2020-05-20 12:55:22 +0200 | [diff] [blame] | 120 | } |
| 121 | |
Akron | 49c765f | 2020-05-20 16:41:22 +0200 | [diff] [blame] | 122 | # Add doc field |
Akron | 340a9cb | 2020-05-20 12:55:22 +0200 | [diff] [blame] | 123 | elsif ($key eq 'doc') { |
Akron | 34a4f58 | 2020-05-27 12:18:57 +0200 | [diff] [blame] | 124 | ${$vc}->union_field(docSigle => $value); |
Akron | 340a9cb | 2020-05-20 12:55:22 +0200 | [diff] [blame] | 125 | } |
| 126 | |
Akron | 49c765f | 2020-05-20 16:41:22 +0200 | [diff] [blame] | 127 | # Add corpus field |
Akron | 340a9cb | 2020-05-20 12:55:22 +0200 | [diff] [blame] | 128 | elsif ($key eq 'corpus') { |
Akron | 34a4f58 | 2020-05-27 12:18:57 +0200 | [diff] [blame] | 129 | ${$vc}->union_field(corpusSigle => $value); |
Akron | 1c07045 | 2020-05-25 11:28:30 +0200 | [diff] [blame] | 130 | } |
| 131 | |
| 132 | # Add corpus field |
| 133 | elsif ($key eq 'cn') { |
| 134 | # Korpussigle, z.B. 'F97 Frankfurter Allgemeine 1997' |
| 135 | if ($value =~ m!^([^\/\s]+)(?:\s.+?)?$!) { |
Akron | 34a4f58 | 2020-05-27 12:18:57 +0200 | [diff] [blame] | 136 | ${$vc}->union_field(corpusSigle => $1); |
Akron | 1c07045 | 2020-05-25 11:28:30 +0200 | [diff] [blame] | 137 | }; |
Akron | 323881c | 2020-05-20 17:15:42 +0200 | [diff] [blame] | 138 | } |
| 139 | |
| 140 | # Mark the vc as frozen |
| 141 | # This means that an extended VC area is expected |
| 142 | elsif ($key eq 'frozen') { |
| 143 | $frozen = 1; |
| 144 | } |
| 145 | |
| 146 | # Start/End intended VC area |
| 147 | elsif ($key eq 'intended') { |
| 148 | if ($value eq 'start') { |
| 149 | $$vc = $vc_int; |
| 150 | } |
| 151 | elsif ($value ne 'end') { |
| 152 | warn 'Unknown intension value ' . $value; |
| 153 | }; |
| 154 | } |
| 155 | |
| 156 | # Start/End extended VC area |
| 157 | elsif ($key eq 'extended') { |
| 158 | if ($value eq 'start') { |
| 159 | $$vc = $vc_ext; |
| 160 | } |
| 161 | elsif ($value ne 'end') { |
| 162 | warn 'Unknown extension value ' . $value; |
| 163 | }; |
| 164 | } |
Akron | 3587f36 | 2020-05-20 17:50:38 +0200 | [diff] [blame] | 165 | |
| 166 | # Set VC name |
| 167 | elsif ($key eq 'name') { |
| 168 | # "Name des virt. Korpus, der angezeigt wird. |
| 169 | # Wird auch intern zur Korpusbildung referenziert, z.B. für <and>, |
| 170 | # <add>, <sub>" |
| 171 | |
| 172 | # No global name defined yet |
Akron | 1c07045 | 2020-05-25 11:28:30 +0200 | [diff] [blame] | 173 | if ($$vc && !$$vc->name) { |
Akron | 3587f36 | 2020-05-20 17:50:38 +0200 | [diff] [blame] | 174 | $vc_ext->name($value); |
| 175 | $vc_int->name($value); |
| 176 | next; |
| 177 | }; |
Akron | 1c07045 | 2020-05-25 11:28:30 +0200 | [diff] [blame] | 178 | |
| 179 | ${$vc} = KorAP::VirtualCorpus::Group->new; |
| 180 | ${$vc}->name($value); |
| 181 | } |
| 182 | |
| 183 | # End VC def |
| 184 | elsif ($key eq 'end') { |
| 185 | $all_vcs{${$vc}->name} = $$vc; |
| 186 | # $vc = undef; |
| 187 | } |
| 188 | |
| 189 | # Add VC definition |
| 190 | elsif ($key eq 'add') { |
| 191 | unless (defined $all_vcs{$value}) { |
| 192 | # warn 'VC ' . $value . ' not defined'; |
| 193 | # exit(1); |
| 194 | next; |
| 195 | }; |
| 196 | |
Akron | 34a4f58 | 2020-05-27 12:18:57 +0200 | [diff] [blame] | 197 | $$vc->union($all_vcs{$value}->clone->to_koral); |
Akron | 3587f36 | 2020-05-20 17:50:38 +0200 | [diff] [blame] | 198 | } |
| 199 | |
Akron | 34a4f58 | 2020-05-27 12:18:57 +0200 | [diff] [blame] | 200 | # AND definition |
| 201 | elsif ($key eq 'and') { |
| 202 | unless (defined $all_vcs{$value}) { |
| 203 | # warn 'VC ' . $value . ' not defined'; |
| 204 | # exit(1); |
| 205 | next; |
| 206 | }; |
| 207 | |
| 208 | $$vc->joint($all_vcs{$value}->clone->to_koral); |
| 209 | } |
| 210 | |
| 211 | # Source of the corpus |
| 212 | elsif ($key eq 'ql') { |
| 213 | # Quellenname, z.B. "Neue Zürcher Zeitung" |
| 214 | $$vc->union_field(corpusTitle => $value); |
| 215 | } |
| 216 | |
| 217 | elsif ($key eq 'sub') { |
| 218 | # "Sub" is the difference - it is the "and not" operation. |
| 219 | warn $key . ' is not yet supported'; |
| 220 | } |
| 221 | |
| 222 | elsif ($key eq 'co') { |
| 223 | # Country, z.B. DE für Text in Deutschland erschienen |
| 224 | warn $key . ' is not yet supported'; |
| 225 | } |
| 226 | |
| 227 | elsif ($key eq 'tl') { |
| 228 | # Textlength, Bereich von Texten der angegebenen Länge [in Anz. Wörtern] |
| 229 | warn $key . ' is not yet supported'; |
| 230 | } |
| 231 | |
| 232 | elsif ($key eq 'ts') { |
| 233 | # Textsorte, z.B. "Bericht" |
| 234 | warn $key . ' is not yet supported'; |
| 235 | } |
| 236 | |
| 237 | elsif ($key eq 'th') { |
| 238 | # Thema, z.B. "Sport - Fußball" |
| 239 | warn $key . ' is not yet supported'; |
| 240 | } |
| 241 | |
| 242 | elsif ($key eq 'red') { |
| 243 | # Reduktionsfaktor |
| 244 | # Wert zw. 1-99%: virt. Korpus wird auf diesen Wert |
| 245 | # reduziert. Modus: feste Reduzierung, nicht variabel. |
| 246 | warn $key . ' is not yet supported'; |
| 247 | } |
| 248 | |
| 249 | elsif ($key eq 'thprob') { |
| 250 | # ThemaProbability |
| 251 | # Wert, der für <th>Thema verwendet wird um zu bestimmen, ab welchem |
| 252 | # Zuverläßigkeitswert ein Thema übernommen wird |
| 253 | } |
| 254 | |
| 255 | |
Akron | 286b46e | 2020-05-25 17:07:48 +0200 | [diff] [blame] | 256 | # Add reduction value as a comment |
| 257 | elsif ($key eq 'redabs') { |
| 258 | # "red. Anz. Texte |
| 259 | # absoluter Wert der durch Reduktion zu erzielende Anzahl Texte" |
| 260 | $$vc->comment('redabs:' . $value); |
Akron | 34a4f58 | 2020-05-27 12:18:57 +0200 | [diff] [blame] | 261 | warn $key . ' is not yet supported'; |
| 262 | } |
| 263 | |
| 264 | # Add reduction value as a comment |
| 265 | elsif ($key eq 'date') { |
| 266 | # Supports two pattern schemes: |
| 267 | # m1=Year1/Month1 bis Year2/Month2 |
| 268 | # Datumsbereich Schema 1: z.B. "2000/01 bis 2010/12" |
| 269 | |
| 270 | # Schema 1 |
| 271 | if ($value =~ m!^(?:m1\s*=\s*)?\s*(\d+)\/(\d+) bis (\d+)\/(\d+)\s*$!s) { |
| 272 | my ($y1, $m1, $y2, $m2) = ($1, $2, $3, $4); |
| 273 | if ($m1 < 10) { |
| 274 | $m1 = '0' . (0+$m1); |
| 275 | }; |
| 276 | if ($m2 < 10) { |
| 277 | $m2 = '0' . (0+$m2); |
| 278 | }; |
| 279 | $$vc->from($y1, $m1); |
| 280 | $$vc->to($y2, $m2); |
| 281 | } |
| 282 | |
| 283 | # Scheme 2 |
| 284 | elsif ($value =~ m!^\s*\d{4}-\d{4}\s+und\s+\d{1,2}-\d{1,2}\s*$!) { |
| 285 | # m2=Year1-Year2 und Month1-Month2 |
| 286 | # Datumsbereich Schema 2: z.B. "1990-2000 und 06-06" |
| 287 | |
| 288 | warn 'Second date scheme not yet supported!' |
| 289 | } |
| 290 | |
| 291 | else { |
| 292 | warn 'Unknown date scheme ' . $value; |
| 293 | }; |
Akron | 286b46e | 2020-05-25 17:07:48 +0200 | [diff] [blame] | 294 | } |
| 295 | |
Akron | 3587f36 | 2020-05-20 17:50:38 +0200 | [diff] [blame] | 296 | # Unknown |
| 297 | else { |
Akron | 34a4f58 | 2020-05-27 12:18:57 +0200 | [diff] [blame] | 298 | warn $key . ' is an unknown field'; |
Akron | 3587f36 | 2020-05-20 17:50:38 +0200 | [diff] [blame] | 299 | }; |
Akron | 26b5970 | 2020-05-19 12:14:41 +0200 | [diff] [blame] | 300 | }; |
| 301 | |
Akron | 26b5970 | 2020-05-19 12:14:41 +0200 | [diff] [blame] | 302 | close($fh); |
| 303 | |
Akron | 1c07045 | 2020-05-25 11:28:30 +0200 | [diff] [blame] | 304 | # Stringify current (extended?) virtual corpus |
Akron | 323881c | 2020-05-20 17:15:42 +0200 | [diff] [blame] | 305 | print $$vc->to_string; |