Akron | 18e407a | 2020-05-11 14:57:19 +0200 | [diff] [blame] | 1 | #!/usr/bin/env perl |
Akron | 5368b6c | 2020-05-20 17:50:38 +0200 | [diff] [blame] | 2 | use strict; |
| 3 | use warnings; |
Akron | 7dacd01 | 2020-05-27 12:18:57 +0200 | [diff] [blame^] | 4 | use lib 'lib'; |
| 5 | use KorAP::VirtualCorpus::Group; |
Akron | 18e407a | 2020-05-11 14:57:19 +0200 | [diff] [blame] | 6 | |
Akron | 1839cb1 | 2020-05-20 12:55:22 +0200 | [diff] [blame] | 7 | # 2020-05-20 |
| 8 | # Preliminary support for C2 def-files. |
Akron | 7dacd01 | 2020-05-27 12:18:57 +0200 | [diff] [blame^] | 9 | # 2020-05-29 |
| 10 | # Introduce optimizable object system. |
Akron | 1839cb1 | 2020-05-20 12:55:22 +0200 | [diff] [blame] | 11 | |
Akron | 1e6f4d4 | 2020-05-19 12:14:41 +0200 | [diff] [blame] | 12 | our @ARGV; |
| 13 | |
Akron | 18e407a | 2020-05-11 14:57:19 +0200 | [diff] [blame] | 14 | unless (@ARGV) { |
| 15 | print <<'HELP'; |
| 16 | Convert a line-separated list of corpus sigles, doc sigles or |
| 17 | text sigles into a virtual corpus query. |
| 18 | |
| 19 | $ perl list2vc.pl my_vc.txt | gzip -vc > my_vc.jsonld.gz |
Akron | 1e6f4d4 | 2020-05-19 12:14:41 +0200 | [diff] [blame] | 20 | $ cat my_vc.txt | perl list2vc.pl - | gzip -vc > my_vc.jsonld.gz |
Akron | 18e407a | 2020-05-11 14:57:19 +0200 | [diff] [blame] | 21 | |
| 22 | HELP |
| 23 | exit 0; |
| 24 | }; |
| 25 | |
Akron | 1839cb1 | 2020-05-20 12:55:22 +0200 | [diff] [blame] | 26 | |
Akron | 36a9b87 | 2020-05-25 11:28:30 +0200 | [diff] [blame] | 27 | # Shorten long strings for logging |
Akron | fe58a6c | 2020-05-20 16:41:22 +0200 | [diff] [blame] | 28 | sub _shorten ($) { |
Akron | 1839cb1 | 2020-05-20 12:55:22 +0200 | [diff] [blame] | 29 | my $line = shift; |
| 30 | if (length($line) < 20) { |
| 31 | return $line; |
| 32 | } |
| 33 | else { |
| 34 | return substr($line,0,17) . '...'; |
| 35 | }; |
| 36 | }; |
| 37 | |
| 38 | |
Akron | 18e407a | 2020-05-11 14:57:19 +0200 | [diff] [blame] | 39 | my $fh; |
Akron | 1e6f4d4 | 2020-05-19 12:14:41 +0200 | [diff] [blame] | 40 | if ($ARGV[0] eq '-') { |
| 41 | $fh = *STDIN; |
| 42 | } elsif (!open($fh, '<' . $ARGV[0])) { |
Akron | 18e407a | 2020-05-11 14:57:19 +0200 | [diff] [blame] | 43 | warn $ARGV[0] . " can't be opened"; |
Akron | 1e6f4d4 | 2020-05-19 12:14:41 +0200 | [diff] [blame] | 44 | exit(0); |
Akron | 18e407a | 2020-05-11 14:57:19 +0200 | [diff] [blame] | 45 | }; |
| 46 | |
Akron | 701139e | 2020-05-25 17:07:48 +0200 | [diff] [blame] | 47 | # Initial VC group |
| 48 | my $vc; |
Akron | 1e6f4d4 | 2020-05-19 12:14:41 +0200 | [diff] [blame] | 49 | |
Akron | 1d3bd4a | 2020-05-20 17:15:42 +0200 | [diff] [blame] | 50 | # Create an intensional and an extensional VC |
Akron | 36a9b87 | 2020-05-25 11:28:30 +0200 | [diff] [blame] | 51 | my $vc_ext = KorAP::VirtualCorpus::Group->new; |
| 52 | my $vc_int = KorAP::VirtualCorpus::Group->new; |
Akron | fe58a6c | 2020-05-20 16:41:22 +0200 | [diff] [blame] | 53 | |
Akron | 701139e | 2020-05-25 17:07:48 +0200 | [diff] [blame] | 54 | # Load ext initially |
| 55 | $$vc = $vc_ext; |
Akron | 1d3bd4a | 2020-05-20 17:15:42 +0200 | [diff] [blame] | 56 | |
Akron | 36a9b87 | 2020-05-25 11:28:30 +0200 | [diff] [blame] | 57 | # Collect all virtual corpora |
| 58 | my %all_vcs; |
| 59 | |
Akron | 1d3bd4a | 2020-05-20 17:15:42 +0200 | [diff] [blame] | 60 | my $frozen = 0; |
Akron | 1e6f4d4 | 2020-05-19 12:14:41 +0200 | [diff] [blame] | 61 | |
| 62 | # Iterate over the whole list |
| 63 | while (!eof $fh) { |
| 64 | my $line = readline($fh); |
| 65 | chomp $line; |
| 66 | |
Akron | 23e9e3c | 2020-05-20 12:37:25 +0200 | [diff] [blame] | 67 | |
| 68 | # Skip empty lines |
| 69 | if (!$line || length($line) == 0 || $line =~ /^[\s\t\n]*$/) { |
| 70 | # empty |
| 71 | next; |
| 72 | }; |
| 73 | |
Akron | 1839cb1 | 2020-05-20 12:55:22 +0200 | [diff] [blame] | 74 | my ($key, $value, $desc); |
| 75 | |
| 76 | # Line-Type: <e>c</a> |
| 77 | if ($line =~ /^\s*<([^>]+)>\s*([^<]*)\s*<\/\1>\s*$/) { |
| 78 | $key = $1; |
| 79 | $value = $2 // undef; |
| 80 | } |
| 81 | |
| 82 | # Line-Type: <e>c |
| 83 | elsif($line =~ /^\s*<([^>]+)>\s*([^<]+)\s*$/) { |
| 84 | $key = $1; |
| 85 | $value = $2; |
| 86 | } |
| 87 | |
Akron | 1e6f4d4 | 2020-05-19 12:14:41 +0200 | [diff] [blame] | 88 | # Get text sigles |
Akron | 1d3bd4a | 2020-05-20 17:15:42 +0200 | [diff] [blame] | 89 | elsif ($line =~ m!^(?:\w+\/){2}\w+$!) { |
Akron | 1839cb1 | 2020-05-20 12:55:22 +0200 | [diff] [blame] | 90 | $key = 'text'; |
| 91 | $value = $line; |
Akron | 1e6f4d4 | 2020-05-19 12:14:41 +0200 | [diff] [blame] | 92 | } |
| 93 | |
| 94 | # Get doc sigles |
Akron | 1d3bd4a | 2020-05-20 17:15:42 +0200 | [diff] [blame] | 95 | elsif ($line =~ m!^(\w+\/\w+?)(?:\s.+?)?$!) { |
Akron | 1839cb1 | 2020-05-20 12:55:22 +0200 | [diff] [blame] | 96 | $key = 'doc'; |
| 97 | $value = $1; |
Akron | 1e6f4d4 | 2020-05-19 12:14:41 +0200 | [diff] [blame] | 98 | } |
| 99 | |
| 100 | # Get corpus sigles |
Akron | 1839cb1 | 2020-05-20 12:55:22 +0200 | [diff] [blame] | 101 | elsif ($line !~ m!(?:\/|\s)!) { |
| 102 | $key = 'corpus'; |
| 103 | $value = $line; |
Akron | 1e6f4d4 | 2020-05-19 12:14:41 +0200 | [diff] [blame] | 104 | } |
| 105 | |
Akron | 1839cb1 | 2020-05-20 12:55:22 +0200 | [diff] [blame] | 106 | # Not known |
Akron | 1e6f4d4 | 2020-05-19 12:14:41 +0200 | [diff] [blame] | 107 | else { |
Akron | fe58a6c | 2020-05-20 16:41:22 +0200 | [diff] [blame] | 108 | warn _shorten($line) . q! isn't a valid VC definition!; |
Akron | 1839cb1 | 2020-05-20 12:55:22 +0200 | [diff] [blame] | 109 | next; |
| 110 | }; |
| 111 | |
Akron | fe58a6c | 2020-05-20 16:41:22 +0200 | [diff] [blame] | 112 | # Add text field |
Akron | 1839cb1 | 2020-05-20 12:55:22 +0200 | [diff] [blame] | 113 | if ($key eq 'text') { |
Akron | ee2073d | 2020-05-20 15:19:55 +0200 | [diff] [blame] | 114 | |
| 115 | # Convert C2 sigle to KorAP form |
| 116 | $value =~ s!^([^/]+?/[^\.]+?)\.(.+?)$!$1\/$2!; |
Akron | 7dacd01 | 2020-05-27 12:18:57 +0200 | [diff] [blame^] | 117 | ${$vc}->union_field(textSigle => $value); |
Akron | 1839cb1 | 2020-05-20 12:55:22 +0200 | [diff] [blame] | 118 | } |
| 119 | |
Akron | fe58a6c | 2020-05-20 16:41:22 +0200 | [diff] [blame] | 120 | # Add doc field |
Akron | 1839cb1 | 2020-05-20 12:55:22 +0200 | [diff] [blame] | 121 | elsif ($key eq 'doc') { |
Akron | 7dacd01 | 2020-05-27 12:18:57 +0200 | [diff] [blame^] | 122 | ${$vc}->union_field(docSigle => $value); |
Akron | 1839cb1 | 2020-05-20 12:55:22 +0200 | [diff] [blame] | 123 | } |
| 124 | |
Akron | fe58a6c | 2020-05-20 16:41:22 +0200 | [diff] [blame] | 125 | # Add corpus field |
Akron | 1839cb1 | 2020-05-20 12:55:22 +0200 | [diff] [blame] | 126 | elsif ($key eq 'corpus') { |
Akron | 7dacd01 | 2020-05-27 12:18:57 +0200 | [diff] [blame^] | 127 | ${$vc}->union_field(corpusSigle => $value); |
Akron | 36a9b87 | 2020-05-25 11:28:30 +0200 | [diff] [blame] | 128 | } |
| 129 | |
| 130 | # Add corpus field |
| 131 | elsif ($key eq 'cn') { |
| 132 | # Korpussigle, z.B. 'F97 Frankfurter Allgemeine 1997' |
| 133 | if ($value =~ m!^([^\/\s]+)(?:\s.+?)?$!) { |
Akron | 7dacd01 | 2020-05-27 12:18:57 +0200 | [diff] [blame^] | 134 | ${$vc}->union_field(corpusSigle => $1); |
Akron | 36a9b87 | 2020-05-25 11:28:30 +0200 | [diff] [blame] | 135 | }; |
Akron | 1d3bd4a | 2020-05-20 17:15:42 +0200 | [diff] [blame] | 136 | } |
| 137 | |
| 138 | # Mark the vc as frozen |
| 139 | # This means that an extended VC area is expected |
| 140 | elsif ($key eq 'frozen') { |
| 141 | $frozen = 1; |
| 142 | } |
| 143 | |
| 144 | # Start/End intended VC area |
| 145 | elsif ($key eq 'intended') { |
| 146 | if ($value eq 'start') { |
| 147 | $$vc = $vc_int; |
| 148 | } |
| 149 | elsif ($value ne 'end') { |
| 150 | warn 'Unknown intension value ' . $value; |
| 151 | }; |
| 152 | } |
| 153 | |
| 154 | # Start/End extended VC area |
| 155 | elsif ($key eq 'extended') { |
| 156 | if ($value eq 'start') { |
| 157 | $$vc = $vc_ext; |
| 158 | } |
| 159 | elsif ($value ne 'end') { |
| 160 | warn 'Unknown extension value ' . $value; |
| 161 | }; |
| 162 | } |
Akron | 5368b6c | 2020-05-20 17:50:38 +0200 | [diff] [blame] | 163 | |
| 164 | # Set VC name |
| 165 | elsif ($key eq 'name') { |
| 166 | # "Name des virt. Korpus, der angezeigt wird. |
| 167 | # Wird auch intern zur Korpusbildung referenziert, z.B. für <and>, |
| 168 | # <add>, <sub>" |
| 169 | |
| 170 | # No global name defined yet |
Akron | 36a9b87 | 2020-05-25 11:28:30 +0200 | [diff] [blame] | 171 | if ($$vc && !$$vc->name) { |
Akron | 5368b6c | 2020-05-20 17:50:38 +0200 | [diff] [blame] | 172 | $vc_ext->name($value); |
| 173 | $vc_int->name($value); |
| 174 | next; |
| 175 | }; |
Akron | 36a9b87 | 2020-05-25 11:28:30 +0200 | [diff] [blame] | 176 | |
| 177 | ${$vc} = KorAP::VirtualCorpus::Group->new; |
| 178 | ${$vc}->name($value); |
| 179 | } |
| 180 | |
| 181 | # End VC def |
| 182 | elsif ($key eq 'end') { |
| 183 | $all_vcs{${$vc}->name} = $$vc; |
| 184 | # $vc = undef; |
| 185 | } |
| 186 | |
| 187 | # Add VC definition |
| 188 | elsif ($key eq 'add') { |
| 189 | unless (defined $all_vcs{$value}) { |
| 190 | # warn 'VC ' . $value . ' not defined'; |
| 191 | # exit(1); |
| 192 | next; |
| 193 | }; |
| 194 | |
Akron | 7dacd01 | 2020-05-27 12:18:57 +0200 | [diff] [blame^] | 195 | $$vc->union($all_vcs{$value}->clone->to_koral); |
Akron | 5368b6c | 2020-05-20 17:50:38 +0200 | [diff] [blame] | 196 | } |
| 197 | |
Akron | 7dacd01 | 2020-05-27 12:18:57 +0200 | [diff] [blame^] | 198 | # AND definition |
| 199 | elsif ($key eq 'and') { |
| 200 | unless (defined $all_vcs{$value}) { |
| 201 | # warn 'VC ' . $value . ' not defined'; |
| 202 | # exit(1); |
| 203 | next; |
| 204 | }; |
| 205 | |
| 206 | $$vc->joint($all_vcs{$value}->clone->to_koral); |
| 207 | } |
| 208 | |
| 209 | # Source of the corpus |
| 210 | elsif ($key eq 'ql') { |
| 211 | # Quellenname, z.B. "Neue Zürcher Zeitung" |
| 212 | $$vc->union_field(corpusTitle => $value); |
| 213 | } |
| 214 | |
| 215 | elsif ($key eq 'sub') { |
| 216 | # "Sub" is the difference - it is the "and not" operation. |
| 217 | warn $key . ' is not yet supported'; |
| 218 | } |
| 219 | |
| 220 | elsif ($key eq 'co') { |
| 221 | # Country, z.B. DE für Text in Deutschland erschienen |
| 222 | warn $key . ' is not yet supported'; |
| 223 | } |
| 224 | |
| 225 | elsif ($key eq 'tl') { |
| 226 | # Textlength, Bereich von Texten der angegebenen Länge [in Anz. Wörtern] |
| 227 | warn $key . ' is not yet supported'; |
| 228 | } |
| 229 | |
| 230 | elsif ($key eq 'ts') { |
| 231 | # Textsorte, z.B. "Bericht" |
| 232 | warn $key . ' is not yet supported'; |
| 233 | } |
| 234 | |
| 235 | elsif ($key eq 'th') { |
| 236 | # Thema, z.B. "Sport - Fußball" |
| 237 | warn $key . ' is not yet supported'; |
| 238 | } |
| 239 | |
| 240 | elsif ($key eq 'red') { |
| 241 | # Reduktionsfaktor |
| 242 | # Wert zw. 1-99%: virt. Korpus wird auf diesen Wert |
| 243 | # reduziert. Modus: feste Reduzierung, nicht variabel. |
| 244 | warn $key . ' is not yet supported'; |
| 245 | } |
| 246 | |
| 247 | elsif ($key eq 'thprob') { |
| 248 | # ThemaProbability |
| 249 | # Wert, der für <th>Thema verwendet wird um zu bestimmen, ab welchem |
| 250 | # Zuverläßigkeitswert ein Thema übernommen wird |
| 251 | } |
| 252 | |
| 253 | |
Akron | 701139e | 2020-05-25 17:07:48 +0200 | [diff] [blame] | 254 | # Add reduction value as a comment |
| 255 | elsif ($key eq 'redabs') { |
| 256 | # "red. Anz. Texte |
| 257 | # absoluter Wert der durch Reduktion zu erzielende Anzahl Texte" |
| 258 | $$vc->comment('redabs:' . $value); |
Akron | 7dacd01 | 2020-05-27 12:18:57 +0200 | [diff] [blame^] | 259 | warn $key . ' is not yet supported'; |
| 260 | } |
| 261 | |
| 262 | # Add reduction value as a comment |
| 263 | elsif ($key eq 'date') { |
| 264 | # Supports two pattern schemes: |
| 265 | # m1=Year1/Month1 bis Year2/Month2 |
| 266 | # Datumsbereich Schema 1: z.B. "2000/01 bis 2010/12" |
| 267 | |
| 268 | # Schema 1 |
| 269 | if ($value =~ m!^(?:m1\s*=\s*)?\s*(\d+)\/(\d+) bis (\d+)\/(\d+)\s*$!s) { |
| 270 | my ($y1, $m1, $y2, $m2) = ($1, $2, $3, $4); |
| 271 | if ($m1 < 10) { |
| 272 | $m1 = '0' . (0+$m1); |
| 273 | }; |
| 274 | if ($m2 < 10) { |
| 275 | $m2 = '0' . (0+$m2); |
| 276 | }; |
| 277 | $$vc->from($y1, $m1); |
| 278 | $$vc->to($y2, $m2); |
| 279 | } |
| 280 | |
| 281 | # Scheme 2 |
| 282 | elsif ($value =~ m!^\s*\d{4}-\d{4}\s+und\s+\d{1,2}-\d{1,2}\s*$!) { |
| 283 | # m2=Year1-Year2 und Month1-Month2 |
| 284 | # Datumsbereich Schema 2: z.B. "1990-2000 und 06-06" |
| 285 | |
| 286 | warn 'Second date scheme not yet supported!' |
| 287 | } |
| 288 | |
| 289 | else { |
| 290 | warn 'Unknown date scheme ' . $value; |
| 291 | }; |
Akron | 701139e | 2020-05-25 17:07:48 +0200 | [diff] [blame] | 292 | } |
| 293 | |
Akron | 5368b6c | 2020-05-20 17:50:38 +0200 | [diff] [blame] | 294 | # Unknown |
| 295 | else { |
Akron | 7dacd01 | 2020-05-27 12:18:57 +0200 | [diff] [blame^] | 296 | warn $key . ' is an unknown field'; |
Akron | 5368b6c | 2020-05-20 17:50:38 +0200 | [diff] [blame] | 297 | }; |
Akron | 1e6f4d4 | 2020-05-19 12:14:41 +0200 | [diff] [blame] | 298 | }; |
| 299 | |
Akron | 1e6f4d4 | 2020-05-19 12:14:41 +0200 | [diff] [blame] | 300 | close($fh); |
| 301 | |
Akron | 36a9b87 | 2020-05-25 11:28:30 +0200 | [diff] [blame] | 302 | # Stringify current (extended?) virtual corpus |
Akron | 1d3bd4a | 2020-05-20 17:15:42 +0200 | [diff] [blame] | 303 | print $$vc->to_string; |