Akron | 18e407a | 2020-05-11 14:57:19 +0200 | [diff] [blame] | 1 | #!/usr/bin/env perl |
Akron | 5368b6c | 2020-05-20 17:50:38 +0200 | [diff] [blame] | 2 | package KorAP::VirtualCorpus; |
| 3 | use strict; |
| 4 | use warnings; |
| 5 | |
| 6 | # Get or set name of the VC |
| 7 | sub name { |
| 8 | my $self = shift; |
| 9 | unless (@_) { |
| 10 | return $self->{name}; |
| 11 | }; |
| 12 | $self->{name} = shift; |
| 13 | return $self; |
| 14 | }; |
| 15 | |
| 16 | |
| 17 | # Quote utility function |
| 18 | sub quote { |
| 19 | shift; |
| 20 | my $str = shift; |
| 21 | $str =~ s/(["\\])/\\$1/g; |
| 22 | return qq{"$str"}; |
| 23 | }; |
| 24 | |
| 25 | |
| 26 | # Escaped quote utility function |
| 27 | sub equote { |
| 28 | shift; |
| 29 | my $str = shift; |
| 30 | $str =~ s/(["\\])/\\$1/g; |
| 31 | $str =~ s/(["\\])/\\$1/g; |
| 32 | return '\\"' . $str . '\\"'; |
| 33 | }; |
| 34 | |
| 35 | |
| 36 | # Stringify globally |
| 37 | sub to_string { |
| 38 | my $self = shift; |
| 39 | ## Create collection object |
| 40 | my $json = '{'; |
| 41 | $json .= '"@context":"http://korap.ids-mannheim.de/ns/KoralQuery/v0.3/context.jsonld",'; |
Akron | 36a9b87 | 2020-05-25 11:28:30 +0200 | [diff] [blame^] | 42 | $json .= '"comment":"Name: ' . $self->equote($self->name) . '",' if $self->name; |
| 43 | $json .= '"collection":'; |
Akron | 5368b6c | 2020-05-20 17:50:38 +0200 | [diff] [blame] | 44 | $json .= $self->_to_fragment; |
Akron | 36a9b87 | 2020-05-25 11:28:30 +0200 | [diff] [blame^] | 45 | return $json .= '}'; |
Akron | 5368b6c | 2020-05-20 17:50:38 +0200 | [diff] [blame] | 46 | }; |
| 47 | |
| 48 | |
Akron | fe58a6c | 2020-05-20 16:41:22 +0200 | [diff] [blame] | 49 | package KorAP::VirtualCorpus::Group; |
| 50 | use strict; |
| 51 | use warnings; |
Akron | 5368b6c | 2020-05-20 17:50:38 +0200 | [diff] [blame] | 52 | use base 'KorAP::VirtualCorpus'; |
| 53 | |
Akron | fe58a6c | 2020-05-20 16:41:22 +0200 | [diff] [blame] | 54 | |
| 55 | # Construct a new VC group |
| 56 | sub new { |
| 57 | my $class = shift; |
| 58 | bless { |
Akron | 36a9b87 | 2020-05-25 11:28:30 +0200 | [diff] [blame^] | 59 | with => [], |
| 60 | with_fields => {}, |
| 61 | without => [], |
| 62 | without_fields => {}, |
Akron | fe58a6c | 2020-05-20 16:41:22 +0200 | [diff] [blame] | 63 | }, $class; |
| 64 | }; |
| 65 | |
Akron | 36a9b87 | 2020-05-25 11:28:30 +0200 | [diff] [blame^] | 66 | # Define an operand to be "or"ed |
| 67 | sub with { |
| 68 | my $self = shift; |
| 69 | push @{$self->{with}}, shift; |
| 70 | }; |
Akron | fe58a6c | 2020-05-20 16:41:22 +0200 | [diff] [blame] | 71 | |
Akron | 36a9b87 | 2020-05-25 11:28:30 +0200 | [diff] [blame^] | 72 | |
| 73 | # Define a field that should be "or"ed |
| 74 | sub with_field { |
Akron | fe58a6c | 2020-05-20 16:41:22 +0200 | [diff] [blame] | 75 | my $self = shift; |
| 76 | my $field = shift; |
Akron | 36a9b87 | 2020-05-25 11:28:30 +0200 | [diff] [blame^] | 77 | push @{$self->{with_fields}->{$field}}, shift; |
Akron | fe58a6c | 2020-05-20 16:41:22 +0200 | [diff] [blame] | 78 | }; |
| 79 | |
Akron | 36a9b87 | 2020-05-25 11:28:30 +0200 | [diff] [blame^] | 80 | # Define an operand to be "and"ed |
| 81 | sub without { |
| 82 | my $self = shift; |
| 83 | push @{$self->{without}}, shift; |
| 84 | }; |
| 85 | |
| 86 | |
| 87 | # Define a field that should be "and"ed |
| 88 | sub without_field { |
| 89 | my $self = shift; |
| 90 | my $field = shift; |
| 91 | push @{$self->{without_fields}->{$field}}, shift; |
| 92 | }; |
| 93 | |
| 94 | # Create a document vector field |
| 95 | sub _doc_vec { |
| 96 | my $field = shift; |
| 97 | my $vec = shift; |
| 98 | my $json = '{'; |
| 99 | $json .= '"@type":"koral:doc",'; |
| 100 | $json .= '"key":"' . $field . '",'; |
| 101 | $json .= '"match":"match:eq",'; |
| 102 | $json .= '"value":['; |
| 103 | $json .= join ',', map { '"' . $_ . '"' } @$vec; |
| 104 | $json .= ']'; |
| 105 | $json .= '},'; |
| 106 | return $json; |
| 107 | } |
| 108 | |
Akron | fe58a6c | 2020-05-20 16:41:22 +0200 | [diff] [blame] | 109 | |
Akron | 5368b6c | 2020-05-20 17:50:38 +0200 | [diff] [blame] | 110 | # Stringify fragment |
| 111 | sub _to_fragment { |
Akron | fe58a6c | 2020-05-20 16:41:22 +0200 | [diff] [blame] | 112 | my $self = shift; |
Akron | fe58a6c | 2020-05-20 16:41:22 +0200 | [diff] [blame] | 113 | |
Akron | 36a9b87 | 2020-05-25 11:28:30 +0200 | [diff] [blame^] | 114 | my $json = '{'; |
Akron | fe58a6c | 2020-05-20 16:41:22 +0200 | [diff] [blame] | 115 | $json .= '"@type":"koral:docGroup",'; |
Akron | 5368b6c | 2020-05-20 17:50:38 +0200 | [diff] [blame] | 116 | $json .= '"comment":"Name: ' . $self->equote($self->name) . '",' if $self->name; |
Akron | fe58a6c | 2020-05-20 16:41:22 +0200 | [diff] [blame] | 117 | |
Akron | 36a9b87 | 2020-05-25 11:28:30 +0200 | [diff] [blame^] | 118 | # Make the outer group "and" |
| 119 | if (keys %{$self->{without_fields}}) { |
| 120 | $json .= '"operation":"operation:and",'; |
| 121 | $json .= '"operands":['; |
| 122 | |
| 123 | foreach my $field (sort keys %{$self->{without_fields}}) { |
| 124 | unless (@{$self->{without_fields}->{$field}}) { |
| 125 | next; |
| 126 | }; |
| 127 | $json .= _doc_vec($field, $self->{without_fields}->{$field}); |
Akron | fe58a6c | 2020-05-20 16:41:22 +0200 | [diff] [blame] | 128 | }; |
Akron | 36a9b87 | 2020-05-25 11:28:30 +0200 | [diff] [blame^] | 129 | |
| 130 | # Remove the last comma |
| 131 | chop $json; |
| 132 | |
| 133 | $json .= ']'; |
| 134 | } |
| 135 | |
| 136 | elsif (keys %{$self->{with_fields}} || @{$self->{with}}) { |
| 137 | $json .= '"operation":"operation:or",'; |
| 138 | |
| 139 | # TODO: |
| 140 | # Flatten embedded or-VCs! |
| 141 | $json .= '"operands":['; |
| 142 | |
| 143 | foreach my $field (sort keys %{$self->{with_fields}}) { |
| 144 | unless (@{$self->{with_fields}->{$field}}) { |
| 145 | next; |
| 146 | }; |
| 147 | $json .= _doc_vec($field, $self->{with_fields}->{$field}); |
| 148 | }; |
| 149 | |
| 150 | foreach my $op (@{$self->{with}}) { |
| 151 | $json .= $op->_to_fragment . ','; |
| 152 | }; |
| 153 | |
| 154 | # Remove the last comma |
| 155 | chop $json; |
| 156 | |
| 157 | $json .= ']'; |
| 158 | } |
| 159 | |
| 160 | # No operands in the group |
| 161 | else { |
| 162 | # Remove the last comma after the comment |
| 163 | chop $json; |
Akron | fe58a6c | 2020-05-20 16:41:22 +0200 | [diff] [blame] | 164 | }; |
| 165 | |
Akron | 36a9b87 | 2020-05-25 11:28:30 +0200 | [diff] [blame^] | 166 | return $json . '}'; |
Akron | fe58a6c | 2020-05-20 16:41:22 +0200 | [diff] [blame] | 167 | }; |
| 168 | |
| 169 | |
| 170 | package main; |
Akron | 18e407a | 2020-05-11 14:57:19 +0200 | [diff] [blame] | 171 | use strict; |
| 172 | use warnings; |
| 173 | |
Akron | 1839cb1 | 2020-05-20 12:55:22 +0200 | [diff] [blame] | 174 | # 2020-05-20 |
| 175 | # Preliminary support for C2 def-files. |
| 176 | |
Akron | 1e6f4d4 | 2020-05-19 12:14:41 +0200 | [diff] [blame] | 177 | our @ARGV; |
| 178 | |
Akron | 18e407a | 2020-05-11 14:57:19 +0200 | [diff] [blame] | 179 | unless (@ARGV) { |
| 180 | print <<'HELP'; |
| 181 | Convert a line-separated list of corpus sigles, doc sigles or |
| 182 | text sigles into a virtual corpus query. |
| 183 | |
| 184 | $ perl list2vc.pl my_vc.txt | gzip -vc > my_vc.jsonld.gz |
Akron | 1e6f4d4 | 2020-05-19 12:14:41 +0200 | [diff] [blame] | 185 | $ cat my_vc.txt | perl list2vc.pl - | gzip -vc > my_vc.jsonld.gz |
Akron | 18e407a | 2020-05-11 14:57:19 +0200 | [diff] [blame] | 186 | |
| 187 | HELP |
| 188 | exit 0; |
| 189 | }; |
| 190 | |
Akron | 1839cb1 | 2020-05-20 12:55:22 +0200 | [diff] [blame] | 191 | |
Akron | 36a9b87 | 2020-05-25 11:28:30 +0200 | [diff] [blame^] | 192 | # Shorten long strings for logging |
Akron | fe58a6c | 2020-05-20 16:41:22 +0200 | [diff] [blame] | 193 | sub _shorten ($) { |
Akron | 1839cb1 | 2020-05-20 12:55:22 +0200 | [diff] [blame] | 194 | my $line = shift; |
| 195 | if (length($line) < 20) { |
| 196 | return $line; |
| 197 | } |
| 198 | else { |
| 199 | return substr($line,0,17) . '...'; |
| 200 | }; |
| 201 | }; |
| 202 | |
| 203 | |
Akron | 18e407a | 2020-05-11 14:57:19 +0200 | [diff] [blame] | 204 | my $fh; |
Akron | 1e6f4d4 | 2020-05-19 12:14:41 +0200 | [diff] [blame] | 205 | if ($ARGV[0] eq '-') { |
| 206 | $fh = *STDIN; |
| 207 | } elsif (!open($fh, '<' . $ARGV[0])) { |
Akron | 18e407a | 2020-05-11 14:57:19 +0200 | [diff] [blame] | 208 | warn $ARGV[0] . " can't be opened"; |
Akron | 1e6f4d4 | 2020-05-19 12:14:41 +0200 | [diff] [blame] | 209 | exit(0); |
Akron | 18e407a | 2020-05-11 14:57:19 +0200 | [diff] [blame] | 210 | }; |
| 211 | |
Akron | 1e6f4d4 | 2020-05-19 12:14:41 +0200 | [diff] [blame] | 212 | |
Akron | 1d3bd4a | 2020-05-20 17:15:42 +0200 | [diff] [blame] | 213 | # Create an intensional and an extensional VC |
Akron | 36a9b87 | 2020-05-25 11:28:30 +0200 | [diff] [blame^] | 214 | my $vc_ext = KorAP::VirtualCorpus::Group->new; |
| 215 | my $vc_int = KorAP::VirtualCorpus::Group->new; |
Akron | fe58a6c | 2020-05-20 16:41:22 +0200 | [diff] [blame] | 216 | |
Akron | 1d3bd4a | 2020-05-20 17:15:42 +0200 | [diff] [blame] | 217 | # Initial VC group |
| 218 | my $vc = \$vc_ext; |
| 219 | |
Akron | 36a9b87 | 2020-05-25 11:28:30 +0200 | [diff] [blame^] | 220 | # Collect all virtual corpora |
| 221 | my %all_vcs; |
| 222 | |
Akron | 1d3bd4a | 2020-05-20 17:15:42 +0200 | [diff] [blame] | 223 | my $frozen = 0; |
Akron | 1e6f4d4 | 2020-05-19 12:14:41 +0200 | [diff] [blame] | 224 | |
| 225 | # Iterate over the whole list |
| 226 | while (!eof $fh) { |
| 227 | my $line = readline($fh); |
| 228 | chomp $line; |
| 229 | |
Akron | 23e9e3c | 2020-05-20 12:37:25 +0200 | [diff] [blame] | 230 | |
| 231 | # Skip empty lines |
| 232 | if (!$line || length($line) == 0 || $line =~ /^[\s\t\n]*$/) { |
| 233 | # empty |
| 234 | next; |
| 235 | }; |
| 236 | |
Akron | 1839cb1 | 2020-05-20 12:55:22 +0200 | [diff] [blame] | 237 | my ($key, $value, $desc); |
| 238 | |
| 239 | # Line-Type: <e>c</a> |
| 240 | if ($line =~ /^\s*<([^>]+)>\s*([^<]*)\s*<\/\1>\s*$/) { |
| 241 | $key = $1; |
| 242 | $value = $2 // undef; |
| 243 | } |
| 244 | |
| 245 | # Line-Type: <e>c |
| 246 | elsif($line =~ /^\s*<([^>]+)>\s*([^<]+)\s*$/) { |
| 247 | $key = $1; |
| 248 | $value = $2; |
| 249 | } |
| 250 | |
Akron | 1e6f4d4 | 2020-05-19 12:14:41 +0200 | [diff] [blame] | 251 | # Get text sigles |
Akron | 1d3bd4a | 2020-05-20 17:15:42 +0200 | [diff] [blame] | 252 | elsif ($line =~ m!^(?:\w+\/){2}\w+$!) { |
Akron | 1839cb1 | 2020-05-20 12:55:22 +0200 | [diff] [blame] | 253 | $key = 'text'; |
| 254 | $value = $line; |
Akron | 1e6f4d4 | 2020-05-19 12:14:41 +0200 | [diff] [blame] | 255 | } |
| 256 | |
| 257 | # Get doc sigles |
Akron | 1d3bd4a | 2020-05-20 17:15:42 +0200 | [diff] [blame] | 258 | elsif ($line =~ m!^(\w+\/\w+?)(?:\s.+?)?$!) { |
Akron | 1839cb1 | 2020-05-20 12:55:22 +0200 | [diff] [blame] | 259 | $key = 'doc'; |
| 260 | $value = $1; |
Akron | 1e6f4d4 | 2020-05-19 12:14:41 +0200 | [diff] [blame] | 261 | } |
| 262 | |
| 263 | # Get corpus sigles |
Akron | 1839cb1 | 2020-05-20 12:55:22 +0200 | [diff] [blame] | 264 | elsif ($line !~ m!(?:\/|\s)!) { |
| 265 | $key = 'corpus'; |
| 266 | $value = $line; |
Akron | 1e6f4d4 | 2020-05-19 12:14:41 +0200 | [diff] [blame] | 267 | } |
| 268 | |
Akron | 1839cb1 | 2020-05-20 12:55:22 +0200 | [diff] [blame] | 269 | # Not known |
Akron | 1e6f4d4 | 2020-05-19 12:14:41 +0200 | [diff] [blame] | 270 | else { |
Akron | fe58a6c | 2020-05-20 16:41:22 +0200 | [diff] [blame] | 271 | warn _shorten($line) . q! isn't a valid VC definition!; |
Akron | 1839cb1 | 2020-05-20 12:55:22 +0200 | [diff] [blame] | 272 | next; |
| 273 | }; |
| 274 | |
Akron | fe58a6c | 2020-05-20 16:41:22 +0200 | [diff] [blame] | 275 | # Add text field |
Akron | 1839cb1 | 2020-05-20 12:55:22 +0200 | [diff] [blame] | 276 | if ($key eq 'text') { |
Akron | ee2073d | 2020-05-20 15:19:55 +0200 | [diff] [blame] | 277 | |
| 278 | # Convert C2 sigle to KorAP form |
| 279 | $value =~ s!^([^/]+?/[^\.]+?)\.(.+?)$!$1\/$2!; |
Akron | 36a9b87 | 2020-05-25 11:28:30 +0200 | [diff] [blame^] | 280 | ${$vc}->with_field(textSigle => $value); |
Akron | 1839cb1 | 2020-05-20 12:55:22 +0200 | [diff] [blame] | 281 | } |
| 282 | |
Akron | fe58a6c | 2020-05-20 16:41:22 +0200 | [diff] [blame] | 283 | # Add doc field |
Akron | 1839cb1 | 2020-05-20 12:55:22 +0200 | [diff] [blame] | 284 | elsif ($key eq 'doc') { |
Akron | 36a9b87 | 2020-05-25 11:28:30 +0200 | [diff] [blame^] | 285 | ${$vc}->with_field(docSigle => $value); |
Akron | 1839cb1 | 2020-05-20 12:55:22 +0200 | [diff] [blame] | 286 | } |
| 287 | |
Akron | fe58a6c | 2020-05-20 16:41:22 +0200 | [diff] [blame] | 288 | # Add corpus field |
Akron | 1839cb1 | 2020-05-20 12:55:22 +0200 | [diff] [blame] | 289 | elsif ($key eq 'corpus') { |
Akron | 36a9b87 | 2020-05-25 11:28:30 +0200 | [diff] [blame^] | 290 | ${$vc}->with_field(corpusSigle => $value); |
| 291 | } |
| 292 | |
| 293 | # Add corpus field |
| 294 | elsif ($key eq 'cn') { |
| 295 | # Korpussigle, z.B. 'F97 Frankfurter Allgemeine 1997' |
| 296 | if ($value =~ m!^([^\/\s]+)(?:\s.+?)?$!) { |
| 297 | ${$vc}->with_field(corpusSigle => $1); |
| 298 | }; |
Akron | 1d3bd4a | 2020-05-20 17:15:42 +0200 | [diff] [blame] | 299 | } |
| 300 | |
| 301 | # Mark the vc as frozen |
| 302 | # This means that an extended VC area is expected |
| 303 | elsif ($key eq 'frozen') { |
| 304 | $frozen = 1; |
| 305 | } |
| 306 | |
| 307 | # Start/End intended VC area |
| 308 | elsif ($key eq 'intended') { |
| 309 | if ($value eq 'start') { |
| 310 | $$vc = $vc_int; |
| 311 | } |
| 312 | elsif ($value ne 'end') { |
| 313 | warn 'Unknown intension value ' . $value; |
| 314 | }; |
| 315 | } |
| 316 | |
| 317 | # Start/End extended VC area |
| 318 | elsif ($key eq 'extended') { |
| 319 | if ($value eq 'start') { |
| 320 | $$vc = $vc_ext; |
| 321 | } |
| 322 | elsif ($value ne 'end') { |
| 323 | warn 'Unknown extension value ' . $value; |
| 324 | }; |
| 325 | } |
Akron | 5368b6c | 2020-05-20 17:50:38 +0200 | [diff] [blame] | 326 | |
| 327 | # Set VC name |
| 328 | elsif ($key eq 'name') { |
| 329 | # "Name des virt. Korpus, der angezeigt wird. |
| 330 | # Wird auch intern zur Korpusbildung referenziert, z.B. für <and>, |
| 331 | # <add>, <sub>" |
| 332 | |
| 333 | # No global name defined yet |
Akron | 36a9b87 | 2020-05-25 11:28:30 +0200 | [diff] [blame^] | 334 | if ($$vc && !$$vc->name) { |
Akron | 5368b6c | 2020-05-20 17:50:38 +0200 | [diff] [blame] | 335 | $vc_ext->name($value); |
| 336 | $vc_int->name($value); |
| 337 | next; |
| 338 | }; |
Akron | 36a9b87 | 2020-05-25 11:28:30 +0200 | [diff] [blame^] | 339 | |
| 340 | ${$vc} = KorAP::VirtualCorpus::Group->new; |
| 341 | ${$vc}->name($value); |
| 342 | } |
| 343 | |
| 344 | # End VC def |
| 345 | elsif ($key eq 'end') { |
| 346 | $all_vcs{${$vc}->name} = $$vc; |
| 347 | # $vc = undef; |
| 348 | } |
| 349 | |
| 350 | # Add VC definition |
| 351 | elsif ($key eq 'add') { |
| 352 | unless (defined $all_vcs{$value}) { |
| 353 | # warn 'VC ' . $value . ' not defined'; |
| 354 | # exit(1); |
| 355 | next; |
| 356 | }; |
| 357 | |
| 358 | $$vc->with($all_vcs{$value}); |
Akron | 5368b6c | 2020-05-20 17:50:38 +0200 | [diff] [blame] | 359 | } |
| 360 | |
| 361 | # Unknown |
| 362 | else { |
| 363 | # warn $key . ' is an unknown field'; |
| 364 | }; |
Akron | 1e6f4d4 | 2020-05-19 12:14:41 +0200 | [diff] [blame] | 365 | }; |
| 366 | |
Akron | 1e6f4d4 | 2020-05-19 12:14:41 +0200 | [diff] [blame] | 367 | close($fh); |
| 368 | |
Akron | 36a9b87 | 2020-05-25 11:28:30 +0200 | [diff] [blame^] | 369 | # Stringify current (extended?) virtual corpus |
Akron | 1d3bd4a | 2020-05-20 17:15:42 +0200 | [diff] [blame] | 370 | print $$vc->to_string; |