blob: 35a205a068d159a85d86e8ab721d2fd826ca696d [file] [log] [blame]
Akron3f875be2020-05-11 14:57:19 +02001#!/usr/bin/env perl
Akron3587f362020-05-20 17:50:38 +02002package KorAP::VirtualCorpus;
3use strict;
4use warnings;
5
Akron286b46e2020-05-25 17:07:48 +02006
Akron3587f362020-05-20 17:50:38 +02007# Get or set name of the VC
8sub name {
9 my $self = shift;
10 unless (@_) {
11 return $self->{name};
12 };
13 $self->{name} = shift;
14 return $self;
15};
16
17
Akron286b46e2020-05-25 17:07:48 +020018# Comment
19sub comment {
20 my $self = shift;
21 unless (@_) {
22 return $self->{comment};
23 };
24 $self->{comment} //= [];
25
26 push @{$self->{comment}}, shift;
27 return $self;
28};
29
30
Akron3587f362020-05-20 17:50:38 +020031# Quote utility function
32sub quote {
33 shift;
34 my $str = shift;
35 $str =~ s/(["\\])/\\$1/g;
36 return qq{"$str"};
37};
38
39
40# Escaped quote utility function
41sub equote {
42 shift;
43 my $str = shift;
44 $str =~ s/(["\\])/\\$1/g;
45 $str =~ s/(["\\])/\\$1/g;
46 return '\\"' . $str . '\\"';
47};
48
49
Akron286b46e2020-05-25 17:07:48 +020050sub _commentparam_to_string {
51 my $self = shift;
52 my $comment = $self->_comment_to_string;
53 if ($comment) {
54 return qq!,"comment":"$comment"!;
55 };
56 return '';
57};
58
59
60sub _comment_to_string {
61 my $self = shift;
62 if (!$self->name && !$self->comment) {
63 return '';
64 };
65
66 my $json = '';
67 $json .= 'name:' . $self->equote($self->name) if $self->name;
68 if ($self->name && $self->comment) {
69 $json .= ','
70 };
71 $json .= join(',', @{$self->{comment}}) if $self->{comment};
72
73 return $json;
74};
75
76
Akron3587f362020-05-20 17:50:38 +020077# Stringify globally
78sub to_string {
79 my $self = shift;
80 ## Create collection object
Akron286b46e2020-05-25 17:07:48 +020081
Akron3587f362020-05-20 17:50:38 +020082 my $json = '{';
83 $json .= '"@context":"http://korap.ids-mannheim.de/ns/KoralQuery/v0.3/context.jsonld",';
Akron1c070452020-05-25 11:28:30 +020084 $json .= '"collection":';
Akron3587f362020-05-20 17:50:38 +020085 $json .= $self->_to_fragment;
Akron286b46e2020-05-25 17:07:48 +020086 # Set at the end, when all comments are done
87 $json .= $self->_commentparam_to_string;
Akron1c070452020-05-25 11:28:30 +020088 return $json .= '}';
Akron3587f362020-05-20 17:50:38 +020089};
90
91
Akron49c765f2020-05-20 16:41:22 +020092package KorAP::VirtualCorpus::Group;
93use strict;
94use warnings;
Akron3587f362020-05-20 17:50:38 +020095use base 'KorAP::VirtualCorpus';
96
Akron49c765f2020-05-20 16:41:22 +020097
98# Construct a new VC group
99sub new {
100 my $class = shift;
101 bless {
Akron1c070452020-05-25 11:28:30 +0200102 with => [],
103 with_fields => {},
104 without => [],
105 without_fields => {},
Akron49c765f2020-05-20 16:41:22 +0200106 }, $class;
107};
108
Akron286b46e2020-05-25 17:07:48 +0200109
Akron1c070452020-05-25 11:28:30 +0200110# Define an operand to be "or"ed
111sub with {
112 my $self = shift;
113 push @{$self->{with}}, shift;
114};
Akron49c765f2020-05-20 16:41:22 +0200115
Akron1c070452020-05-25 11:28:30 +0200116
117# Define a field that should be "or"ed
118sub with_field {
Akron49c765f2020-05-20 16:41:22 +0200119 my $self = shift;
120 my $field = shift;
Akron1c070452020-05-25 11:28:30 +0200121 push @{$self->{with_fields}->{$field}}, shift;
Akron49c765f2020-05-20 16:41:22 +0200122};
123
Akron1c070452020-05-25 11:28:30 +0200124# Define an operand to be "and"ed
125sub without {
126 my $self = shift;
127 push @{$self->{without}}, shift;
128};
129
130
131# Define a field that should be "and"ed
132sub without_field {
133 my $self = shift;
134 my $field = shift;
135 push @{$self->{without_fields}->{$field}}, shift;
136};
137
Akron286b46e2020-05-25 17:07:48 +0200138
139# VC contains only with fields
140sub only_with_fields {
141 my $self = shift;
142
143 if (keys %{$self->{without_fields}} || @{$self->{with}} || @{$self->{without}}) {
144 return 0;
145 };
146
147 return 1;
148};
149
150
Akron1c070452020-05-25 11:28:30 +0200151# Create a document vector field
152sub _doc_vec {
153 my $field = shift;
154 my $vec = shift;
155 my $json = '{';
156 $json .= '"@type":"koral:doc",';
157 $json .= '"key":"' . $field . '",';
158 $json .= '"match":"match:eq",';
159 $json .= '"value":[';
160 $json .= join ',', map { '"' . $_ . '"' } @$vec;
161 $json .= ']';
162 $json .= '},';
163 return $json;
164}
165
Akron49c765f2020-05-20 16:41:22 +0200166
Akron3587f362020-05-20 17:50:38 +0200167# Stringify fragment
168sub _to_fragment {
Akron49c765f2020-05-20 16:41:22 +0200169 my $self = shift;
Akron49c765f2020-05-20 16:41:22 +0200170
Akron1c070452020-05-25 11:28:30 +0200171 my $json = '{';
Akron49c765f2020-05-20 16:41:22 +0200172 $json .= '"@type":"koral:docGroup",';
Akron49c765f2020-05-20 16:41:22 +0200173
Akron1c070452020-05-25 11:28:30 +0200174 # Make the outer group "and"
175 if (keys %{$self->{without_fields}}) {
176 $json .= '"operation":"operation:and",';
177 $json .= '"operands":[';
178
179 foreach my $field (sort keys %{$self->{without_fields}}) {
180 unless (@{$self->{without_fields}->{$field}}) {
181 next;
182 };
183 $json .= _doc_vec($field, $self->{without_fields}->{$field});
Akron49c765f2020-05-20 16:41:22 +0200184 };
Akron1c070452020-05-25 11:28:30 +0200185
186 # Remove the last comma
187 chop $json;
188
189 $json .= ']';
190 }
191
192 elsif (keys %{$self->{with_fields}} || @{$self->{with}}) {
193 $json .= '"operation":"operation:or",';
194
Akron1c070452020-05-25 11:28:30 +0200195 $json .= '"operands":[';
196
Akron286b46e2020-05-25 17:07:48 +0200197 # Flatten embedded "or"-VCs
198 foreach my $op (@{$self->{with}}) {
199
200 # The embedded VC has only extending fields
201 if ($op->only_with_fields) {
202
203 $self->comment('embed:[' . $op->_comment_to_string . ']');
204
205 foreach my $k (keys %{$op->{with_fields}}) {
206 foreach my $v (@{$op->{with_fields}->{$k}}) {
207 $self->with_field($k, $v);
208 };
209 };
210 }
211
212 # Embed complex VC
213 else {
214 $json .= $op->_to_fragment . ',';
215 };
216 };
217
Akron1c070452020-05-25 11:28:30 +0200218 foreach my $field (sort keys %{$self->{with_fields}}) {
219 unless (@{$self->{with_fields}->{$field}}) {
220 next;
221 };
222 $json .= _doc_vec($field, $self->{with_fields}->{$field});
223 };
224
Akron1c070452020-05-25 11:28:30 +0200225 # Remove the last comma
226 chop $json;
227
228 $json .= ']';
229 }
230
231 # No operands in the group
232 else {
233 # Remove the last comma after the comment
234 chop $json;
Akron49c765f2020-05-20 16:41:22 +0200235 };
236
Akron286b46e2020-05-25 17:07:48 +0200237 # Set at the end, when all comments are done
238 $json .= $self->_commentparam_to_string;
Akron1c070452020-05-25 11:28:30 +0200239 return $json . '}';
Akron49c765f2020-05-20 16:41:22 +0200240};
241
242
243package main;
Akron3f875be2020-05-11 14:57:19 +0200244use strict;
245use warnings;
246
Akron340a9cb2020-05-20 12:55:22 +0200247# 2020-05-20
248# Preliminary support for C2 def-files.
249
Akron26b59702020-05-19 12:14:41 +0200250our @ARGV;
251
Akron3f875be2020-05-11 14:57:19 +0200252unless (@ARGV) {
253 print <<'HELP';
254Convert a line-separated list of corpus sigles, doc sigles or
255text sigles into a virtual corpus query.
256
257 $ perl list2vc.pl my_vc.txt | gzip -vc > my_vc.jsonld.gz
Akron26b59702020-05-19 12:14:41 +0200258 $ cat my_vc.txt | perl list2vc.pl - | gzip -vc > my_vc.jsonld.gz
Akron3f875be2020-05-11 14:57:19 +0200259
260HELP
261exit 0;
262};
263
Akron340a9cb2020-05-20 12:55:22 +0200264
Akron1c070452020-05-25 11:28:30 +0200265# Shorten long strings for logging
Akron49c765f2020-05-20 16:41:22 +0200266sub _shorten ($) {
Akron340a9cb2020-05-20 12:55:22 +0200267 my $line = shift;
268 if (length($line) < 20) {
269 return $line;
270 }
271 else {
272 return substr($line,0,17) . '...';
273 };
274};
275
276
Akron3f875be2020-05-11 14:57:19 +0200277my $fh;
Akron26b59702020-05-19 12:14:41 +0200278if ($ARGV[0] eq '-') {
279 $fh = *STDIN;
280} elsif (!open($fh, '<' . $ARGV[0])) {
Akron3f875be2020-05-11 14:57:19 +0200281 warn $ARGV[0] . " can't be opened";
Akron26b59702020-05-19 12:14:41 +0200282 exit(0);
Akron3f875be2020-05-11 14:57:19 +0200283};
284
Akron286b46e2020-05-25 17:07:48 +0200285# Initial VC group
286my $vc;
Akron26b59702020-05-19 12:14:41 +0200287
Akron323881c2020-05-20 17:15:42 +0200288# Create an intensional and an extensional VC
Akron1c070452020-05-25 11:28:30 +0200289my $vc_ext = KorAP::VirtualCorpus::Group->new;
290my $vc_int = KorAP::VirtualCorpus::Group->new;
Akron49c765f2020-05-20 16:41:22 +0200291
Akron286b46e2020-05-25 17:07:48 +0200292# Load ext initially
293$$vc = $vc_ext;
Akron323881c2020-05-20 17:15:42 +0200294
Akron1c070452020-05-25 11:28:30 +0200295# Collect all virtual corpora
296my %all_vcs;
297
Akron323881c2020-05-20 17:15:42 +0200298my $frozen = 0;
Akron26b59702020-05-19 12:14:41 +0200299
300# Iterate over the whole list
301while (!eof $fh) {
302 my $line = readline($fh);
303 chomp $line;
304
Akrone2645ec2020-05-20 12:37:25 +0200305
306 # Skip empty lines
307 if (!$line || length($line) == 0 || $line =~ /^[\s\t\n]*$/) {
308 # empty
309 next;
310 };
311
Akron340a9cb2020-05-20 12:55:22 +0200312 my ($key, $value, $desc);
313
314 # Line-Type: <e>c</a>
315 if ($line =~ /^\s*<([^>]+)>\s*([^<]*)\s*<\/\1>\s*$/) {
316 $key = $1;
317 $value = $2 // undef;
318 }
319
320 # Line-Type: <e>c
321 elsif($line =~ /^\s*<([^>]+)>\s*([^<]+)\s*$/) {
322 $key = $1;
323 $value = $2;
324 }
325
Akron26b59702020-05-19 12:14:41 +0200326 # Get text sigles
Akron323881c2020-05-20 17:15:42 +0200327 elsif ($line =~ m!^(?:\w+\/){2}\w+$!) {
Akron340a9cb2020-05-20 12:55:22 +0200328 $key = 'text';
329 $value = $line;
Akron26b59702020-05-19 12:14:41 +0200330 }
331
332 # Get doc sigles
Akron323881c2020-05-20 17:15:42 +0200333 elsif ($line =~ m!^(\w+\/\w+?)(?:\s.+?)?$!) {
Akron340a9cb2020-05-20 12:55:22 +0200334 $key = 'doc';
335 $value = $1;
Akron26b59702020-05-19 12:14:41 +0200336 }
337
338 # Get corpus sigles
Akron340a9cb2020-05-20 12:55:22 +0200339 elsif ($line !~ m!(?:\/|\s)!) {
340 $key = 'corpus';
341 $value = $line;
Akron26b59702020-05-19 12:14:41 +0200342 }
343
Akron340a9cb2020-05-20 12:55:22 +0200344 # Not known
Akron26b59702020-05-19 12:14:41 +0200345 else {
Akron49c765f2020-05-20 16:41:22 +0200346 warn _shorten($line) . q! isn't a valid VC definition!;
Akron340a9cb2020-05-20 12:55:22 +0200347 next;
348 };
349
Akron49c765f2020-05-20 16:41:22 +0200350 # Add text field
Akron340a9cb2020-05-20 12:55:22 +0200351 if ($key eq 'text') {
Akron68746a12020-05-20 15:19:55 +0200352
353 # Convert C2 sigle to KorAP form
354 $value =~ s!^([^/]+?/[^\.]+?)\.(.+?)$!$1\/$2!;
Akron1c070452020-05-25 11:28:30 +0200355 ${$vc}->with_field(textSigle => $value);
Akron340a9cb2020-05-20 12:55:22 +0200356 }
357
Akron49c765f2020-05-20 16:41:22 +0200358 # Add doc field
Akron340a9cb2020-05-20 12:55:22 +0200359 elsif ($key eq 'doc') {
Akron1c070452020-05-25 11:28:30 +0200360 ${$vc}->with_field(docSigle => $value);
Akron340a9cb2020-05-20 12:55:22 +0200361 }
362
Akron49c765f2020-05-20 16:41:22 +0200363 # Add corpus field
Akron340a9cb2020-05-20 12:55:22 +0200364 elsif ($key eq 'corpus') {
Akron1c070452020-05-25 11:28:30 +0200365 ${$vc}->with_field(corpusSigle => $value);
366 }
367
368 # Add corpus field
369 elsif ($key eq 'cn') {
370 # Korpussigle, z.B. 'F97 Frankfurter Allgemeine 1997'
371 if ($value =~ m!^([^\/\s]+)(?:\s.+?)?$!) {
372 ${$vc}->with_field(corpusSigle => $1);
373 };
Akron323881c2020-05-20 17:15:42 +0200374 }
375
376 # Mark the vc as frozen
377 # This means that an extended VC area is expected
378 elsif ($key eq 'frozen') {
379 $frozen = 1;
380 }
381
382 # Start/End intended VC area
383 elsif ($key eq 'intended') {
384 if ($value eq 'start') {
385 $$vc = $vc_int;
386 }
387 elsif ($value ne 'end') {
388 warn 'Unknown intension value ' . $value;
389 };
390 }
391
392 # Start/End extended VC area
393 elsif ($key eq 'extended') {
394 if ($value eq 'start') {
395 $$vc = $vc_ext;
396 }
397 elsif ($value ne 'end') {
398 warn 'Unknown extension value ' . $value;
399 };
400 }
Akron3587f362020-05-20 17:50:38 +0200401
402 # Set VC name
403 elsif ($key eq 'name') {
404 # "Name des virt. Korpus, der angezeigt wird.
405 # Wird auch intern zur Korpusbildung referenziert, z.B. für <and>,
406 # <add>, <sub>"
407
408 # No global name defined yet
Akron1c070452020-05-25 11:28:30 +0200409 if ($$vc && !$$vc->name) {
Akron3587f362020-05-20 17:50:38 +0200410 $vc_ext->name($value);
411 $vc_int->name($value);
412 next;
413 };
Akron1c070452020-05-25 11:28:30 +0200414
415 ${$vc} = KorAP::VirtualCorpus::Group->new;
416 ${$vc}->name($value);
417 }
418
419 # End VC def
420 elsif ($key eq 'end') {
421 $all_vcs{${$vc}->name} = $$vc;
422 # $vc = undef;
423 }
424
425 # Add VC definition
426 elsif ($key eq 'add') {
427 unless (defined $all_vcs{$value}) {
428 # warn 'VC ' . $value . ' not defined';
429 # exit(1);
430 next;
431 };
432
433 $$vc->with($all_vcs{$value});
Akron3587f362020-05-20 17:50:38 +0200434 }
435
Akron286b46e2020-05-25 17:07:48 +0200436 # Add reduction value as a comment
437 elsif ($key eq 'redabs') {
438 # "red. Anz. Texte
439 # absoluter Wert der durch Reduktion zu erzielende Anzahl Texte"
440 $$vc->comment('redabs:' . $value);
441 }
442
Akron3587f362020-05-20 17:50:38 +0200443 # Unknown
444 else {
445 # warn $key . ' is an unknown field';
446 };
Akron26b59702020-05-19 12:14:41 +0200447};
448
Akron26b59702020-05-19 12:14:41 +0200449close($fh);
450
Akron1c070452020-05-25 11:28:30 +0200451# Stringify current (extended?) virtual corpus
Akron323881c2020-05-20 17:15:42 +0200452print $$vc->to_string;