blob: a25bf299f3954d266bb4a86f56a431cb00366f3e [file] [log] [blame]
Akron3f875be2020-05-11 14:57:19 +02001#!/usr/bin/env perl
Akron3587f362020-05-20 17:50:38 +02002package KorAP::VirtualCorpus;
3use strict;
4use warnings;
5
6# Get or set name of the VC
7sub name {
8 my $self = shift;
9 unless (@_) {
10 return $self->{name};
11 };
12 $self->{name} = shift;
13 return $self;
14};
15
16
17# Quote utility function
18sub quote {
19 shift;
20 my $str = shift;
21 $str =~ s/(["\\])/\\$1/g;
22 return qq{"$str"};
23};
24
25
26# Escaped quote utility function
27sub equote {
28 shift;
29 my $str = shift;
30 $str =~ s/(["\\])/\\$1/g;
31 $str =~ s/(["\\])/\\$1/g;
32 return '\\"' . $str . '\\"';
33};
34
35
36# Stringify globally
37sub to_string {
38 my $self = shift;
39 ## Create collection object
40 my $json = '{';
41 $json .= '"@context":"http://korap.ids-mannheim.de/ns/KoralQuery/v0.3/context.jsonld",';
Akron1c070452020-05-25 11:28:30 +020042 $json .= '"comment":"Name: ' . $self->equote($self->name) . '",' if $self->name;
43 $json .= '"collection":';
Akron3587f362020-05-20 17:50:38 +020044 $json .= $self->_to_fragment;
Akron1c070452020-05-25 11:28:30 +020045 return $json .= '}';
Akron3587f362020-05-20 17:50:38 +020046};
47
48
Akron49c765f2020-05-20 16:41:22 +020049package KorAP::VirtualCorpus::Group;
50use strict;
51use warnings;
Akron3587f362020-05-20 17:50:38 +020052use base 'KorAP::VirtualCorpus';
53
Akron49c765f2020-05-20 16:41:22 +020054
55# Construct a new VC group
56sub new {
57 my $class = shift;
58 bless {
Akron1c070452020-05-25 11:28:30 +020059 with => [],
60 with_fields => {},
61 without => [],
62 without_fields => {},
Akron49c765f2020-05-20 16:41:22 +020063 }, $class;
64};
65
Akron1c070452020-05-25 11:28:30 +020066# Define an operand to be "or"ed
67sub with {
68 my $self = shift;
69 push @{$self->{with}}, shift;
70};
Akron49c765f2020-05-20 16:41:22 +020071
Akron1c070452020-05-25 11:28:30 +020072
73# Define a field that should be "or"ed
74sub with_field {
Akron49c765f2020-05-20 16:41:22 +020075 my $self = shift;
76 my $field = shift;
Akron1c070452020-05-25 11:28:30 +020077 push @{$self->{with_fields}->{$field}}, shift;
Akron49c765f2020-05-20 16:41:22 +020078};
79
Akron1c070452020-05-25 11:28:30 +020080# Define an operand to be "and"ed
81sub without {
82 my $self = shift;
83 push @{$self->{without}}, shift;
84};
85
86
87# Define a field that should be "and"ed
88sub without_field {
89 my $self = shift;
90 my $field = shift;
91 push @{$self->{without_fields}->{$field}}, shift;
92};
93
94# Create a document vector field
95sub _doc_vec {
96 my $field = shift;
97 my $vec = shift;
98 my $json = '{';
99 $json .= '"@type":"koral:doc",';
100 $json .= '"key":"' . $field . '",';
101 $json .= '"match":"match:eq",';
102 $json .= '"value":[';
103 $json .= join ',', map { '"' . $_ . '"' } @$vec;
104 $json .= ']';
105 $json .= '},';
106 return $json;
107}
108
Akron49c765f2020-05-20 16:41:22 +0200109
Akron3587f362020-05-20 17:50:38 +0200110# Stringify fragment
111sub _to_fragment {
Akron49c765f2020-05-20 16:41:22 +0200112 my $self = shift;
Akron49c765f2020-05-20 16:41:22 +0200113
Akron1c070452020-05-25 11:28:30 +0200114 my $json = '{';
Akron49c765f2020-05-20 16:41:22 +0200115 $json .= '"@type":"koral:docGroup",';
Akron3587f362020-05-20 17:50:38 +0200116 $json .= '"comment":"Name: ' . $self->equote($self->name) . '",' if $self->name;
Akron49c765f2020-05-20 16:41:22 +0200117
Akron1c070452020-05-25 11:28:30 +0200118 # Make the outer group "and"
119 if (keys %{$self->{without_fields}}) {
120 $json .= '"operation":"operation:and",';
121 $json .= '"operands":[';
122
123 foreach my $field (sort keys %{$self->{without_fields}}) {
124 unless (@{$self->{without_fields}->{$field}}) {
125 next;
126 };
127 $json .= _doc_vec($field, $self->{without_fields}->{$field});
Akron49c765f2020-05-20 16:41:22 +0200128 };
Akron1c070452020-05-25 11:28:30 +0200129
130 # Remove the last comma
131 chop $json;
132
133 $json .= ']';
134 }
135
136 elsif (keys %{$self->{with_fields}} || @{$self->{with}}) {
137 $json .= '"operation":"operation:or",';
138
139 # TODO:
140 # Flatten embedded or-VCs!
141 $json .= '"operands":[';
142
143 foreach my $field (sort keys %{$self->{with_fields}}) {
144 unless (@{$self->{with_fields}->{$field}}) {
145 next;
146 };
147 $json .= _doc_vec($field, $self->{with_fields}->{$field});
148 };
149
150 foreach my $op (@{$self->{with}}) {
151 $json .= $op->_to_fragment . ',';
152 };
153
154 # Remove the last comma
155 chop $json;
156
157 $json .= ']';
158 }
159
160 # No operands in the group
161 else {
162 # Remove the last comma after the comment
163 chop $json;
Akron49c765f2020-05-20 16:41:22 +0200164 };
165
Akron1c070452020-05-25 11:28:30 +0200166 return $json . '}';
Akron49c765f2020-05-20 16:41:22 +0200167};
168
169
170package main;
Akron3f875be2020-05-11 14:57:19 +0200171use strict;
172use warnings;
173
Akron340a9cb2020-05-20 12:55:22 +0200174# 2020-05-20
175# Preliminary support for C2 def-files.
176
Akron26b59702020-05-19 12:14:41 +0200177our @ARGV;
178
Akron3f875be2020-05-11 14:57:19 +0200179unless (@ARGV) {
180 print <<'HELP';
181Convert a line-separated list of corpus sigles, doc sigles or
182text sigles into a virtual corpus query.
183
184 $ perl list2vc.pl my_vc.txt | gzip -vc > my_vc.jsonld.gz
Akron26b59702020-05-19 12:14:41 +0200185 $ cat my_vc.txt | perl list2vc.pl - | gzip -vc > my_vc.jsonld.gz
Akron3f875be2020-05-11 14:57:19 +0200186
187HELP
188exit 0;
189};
190
Akron340a9cb2020-05-20 12:55:22 +0200191
Akron1c070452020-05-25 11:28:30 +0200192# Shorten long strings for logging
Akron49c765f2020-05-20 16:41:22 +0200193sub _shorten ($) {
Akron340a9cb2020-05-20 12:55:22 +0200194 my $line = shift;
195 if (length($line) < 20) {
196 return $line;
197 }
198 else {
199 return substr($line,0,17) . '...';
200 };
201};
202
203
Akron3f875be2020-05-11 14:57:19 +0200204my $fh;
Akron26b59702020-05-19 12:14:41 +0200205if ($ARGV[0] eq '-') {
206 $fh = *STDIN;
207} elsif (!open($fh, '<' . $ARGV[0])) {
Akron3f875be2020-05-11 14:57:19 +0200208 warn $ARGV[0] . " can't be opened";
Akron26b59702020-05-19 12:14:41 +0200209 exit(0);
Akron3f875be2020-05-11 14:57:19 +0200210};
211
Akron26b59702020-05-19 12:14:41 +0200212
Akron323881c2020-05-20 17:15:42 +0200213# Create an intensional and an extensional VC
Akron1c070452020-05-25 11:28:30 +0200214my $vc_ext = KorAP::VirtualCorpus::Group->new;
215my $vc_int = KorAP::VirtualCorpus::Group->new;
Akron49c765f2020-05-20 16:41:22 +0200216
Akron323881c2020-05-20 17:15:42 +0200217# Initial VC group
218my $vc = \$vc_ext;
219
Akron1c070452020-05-25 11:28:30 +0200220# Collect all virtual corpora
221my %all_vcs;
222
Akron323881c2020-05-20 17:15:42 +0200223my $frozen = 0;
Akron26b59702020-05-19 12:14:41 +0200224
225# Iterate over the whole list
226while (!eof $fh) {
227 my $line = readline($fh);
228 chomp $line;
229
Akrone2645ec2020-05-20 12:37:25 +0200230
231 # Skip empty lines
232 if (!$line || length($line) == 0 || $line =~ /^[\s\t\n]*$/) {
233 # empty
234 next;
235 };
236
Akron340a9cb2020-05-20 12:55:22 +0200237 my ($key, $value, $desc);
238
239 # Line-Type: <e>c</a>
240 if ($line =~ /^\s*<([^>]+)>\s*([^<]*)\s*<\/\1>\s*$/) {
241 $key = $1;
242 $value = $2 // undef;
243 }
244
245 # Line-Type: <e>c
246 elsif($line =~ /^\s*<([^>]+)>\s*([^<]+)\s*$/) {
247 $key = $1;
248 $value = $2;
249 }
250
Akron26b59702020-05-19 12:14:41 +0200251 # Get text sigles
Akron323881c2020-05-20 17:15:42 +0200252 elsif ($line =~ m!^(?:\w+\/){2}\w+$!) {
Akron340a9cb2020-05-20 12:55:22 +0200253 $key = 'text';
254 $value = $line;
Akron26b59702020-05-19 12:14:41 +0200255 }
256
257 # Get doc sigles
Akron323881c2020-05-20 17:15:42 +0200258 elsif ($line =~ m!^(\w+\/\w+?)(?:\s.+?)?$!) {
Akron340a9cb2020-05-20 12:55:22 +0200259 $key = 'doc';
260 $value = $1;
Akron26b59702020-05-19 12:14:41 +0200261 }
262
263 # Get corpus sigles
Akron340a9cb2020-05-20 12:55:22 +0200264 elsif ($line !~ m!(?:\/|\s)!) {
265 $key = 'corpus';
266 $value = $line;
Akron26b59702020-05-19 12:14:41 +0200267 }
268
Akron340a9cb2020-05-20 12:55:22 +0200269 # Not known
Akron26b59702020-05-19 12:14:41 +0200270 else {
Akron49c765f2020-05-20 16:41:22 +0200271 warn _shorten($line) . q! isn't a valid VC definition!;
Akron340a9cb2020-05-20 12:55:22 +0200272 next;
273 };
274
Akron49c765f2020-05-20 16:41:22 +0200275 # Add text field
Akron340a9cb2020-05-20 12:55:22 +0200276 if ($key eq 'text') {
Akron68746a12020-05-20 15:19:55 +0200277
278 # Convert C2 sigle to KorAP form
279 $value =~ s!^([^/]+?/[^\.]+?)\.(.+?)$!$1\/$2!;
Akron1c070452020-05-25 11:28:30 +0200280 ${$vc}->with_field(textSigle => $value);
Akron340a9cb2020-05-20 12:55:22 +0200281 }
282
Akron49c765f2020-05-20 16:41:22 +0200283 # Add doc field
Akron340a9cb2020-05-20 12:55:22 +0200284 elsif ($key eq 'doc') {
Akron1c070452020-05-25 11:28:30 +0200285 ${$vc}->with_field(docSigle => $value);
Akron340a9cb2020-05-20 12:55:22 +0200286 }
287
Akron49c765f2020-05-20 16:41:22 +0200288 # Add corpus field
Akron340a9cb2020-05-20 12:55:22 +0200289 elsif ($key eq 'corpus') {
Akron1c070452020-05-25 11:28:30 +0200290 ${$vc}->with_field(corpusSigle => $value);
291 }
292
293 # Add corpus field
294 elsif ($key eq 'cn') {
295 # Korpussigle, z.B. 'F97 Frankfurter Allgemeine 1997'
296 if ($value =~ m!^([^\/\s]+)(?:\s.+?)?$!) {
297 ${$vc}->with_field(corpusSigle => $1);
298 };
Akron323881c2020-05-20 17:15:42 +0200299 }
300
301 # Mark the vc as frozen
302 # This means that an extended VC area is expected
303 elsif ($key eq 'frozen') {
304 $frozen = 1;
305 }
306
307 # Start/End intended VC area
308 elsif ($key eq 'intended') {
309 if ($value eq 'start') {
310 $$vc = $vc_int;
311 }
312 elsif ($value ne 'end') {
313 warn 'Unknown intension value ' . $value;
314 };
315 }
316
317 # Start/End extended VC area
318 elsif ($key eq 'extended') {
319 if ($value eq 'start') {
320 $$vc = $vc_ext;
321 }
322 elsif ($value ne 'end') {
323 warn 'Unknown extension value ' . $value;
324 };
325 }
Akron3587f362020-05-20 17:50:38 +0200326
327 # Set VC name
328 elsif ($key eq 'name') {
329 # "Name des virt. Korpus, der angezeigt wird.
330 # Wird auch intern zur Korpusbildung referenziert, z.B. für <and>,
331 # <add>, <sub>"
332
333 # No global name defined yet
Akron1c070452020-05-25 11:28:30 +0200334 if ($$vc && !$$vc->name) {
Akron3587f362020-05-20 17:50:38 +0200335 $vc_ext->name($value);
336 $vc_int->name($value);
337 next;
338 };
Akron1c070452020-05-25 11:28:30 +0200339
340 ${$vc} = KorAP::VirtualCorpus::Group->new;
341 ${$vc}->name($value);
342 }
343
344 # End VC def
345 elsif ($key eq 'end') {
346 $all_vcs{${$vc}->name} = $$vc;
347 # $vc = undef;
348 }
349
350 # Add VC definition
351 elsif ($key eq 'add') {
352 unless (defined $all_vcs{$value}) {
353 # warn 'VC ' . $value . ' not defined';
354 # exit(1);
355 next;
356 };
357
358 $$vc->with($all_vcs{$value});
Akron3587f362020-05-20 17:50:38 +0200359 }
360
361 # Unknown
362 else {
363 # warn $key . ' is an unknown field';
364 };
Akron26b59702020-05-19 12:14:41 +0200365};
366
Akron26b59702020-05-19 12:14:41 +0200367close($fh);
368
Akron1c070452020-05-25 11:28:30 +0200369# Stringify current (extended?) virtual corpus
Akron323881c2020-05-20 17:15:42 +0200370print $$vc->to_string;