blob: 3e31e50ec8068f59e7b36566497ce5bbe6cc5cc6 [file] [log] [blame]
Akron3f875be2020-05-11 14:57:19 +02001#!/usr/bin/env perl
Akron3587f362020-05-20 17:50:38 +02002use strict;
3use warnings;
Akron34a4f582020-05-27 12:18:57 +02004use lib 'lib';
5use KorAP::VirtualCorpus::Group;
Akron3f875be2020-05-11 14:57:19 +02006
Akron340a9cb2020-05-20 12:55:22 +02007# 2020-05-20
8# Preliminary support for C2 def-files.
Akron34a4f582020-05-27 12:18:57 +02009# 2020-05-29
10# Introduce optimizable object system.
Akron340a9cb2020-05-20 12:55:22 +020011
Akron26b59702020-05-19 12:14:41 +020012our @ARGV;
13
Akron3f875be2020-05-11 14:57:19 +020014unless (@ARGV) {
15 print <<'HELP';
16Convert a line-separated list of corpus sigles, doc sigles or
17text sigles into a virtual corpus query.
18
19 $ perl list2vc.pl my_vc.txt | gzip -vc > my_vc.jsonld.gz
Akron26b59702020-05-19 12:14:41 +020020 $ cat my_vc.txt | perl list2vc.pl - | gzip -vc > my_vc.jsonld.gz
Akron3f875be2020-05-11 14:57:19 +020021
22HELP
23exit 0;
24};
25
Akron340a9cb2020-05-20 12:55:22 +020026
Akron1c070452020-05-25 11:28:30 +020027# Shorten long strings for logging
Akron49c765f2020-05-20 16:41:22 +020028sub _shorten ($) {
Akron340a9cb2020-05-20 12:55:22 +020029 my $line = shift;
30 if (length($line) < 20) {
31 return $line;
32 }
33 else {
34 return substr($line,0,17) . '...';
35 };
36};
37
38
Akron3f875be2020-05-11 14:57:19 +020039my $fh;
Akron26b59702020-05-19 12:14:41 +020040if ($ARGV[0] eq '-') {
41 $fh = *STDIN;
42} elsif (!open($fh, '<' . $ARGV[0])) {
Akron3f875be2020-05-11 14:57:19 +020043 warn $ARGV[0] . " can't be opened";
Akron26b59702020-05-19 12:14:41 +020044 exit(0);
Akron3f875be2020-05-11 14:57:19 +020045};
46
Akron286b46e2020-05-25 17:07:48 +020047# Initial VC group
48my $vc;
Akron26b59702020-05-19 12:14:41 +020049
Akron323881c2020-05-20 17:15:42 +020050# Create an intensional and an extensional VC
Akron1c070452020-05-25 11:28:30 +020051my $vc_ext = KorAP::VirtualCorpus::Group->new;
52my $vc_int = KorAP::VirtualCorpus::Group->new;
Akron49c765f2020-05-20 16:41:22 +020053
Akron286b46e2020-05-25 17:07:48 +020054# Load ext initially
55$$vc = $vc_ext;
Akron323881c2020-05-20 17:15:42 +020056
Akron1c070452020-05-25 11:28:30 +020057# Collect all virtual corpora
58my %all_vcs;
59
Akron323881c2020-05-20 17:15:42 +020060my $frozen = 0;
Akron26b59702020-05-19 12:14:41 +020061
62# Iterate over the whole list
63while (!eof $fh) {
64 my $line = readline($fh);
65 chomp $line;
66
Akrone2645ec2020-05-20 12:37:25 +020067
68 # Skip empty lines
69 if (!$line || length($line) == 0 || $line =~ /^[\s\t\n]*$/) {
70 # empty
71 next;
72 };
73
Akron340a9cb2020-05-20 12:55:22 +020074 my ($key, $value, $desc);
75
76 # Line-Type: <e>c</a>
77 if ($line =~ /^\s*<([^>]+)>\s*([^<]*)\s*<\/\1>\s*$/) {
78 $key = $1;
79 $value = $2 // undef;
80 }
81
82 # Line-Type: <e>c
83 elsif($line =~ /^\s*<([^>]+)>\s*([^<]+)\s*$/) {
84 $key = $1;
85 $value = $2;
86 }
87
Akron26b59702020-05-19 12:14:41 +020088 # Get text sigles
Akron323881c2020-05-20 17:15:42 +020089 elsif ($line =~ m!^(?:\w+\/){2}\w+$!) {
Akron340a9cb2020-05-20 12:55:22 +020090 $key = 'text';
91 $value = $line;
Akron26b59702020-05-19 12:14:41 +020092 }
93
94 # Get doc sigles
Akron323881c2020-05-20 17:15:42 +020095 elsif ($line =~ m!^(\w+\/\w+?)(?:\s.+?)?$!) {
Akron340a9cb2020-05-20 12:55:22 +020096 $key = 'doc';
97 $value = $1;
Akron26b59702020-05-19 12:14:41 +020098 }
99
100 # Get corpus sigles
Akron340a9cb2020-05-20 12:55:22 +0200101 elsif ($line !~ m!(?:\/|\s)!) {
102 $key = 'corpus';
103 $value = $line;
Akron26b59702020-05-19 12:14:41 +0200104 }
105
Akron340a9cb2020-05-20 12:55:22 +0200106 # Not known
Akron26b59702020-05-19 12:14:41 +0200107 else {
Akron49c765f2020-05-20 16:41:22 +0200108 warn _shorten($line) . q! isn't a valid VC definition!;
Akron340a9cb2020-05-20 12:55:22 +0200109 next;
110 };
111
Akron49c765f2020-05-20 16:41:22 +0200112 # Add text field
Akron340a9cb2020-05-20 12:55:22 +0200113 if ($key eq 'text') {
Akron68746a12020-05-20 15:19:55 +0200114
115 # Convert C2 sigle to KorAP form
116 $value =~ s!^([^/]+?/[^\.]+?)\.(.+?)$!$1\/$2!;
Akron34a4f582020-05-27 12:18:57 +0200117 ${$vc}->union_field(textSigle => $value);
Akron340a9cb2020-05-20 12:55:22 +0200118 }
119
Akron49c765f2020-05-20 16:41:22 +0200120 # Add doc field
Akron340a9cb2020-05-20 12:55:22 +0200121 elsif ($key eq 'doc') {
Akron34a4f582020-05-27 12:18:57 +0200122 ${$vc}->union_field(docSigle => $value);
Akron340a9cb2020-05-20 12:55:22 +0200123 }
124
Akron49c765f2020-05-20 16:41:22 +0200125 # Add corpus field
Akron340a9cb2020-05-20 12:55:22 +0200126 elsif ($key eq 'corpus') {
Akron34a4f582020-05-27 12:18:57 +0200127 ${$vc}->union_field(corpusSigle => $value);
Akron1c070452020-05-25 11:28:30 +0200128 }
129
130 # Add corpus field
131 elsif ($key eq 'cn') {
132 # Korpussigle, z.B. 'F97 Frankfurter Allgemeine 1997'
133 if ($value =~ m!^([^\/\s]+)(?:\s.+?)?$!) {
Akron34a4f582020-05-27 12:18:57 +0200134 ${$vc}->union_field(corpusSigle => $1);
Akron1c070452020-05-25 11:28:30 +0200135 };
Akron323881c2020-05-20 17:15:42 +0200136 }
137
138 # Mark the vc as frozen
139 # This means that an extended VC area is expected
140 elsif ($key eq 'frozen') {
141 $frozen = 1;
142 }
143
144 # Start/End intended VC area
145 elsif ($key eq 'intended') {
146 if ($value eq 'start') {
147 $$vc = $vc_int;
148 }
149 elsif ($value ne 'end') {
150 warn 'Unknown intension value ' . $value;
151 };
152 }
153
154 # Start/End extended VC area
155 elsif ($key eq 'extended') {
156 if ($value eq 'start') {
157 $$vc = $vc_ext;
158 }
159 elsif ($value ne 'end') {
160 warn 'Unknown extension value ' . $value;
161 };
162 }
Akron3587f362020-05-20 17:50:38 +0200163
164 # Set VC name
165 elsif ($key eq 'name') {
166 # "Name des virt. Korpus, der angezeigt wird.
167 # Wird auch intern zur Korpusbildung referenziert, z.B. für <and>,
168 # <add>, <sub>"
169
170 # No global name defined yet
Akron1c070452020-05-25 11:28:30 +0200171 if ($$vc && !$$vc->name) {
Akron3587f362020-05-20 17:50:38 +0200172 $vc_ext->name($value);
173 $vc_int->name($value);
174 next;
175 };
Akron1c070452020-05-25 11:28:30 +0200176
177 ${$vc} = KorAP::VirtualCorpus::Group->new;
178 ${$vc}->name($value);
179 }
180
181 # End VC def
182 elsif ($key eq 'end') {
183 $all_vcs{${$vc}->name} = $$vc;
184 # $vc = undef;
185 }
186
187 # Add VC definition
188 elsif ($key eq 'add') {
189 unless (defined $all_vcs{$value}) {
190 # warn 'VC ' . $value . ' not defined';
191 # exit(1);
192 next;
193 };
194
Akron34a4f582020-05-27 12:18:57 +0200195 $$vc->union($all_vcs{$value}->clone->to_koral);
Akron3587f362020-05-20 17:50:38 +0200196 }
197
Akron34a4f582020-05-27 12:18:57 +0200198 # AND definition
199 elsif ($key eq 'and') {
200 unless (defined $all_vcs{$value}) {
201 # warn 'VC ' . $value . ' not defined';
202 # exit(1);
203 next;
204 };
205
206 $$vc->joint($all_vcs{$value}->clone->to_koral);
207 }
208
209 # Source of the corpus
210 elsif ($key eq 'ql') {
211 # Quellenname, z.B. "Neue Zürcher Zeitung"
212 $$vc->union_field(corpusTitle => $value);
213 }
214
215 elsif ($key eq 'sub') {
216 # "Sub" is the difference - it is the "and not" operation.
217 warn $key . ' is not yet supported';
218 }
219
220 elsif ($key eq 'co') {
221 # Country, z.B. DE für Text in Deutschland erschienen
222 warn $key . ' is not yet supported';
223 }
224
225 elsif ($key eq 'tl') {
226 # Textlength, Bereich von Texten der angegebenen Länge [in Anz. Wörtern]
227 warn $key . ' is not yet supported';
228 }
229
230 elsif ($key eq 'ts') {
231 # Textsorte, z.B. "Bericht"
232 warn $key . ' is not yet supported';
233 }
234
235 elsif ($key eq 'th') {
236 # Thema, z.B. "Sport - Fußball"
237 warn $key . ' is not yet supported';
238 }
239
240 elsif ($key eq 'red') {
241 # Reduktionsfaktor
242 # Wert zw. 1-99%: virt. Korpus wird auf diesen Wert
243 # reduziert. Modus: feste Reduzierung, nicht variabel.
244 warn $key . ' is not yet supported';
245 }
246
247 elsif ($key eq 'thprob') {
248 # ThemaProbability
249 # Wert, der für <th>Thema verwendet wird um zu bestimmen, ab welchem
250 # Zuverläßigkeitswert ein Thema übernommen wird
251 }
252
253
Akron286b46e2020-05-25 17:07:48 +0200254 # Add reduction value as a comment
255 elsif ($key eq 'redabs') {
256 # "red. Anz. Texte
257 # absoluter Wert der durch Reduktion zu erzielende Anzahl Texte"
258 $$vc->comment('redabs:' . $value);
Akron34a4f582020-05-27 12:18:57 +0200259 warn $key . ' is not yet supported';
260 }
261
262 # Add reduction value as a comment
263 elsif ($key eq 'date') {
264 # Supports two pattern schemes:
265 # m1=Year1/Month1 bis Year2/Month2
266 # Datumsbereich Schema 1: z.B. "2000/01 bis 2010/12"
267
268 # Schema 1
269 if ($value =~ m!^(?:m1\s*=\s*)?\s*(\d+)\/(\d+) bis (\d+)\/(\d+)\s*$!s) {
270 my ($y1, $m1, $y2, $m2) = ($1, $2, $3, $4);
271 if ($m1 < 10) {
272 $m1 = '0' . (0+$m1);
273 };
274 if ($m2 < 10) {
275 $m2 = '0' . (0+$m2);
276 };
277 $$vc->from($y1, $m1);
278 $$vc->to($y2, $m2);
279 }
280
281 # Scheme 2
282 elsif ($value =~ m!^\s*\d{4}-\d{4}\s+und\s+\d{1,2}-\d{1,2}\s*$!) {
283 # m2=Year1-Year2 und Month1-Month2
284 # Datumsbereich Schema 2: z.B. "1990-2000 und 06-06"
285
286 warn 'Second date scheme not yet supported!'
287 }
288
289 else {
290 warn 'Unknown date scheme ' . $value;
291 };
Akron286b46e2020-05-25 17:07:48 +0200292 }
293
Akron3587f362020-05-20 17:50:38 +0200294 # Unknown
295 else {
Akron34a4f582020-05-27 12:18:57 +0200296 warn $key . ' is an unknown field';
Akron3587f362020-05-20 17:50:38 +0200297 };
Akron26b59702020-05-19 12:14:41 +0200298};
299
Akron26b59702020-05-19 12:14:41 +0200300close($fh);
301
Akron1c070452020-05-25 11:28:30 +0200302# Stringify current (extended?) virtual corpus
Akron323881c2020-05-20 17:15:42 +0200303print $$vc->to_string;