blob: 85986a979e6c438f957a67678be4e7f088722f05 [file] [log] [blame]
Akron3f875be2020-05-11 14:57:19 +02001#!/usr/bin/env perl
Akron3587f362020-05-20 17:50:38 +02002use strict;
3use warnings;
Akron34a4f582020-05-27 12:18:57 +02004use lib 'lib';
5use KorAP::VirtualCorpus::Group;
Akron3f875be2020-05-11 14:57:19 +02006
Akron340a9cb2020-05-20 12:55:22 +02007# 2020-05-20
8# Preliminary support for C2 def-files.
Akron34a4f582020-05-27 12:18:57 +02009# 2020-05-29
10# Introduce optimizable object system.
Akron340a9cb2020-05-20 12:55:22 +020011
Akrondd0aa3a2024-04-10 11:03:38 +020012our $VERSION = 0.1;
13
Akron26b59702020-05-19 12:14:41 +020014our @ARGV;
15
Akron3f875be2020-05-11 14:57:19 +020016unless (@ARGV) {
17 print <<'HELP';
18Convert a line-separated list of corpus sigles, doc sigles or
19text sigles into a virtual corpus query.
20
21 $ perl list2vc.pl my_vc.txt | gzip -vc > my_vc.jsonld.gz
Akron26b59702020-05-19 12:14:41 +020022 $ cat my_vc.txt | perl list2vc.pl - | gzip -vc > my_vc.jsonld.gz
Akron3f875be2020-05-11 14:57:19 +020023
24HELP
25exit 0;
26};
27
Akron340a9cb2020-05-20 12:55:22 +020028
Akron1c070452020-05-25 11:28:30 +020029# Shorten long strings for logging
Akron49c765f2020-05-20 16:41:22 +020030sub _shorten ($) {
Akron340a9cb2020-05-20 12:55:22 +020031 my $line = shift;
32 if (length($line) < 20) {
33 return $line;
34 }
35 else {
36 return substr($line,0,17) . '...';
37 };
38};
39
40
Akron3f875be2020-05-11 14:57:19 +020041my $fh;
Akron26b59702020-05-19 12:14:41 +020042if ($ARGV[0] eq '-') {
43 $fh = *STDIN;
44} elsif (!open($fh, '<' . $ARGV[0])) {
Akron3f875be2020-05-11 14:57:19 +020045 warn $ARGV[0] . " can't be opened";
Akron26b59702020-05-19 12:14:41 +020046 exit(0);
Akron3f875be2020-05-11 14:57:19 +020047};
48
Akron286b46e2020-05-25 17:07:48 +020049# Initial VC group
50my $vc;
Akron26b59702020-05-19 12:14:41 +020051
Akron323881c2020-05-20 17:15:42 +020052# Create an intensional and an extensional VC
Akron1c070452020-05-25 11:28:30 +020053my $vc_ext = KorAP::VirtualCorpus::Group->new;
54my $vc_int = KorAP::VirtualCorpus::Group->new;
Akron49c765f2020-05-20 16:41:22 +020055
Akron286b46e2020-05-25 17:07:48 +020056# Load ext initially
57$$vc = $vc_ext;
Akron323881c2020-05-20 17:15:42 +020058
Akron1c070452020-05-25 11:28:30 +020059# Collect all virtual corpora
60my %all_vcs;
61
Akron323881c2020-05-20 17:15:42 +020062my $frozen = 0;
Akron26b59702020-05-19 12:14:41 +020063
64# Iterate over the whole list
65while (!eof $fh) {
66 my $line = readline($fh);
67 chomp $line;
68
Akrone2645ec2020-05-20 12:37:25 +020069
70 # Skip empty lines
71 if (!$line || length($line) == 0 || $line =~ /^[\s\t\n]*$/) {
72 # empty
73 next;
74 };
75
Akron340a9cb2020-05-20 12:55:22 +020076 my ($key, $value, $desc);
77
78 # Line-Type: <e>c</a>
79 if ($line =~ /^\s*<([^>]+)>\s*([^<]*)\s*<\/\1>\s*$/) {
80 $key = $1;
81 $value = $2 // undef;
82 }
83
84 # Line-Type: <e>c
85 elsif($line =~ /^\s*<([^>]+)>\s*([^<]+)\s*$/) {
86 $key = $1;
87 $value = $2;
88 }
89
Akron26b59702020-05-19 12:14:41 +020090 # Get text sigles
Akron323881c2020-05-20 17:15:42 +020091 elsif ($line =~ m!^(?:\w+\/){2}\w+$!) {
Akron340a9cb2020-05-20 12:55:22 +020092 $key = 'text';
93 $value = $line;
Akron26b59702020-05-19 12:14:41 +020094 }
95
96 # Get doc sigles
Akron323881c2020-05-20 17:15:42 +020097 elsif ($line =~ m!^(\w+\/\w+?)(?:\s.+?)?$!) {
Akron340a9cb2020-05-20 12:55:22 +020098 $key = 'doc';
99 $value = $1;
Akron26b59702020-05-19 12:14:41 +0200100 }
101
102 # Get corpus sigles
Akron340a9cb2020-05-20 12:55:22 +0200103 elsif ($line !~ m!(?:\/|\s)!) {
104 $key = 'corpus';
105 $value = $line;
Akron26b59702020-05-19 12:14:41 +0200106 }
107
Akron340a9cb2020-05-20 12:55:22 +0200108 # Not known
Akron26b59702020-05-19 12:14:41 +0200109 else {
Akron49c765f2020-05-20 16:41:22 +0200110 warn _shorten($line) . q! isn't a valid VC definition!;
Akron340a9cb2020-05-20 12:55:22 +0200111 next;
112 };
113
Akron49c765f2020-05-20 16:41:22 +0200114 # Add text field
Akron340a9cb2020-05-20 12:55:22 +0200115 if ($key eq 'text') {
Akron68746a12020-05-20 15:19:55 +0200116
117 # Convert C2 sigle to KorAP form
118 $value =~ s!^([^/]+?/[^\.]+?)\.(.+?)$!$1\/$2!;
Akron34a4f582020-05-27 12:18:57 +0200119 ${$vc}->union_field(textSigle => $value);
Akron340a9cb2020-05-20 12:55:22 +0200120 }
121
Akron49c765f2020-05-20 16:41:22 +0200122 # Add doc field
Akron340a9cb2020-05-20 12:55:22 +0200123 elsif ($key eq 'doc') {
Akron34a4f582020-05-27 12:18:57 +0200124 ${$vc}->union_field(docSigle => $value);
Akron340a9cb2020-05-20 12:55:22 +0200125 }
126
Akron49c765f2020-05-20 16:41:22 +0200127 # Add corpus field
Akron340a9cb2020-05-20 12:55:22 +0200128 elsif ($key eq 'corpus') {
Akron34a4f582020-05-27 12:18:57 +0200129 ${$vc}->union_field(corpusSigle => $value);
Akron1c070452020-05-25 11:28:30 +0200130 }
131
132 # Add corpus field
133 elsif ($key eq 'cn') {
134 # Korpussigle, z.B. 'F97 Frankfurter Allgemeine 1997'
135 if ($value =~ m!^([^\/\s]+)(?:\s.+?)?$!) {
Akron34a4f582020-05-27 12:18:57 +0200136 ${$vc}->union_field(corpusSigle => $1);
Akron1c070452020-05-25 11:28:30 +0200137 };
Akron323881c2020-05-20 17:15:42 +0200138 }
139
140 # Mark the vc as frozen
141 # This means that an extended VC area is expected
142 elsif ($key eq 'frozen') {
143 $frozen = 1;
144 }
145
146 # Start/End intended VC area
147 elsif ($key eq 'intended') {
148 if ($value eq 'start') {
149 $$vc = $vc_int;
150 }
151 elsif ($value ne 'end') {
152 warn 'Unknown intension value ' . $value;
153 };
154 }
155
156 # Start/End extended VC area
157 elsif ($key eq 'extended') {
158 if ($value eq 'start') {
159 $$vc = $vc_ext;
160 }
161 elsif ($value ne 'end') {
162 warn 'Unknown extension value ' . $value;
163 };
164 }
Akron3587f362020-05-20 17:50:38 +0200165
166 # Set VC name
167 elsif ($key eq 'name') {
168 # "Name des virt. Korpus, der angezeigt wird.
169 # Wird auch intern zur Korpusbildung referenziert, z.B. für <and>,
170 # <add>, <sub>"
171
172 # No global name defined yet
Akron1c070452020-05-25 11:28:30 +0200173 if ($$vc && !$$vc->name) {
Akron3587f362020-05-20 17:50:38 +0200174 $vc_ext->name($value);
175 $vc_int->name($value);
176 next;
177 };
Akron1c070452020-05-25 11:28:30 +0200178
179 ${$vc} = KorAP::VirtualCorpus::Group->new;
180 ${$vc}->name($value);
181 }
182
183 # End VC def
184 elsif ($key eq 'end') {
185 $all_vcs{${$vc}->name} = $$vc;
186 # $vc = undef;
187 }
188
189 # Add VC definition
190 elsif ($key eq 'add') {
191 unless (defined $all_vcs{$value}) {
192 # warn 'VC ' . $value . ' not defined';
193 # exit(1);
194 next;
195 };
196
Akron34a4f582020-05-27 12:18:57 +0200197 $$vc->union($all_vcs{$value}->clone->to_koral);
Akron3587f362020-05-20 17:50:38 +0200198 }
199
Akron34a4f582020-05-27 12:18:57 +0200200 # AND definition
201 elsif ($key eq 'and') {
202 unless (defined $all_vcs{$value}) {
203 # warn 'VC ' . $value . ' not defined';
204 # exit(1);
205 next;
206 };
207
208 $$vc->joint($all_vcs{$value}->clone->to_koral);
209 }
210
211 # Source of the corpus
212 elsif ($key eq 'ql') {
213 # Quellenname, z.B. "Neue Zürcher Zeitung"
214 $$vc->union_field(corpusTitle => $value);
215 }
216
217 elsif ($key eq 'sub') {
218 # "Sub" is the difference - it is the "and not" operation.
219 warn $key . ' is not yet supported';
220 }
221
222 elsif ($key eq 'co') {
223 # Country, z.B. DE für Text in Deutschland erschienen
224 warn $key . ' is not yet supported';
225 }
226
227 elsif ($key eq 'tl') {
228 # Textlength, Bereich von Texten der angegebenen Länge [in Anz. Wörtern]
229 warn $key . ' is not yet supported';
230 }
231
232 elsif ($key eq 'ts') {
233 # Textsorte, z.B. "Bericht"
234 warn $key . ' is not yet supported';
235 }
236
237 elsif ($key eq 'th') {
238 # Thema, z.B. "Sport - Fußball"
239 warn $key . ' is not yet supported';
240 }
241
242 elsif ($key eq 'red') {
243 # Reduktionsfaktor
244 # Wert zw. 1-99%: virt. Korpus wird auf diesen Wert
245 # reduziert. Modus: feste Reduzierung, nicht variabel.
246 warn $key . ' is not yet supported';
247 }
248
249 elsif ($key eq 'thprob') {
250 # ThemaProbability
251 # Wert, der für <th>Thema verwendet wird um zu bestimmen, ab welchem
252 # Zuverläßigkeitswert ein Thema übernommen wird
253 }
254
255
Akron286b46e2020-05-25 17:07:48 +0200256 # Add reduction value as a comment
257 elsif ($key eq 'redabs') {
258 # "red. Anz. Texte
259 # absoluter Wert der durch Reduktion zu erzielende Anzahl Texte"
260 $$vc->comment('redabs:' . $value);
Akron34a4f582020-05-27 12:18:57 +0200261 warn $key . ' is not yet supported';
262 }
263
264 # Add reduction value as a comment
265 elsif ($key eq 'date') {
266 # Supports two pattern schemes:
267 # m1=Year1/Month1 bis Year2/Month2
268 # Datumsbereich Schema 1: z.B. "2000/01 bis 2010/12"
269
270 # Schema 1
271 if ($value =~ m!^(?:m1\s*=\s*)?\s*(\d+)\/(\d+) bis (\d+)\/(\d+)\s*$!s) {
272 my ($y1, $m1, $y2, $m2) = ($1, $2, $3, $4);
273 if ($m1 < 10) {
274 $m1 = '0' . (0+$m1);
275 };
276 if ($m2 < 10) {
277 $m2 = '0' . (0+$m2);
278 };
279 $$vc->from($y1, $m1);
280 $$vc->to($y2, $m2);
281 }
282
283 # Scheme 2
284 elsif ($value =~ m!^\s*\d{4}-\d{4}\s+und\s+\d{1,2}-\d{1,2}\s*$!) {
285 # m2=Year1-Year2 und Month1-Month2
286 # Datumsbereich Schema 2: z.B. "1990-2000 und 06-06"
287
288 warn 'Second date scheme not yet supported!'
289 }
290
291 else {
292 warn 'Unknown date scheme ' . $value;
293 };
Akron286b46e2020-05-25 17:07:48 +0200294 }
295
Akron3587f362020-05-20 17:50:38 +0200296 # Unknown
297 else {
Akron34a4f582020-05-27 12:18:57 +0200298 warn $key . ' is an unknown field';
Akron3587f362020-05-20 17:50:38 +0200299 };
Akron26b59702020-05-19 12:14:41 +0200300};
301
Akron26b59702020-05-19 12:14:41 +0200302close($fh);
303
Akron1c070452020-05-25 11:28:30 +0200304# Stringify current (extended?) virtual corpus
Akron323881c2020-05-20 17:15:42 +0200305print $$vc->to_string;