blob: d68e22de8bdd4ccd1fd9d9615911f94f95cdda62 [file] [log] [blame]
Akron3f875be2020-05-11 14:57:19 +02001#!/usr/bin/env perl
Akron3587f362020-05-20 17:50:38 +02002package KorAP::VirtualCorpus;
3use strict;
4use warnings;
5
6# Get or set name of the VC
7sub name {
8 my $self = shift;
9 unless (@_) {
10 return $self->{name};
11 };
12 $self->{name} = shift;
13 return $self;
14};
15
16
17# Quote utility function
18sub quote {
19 shift;
20 my $str = shift;
21 $str =~ s/(["\\])/\\$1/g;
22 return qq{"$str"};
23};
24
25
26# Escaped quote utility function
27sub equote {
28 shift;
29 my $str = shift;
30 $str =~ s/(["\\])/\\$1/g;
31 $str =~ s/(["\\])/\\$1/g;
32 return '\\"' . $str . '\\"';
33};
34
35
36# Stringify globally
37sub to_string {
38 my $self = shift;
39 ## Create collection object
40 my $json = '{';
41 $json .= '"@context":"http://korap.ids-mannheim.de/ns/KoralQuery/v0.3/context.jsonld",';
42 $json .= '"collection":{';
43 $json .= $self->_to_fragment;
44 return $json .= '}}';
45};
46
47
Akron49c765f2020-05-20 16:41:22 +020048package KorAP::VirtualCorpus::Group;
49use strict;
50use warnings;
Akron3587f362020-05-20 17:50:38 +020051use base 'KorAP::VirtualCorpus';
52
Akron49c765f2020-05-20 16:41:22 +020053
54# Construct a new VC group
55sub new {
56 my $class = shift;
57 bless {
58 op => shift,
59 fields => {}
60 }, $class;
61};
62
63
64# Add field information to group
65sub add_field {
66 my $self = shift;
67 my $field = shift;
68 push @{$self->{fields}->{$field}}, shift;
69};
70
71
Akron3587f362020-05-20 17:50:38 +020072# Stringify fragment
73sub _to_fragment {
Akron49c765f2020-05-20 16:41:22 +020074 my $self = shift;
Akron3587f362020-05-20 17:50:38 +020075 my $json = '';
Akron49c765f2020-05-20 16:41:22 +020076
77 unless (keys %{$self->{fields}}) {
78 return $json . '}}';
79 };
80
81 $json .= '"@type":"koral:docGroup",';
Akron3587f362020-05-20 17:50:38 +020082 $json .= '"comment":"Name: ' . $self->equote($self->name) . '",' if $self->name;
Akron49c765f2020-05-20 16:41:22 +020083 $json .= '"operation":"operation:' . $self->{op} . '",';
84 $json .= '"operands":[';
85
86 foreach my $field (sort keys %{$self->{fields}}) {
87 unless (@{$self->{fields}->{$field}}) {
88 next;
89 };
90 $json .= '{';
91 $json .= '"@type":"koral:doc",';
92 $json .= '"key":"' . $field . '",';
93 $json .= '"match":"match:eq",';
94 $json .= '"value":[';
95 $json .= join ',', map { '"' . $_ . '"' } @{$self->{fields}->{$field}};
96 $json .= ']';
97 $json .= '},';
98 };
99
100 # Remove the last comma
101 chop $json;
102
Akron3587f362020-05-20 17:50:38 +0200103 return $json . ']';
Akron49c765f2020-05-20 16:41:22 +0200104};
105
106
107package main;
Akron3f875be2020-05-11 14:57:19 +0200108use strict;
109use warnings;
110
Akron340a9cb2020-05-20 12:55:22 +0200111# 2020-05-20
112# Preliminary support for C2 def-files.
113
Akron26b59702020-05-19 12:14:41 +0200114our @ARGV;
115
Akron3f875be2020-05-11 14:57:19 +0200116unless (@ARGV) {
117 print <<'HELP';
118Convert a line-separated list of corpus sigles, doc sigles or
119text sigles into a virtual corpus query.
120
121 $ perl list2vc.pl my_vc.txt | gzip -vc > my_vc.jsonld.gz
Akron26b59702020-05-19 12:14:41 +0200122 $ cat my_vc.txt | perl list2vc.pl - | gzip -vc > my_vc.jsonld.gz
Akron3f875be2020-05-11 14:57:19 +0200123
124HELP
125exit 0;
126};
127
Akron340a9cb2020-05-20 12:55:22 +0200128
Akron49c765f2020-05-20 16:41:22 +0200129sub _shorten ($) {
Akron340a9cb2020-05-20 12:55:22 +0200130 my $line = shift;
131 if (length($line) < 20) {
132 return $line;
133 }
134 else {
135 return substr($line,0,17) . '...';
136 };
137};
138
139
Akron3f875be2020-05-11 14:57:19 +0200140my $fh;
Akron26b59702020-05-19 12:14:41 +0200141if ($ARGV[0] eq '-') {
142 $fh = *STDIN;
143} elsif (!open($fh, '<' . $ARGV[0])) {
Akron3f875be2020-05-11 14:57:19 +0200144 warn $ARGV[0] . " can't be opened";
Akron26b59702020-05-19 12:14:41 +0200145 exit(0);
Akron3f875be2020-05-11 14:57:19 +0200146};
147
Akron26b59702020-05-19 12:14:41 +0200148
Akron323881c2020-05-20 17:15:42 +0200149# Create an intensional and an extensional VC
150my $vc_ext = KorAP::VirtualCorpus::Group->new('or');
151my $vc_int = KorAP::VirtualCorpus::Group->new('or');
Akron49c765f2020-05-20 16:41:22 +0200152
Akron323881c2020-05-20 17:15:42 +0200153# Initial VC group
154my $vc = \$vc_ext;
155
156my $frozen = 0;
Akron26b59702020-05-19 12:14:41 +0200157
158# Iterate over the whole list
159while (!eof $fh) {
160 my $line = readline($fh);
161 chomp $line;
162
Akrone2645ec2020-05-20 12:37:25 +0200163
164 # Skip empty lines
165 if (!$line || length($line) == 0 || $line =~ /^[\s\t\n]*$/) {
166 # empty
167 next;
168 };
169
Akron340a9cb2020-05-20 12:55:22 +0200170 my ($key, $value, $desc);
171
172 # Line-Type: <e>c</a>
173 if ($line =~ /^\s*<([^>]+)>\s*([^<]*)\s*<\/\1>\s*$/) {
174 $key = $1;
175 $value = $2 // undef;
176 }
177
178 # Line-Type: <e>c
179 elsif($line =~ /^\s*<([^>]+)>\s*([^<]+)\s*$/) {
180 $key = $1;
181 $value = $2;
182 }
183
Akron26b59702020-05-19 12:14:41 +0200184 # Get text sigles
Akron323881c2020-05-20 17:15:42 +0200185 elsif ($line =~ m!^(?:\w+\/){2}\w+$!) {
Akron340a9cb2020-05-20 12:55:22 +0200186 $key = 'text';
187 $value = $line;
Akron26b59702020-05-19 12:14:41 +0200188 }
189
190 # Get doc sigles
Akron323881c2020-05-20 17:15:42 +0200191 elsif ($line =~ m!^(\w+\/\w+?)(?:\s.+?)?$!) {
Akron340a9cb2020-05-20 12:55:22 +0200192 $key = 'doc';
193 $value = $1;
Akron26b59702020-05-19 12:14:41 +0200194 }
195
196 # Get corpus sigles
Akron340a9cb2020-05-20 12:55:22 +0200197 elsif ($line !~ m!(?:\/|\s)!) {
198 $key = 'corpus';
199 $value = $line;
Akron26b59702020-05-19 12:14:41 +0200200 }
201
Akron340a9cb2020-05-20 12:55:22 +0200202 # Not known
Akron26b59702020-05-19 12:14:41 +0200203 else {
Akron49c765f2020-05-20 16:41:22 +0200204 warn _shorten($line) . q! isn't a valid VC definition!;
Akron340a9cb2020-05-20 12:55:22 +0200205 next;
206 };
207
Akron49c765f2020-05-20 16:41:22 +0200208 # Add text field
Akron340a9cb2020-05-20 12:55:22 +0200209 if ($key eq 'text') {
Akron68746a12020-05-20 15:19:55 +0200210
211 # Convert C2 sigle to KorAP form
212 $value =~ s!^([^/]+?/[^\.]+?)\.(.+?)$!$1\/$2!;
Akron323881c2020-05-20 17:15:42 +0200213 ${$vc}->add_field(textSigle => $value);
Akron340a9cb2020-05-20 12:55:22 +0200214 }
215
Akron49c765f2020-05-20 16:41:22 +0200216 # Add doc field
Akron340a9cb2020-05-20 12:55:22 +0200217 elsif ($key eq 'doc') {
Akron323881c2020-05-20 17:15:42 +0200218 ${$vc}->add_field(docSigle => $value);
Akron340a9cb2020-05-20 12:55:22 +0200219 }
220
Akron49c765f2020-05-20 16:41:22 +0200221 # Add corpus field
Akron340a9cb2020-05-20 12:55:22 +0200222 elsif ($key eq 'corpus') {
Akron323881c2020-05-20 17:15:42 +0200223 ${$vc}->add_field(corpusSigle => $value);
224 }
225
226 # Mark the vc as frozen
227 # This means that an extended VC area is expected
228 elsif ($key eq 'frozen') {
229 $frozen = 1;
230 }
231
232 # Start/End intended VC area
233 elsif ($key eq 'intended') {
234 if ($value eq 'start') {
235 $$vc = $vc_int;
236 }
237 elsif ($value ne 'end') {
238 warn 'Unknown intension value ' . $value;
239 };
240 }
241
242 # Start/End extended VC area
243 elsif ($key eq 'extended') {
244 if ($value eq 'start') {
245 $$vc = $vc_ext;
246 }
247 elsif ($value ne 'end') {
248 warn 'Unknown extension value ' . $value;
249 };
250 }
Akron3587f362020-05-20 17:50:38 +0200251
252 # Set VC name
253 elsif ($key eq 'name') {
254 # "Name des virt. Korpus, der angezeigt wird.
255 # Wird auch intern zur Korpusbildung referenziert, z.B. für <and>,
256 # <add>, <sub>"
257
258 # No global name defined yet
259 unless ($$vc->name) {
260 $vc_ext->name($value);
261 $vc_int->name($value);
262 next;
263 };
264 }
265
266 # Unknown
267 else {
268 # warn $key . ' is an unknown field';
269 };
Akron26b59702020-05-19 12:14:41 +0200270};
271
Akron26b59702020-05-19 12:14:41 +0200272close($fh);
273
Akron323881c2020-05-20 17:15:42 +0200274# Stringify current (extended) virtual corpus
275print $$vc->to_string;