blob: 95259ccd811e387cbee8a8dab9f17c9f1e50f82c [file] [log] [blame]
Akron3f875be2020-05-11 14:57:19 +02001#!/usr/bin/env perl
Akron49c765f2020-05-20 16:41:22 +02002package KorAP::VirtualCorpus::Group;
3use strict;
4use warnings;
5
6# Construct a new VC group
7sub new {
8 my $class = shift;
9 bless {
10 op => shift,
11 fields => {}
12 }, $class;
13};
14
15
16# Add field information to group
17sub add_field {
18 my $self = shift;
19 my $field = shift;
20 push @{$self->{fields}->{$field}}, shift;
21};
22
23
24# Stringify
25sub to_string {
26 my $self = shift;
27 ## Create collection object
28 my $json = '{';
29 $json .= '"@context":"http://korap.ids-mannheim.de/ns/KoralQuery/v0.3/context.jsonld",';
30 $json .= '"collection":{';
31
32 unless (keys %{$self->{fields}}) {
33 return $json . '}}';
34 };
35
36 $json .= '"@type":"koral:docGroup",';
37 $json .= '"operation":"operation:' . $self->{op} . '",';
38 $json .= '"operands":[';
39
40 foreach my $field (sort keys %{$self->{fields}}) {
41 unless (@{$self->{fields}->{$field}}) {
42 next;
43 };
44 $json .= '{';
45 $json .= '"@type":"koral:doc",';
46 $json .= '"key":"' . $field . '",';
47 $json .= '"match":"match:eq",';
48 $json .= '"value":[';
49 $json .= join ',', map { '"' . $_ . '"' } @{$self->{fields}->{$field}};
50 $json .= ']';
51 $json .= '},';
52 };
53
54 # Remove the last comma
55 chop $json;
56
57 $json .= ']}}';
58 return $json;
59};
60
61
62package main;
Akron3f875be2020-05-11 14:57:19 +020063use strict;
64use warnings;
65
Akron340a9cb2020-05-20 12:55:22 +020066# 2020-05-20
67# Preliminary support for C2 def-files.
68
Akron26b59702020-05-19 12:14:41 +020069our @ARGV;
70
Akron3f875be2020-05-11 14:57:19 +020071unless (@ARGV) {
72 print <<'HELP';
73Convert a line-separated list of corpus sigles, doc sigles or
74text sigles into a virtual corpus query.
75
76 $ perl list2vc.pl my_vc.txt | gzip -vc > my_vc.jsonld.gz
Akron26b59702020-05-19 12:14:41 +020077 $ cat my_vc.txt | perl list2vc.pl - | gzip -vc > my_vc.jsonld.gz
Akron3f875be2020-05-11 14:57:19 +020078
79HELP
80exit 0;
81};
82
Akron340a9cb2020-05-20 12:55:22 +020083
Akron49c765f2020-05-20 16:41:22 +020084sub _shorten ($) {
Akron340a9cb2020-05-20 12:55:22 +020085 my $line = shift;
86 if (length($line) < 20) {
87 return $line;
88 }
89 else {
90 return substr($line,0,17) . '...';
91 };
92};
93
94
Akron3f875be2020-05-11 14:57:19 +020095my $fh;
Akron26b59702020-05-19 12:14:41 +020096if ($ARGV[0] eq '-') {
97 $fh = *STDIN;
98} elsif (!open($fh, '<' . $ARGV[0])) {
Akron3f875be2020-05-11 14:57:19 +020099 warn $ARGV[0] . " can't be opened";
Akron26b59702020-05-19 12:14:41 +0200100 exit(0);
Akron3f875be2020-05-11 14:57:19 +0200101};
102
Akron26b59702020-05-19 12:14:41 +0200103
Akron323881c2020-05-20 17:15:42 +0200104# Create an intensional and an extensional VC
105my $vc_ext = KorAP::VirtualCorpus::Group->new('or');
106my $vc_int = KorAP::VirtualCorpus::Group->new('or');
Akron49c765f2020-05-20 16:41:22 +0200107
Akron323881c2020-05-20 17:15:42 +0200108# Initial VC group
109my $vc = \$vc_ext;
110
111my $frozen = 0;
Akron26b59702020-05-19 12:14:41 +0200112
113# Iterate over the whole list
114while (!eof $fh) {
115 my $line = readline($fh);
116 chomp $line;
117
Akrone2645ec2020-05-20 12:37:25 +0200118
119 # Skip empty lines
120 if (!$line || length($line) == 0 || $line =~ /^[\s\t\n]*$/) {
121 # empty
122 next;
123 };
124
Akron340a9cb2020-05-20 12:55:22 +0200125 my ($key, $value, $desc);
126
127 # Line-Type: <e>c</a>
128 if ($line =~ /^\s*<([^>]+)>\s*([^<]*)\s*<\/\1>\s*$/) {
129 $key = $1;
130 $value = $2 // undef;
131 }
132
133 # Line-Type: <e>c
134 elsif($line =~ /^\s*<([^>]+)>\s*([^<]+)\s*$/) {
135 $key = $1;
136 $value = $2;
137 }
138
Akron26b59702020-05-19 12:14:41 +0200139 # Get text sigles
Akron323881c2020-05-20 17:15:42 +0200140 elsif ($line =~ m!^(?:\w+\/){2}\w+$!) {
Akron340a9cb2020-05-20 12:55:22 +0200141 $key = 'text';
142 $value = $line;
Akron26b59702020-05-19 12:14:41 +0200143 }
144
145 # Get doc sigles
Akron323881c2020-05-20 17:15:42 +0200146 elsif ($line =~ m!^(\w+\/\w+?)(?:\s.+?)?$!) {
Akron340a9cb2020-05-20 12:55:22 +0200147 $key = 'doc';
148 $value = $1;
Akron26b59702020-05-19 12:14:41 +0200149 }
150
151 # Get corpus sigles
Akron340a9cb2020-05-20 12:55:22 +0200152 elsif ($line !~ m!(?:\/|\s)!) {
153 $key = 'corpus';
154 $value = $line;
Akron26b59702020-05-19 12:14:41 +0200155 }
156
Akron340a9cb2020-05-20 12:55:22 +0200157 # Not known
Akron26b59702020-05-19 12:14:41 +0200158 else {
Akron49c765f2020-05-20 16:41:22 +0200159 warn _shorten($line) . q! isn't a valid VC definition!;
Akron340a9cb2020-05-20 12:55:22 +0200160 next;
161 };
162
Akron49c765f2020-05-20 16:41:22 +0200163 # Add text field
Akron340a9cb2020-05-20 12:55:22 +0200164 if ($key eq 'text') {
Akron68746a12020-05-20 15:19:55 +0200165
166 # Convert C2 sigle to KorAP form
167 $value =~ s!^([^/]+?/[^\.]+?)\.(.+?)$!$1\/$2!;
Akron323881c2020-05-20 17:15:42 +0200168 ${$vc}->add_field(textSigle => $value);
Akron340a9cb2020-05-20 12:55:22 +0200169 }
170
Akron49c765f2020-05-20 16:41:22 +0200171 # Add doc field
Akron340a9cb2020-05-20 12:55:22 +0200172 elsif ($key eq 'doc') {
Akron323881c2020-05-20 17:15:42 +0200173 ${$vc}->add_field(docSigle => $value);
Akron340a9cb2020-05-20 12:55:22 +0200174 }
175
Akron49c765f2020-05-20 16:41:22 +0200176 # Add corpus field
Akron340a9cb2020-05-20 12:55:22 +0200177 elsif ($key eq 'corpus') {
Akron323881c2020-05-20 17:15:42 +0200178 ${$vc}->add_field(corpusSigle => $value);
179 }
180
181 # Mark the vc as frozen
182 # This means that an extended VC area is expected
183 elsif ($key eq 'frozen') {
184 $frozen = 1;
185 }
186
187 # Start/End intended VC area
188 elsif ($key eq 'intended') {
189 if ($value eq 'start') {
190 $$vc = $vc_int;
191 }
192 elsif ($value ne 'end') {
193 warn 'Unknown intension value ' . $value;
194 };
195 }
196
197 # Start/End extended VC area
198 elsif ($key eq 'extended') {
199 if ($value eq 'start') {
200 $$vc = $vc_ext;
201 }
202 elsif ($value ne 'end') {
203 warn 'Unknown extension value ' . $value;
204 };
205 }
Akron26b59702020-05-19 12:14:41 +0200206};
207
Akron26b59702020-05-19 12:14:41 +0200208close($fh);
209
Akron323881c2020-05-20 17:15:42 +0200210# Stringify current (extended) virtual corpus
211print $$vc->to_string;