Akron | 3f875be | 2020-05-11 14:57:19 +0200 | [diff] [blame] | 1 | #!/usr/bin/env perl |
Akron | 3587f36 | 2020-05-20 17:50:38 +0200 | [diff] [blame^] | 2 | package KorAP::VirtualCorpus; |
| 3 | use strict; |
| 4 | use warnings; |
| 5 | |
| 6 | # Get or set name of the VC |
| 7 | sub name { |
| 8 | my $self = shift; |
| 9 | unless (@_) { |
| 10 | return $self->{name}; |
| 11 | }; |
| 12 | $self->{name} = shift; |
| 13 | return $self; |
| 14 | }; |
| 15 | |
| 16 | |
| 17 | # Quote utility function |
| 18 | sub quote { |
| 19 | shift; |
| 20 | my $str = shift; |
| 21 | $str =~ s/(["\\])/\\$1/g; |
| 22 | return qq{"$str"}; |
| 23 | }; |
| 24 | |
| 25 | |
| 26 | # Escaped quote utility function |
| 27 | sub equote { |
| 28 | shift; |
| 29 | my $str = shift; |
| 30 | $str =~ s/(["\\])/\\$1/g; |
| 31 | $str =~ s/(["\\])/\\$1/g; |
| 32 | return '\\"' . $str . '\\"'; |
| 33 | }; |
| 34 | |
| 35 | |
| 36 | # Stringify globally |
| 37 | sub to_string { |
| 38 | my $self = shift; |
| 39 | ## Create collection object |
| 40 | my $json = '{'; |
| 41 | $json .= '"@context":"http://korap.ids-mannheim.de/ns/KoralQuery/v0.3/context.jsonld",'; |
| 42 | $json .= '"collection":{'; |
| 43 | $json .= $self->_to_fragment; |
| 44 | return $json .= '}}'; |
| 45 | }; |
| 46 | |
| 47 | |
Akron | 49c765f | 2020-05-20 16:41:22 +0200 | [diff] [blame] | 48 | package KorAP::VirtualCorpus::Group; |
| 49 | use strict; |
| 50 | use warnings; |
Akron | 3587f36 | 2020-05-20 17:50:38 +0200 | [diff] [blame^] | 51 | use base 'KorAP::VirtualCorpus'; |
| 52 | |
Akron | 49c765f | 2020-05-20 16:41:22 +0200 | [diff] [blame] | 53 | |
| 54 | # Construct a new VC group |
| 55 | sub new { |
| 56 | my $class = shift; |
| 57 | bless { |
| 58 | op => shift, |
| 59 | fields => {} |
| 60 | }, $class; |
| 61 | }; |
| 62 | |
| 63 | |
| 64 | # Add field information to group |
| 65 | sub add_field { |
| 66 | my $self = shift; |
| 67 | my $field = shift; |
| 68 | push @{$self->{fields}->{$field}}, shift; |
| 69 | }; |
| 70 | |
| 71 | |
Akron | 3587f36 | 2020-05-20 17:50:38 +0200 | [diff] [blame^] | 72 | # Stringify fragment |
| 73 | sub _to_fragment { |
Akron | 49c765f | 2020-05-20 16:41:22 +0200 | [diff] [blame] | 74 | my $self = shift; |
Akron | 3587f36 | 2020-05-20 17:50:38 +0200 | [diff] [blame^] | 75 | my $json = ''; |
Akron | 49c765f | 2020-05-20 16:41:22 +0200 | [diff] [blame] | 76 | |
| 77 | unless (keys %{$self->{fields}}) { |
| 78 | return $json . '}}'; |
| 79 | }; |
| 80 | |
| 81 | $json .= '"@type":"koral:docGroup",'; |
Akron | 3587f36 | 2020-05-20 17:50:38 +0200 | [diff] [blame^] | 82 | $json .= '"comment":"Name: ' . $self->equote($self->name) . '",' if $self->name; |
Akron | 49c765f | 2020-05-20 16:41:22 +0200 | [diff] [blame] | 83 | $json .= '"operation":"operation:' . $self->{op} . '",'; |
| 84 | $json .= '"operands":['; |
| 85 | |
| 86 | foreach my $field (sort keys %{$self->{fields}}) { |
| 87 | unless (@{$self->{fields}->{$field}}) { |
| 88 | next; |
| 89 | }; |
| 90 | $json .= '{'; |
| 91 | $json .= '"@type":"koral:doc",'; |
| 92 | $json .= '"key":"' . $field . '",'; |
| 93 | $json .= '"match":"match:eq",'; |
| 94 | $json .= '"value":['; |
| 95 | $json .= join ',', map { '"' . $_ . '"' } @{$self->{fields}->{$field}}; |
| 96 | $json .= ']'; |
| 97 | $json .= '},'; |
| 98 | }; |
| 99 | |
| 100 | # Remove the last comma |
| 101 | chop $json; |
| 102 | |
Akron | 3587f36 | 2020-05-20 17:50:38 +0200 | [diff] [blame^] | 103 | return $json . ']'; |
Akron | 49c765f | 2020-05-20 16:41:22 +0200 | [diff] [blame] | 104 | }; |
| 105 | |
| 106 | |
| 107 | package main; |
Akron | 3f875be | 2020-05-11 14:57:19 +0200 | [diff] [blame] | 108 | use strict; |
| 109 | use warnings; |
| 110 | |
Akron | 340a9cb | 2020-05-20 12:55:22 +0200 | [diff] [blame] | 111 | # 2020-05-20 |
| 112 | # Preliminary support for C2 def-files. |
| 113 | |
Akron | 26b5970 | 2020-05-19 12:14:41 +0200 | [diff] [blame] | 114 | our @ARGV; |
| 115 | |
Akron | 3f875be | 2020-05-11 14:57:19 +0200 | [diff] [blame] | 116 | unless (@ARGV) { |
| 117 | print <<'HELP'; |
| 118 | Convert a line-separated list of corpus sigles, doc sigles or |
| 119 | text sigles into a virtual corpus query. |
| 120 | |
| 121 | $ perl list2vc.pl my_vc.txt | gzip -vc > my_vc.jsonld.gz |
Akron | 26b5970 | 2020-05-19 12:14:41 +0200 | [diff] [blame] | 122 | $ cat my_vc.txt | perl list2vc.pl - | gzip -vc > my_vc.jsonld.gz |
Akron | 3f875be | 2020-05-11 14:57:19 +0200 | [diff] [blame] | 123 | |
| 124 | HELP |
| 125 | exit 0; |
| 126 | }; |
| 127 | |
Akron | 340a9cb | 2020-05-20 12:55:22 +0200 | [diff] [blame] | 128 | |
Akron | 49c765f | 2020-05-20 16:41:22 +0200 | [diff] [blame] | 129 | sub _shorten ($) { |
Akron | 340a9cb | 2020-05-20 12:55:22 +0200 | [diff] [blame] | 130 | my $line = shift; |
| 131 | if (length($line) < 20) { |
| 132 | return $line; |
| 133 | } |
| 134 | else { |
| 135 | return substr($line,0,17) . '...'; |
| 136 | }; |
| 137 | }; |
| 138 | |
| 139 | |
Akron | 3f875be | 2020-05-11 14:57:19 +0200 | [diff] [blame] | 140 | my $fh; |
Akron | 26b5970 | 2020-05-19 12:14:41 +0200 | [diff] [blame] | 141 | if ($ARGV[0] eq '-') { |
| 142 | $fh = *STDIN; |
| 143 | } elsif (!open($fh, '<' . $ARGV[0])) { |
Akron | 3f875be | 2020-05-11 14:57:19 +0200 | [diff] [blame] | 144 | warn $ARGV[0] . " can't be opened"; |
Akron | 26b5970 | 2020-05-19 12:14:41 +0200 | [diff] [blame] | 145 | exit(0); |
Akron | 3f875be | 2020-05-11 14:57:19 +0200 | [diff] [blame] | 146 | }; |
| 147 | |
Akron | 26b5970 | 2020-05-19 12:14:41 +0200 | [diff] [blame] | 148 | |
Akron | 323881c | 2020-05-20 17:15:42 +0200 | [diff] [blame] | 149 | # Create an intensional and an extensional VC |
| 150 | my $vc_ext = KorAP::VirtualCorpus::Group->new('or'); |
| 151 | my $vc_int = KorAP::VirtualCorpus::Group->new('or'); |
Akron | 49c765f | 2020-05-20 16:41:22 +0200 | [diff] [blame] | 152 | |
Akron | 323881c | 2020-05-20 17:15:42 +0200 | [diff] [blame] | 153 | # Initial VC group |
| 154 | my $vc = \$vc_ext; |
| 155 | |
| 156 | my $frozen = 0; |
Akron | 26b5970 | 2020-05-19 12:14:41 +0200 | [diff] [blame] | 157 | |
| 158 | # Iterate over the whole list |
| 159 | while (!eof $fh) { |
| 160 | my $line = readline($fh); |
| 161 | chomp $line; |
| 162 | |
Akron | e2645ec | 2020-05-20 12:37:25 +0200 | [diff] [blame] | 163 | |
| 164 | # Skip empty lines |
| 165 | if (!$line || length($line) == 0 || $line =~ /^[\s\t\n]*$/) { |
| 166 | # empty |
| 167 | next; |
| 168 | }; |
| 169 | |
Akron | 340a9cb | 2020-05-20 12:55:22 +0200 | [diff] [blame] | 170 | my ($key, $value, $desc); |
| 171 | |
| 172 | # Line-Type: <e>c</a> |
| 173 | if ($line =~ /^\s*<([^>]+)>\s*([^<]*)\s*<\/\1>\s*$/) { |
| 174 | $key = $1; |
| 175 | $value = $2 // undef; |
| 176 | } |
| 177 | |
| 178 | # Line-Type: <e>c |
| 179 | elsif($line =~ /^\s*<([^>]+)>\s*([^<]+)\s*$/) { |
| 180 | $key = $1; |
| 181 | $value = $2; |
| 182 | } |
| 183 | |
Akron | 26b5970 | 2020-05-19 12:14:41 +0200 | [diff] [blame] | 184 | # Get text sigles |
Akron | 323881c | 2020-05-20 17:15:42 +0200 | [diff] [blame] | 185 | elsif ($line =~ m!^(?:\w+\/){2}\w+$!) { |
Akron | 340a9cb | 2020-05-20 12:55:22 +0200 | [diff] [blame] | 186 | $key = 'text'; |
| 187 | $value = $line; |
Akron | 26b5970 | 2020-05-19 12:14:41 +0200 | [diff] [blame] | 188 | } |
| 189 | |
| 190 | # Get doc sigles |
Akron | 323881c | 2020-05-20 17:15:42 +0200 | [diff] [blame] | 191 | elsif ($line =~ m!^(\w+\/\w+?)(?:\s.+?)?$!) { |
Akron | 340a9cb | 2020-05-20 12:55:22 +0200 | [diff] [blame] | 192 | $key = 'doc'; |
| 193 | $value = $1; |
Akron | 26b5970 | 2020-05-19 12:14:41 +0200 | [diff] [blame] | 194 | } |
| 195 | |
| 196 | # Get corpus sigles |
Akron | 340a9cb | 2020-05-20 12:55:22 +0200 | [diff] [blame] | 197 | elsif ($line !~ m!(?:\/|\s)!) { |
| 198 | $key = 'corpus'; |
| 199 | $value = $line; |
Akron | 26b5970 | 2020-05-19 12:14:41 +0200 | [diff] [blame] | 200 | } |
| 201 | |
Akron | 340a9cb | 2020-05-20 12:55:22 +0200 | [diff] [blame] | 202 | # Not known |
Akron | 26b5970 | 2020-05-19 12:14:41 +0200 | [diff] [blame] | 203 | else { |
Akron | 49c765f | 2020-05-20 16:41:22 +0200 | [diff] [blame] | 204 | warn _shorten($line) . q! isn't a valid VC definition!; |
Akron | 340a9cb | 2020-05-20 12:55:22 +0200 | [diff] [blame] | 205 | next; |
| 206 | }; |
| 207 | |
Akron | 49c765f | 2020-05-20 16:41:22 +0200 | [diff] [blame] | 208 | # Add text field |
Akron | 340a9cb | 2020-05-20 12:55:22 +0200 | [diff] [blame] | 209 | if ($key eq 'text') { |
Akron | 68746a1 | 2020-05-20 15:19:55 +0200 | [diff] [blame] | 210 | |
| 211 | # Convert C2 sigle to KorAP form |
| 212 | $value =~ s!^([^/]+?/[^\.]+?)\.(.+?)$!$1\/$2!; |
Akron | 323881c | 2020-05-20 17:15:42 +0200 | [diff] [blame] | 213 | ${$vc}->add_field(textSigle => $value); |
Akron | 340a9cb | 2020-05-20 12:55:22 +0200 | [diff] [blame] | 214 | } |
| 215 | |
Akron | 49c765f | 2020-05-20 16:41:22 +0200 | [diff] [blame] | 216 | # Add doc field |
Akron | 340a9cb | 2020-05-20 12:55:22 +0200 | [diff] [blame] | 217 | elsif ($key eq 'doc') { |
Akron | 323881c | 2020-05-20 17:15:42 +0200 | [diff] [blame] | 218 | ${$vc}->add_field(docSigle => $value); |
Akron | 340a9cb | 2020-05-20 12:55:22 +0200 | [diff] [blame] | 219 | } |
| 220 | |
Akron | 49c765f | 2020-05-20 16:41:22 +0200 | [diff] [blame] | 221 | # Add corpus field |
Akron | 340a9cb | 2020-05-20 12:55:22 +0200 | [diff] [blame] | 222 | elsif ($key eq 'corpus') { |
Akron | 323881c | 2020-05-20 17:15:42 +0200 | [diff] [blame] | 223 | ${$vc}->add_field(corpusSigle => $value); |
| 224 | } |
| 225 | |
| 226 | # Mark the vc as frozen |
| 227 | # This means that an extended VC area is expected |
| 228 | elsif ($key eq 'frozen') { |
| 229 | $frozen = 1; |
| 230 | } |
| 231 | |
| 232 | # Start/End intended VC area |
| 233 | elsif ($key eq 'intended') { |
| 234 | if ($value eq 'start') { |
| 235 | $$vc = $vc_int; |
| 236 | } |
| 237 | elsif ($value ne 'end') { |
| 238 | warn 'Unknown intension value ' . $value; |
| 239 | }; |
| 240 | } |
| 241 | |
| 242 | # Start/End extended VC area |
| 243 | elsif ($key eq 'extended') { |
| 244 | if ($value eq 'start') { |
| 245 | $$vc = $vc_ext; |
| 246 | } |
| 247 | elsif ($value ne 'end') { |
| 248 | warn 'Unknown extension value ' . $value; |
| 249 | }; |
| 250 | } |
Akron | 3587f36 | 2020-05-20 17:50:38 +0200 | [diff] [blame^] | 251 | |
| 252 | # Set VC name |
| 253 | elsif ($key eq 'name') { |
| 254 | # "Name des virt. Korpus, der angezeigt wird. |
| 255 | # Wird auch intern zur Korpusbildung referenziert, z.B. für <and>, |
| 256 | # <add>, <sub>" |
| 257 | |
| 258 | # No global name defined yet |
| 259 | unless ($$vc->name) { |
| 260 | $vc_ext->name($value); |
| 261 | $vc_int->name($value); |
| 262 | next; |
| 263 | }; |
| 264 | } |
| 265 | |
| 266 | # Unknown |
| 267 | else { |
| 268 | # warn $key . ' is an unknown field'; |
| 269 | }; |
Akron | 26b5970 | 2020-05-19 12:14:41 +0200 | [diff] [blame] | 270 | }; |
| 271 | |
Akron | 26b5970 | 2020-05-19 12:14:41 +0200 | [diff] [blame] | 272 | close($fh); |
| 273 | |
Akron | 323881c | 2020-05-20 17:15:42 +0200 | [diff] [blame] | 274 | # Stringify current (extended) virtual corpus |
| 275 | print $$vc->to_string; |