blob: 508f88f9dc4cb7a470b7d896707c0e2ef3fc971f [file] [log] [blame]
Akron18e407a2020-05-11 14:57:19 +02001#!/usr/bin/env perl
2use strict;
3use warnings;
4
Akron1839cb12020-05-20 12:55:22 +02005# 2020-05-20
6# Preliminary support for C2 def-files.
7
8
Akron1e6f4d42020-05-19 12:14:41 +02009our @ARGV;
10
Akron18e407a2020-05-11 14:57:19 +020011unless (@ARGV) {
12 print <<'HELP';
13Convert a line-separated list of corpus sigles, doc sigles or
14text sigles into a virtual corpus query.
15
16 $ perl list2vc.pl my_vc.txt | gzip -vc > my_vc.jsonld.gz
Akron1e6f4d42020-05-19 12:14:41 +020017 $ cat my_vc.txt | perl list2vc.pl - | gzip -vc > my_vc.jsonld.gz
Akron18e407a2020-05-11 14:57:19 +020018
19HELP
20exit 0;
21};
22
Akron1839cb12020-05-20 12:55:22 +020023
24sub shorten ($) {
25 my $line = shift;
26 if (length($line) < 20) {
27 return $line;
28 }
29 else {
30 return substr($line,0,17) . '...';
31 };
32};
33
34
Akron18e407a2020-05-11 14:57:19 +020035my $fh;
Akron1e6f4d42020-05-19 12:14:41 +020036if ($ARGV[0] eq '-') {
37 $fh = *STDIN;
38} elsif (!open($fh, '<' . $ARGV[0])) {
Akron18e407a2020-05-11 14:57:19 +020039 warn $ARGV[0] . " can't be opened";
Akron1e6f4d42020-05-19 12:14:41 +020040 exit(0);
Akron18e407a2020-05-11 14:57:19 +020041};
42
Akron1e6f4d42020-05-19 12:14:41 +020043
44my %data = (
45 corpus => [],
46 doc => [],
47 text => []
48);
49
50# Iterate over the whole list
51while (!eof $fh) {
52 my $line = readline($fh);
53 chomp $line;
54
Akron23e9e3c2020-05-20 12:37:25 +020055
56 # Skip empty lines
57 if (!$line || length($line) == 0 || $line =~ /^[\s\t\n]*$/) {
58 # empty
59 next;
60 };
61
Akron1839cb12020-05-20 12:55:22 +020062 my ($key, $value, $desc);
63
64 # Line-Type: <e>c</a>
65 if ($line =~ /^\s*<([^>]+)>\s*([^<]*)\s*<\/\1>\s*$/) {
66 $key = $1;
67 $value = $2 // undef;
68 }
69
70 # Line-Type: <e>c
71 elsif($line =~ /^\s*<([^>]+)>\s*([^<]+)\s*$/) {
72 $key = $1;
73 $value = $2;
74 }
75
Akron1e6f4d42020-05-19 12:14:41 +020076 # Get text sigles
Akron1839cb12020-05-20 12:55:22 +020077 elsif ($line =~ m!^(?:[^\/\s]+\/){2}[^\/\s]+$!) {
78 $key = 'text';
79 $value = $line;
Akron1e6f4d42020-05-19 12:14:41 +020080 }
81
82 # Get doc sigles
Akron1839cb12020-05-20 12:55:22 +020083 elsif ($line =~ m!^([^\/\s]+\/[^\/\s]+?)(?:\s.+?)?$!) {
84 $key = 'doc';
85 $value = $1;
Akron1e6f4d42020-05-19 12:14:41 +020086 }
87
88 # Get corpus sigles
Akron1839cb12020-05-20 12:55:22 +020089 elsif ($line !~ m!(?:\/|\s)!) {
90 $key = 'corpus';
91 $value = $line;
Akron1e6f4d42020-05-19 12:14:41 +020092 }
93
Akron1839cb12020-05-20 12:55:22 +020094 # Not known
Akron1e6f4d42020-05-19 12:14:41 +020095 else {
96 warn shorten($line) . q! isn't a valid sigle!;
Akron1839cb12020-05-20 12:55:22 +020097 next;
98 };
99
100 if ($key eq 'text') {
101 push @{$data{text}}, $value;
102 }
103
104 elsif ($key eq 'doc') {
105 push @{$data{doc}}, $value;
106 }
107
108 elsif ($key eq 'corpus') {
109 push @{$data{corpus}}, $value;
Akron1e6f4d42020-05-19 12:14:41 +0200110 };
111};
112
113# Create collection object
114my $json = '{';
115$json .= '"@context":"http://korap.ids-mannheim.de/ns/KoralQuery/v0.3/context.jsonld",';
116$json .= '"collection":{';
117
118unless (@{$data{corpus}} || @{$data{doc}} || @{$data{text}}) {
119 $json .= '}}';
120 close($fh);
121 print $json;
122 exit(0);
123};
124
125$json .= '"@type":"koral:docGroup",';
126$json .= '"operation":"operation:or",';
127$json .= '"operands":[';
128
129foreach my $type (qw/corpus doc text/) {
130 unless (@{$data{$type}}) {
131 next;
132 };
133 $json .= '{';
134 $json .= '"@type":"koral:doc",';
135 $json .= '"key":"' . $type . 'Sigle",';
136 $json .= '"match":"match:eq",';
137 $json .= '"value":[';
138 $json .= join ',', map { '"' . $_ . '"' } @{$data{$type}};
139 $json .= ']';
140 $json .= '},';
141};
142
143# Remove the last comma
144chop $json;
145
146$json .= ']}}';
147
148close($fh);
149
150print $json;
151