blob: c3fcd30d44d93ebaa7878f1b96410ff45fe803a2 [file] [log] [blame]
Akron18e407a2020-05-11 14:57:19 +02001#!/usr/bin/env perl
2use strict;
3use warnings;
4
Akron1839cb12020-05-20 12:55:22 +02005# 2020-05-20
6# Preliminary support for C2 def-files.
7
8
Akron1e6f4d42020-05-19 12:14:41 +02009our @ARGV;
10
Akron18e407a2020-05-11 14:57:19 +020011unless (@ARGV) {
12 print <<'HELP';
13Convert a line-separated list of corpus sigles, doc sigles or
14text sigles into a virtual corpus query.
15
16 $ perl list2vc.pl my_vc.txt | gzip -vc > my_vc.jsonld.gz
Akron1e6f4d42020-05-19 12:14:41 +020017 $ cat my_vc.txt | perl list2vc.pl - | gzip -vc > my_vc.jsonld.gz
Akron18e407a2020-05-11 14:57:19 +020018
19HELP
20exit 0;
21};
22
Akron1839cb12020-05-20 12:55:22 +020023
24sub shorten ($) {
25 my $line = shift;
26 if (length($line) < 20) {
27 return $line;
28 }
29 else {
30 return substr($line,0,17) . '...';
31 };
32};
33
34
Akron18e407a2020-05-11 14:57:19 +020035my $fh;
Akron1e6f4d42020-05-19 12:14:41 +020036if ($ARGV[0] eq '-') {
37 $fh = *STDIN;
38} elsif (!open($fh, '<' . $ARGV[0])) {
Akron18e407a2020-05-11 14:57:19 +020039 warn $ARGV[0] . " can't be opened";
Akron1e6f4d42020-05-19 12:14:41 +020040 exit(0);
Akron18e407a2020-05-11 14:57:19 +020041};
42
Akron1e6f4d42020-05-19 12:14:41 +020043
44my %data = (
45 corpus => [],
46 doc => [],
47 text => []
48);
49
50# Iterate over the whole list
51while (!eof $fh) {
52 my $line = readline($fh);
53 chomp $line;
54
Akron23e9e3c2020-05-20 12:37:25 +020055
56 # Skip empty lines
57 if (!$line || length($line) == 0 || $line =~ /^[\s\t\n]*$/) {
58 # empty
59 next;
60 };
61
Akron1839cb12020-05-20 12:55:22 +020062 my ($key, $value, $desc);
63
64 # Line-Type: <e>c</a>
65 if ($line =~ /^\s*<([^>]+)>\s*([^<]*)\s*<\/\1>\s*$/) {
66 $key = $1;
67 $value = $2 // undef;
68 }
69
70 # Line-Type: <e>c
71 elsif($line =~ /^\s*<([^>]+)>\s*([^<]+)\s*$/) {
72 $key = $1;
73 $value = $2;
74 }
75
Akron1e6f4d42020-05-19 12:14:41 +020076 # Get text sigles
Akron1839cb12020-05-20 12:55:22 +020077 elsif ($line =~ m!^(?:[^\/\s]+\/){2}[^\/\s]+$!) {
78 $key = 'text';
79 $value = $line;
Akron1e6f4d42020-05-19 12:14:41 +020080 }
81
82 # Get doc sigles
Akron1839cb12020-05-20 12:55:22 +020083 elsif ($line =~ m!^([^\/\s]+\/[^\/\s]+?)(?:\s.+?)?$!) {
84 $key = 'doc';
85 $value = $1;
Akron1e6f4d42020-05-19 12:14:41 +020086 }
87
88 # Get corpus sigles
Akron1839cb12020-05-20 12:55:22 +020089 elsif ($line !~ m!(?:\/|\s)!) {
90 $key = 'corpus';
91 $value = $line;
Akron1e6f4d42020-05-19 12:14:41 +020092 }
93
Akron1839cb12020-05-20 12:55:22 +020094 # Not known
Akron1e6f4d42020-05-19 12:14:41 +020095 else {
96 warn shorten($line) . q! isn't a valid sigle!;
Akron1839cb12020-05-20 12:55:22 +020097 next;
98 };
99
100 if ($key eq 'text') {
Akronee2073d2020-05-20 15:19:55 +0200101
102 # Convert C2 sigle to KorAP form
103 $value =~ s!^([^/]+?/[^\.]+?)\.(.+?)$!$1\/$2!;
Akron1839cb12020-05-20 12:55:22 +0200104 push @{$data{text}}, $value;
105 }
106
107 elsif ($key eq 'doc') {
108 push @{$data{doc}}, $value;
109 }
110
111 elsif ($key eq 'corpus') {
112 push @{$data{corpus}}, $value;
Akron1e6f4d42020-05-19 12:14:41 +0200113 };
114};
115
116# Create collection object
117my $json = '{';
118$json .= '"@context":"http://korap.ids-mannheim.de/ns/KoralQuery/v0.3/context.jsonld",';
119$json .= '"collection":{';
120
121unless (@{$data{corpus}} || @{$data{doc}} || @{$data{text}}) {
122 $json .= '}}';
123 close($fh);
124 print $json;
125 exit(0);
126};
127
128$json .= '"@type":"koral:docGroup",';
129$json .= '"operation":"operation:or",';
130$json .= '"operands":[';
131
132foreach my $type (qw/corpus doc text/) {
133 unless (@{$data{$type}}) {
134 next;
135 };
136 $json .= '{';
137 $json .= '"@type":"koral:doc",';
138 $json .= '"key":"' . $type . 'Sigle",';
139 $json .= '"match":"match:eq",';
140 $json .= '"value":[';
141 $json .= join ',', map { '"' . $_ . '"' } @{$data{$type}};
142 $json .= ']';
143 $json .= '},';
144};
145
146# Remove the last comma
147chop $json;
148
149$json .= ']}}';
150
151close($fh);
152
153print $json;
154