blob: ac73ab94424605752c823847e0431f3c40d1a7cd [file] [log] [blame]
Akron18e407a2020-05-11 14:57:19 +02001#!/usr/bin/env perl
Akronfe58a6c2020-05-20 16:41:22 +02002package KorAP::VirtualCorpus::Group;
3use strict;
4use warnings;
5
6# Construct a new VC group
7sub new {
8 my $class = shift;
9 bless {
10 op => shift,
11 fields => {}
12 }, $class;
13};
14
15
16# Add field information to group
17sub add_field {
18 my $self = shift;
19 my $field = shift;
20 push @{$self->{fields}->{$field}}, shift;
21};
22
23
24# Stringify
25sub to_string {
26 my $self = shift;
27 ## Create collection object
28 my $json = '{';
29 $json .= '"@context":"http://korap.ids-mannheim.de/ns/KoralQuery/v0.3/context.jsonld",';
30 $json .= '"collection":{';
31
32 unless (keys %{$self->{fields}}) {
33 return $json . '}}';
34 };
35
36 $json .= '"@type":"koral:docGroup",';
37 $json .= '"operation":"operation:' . $self->{op} . '",';
38 $json .= '"operands":[';
39
40 foreach my $field (sort keys %{$self->{fields}}) {
41 unless (@{$self->{fields}->{$field}}) {
42 next;
43 };
44 $json .= '{';
45 $json .= '"@type":"koral:doc",';
46 $json .= '"key":"' . $field . '",';
47 $json .= '"match":"match:eq",';
48 $json .= '"value":[';
49 $json .= join ',', map { '"' . $_ . '"' } @{$self->{fields}->{$field}};
50 $json .= ']';
51 $json .= '},';
52 };
53
54 # Remove the last comma
55 chop $json;
56
57 $json .= ']}}';
58 return $json;
59};
60
61
62package main;
Akron18e407a2020-05-11 14:57:19 +020063use strict;
64use warnings;
65
Akron1839cb12020-05-20 12:55:22 +020066# 2020-05-20
67# Preliminary support for C2 def-files.
68
Akron1e6f4d42020-05-19 12:14:41 +020069our @ARGV;
70
Akron18e407a2020-05-11 14:57:19 +020071unless (@ARGV) {
72 print <<'HELP';
73Convert a line-separated list of corpus sigles, doc sigles or
74text sigles into a virtual corpus query.
75
76 $ perl list2vc.pl my_vc.txt | gzip -vc > my_vc.jsonld.gz
Akron1e6f4d42020-05-19 12:14:41 +020077 $ cat my_vc.txt | perl list2vc.pl - | gzip -vc > my_vc.jsonld.gz
Akron18e407a2020-05-11 14:57:19 +020078
79HELP
80exit 0;
81};
82
Akron1839cb12020-05-20 12:55:22 +020083
Akronfe58a6c2020-05-20 16:41:22 +020084sub _shorten ($) {
Akron1839cb12020-05-20 12:55:22 +020085 my $line = shift;
86 if (length($line) < 20) {
87 return $line;
88 }
89 else {
90 return substr($line,0,17) . '...';
91 };
92};
93
94
Akron18e407a2020-05-11 14:57:19 +020095my $fh;
Akron1e6f4d42020-05-19 12:14:41 +020096if ($ARGV[0] eq '-') {
97 $fh = *STDIN;
98} elsif (!open($fh, '<' . $ARGV[0])) {
Akron18e407a2020-05-11 14:57:19 +020099 warn $ARGV[0] . " can't be opened";
Akron1e6f4d42020-05-19 12:14:41 +0200100 exit(0);
Akron18e407a2020-05-11 14:57:19 +0200101};
102
Akron1e6f4d42020-05-19 12:14:41 +0200103
Akronfe58a6c2020-05-20 16:41:22 +0200104# Initial VC group
105my $vc = KorAP::VirtualCorpus::Group->new('or');
106
Akron1e6f4d42020-05-19 12:14:41 +0200107
108# Iterate over the whole list
109while (!eof $fh) {
110 my $line = readline($fh);
111 chomp $line;
112
Akron23e9e3c2020-05-20 12:37:25 +0200113
114 # Skip empty lines
115 if (!$line || length($line) == 0 || $line =~ /^[\s\t\n]*$/) {
116 # empty
117 next;
118 };
119
Akron1839cb12020-05-20 12:55:22 +0200120 my ($key, $value, $desc);
121
122 # Line-Type: <e>c</a>
123 if ($line =~ /^\s*<([^>]+)>\s*([^<]*)\s*<\/\1>\s*$/) {
124 $key = $1;
125 $value = $2 // undef;
126 }
127
128 # Line-Type: <e>c
129 elsif($line =~ /^\s*<([^>]+)>\s*([^<]+)\s*$/) {
130 $key = $1;
131 $value = $2;
132 }
133
Akron1e6f4d42020-05-19 12:14:41 +0200134 # Get text sigles
Akron1839cb12020-05-20 12:55:22 +0200135 elsif ($line =~ m!^(?:[^\/\s]+\/){2}[^\/\s]+$!) {
136 $key = 'text';
137 $value = $line;
Akron1e6f4d42020-05-19 12:14:41 +0200138 }
139
140 # Get doc sigles
Akron1839cb12020-05-20 12:55:22 +0200141 elsif ($line =~ m!^([^\/\s]+\/[^\/\s]+?)(?:\s.+?)?$!) {
142 $key = 'doc';
143 $value = $1;
Akron1e6f4d42020-05-19 12:14:41 +0200144 }
145
146 # Get corpus sigles
Akron1839cb12020-05-20 12:55:22 +0200147 elsif ($line !~ m!(?:\/|\s)!) {
148 $key = 'corpus';
149 $value = $line;
Akron1e6f4d42020-05-19 12:14:41 +0200150 }
151
Akron1839cb12020-05-20 12:55:22 +0200152 # Not known
Akron1e6f4d42020-05-19 12:14:41 +0200153 else {
Akronfe58a6c2020-05-20 16:41:22 +0200154 warn _shorten($line) . q! isn't a valid VC definition!;
Akron1839cb12020-05-20 12:55:22 +0200155 next;
156 };
157
Akronfe58a6c2020-05-20 16:41:22 +0200158 # Add text field
Akron1839cb12020-05-20 12:55:22 +0200159 if ($key eq 'text') {
Akronee2073d2020-05-20 15:19:55 +0200160
161 # Convert C2 sigle to KorAP form
162 $value =~ s!^([^/]+?/[^\.]+?)\.(.+?)$!$1\/$2!;
Akronfe58a6c2020-05-20 16:41:22 +0200163 $vc->add_field(textSigle => $value);
Akron1839cb12020-05-20 12:55:22 +0200164 }
165
Akronfe58a6c2020-05-20 16:41:22 +0200166 # Add doc field
Akron1839cb12020-05-20 12:55:22 +0200167 elsif ($key eq 'doc') {
Akronfe58a6c2020-05-20 16:41:22 +0200168 $vc->add_field(docSigle => $value);
Akron1839cb12020-05-20 12:55:22 +0200169 }
170
Akronfe58a6c2020-05-20 16:41:22 +0200171 # Add corpus field
Akron1839cb12020-05-20 12:55:22 +0200172 elsif ($key eq 'corpus') {
Akronfe58a6c2020-05-20 16:41:22 +0200173 $vc->add_field(corpusSigle => $value);
Akron1e6f4d42020-05-19 12:14:41 +0200174 };
175};
176
Akron1e6f4d42020-05-19 12:14:41 +0200177close($fh);
178
Akronfe58a6c2020-05-20 16:41:22 +0200179# Stringify virtual corpus
180print $vc->to_string;