Blame - tools/list2vc.pl - KorAP/Kustvakt

blob: cc027463fce3b870c0aa8e6bf344f597e5b914cc [file] [log] [blame]

Akron	18e407a	2020-05-11 14:57:19 +0200	[diff] [blame]	1	#!/usr/bin/env perl
				2	use strict;
				3	use warnings;
				4
Akron	1e6f4d4	2020-05-19 12:14:41 +0200	[diff] [blame^]	5	our @ARGV;
				6
Akron	18e407a	2020-05-11 14:57:19 +0200	[diff] [blame]	7	sub shorten ($) {
				8	my $line = shift;
				9	if (length($line) < 20) {
				10	return $line;
				11	}
				12	else {
				13	return substr($line,0,17) . '...';
				14	};
				15	};
				16
				17
				18	unless (@ARGV) {
				19	print <<'HELP';
				20	Convert a line-separated list of corpus sigles, doc sigles or
				21	text sigles into a virtual corpus query.
				22
				23	$ perl list2vc.pl my_vc.txt \| gzip -vc > my_vc.jsonld.gz
Akron	1e6f4d4	2020-05-19 12:14:41 +0200	[diff] [blame^]	24	$ cat my_vc.txt \| perl list2vc.pl - \| gzip -vc > my_vc.jsonld.gz
Akron	18e407a	2020-05-11 14:57:19 +0200	[diff] [blame]	25
				26	HELP
				27	exit 0;
				28	};
				29
				30	my $fh;
Akron	1e6f4d4	2020-05-19 12:14:41 +0200	[diff] [blame^]	31	if ($ARGV[0] eq '-') {
				32	$fh = *STDIN;
				33	} elsif (!open($fh, '<' . $ARGV[0])) {
Akron	18e407a	2020-05-11 14:57:19 +0200	[diff] [blame]	34	warn $ARGV[0] . " can't be opened";
Akron	1e6f4d4	2020-05-19 12:14:41 +0200	[diff] [blame^]	35	exit(0);
Akron	18e407a	2020-05-11 14:57:19 +0200	[diff] [blame]	36	};
				37
Akron	1e6f4d4	2020-05-19 12:14:41 +0200	[diff] [blame^]	38
				39	my %data = (
				40	corpus => [],
				41	doc => [],
				42	text => []
				43	);
				44
				45	# Iterate over the whole list
				46	while (!eof $fh) {
				47	my $line = readline($fh);
				48	chomp $line;
				49
				50	# Get text sigles
				51	if ($line =~ m!^([^\/]+\/){2}[^\/]+$!) {
				52	push @{$data{text}}, $line;
				53	}
				54
				55	# Get doc sigles
				56	elsif ($line =~ m!^[^\/]+\/[^\/]+$!) {
				57	push @{$data{doc}}, $line;
				58	}
				59
				60	# Get corpus sigles
				61	elsif ($line !~ m!\/!) {
				62	push @{$data{corpus}}, $line;
				63	}
				64
				65	else {
				66	warn shorten($line) . q! isn't a valid sigle!;
				67	};
				68	};
				69
				70	# Create collection object
				71	my $json = '{';
				72	$json .= '"@context":"http://korap.ids-mannheim.de/ns/KoralQuery/v0.3/context.jsonld",';
				73	$json .= '"collection":{';
				74
				75	unless (@{$data{corpus}} \|\| @{$data{doc}} \|\| @{$data{text}}) {
				76	$json .= '}}';
				77	close($fh);
				78	print $json;
				79	exit(0);
				80	};
				81
				82	$json .= '"@type":"koral:docGroup",';
				83	$json .= '"operation":"operation:or",';
				84	$json .= '"operands":[';
				85
				86	foreach my $type (qw/corpus doc text/) {
				87	unless (@{$data{$type}}) {
				88	next;
				89	};
				90	$json .= '{';
				91	$json .= '"@type":"koral:doc",';
				92	$json .= '"key":"' . $type . 'Sigle",';
				93	$json .= '"match":"match:eq",';
				94	$json .= '"value":[';
				95	$json .= join ',', map { '"' . $_ . '"' } @{$data{$type}};
				96	$json .= ']';
				97	$json .= '},';
				98	};
				99
				100	# Remove the last comma
				101	chop $json;
				102
				103	$json .= ']}}';
				104
				105	close($fh);
				106
				107	print $json;
				108