Add deflist generation
Change-Id: I28282a23e0fee65191db6f45d87a02664a19acdf
diff --git a/Makefile.PL b/Makefile.PL
index 8fa8fe0..7445d58 100644
--- a/Makefile.PL
+++ b/Makefile.PL
@@ -14,9 +14,11 @@
'Test::More' => 0,
'Test::Output' => 0,
'File::Basename' => 0,
+ 'File::Temp' => 0,
'Data::Dumper' => 0,
'File::Spec::Functions' => 0,
'Mojolicious' => 0,
+ 'IO::Uncompress::Bunzip2' => 0,
},
MIN_PERL_VERSION => '5.016',
EXE_FILES => ['script/cosmasvc2koralquery'],
diff --git a/lib/KorAP/Def.pm b/lib/KorAP/Def.pm
index 5a4591d..b71dce8 100644
--- a/lib/KorAP/Def.pm
+++ b/lib/KorAP/Def.pm
@@ -1,5 +1,7 @@
package KorAP::Def;
use KorAP::VirtualCorpus::Group;
+use IO::Uncompress::Bunzip2 qw($Bunzip2Error);
+use IO::File;
use strict;
use warnings;
@@ -12,17 +14,37 @@
if (ref $file && ref $file eq 'GLOB') {
$self->{file} = '';
$self->{fh} = $file;
+ return bless $self, $class;
}
- elsif ($file =~ /\.def\.bz2$/) {
+ if (!-f $file) {
+ if (-f $file . '.def') {
+ $file .= '.def';
+ }
+
+ elsif (-f $file . '.txt') {
+ $file .= '.txt';
+ }
+
+ elsif (-f $file . '.def.bz2') {
+ $file .= '.def.bz2';
+ }
+
+ else {
+ warn 'Unable to load def file from ' . $file;
+ return;
+ };
+ };
+
+ if ($file =~ /\.def\.bz2$/) {
$self->{file} = $file;
- # $self->{fh} = $file;
- # TODO
+ $self->{fh} = IO::Uncompress::Bunzip2->new($file)
+ or die "bunzip2 failed: $Bunzip2Error\n";;
}
elsif (-f $file) {
$self->{file} = $file;
-
+
# Open def file
if (!open($self->{fh}, '<' . $file)) {
warn $ARGV[0] . " can't be opened";
@@ -51,9 +73,11 @@
my $frozen = 0;
+ my $fh = $self->{fh};
+
# Iterate over the whole list
- while (!eof($self->{fh})) {
- my $line = readline($self->{fh});
+ while (!eof($fh)) {
+ my $line = <$fh>;
chomp $line;
# Skip empty lines
@@ -288,7 +312,7 @@
};
$self->{vc} = $vc;
- close($self->{fh});
+ close($fh);
};
sub to_string {
diff --git a/lib/KorAP/DefList.pm b/lib/KorAP/DefList.pm
index 09fd1cb..076b97d 100644
--- a/lib/KorAP/DefList.pm
+++ b/lib/KorAP/DefList.pm
@@ -2,34 +2,32 @@
use KorAP::Def;
use KorAP::VirtualCorpus::Group;
use KorAP::VirtualCorpus::Doc;
+use File::Spec::Functions qw!catfile!;
use strict;
use warnings;
# Constructor
sub new {
my $class = shift;
- my $file = shift;
+ my %self = @_;
- warn 'xx';
+ $self{output} //= '.';
+
+ my $file = $self{file};
if (ref $file && ref $file eq 'GLOB') {
- return bless {
- file => '',
- fh => $file
- }, $class;
- };
-
- my $self = {
- file => $file
+ $self{file} = '';
+ $self{fh} = $file;
+ return bless \%self, $class;
};
# Open def file
- if (!open($self->{fh}, '<' . $file)) {
- warn $ARGV[0] . " can't be opened";
+ if (!open($self{fh}, '<' . $file)) {
+ warn $file . " can't be opened";
exit(0);
};
- return bless $self, $class;
+ return bless \%self, $class;
};
@@ -68,7 +66,25 @@
sub copy_vc {
my $self = shift;
my $file = shift;
- print "Convert ", $file, " from def file\n";
+
+ print "Convert ", $file, " from def-file\n";
+
+ # Open file from copy directory
+ my $def = KorAP::Def->new(catfile($self->{copy}, $file));
+
+ # Parse
+ return unless $def;
+ $def->parse or return;
+
+ # Output
+ my $out = catfile($self->{output}, $file . '.json');
+ if (open(my $koral, '>' . $out)) {
+ print $koral $def->to_string;
+ close($koral);
+ return;
+ };
+
+ warn 'Unable to write file ' . $out;
};
sub regex_to_vc {
@@ -76,7 +92,18 @@
my ($name, $desc, $regex) = @_;
print "Convert ", $name, " from regex\n";
- return from_regex($regex, $name, $desc);
+
+ my $vc = from_regex($regex, $name, $desc);
+
+ # Output
+ my $out = catfile($self->{output}, $name . '.json');
+ if (open(my $koral, '>' . $out)) {
+ print $koral $vc->to_string;
+ close($koral);
+ return;
+ };
+
+ warn 'Unable to write file ' . $out;
};
sub from_regex {
diff --git a/script/cosmasvc2koralquery b/script/cosmasvc2koralquery
index a95605f..2e20568 100755
--- a/script/cosmasvc2koralquery
+++ b/script/cosmasvc2koralquery
@@ -19,10 +19,12 @@
my $cmd = shift @ARGV;
-my $input;
+my ($input, $output, $copysrc);
GetOptions (
- "input|i=s" => \$input
+ "input|i=s" => \$input,
+ "output|o=s" => \$output,
+ "copy-src|c=s" => \$copysrc,
)
or die("Error in command line arguments\n");
@@ -34,7 +36,18 @@
$ perl cosmasvc2koralquery def my_vc.txt | gzip -vc > my_vc.jsonld.gz
$ cat my_vc.txt | perl cosmasvc2koralquery def - | gzip -vc > my_vc.jsonld.gz
-Commands: def, list
+Command: def
+
+ Convert a def file or a list of sigles to a KoralQuery VC.
+
+ Takes the list or def from STDIN and exports to STDOUT.
+
+Command: list
+
+ Convert a list with copy or regex instructions to KoralQuery VCs.
+
+ --output: The output directory
+ --copy-src: The directory for def files to copy
HELP
exit 1;
@@ -43,7 +56,11 @@
# Process a list
if ($cmd eq 'list') {
- KorAP::DefList->new($input || $ARGV[0])->parse;
+ KorAP::DefList->new(
+ file => ($input || $ARGV[0]),
+ copy => ($copysrc || '.'),
+ output => ($output || '.')
+ )->parse;
exit(0);
};
diff --git a/t/data/list-example.ls b/t/data/list-example.ls
index 6878eef..4426046 100644
--- a/t/data/list-example.ls
+++ b/t/data/list-example.ls
@@ -14,7 +14,7 @@
thm-lit Belletristik/Trivialliteratur: Thomas-Mann-Korpus THM/(AMB|AMD|AME|AMF|AMH|AMJ|AMK|AML|AMN|AMZ)
wxx11 Wikipedia Artikel und Diskussionen W[PD]D11/
zca Zeit Campus (Feb.-Apr.;Jun.;Aug.;Okt.-Dez.; Dez. 2013 n.v.) ZCA[0-9][0-9]/
-corp-w-gesamt.2023-i.16.03.23 @COPY@
-corp-a @COPY@
+list2 @COPY@
+list5 @COPY@
misc-lit Belletristik/Trivialliteratur (öffentlich) (GOE/(AGD|AGM|AGN|AGV|AGW))|(MK1/(LBC|LBT|LFH|LGB|LJA|LMB|LSO|MHE|TJM|TPM))|(MK2/TRI)
diff --git a/t/data/list5.def.bz2 b/t/data/list5.def.bz2
new file mode 100644
index 0000000..d5215b8
--- /dev/null
+++ b/t/data/list5.def.bz2
Binary files differ
diff --git a/t/list2vc-deflist.t b/t/list2vc-deflist.t
index 41c2176..89577fb 100644
--- a/t/list2vc-deflist.t
+++ b/t/list2vc-deflist.t
@@ -5,24 +5,60 @@
use File::Basename;
use File::Spec::Functions;
use Data::Dumper;
+use File::Temp qw/tempdir/;
use Test::Output;
use Mojo::JSON 'decode_json';
my $script = catfile(dirname(__FILE__), '..', 'script', 'cosmasvc2koralquery');
+my $copysrc = catdir(dirname(__FILE__), 'data');
my $list1 = catfile(dirname(__FILE__), 'data', 'list-example.ls');
+my $output = tempdir( CLEANUP => 1 );
+
+my @call = ($script, 'list', $list1, '--copy-src', $copysrc, '--output', $output);
# Check STDOUT
-stdout_like(
+stderr_like(
sub {
- system($script, 'list', $list1);
+ system(@call);
},
- qr!Convert!,
+ qr!redabs is not yet supported!,
"check stdout"
);
# Check JSON
-# my $protocol = join('', `$script list $list1`);
+my $protocol = join('', `@call`);
+
+like($protocol, qr!bih from regex!);
+ok(-f catfile($output, 'bih.json'));
+like($protocol, qr!bio from regex!);
+ok(-f catfile($output, 'bio.json'));
+like($protocol, qr!bio-pub from regex!);
+ok(-f catfile($output, 'bio-pub.json'));
+like($protocol, qr!l from regex!);
+ok(-f catfile($output, 'l.json'));
+like($protocol, qr!dpa from regex!);
+ok(-f catfile($output, 'dpa.json'));
+like($protocol, qr!fsp from regex!);
+ok(-f catfile($output, 'fsp.json'));
+like($protocol, qr!fsp-pub from regex!);
+ok(-f catfile($output, 'fsp-pub.json'));
+like($protocol, qr!kjl from regex!);
+ok(-f catfile($output, 'kjl.json'));
+like($protocol, qr!thm-lit from regex!);
+ok(-f catfile($output, 'thm-lit.json'));
+like($protocol, qr!wxx11 from regex!);
+ok(-f catfile($output, 'wxx11.json'));
+like($protocol, qr!zca from regex!);
+ok(-f catfile($output, 'zca.json'));
+like($protocol, qr!list2 from def-file!);
+ok(-f catfile($output, 'list2.json'));
+like($protocol, qr!list5 from def-file!);
+ok(-f catfile($output, 'list5.json'));
+like($protocol, qr!misc-lit from regex!);
+ok(-f catfile($output, 'misc-lit.json'));
done_testing;
__END__
+
+