Test file processing for batch processing
Change-Id: If63eb7e835c283a4a09e807aa4e285d84755c214
diff --git a/Makefile.PL b/Makefile.PL
index 8e659d4..b4bdacb 100644
--- a/Makefile.PL
+++ b/Makefile.PL
@@ -21,6 +21,7 @@
'Array::IntSpan' => 2.003,
'List::MoreUtils' => 0.33,
'Parallel::ForkManager' => 1.17,
+ 'IO::Compress::Gzip' => 2.069,
'IO::Dir::Recursive' => 0.03,
'File::Temp' => 0,
'Directory::Iterator' => 0,
diff --git a/lib/KorAP/XML/Batch/File.pm b/lib/KorAP/XML/Batch/File.pm
new file mode 100644
index 0000000..de34e4f
--- /dev/null
+++ b/lib/KorAP/XML/Batch/File.pm
@@ -0,0 +1,87 @@
+package KorAP::XML::Batch::File;
+use KorAP::XML::Krill;
+use Mojo::Log;
+use IO::Compress::Gzip;
+use IO::File;
+use strict;
+use warnings;
+
+sub new {
+ my $class = shift;
+ my %param = @_;
+
+ bless {
+ cache => $param{cache} // undef,
+ meta_type => $param{meta_type} // 'I5',
+ overwrite => $param{overwrite},
+ foundry => $param{foundry} // 'Base',
+ layer => $param{layer} // 'Tokens',
+ anno => $param{anno} // [[]],
+ log => $param{log} // Mojo::Log->new,
+ primary => $param{primary},
+ pretty => $param{pretty},
+ gzip => $param{gzip} // 0
+ }, $class;
+};
+
+
+sub process {
+ my $self = shift;
+ my $input = shift;
+ my $output = shift;
+
+ # Create and parse new document
+ $input =~ s{([^/])$}{$1/};
+ my $doc = KorAP::XML::Krill->new(
+ path => $input,
+ meta_type => $self->{meta_type},
+ cache => $self->{cache}
+ );
+
+ # Parse document
+ unless ($doc->parse) {
+ $self->{log}->warn(($output // $input) . " can't be processed - no document data");
+ return;
+ };
+
+ # Get tokenization
+ my $tokens = KorAP::XML::Tokenizer->new(
+ path => $doc->path,
+ doc => $doc,
+ foundry => $self->{foundry},
+ layer => $self->{layer},
+ name => 'tokens'
+ );
+
+ # Unable to process base tokenization
+ unless ($tokens->parse) {
+ $self->{log}->error(($output // $input) . " can't be processed - no base tokenization");
+ return;
+ };
+
+ foreach (@{$self->{anno}}) {
+ $tokens->add(@$_);
+ };
+
+ my $file;
+ my $print_text = ($self->{pretty} ? $tokens->to_pretty_json($self->{primary}) : $tokens->to_json($self->{primary}));
+ if ($output) {
+ if ($self->{gzip}) {
+ $file = IO::Compress::Gzip->new($output, Minimal => 1);
+ }
+ else {
+ $file = IO::File->new($output, "w");
+ };
+
+ $file->print($print_text);
+ $file->close;
+ }
+
+ else {
+ print $print_text . "\n";
+ };
+
+ return 1;
+};
+
+1;
diff --git a/lib/KorAP/XML/ProcessFile.pm b/lib/KorAP/XML/ProcessFile.pm
deleted file mode 100644
index b3adeab..0000000
--- a/lib/KorAP/XML/ProcessFile.pm
+++ /dev/null
@@ -1,137 +0,0 @@
-package KorAP::XML::ProcessFile;
-use KorAP::XML::Krill;
-use Log::Log4perl;
-use strict;
-use warnings;
-
-sub new {
- my $class = shift;
- my %param = @_;
-
- my @layers;
- push(@layers, ['Base', 'Sentences']);
- push(@layers, ['Base', 'Paragraphs']);
-
- # Connexor
- push(@layers, ['Connexor', 'Morpho']);
- push(@layers, ['Connexor', 'Syntax']);
- push(@layers, ['Connexor', 'Phrase']);
- push(@layers, ['Connexor', 'Sentences']);
-
- # CoreNLP
- push(@layers, ['CoreNLP', 'NamedEntities']);
- push(@layers, ['CoreNLP', 'Sentences']);
- push(@layers, ['CoreNLP', 'Morpho']);
- push(@layers, ['CoreNLP', 'Constituency']);
-
- # DeReKo
- push(@layers, ['DeReKo', 'Structure']);
-
- # Glemm
- push(@layers, ['Glemm', 'Morpho']);
-
- # Malt
- push(@layers, ['Malt', 'Dependency']);
-
- # MDParser
- push(@layers, ['MDParser', 'Dependency']);
-
- # Mate
- push(@layers, ['Mate', 'Morpho']);
- push(@layers, ['Mate', 'Dependency']);
-
- # OpenNLP
- push(@layers, ['OpenNLP', 'Morpho']);
- push(@layers, ['OpenNLP', 'Sentences']);
-
- # Schreibgebrauch
- push(@layers, ['Sgbr', 'Lemma']);
- push(@layers, ['Sgbr', 'Morpho']);
-
- # TreeTagger
- push(@layers, ['TreeTagger', 'Morpho']);
- push(@layers, ['TreeTagger', 'Sentences']);
-
- # XIP
- push(@layers, ['XIP', 'Morpho']);
- push(@layers, ['XIP', 'Constituency']);
- push(@layers, ['XIP', 'Sentences']);
- push(@layers, ['XIP', 'Dependency']);
-
- my @anno;
- my $skip = $param{skip};
-
- # Check for complete skipping
- if ($skip->{'#all'}) {
- foreach (@$param{anno}) {
- push @anno, [split('#', $_)];
- }
- }
-
- # Iterate over all layers
- else {
- # Add to index file - respect skipping
- foreach my $info (@layers) {
-
- # Skip if Foundry or Foundry#Layer should be skipped
- unless ($skip->{lc($info->[0])} || $skip->{lc($info->[0]) . '#' . lc($info->[1])}) {
- push @anno, $info;
- };
- };
- };
-
- bless {
- cache => $param{cache} // undef,
- meta => $param{meta} // 'I5',
- outpu => $param{output},
- overwrite => $param{overwrite},
- foundry => $param{foundry} // 'Base',
- layer => $param{layer} // 'Tokens',
- anno => \@anno,
- log => $param{log} // Log::Log4perl->get_logger('main')
- }, $class;
-};
-
-
-sub process {
- my $self = shift;
- my $input = shift;
- my $output = shift;
-
- # Create and parse new document
- $input =~ s{([^/])$}{$1/};
- my $doc = KorAP::XML::Krill->new(
- path => $input,
- meta_type => $self->{meta},
- cache => $self->{cache}
- );
-
- # Parse document
- unless ($doc->parse) {
- $log->warn($output . " can't be processed - no document data");
- return;
- };
-
- # Get tokenization
- my $tokens = KorAP::XML::Tokenizer->new(
- path => $doc->path,
- doc => $doc,
- foundry => $self->{foundry},
- layer => $self->{layer},
- name => 'tokens'
- );
-
- # Unable to process base tokenization
- unless ($tokens->parse) {
- $log->error($output . " can't be processed - no base tokenization");
- return;
- };
-
- foreach (@{$self->{anno}}) {
- $tokens->add(@$_);
- };
-
-# Go on here with my $file; my $print_text
-};
-
-1;
diff --git a/script/korapxml2krill b/script/korapxml2krill
index 53f0765..af75538 100644
--- a/script/korapxml2krill
+++ b/script/korapxml2krill
@@ -14,6 +14,7 @@
use KorAP::XML::Krill;
use KorAP::XML::Archive;
use KorAP::XML::Tokenizer;
+use KorAP::XML::ProcessFile;
use Parallel::ForkManager;
# TODO: use Parallel::Loops
# TODO: make output files
@@ -367,7 +368,6 @@
# Iterate over all given sigles and extract
foreach (@sigle) {
print "$_ ";
-# print '' . ($archive->extract('./'. $_, $output) ? '' : 'not ');
print '' . ($archive->extract('./' . $_, $output) ? '' : 'not ');
print "extracted.\n";
};
diff --git a/t/batch_file.t b/t/batch_file.t
new file mode 100644
index 0000000..aa7993d
--- /dev/null
+++ b/t/batch_file.t
@@ -0,0 +1,41 @@
+#!/usr/bin/env perl
+use strict;
+use warnings;
+use Test::More;
+use File::Basename 'dirname';
+use File::Spec::Functions 'catdir';
+use File::Temp qw/ :POSIX /;
+use Mojo::Util qw/slurp/;
+use Mojo::JSON qw/decode_json/;
+
+use_ok('KorAP::XML::Batch::File');
+
+ok(my $bf = KorAP::XML::Batch::File->new(
+ overwrite => 1,
+ foundry => 'OpenNLP',
+ layer => 'Tokens'
+), 'Construct new batch file object');
+
+# gzip => 1,
+
+my $path = catdir(dirname(__FILE__), 'annotation', 'corpus', 'doc', '0001');
+
+my $output = tmpnam();
+ok($bf->process($path => $output), 'Process file');
+
+ok(-f $output, 'File exists');
+
+ok(my $file = slurp $output, 'Slurp data');
+
+ok(my $json = decode_json $file, 'decode json');
+
+is($json->{textType}, 'Zeitung: Tageszeitung', 'text type');
+is($json->{title}, 'Beispiel Text', 'Title');
+is($json->{data}->{tokenSource}, 'opennlp#tokens', 'Title');
+is($json->{data}->{foundries}, '', 'Foundries');
+like($json->{data}->{text}, qr/^Zum letzten kulturellen/, 'Foundries');
+is($json->{data}->{stream}->[0]->[0], '-:tokens$<i>18', 'Tokens');
+is($json->{data}->{stream}->[0]->[1], '<>:base/s:t$<b>64<i>0<i>129<i>17<b>0', 'Data');
+
+done_testing;
+__END__