Test file processing for batch processing Change-Id: If63eb7e835c283a4a09e807aa4e285d84755c214

commit: 405f0c5239b1056952cb436a4bc0ea8210f307d9 [log] [tgz]
author: Akron <nils@diewald-online.de> Thu Jul 07 17:56:16 2016 +0200
committer: Akron <nils@diewald-online.de> Thu Jul 07 17:56:16 2016 +0200
tree: efb53d7b80d04f4b9ff581fc44d3ded22f7ce4dc
parent: 8b9905233c2ad3044af7ee8c712c371faf2ebec3 [diff]
diff --git a/Makefile.PL b/Makefile.PL
index 8e659d4..b4bdacb 100644
--- a/Makefile.PL
+++ b/Makefile.PL

@@ -21,6 +21,7 @@
     'Array::IntSpan'  => 2.003,
     'List::MoreUtils' => 0.33,
     'Parallel::ForkManager' => 1.17,
+    'IO::Compress::Gzip' => 2.069,
     'IO::Dir::Recursive' => 0.03,
     'File::Temp'      => 0,
     'Directory::Iterator' => 0,

diff --git a/lib/KorAP/XML/Batch/File.pm b/lib/KorAP/XML/Batch/File.pm
new file mode 100644
index 0000000..de34e4f
--- /dev/null
+++ b/lib/KorAP/XML/Batch/File.pm

@@ -0,0 +1,87 @@
+package KorAP::XML::Batch::File;
+use KorAP::XML::Krill;
+use Mojo::Log;
+use IO::Compress::Gzip;
+use IO::File;
+use strict;
+use warnings;
+
+sub new {
+  my $class = shift;
+  my %param = @_;
+
+  bless {
+    cache     => $param{cache}     // undef,
+    meta_type => $param{meta_type} // 'I5',
+    overwrite => $param{overwrite},
+    foundry   => $param{foundry}   // 'Base',
+    layer     => $param{layer}     // 'Tokens',
+    anno      => $param{anno}      // [[]],
+    log       => $param{log}       // Mojo::Log->new,
+    primary   => $param{primary},
+    pretty    => $param{pretty},
+    gzip      => $param{gzip} // 0
+  }, $class;
+};
+
+
+sub process {
+  my $self = shift;
+  my $input = shift;
+  my $output = shift;
+
+  # Create and parse new document
+  $input =~ s{([^/])$}{$1/};
+  my $doc = KorAP::XML::Krill->new(
+    path      => $input,
+    meta_type => $self->{meta_type},
+    cache     => $self->{cache}
+  );
+
+  # Parse document
+  unless ($doc->parse) {
+    $self->{log}->warn(($output // $input) . " can't be processed - no document data");
+    return;
+  };
+
+  # Get tokenization
+  my $tokens = KorAP::XML::Tokenizer->new(
+    path => $doc->path,
+    doc => $doc,
+    foundry => $self->{foundry},
+    layer => $self->{layer},
+    name => 'tokens'
+  );
+
+  # Unable to process base tokenization
+  unless ($tokens->parse) {
+    $self->{log}->error(($output // $input) . " can't be processed - no base tokenization");
+    return;
+  };
+
+  foreach (@{$self->{anno}}) {
+    $tokens->add(@$_);
+  };
+
+  my $file;
+  my $print_text = ($self->{pretty} ? $tokens->to_pretty_json($self->{primary}) : $tokens->to_json($self->{primary}));
+  if ($output) {
+    if ($self->{gzip}) {
+      $file = IO::Compress::Gzip->new($output, Minimal => 1);
+    }
+    else {
+      $file = IO::File->new($output, "w");
+    };
+
+    $file->print($print_text);
+    $file->close;
+  }
+
+  else {
+    print $print_text . "\n";
+  };
+
+  return 1;
+};
+
+1;

diff --git a/lib/KorAP/XML/ProcessFile.pm b/lib/KorAP/XML/ProcessFile.pm
deleted file mode 100644
index b3adeab..0000000
--- a/lib/KorAP/XML/ProcessFile.pm
+++ /dev/null

@@ -1,137 +0,0 @@
-package KorAP::XML::ProcessFile;
-use KorAP::XML::Krill;
-use Log::Log4perl;
-use strict;
-use warnings;
-
-sub new {
-  my $class = shift;
-  my %param = @_;
-
-  my @layers;
-  push(@layers, ['Base', 'Sentences']);
-  push(@layers, ['Base', 'Paragraphs']);
-
-  # Connexor
-  push(@layers, ['Connexor', 'Morpho']);
-  push(@layers, ['Connexor', 'Syntax']);
-  push(@layers, ['Connexor', 'Phrase']);
-  push(@layers, ['Connexor', 'Sentences']);
-
-  # CoreNLP
-  push(@layers, ['CoreNLP', 'NamedEntities']);
-  push(@layers, ['CoreNLP', 'Sentences']);
-  push(@layers, ['CoreNLP', 'Morpho']);
-  push(@layers, ['CoreNLP', 'Constituency']);
-
-  # DeReKo
-  push(@layers, ['DeReKo', 'Structure']);
-
-  # Glemm
-  push(@layers, ['Glemm', 'Morpho']);
-
-  # Malt
-  push(@layers, ['Malt', 'Dependency']);
-
-  # MDParser
-  push(@layers, ['MDParser', 'Dependency']);
-
-  # Mate
-  push(@layers, ['Mate', 'Morpho']);
-  push(@layers, ['Mate', 'Dependency']);
-
-  # OpenNLP
-  push(@layers, ['OpenNLP', 'Morpho']);
-  push(@layers, ['OpenNLP', 'Sentences']);
-
-  # Schreibgebrauch
-  push(@layers, ['Sgbr', 'Lemma']);
-  push(@layers, ['Sgbr', 'Morpho']);
-
-  # TreeTagger
-  push(@layers, ['TreeTagger', 'Morpho']);
-  push(@layers, ['TreeTagger', 'Sentences']);
-
-  # XIP
-  push(@layers, ['XIP', 'Morpho']);
-  push(@layers, ['XIP', 'Constituency']);
-  push(@layers, ['XIP', 'Sentences']);
-  push(@layers, ['XIP', 'Dependency']);
-
-  my @anno;
-  my $skip = $param{skip};
-
-  # Check for complete skipping
-  if ($skip->{'#all'}) {
-    foreach (@$param{anno}) {
-      push @anno, [split('#', $_)];
-    }
-  }
-
-  # Iterate over all layers
-  else {
-    # Add to index file - respect skipping
-    foreach my $info (@layers) {
-
-      # Skip if Foundry or Foundry#Layer should be skipped
-      unless ($skip->{lc($info->[0])} || $skip->{lc($info->[0]) . '#' . lc($info->[1])}) {
-	push @anno, $info;
-      };
-    };
-  };
-
-  bless {
-    cache     => $param{cache} // undef,
-    meta      => $param{meta}  // 'I5',
-    outpu     => $param{output},
-    overwrite => $param{overwrite},
-    foundry   => $param{foundry} // 'Base',
-    layer     => $param{layer}   // 'Tokens',
-    anno      => \@anno,
-    log       => $param{log} // Log::Log4perl->get_logger('main')
-  }, $class;
-};
-
-
-sub process {
-  my $self = shift;
-  my $input = shift;
-  my $output = shift;
-
-  # Create and parse new document
-  $input =~ s{([^/])$}{$1/};
-  my $doc = KorAP::XML::Krill->new(
-    path => $input,
-    meta_type => $self->{meta},
-    cache => $self->{cache}
-  );
-
-  # Parse document
-  unless ($doc->parse) {
-    $log->warn($output . " can't be processed - no document data");
-    return;
-  };
-
-  # Get tokenization
-  my $tokens = KorAP::XML::Tokenizer->new(
-    path => $doc->path,
-    doc => $doc,
-    foundry => $self->{foundry},
-    layer => $self->{layer},
-    name => 'tokens'
-  );
-
-  # Unable to process base tokenization
-  unless ($tokens->parse) {
-    $log->error($output . " can't be processed - no base tokenization");
-    return;
-  };
-
-  foreach (@{$self->{anno}}) {
-    $tokens->add(@$_);
-  };
-
-# Go on here with my $file; my $print_text
-};
-
-1;

diff --git a/script/korapxml2krill b/script/korapxml2krill
index 53f0765..af75538 100644
--- a/script/korapxml2krill
+++ b/script/korapxml2krill

@@ -14,6 +14,7 @@
 use KorAP::XML::Krill;
 use KorAP::XML::Archive;
 use KorAP::XML::Tokenizer;
+use KorAP::XML::ProcessFile;
 use Parallel::ForkManager;
 # TODO: use Parallel::Loops
 # TODO: make output files
@@ -367,7 +368,6 @@
     # Iterate over all given sigles and extract
     foreach (@sigle) {
       print "$_ ";
-#      print '' . ($archive->extract('./'. $_, $output) ? '' : 'not ');
       print '' . ($archive->extract('./' . $_, $output) ? '' : 'not ');
       print "extracted.\n";
     };

diff --git a/t/batch_file.t b/t/batch_file.t
new file mode 100644
index 0000000..aa7993d
--- /dev/null
+++ b/t/batch_file.t

@@ -0,0 +1,41 @@
+#!/usr/bin/env perl
+use strict;
+use warnings;
+use Test::More;
+use File::Basename 'dirname';
+use File::Spec::Functions 'catdir';
+use File::Temp qw/ :POSIX /;
+use Mojo::Util qw/slurp/;
+use Mojo::JSON qw/decode_json/;
+
+use_ok('KorAP::XML::Batch::File');
+
+ok(my $bf = KorAP::XML::Batch::File->new(
+  overwrite => 1,
+  foundry => 'OpenNLP',
+  layer => 'Tokens'
+), 'Construct new batch file object');
+
+# gzip => 1,
+
+my $path = catdir(dirname(__FILE__), 'annotation', 'corpus', 'doc', '0001');
+
+my $output = tmpnam();
+ok($bf->process($path => $output), 'Process file');
+
+ok(-f $output, 'File exists');
+
+ok(my $file = slurp $output, 'Slurp data');
+
+ok(my $json = decode_json $file, 'decode json');
+
+is($json->{textType}, 'Zeitung: Tageszeitung', 'text type');
+is($json->{title}, 'Beispiel Text', 'Title');
+is($json->{data}->{tokenSource}, 'opennlp#tokens', 'Title');
+is($json->{data}->{foundries}, '', 'Foundries');
+like($json->{data}->{text}, qr/^Zum letzten kulturellen/, 'Foundries');
+is($json->{data}->{stream}->[0]->[0], '-:tokens$<i>18', 'Tokens');
+is($json->{data}->{stream}->[0]->[1], '<>:base/s:t$<b>64<i>0<i>129<i>17<b>0', 'Data');
+
+done_testing;
+__END__
commit	405f0c5239b1056952cb436a4bc0ea8210f307d9	[log] [tgz]
author	Akron <nils@diewald-online.de>	Thu Jul 07 17:56:16 2016 +0200
committer	Akron <nils@diewald-online.de>	Thu Jul 07 17:56:16 2016 +0200
tree	efb53d7b80d04f4b9ff581fc44d3ded22f7ce4dc
parent	8b9905233c2ad3044af7ee8c712c371faf2ebec3 [diff]