Added archive support to korapxml2krill_dir

Change-Id: Ib62934a08628db3667891a1562acbf0149c17482
diff --git a/Changes b/Changes
index 7350491..8a28f26 100644
--- a/Changes
+++ b/Changes
@@ -1,3 +1,7 @@
+0.08 2016-02-14
+        - Added support for archive streaming.
+	- Improved scripts.
+
 0.07 2016-02-13
         - Improved support for Schreibgebrauch meta data
 	  (IDS flavour).
diff --git a/MANIFEST b/MANIFEST
index a63f2e9..fc4299e 100755
--- a/MANIFEST
+++ b/MANIFEST
@@ -1,5 +1,6 @@
 lib/KorAP/XML/Krill.pm
 lib/KorAP/XML/Log.pm
+lib/KorAP/XML/Archive.pm
 lib/KorAP/XML/Tokenizer.pm
 lib/KorAP/XML/Tokenizer/Match.pm
 lib/KorAP/XML/Tokenizer/Range.pm
@@ -40,6 +41,7 @@
 lib/KorAP/XML/Index/XIP/Dependency.pm
 lib/KorAP/XML/Index/XIP/Morpho.pm
 lib/KorAP/XML/Index/XIP/Sentences.pm
+t/archive.t
 t/meta.t
 t/primary.t
 t/range.t
@@ -84,6 +86,7 @@
 t/sgbr/meta_ids.t
 t/sgbr/pos.t
 t/sgbr/token.t
+t/corpus/archive.zip
 t/corpus/BZK/header.xml
 t/corpus/GOE/header.xml
 t/corpus/VDI/header.xml
diff --git a/Makefile.PL b/Makefile.PL
index 1b99119..7bfcb5e 100644
--- a/Makefile.PL
+++ b/Makefile.PL
@@ -21,6 +21,7 @@
     'Array::IntSpan' => 2.003,
     'List::MoreUtils' => 0.33,
     'IO::Dir::Recursive' => 0.03,
+    'File::Temp'     => 0,
     'Directory::Iterator' => 0,
     'Benchmark'      => 0,
     'Carp'           => 0,
diff --git a/lib/KorAP/XML/Archive.pm b/lib/KorAP/XML/Archive.pm
new file mode 100644
index 0000000..d8dbc27
--- /dev/null
+++ b/lib/KorAP/XML/Archive.pm
@@ -0,0 +1,148 @@
+package KorAP::XML::Archive;
+use Carp qw/carp/;
+use File::Spec::Functions qw(rel2abs);
+use strict;
+use warnings;
+
+# Convert new archive helper
+sub new {
+  my $class = shift;
+  my $file = shift or return;
+  bless \$file, $class;
+};
+
+
+# Check if unzip is installed
+sub test_unzip {
+  return 1 if grep { -x "$_/unzip"} split /:/, $ENV{PATH};
+  return;
+};
+
+# Check the compressed archive
+sub test {
+  my $self = shift;
+  my $file = $$self;
+  my $out = `unzip -t $file`;
+  if ($out =~ /no errors/i) {
+    return 1;
+  };
+  return 0;
+};
+
+
+# List all text paths contained in the file
+sub list_texts {
+  my $self = shift;
+  my $file = $$self;
+  my %texts;
+  foreach (`unzip -l $file *.xml`) {
+    if ($_ =~ m![\t\s]
+		((?:\./)?
+		  [^\t\s/\.]+?/ # Corpus
+		  [^\t\s/]+?/   # Document
+		  [^\t\s/]+?    # Text
+		)/(?:[^/]+?)\.xml$!x) {
+      $texts{$1} = 1;
+    };
+  };
+
+  return sort {$a cmp $b} keys %texts;
+};
+
+
+# Split a text path to prefix, corpus, document, text
+sub split_path {
+  my $self = shift;
+  my $text_path = shift;
+
+  unless ($text_path) {
+    carp('No text path given');
+    return 0;
+  };
+
+  # Check for '.' prefix in text
+  my $prefix = '';
+  if ($text_path =~ s!^\./!!) {
+    $prefix = '.';
+  };
+
+  # Unix form
+  if ($text_path =~ m!^([^/]+?)/([^/]+?)/([^/]+?)$!) {
+    return ($prefix, $1, $2, $3);
+  }
+
+  # Windows form
+  elsif ($text_path =~ m!^([^\\]+?)\\([^\\]+?)\\([^\\]+?)$!) {
+    return ($prefix, $1, $2, $3);
+  };
+
+  # Text has not the expected pattern
+  carp $text_path . ' is not a well-formed text path in ' . $$self;
+  return;
+};
+
+
+# Get the archives path
+sub path {
+  return rel2abs(${$_[0]});
+};
+
+
+# Extract files to a directory
+sub extract {
+  my $self = shift;
+  my $text_path = shift;
+  my $target_dir = shift;
+
+  my @cmd = (
+    'unzip',           # Use unzip program
+    '-qo',             # quietly overwrite all existing files
+    '-d', $target_dir # Extract into target directory
+  );
+
+  push(@cmd, $$self); # Extract from zip
+
+  my ($prefix, $corpus, $doc, $text) = $self->split_path($text_path) or return;
+
+  # Add some interesting files for extraction
+  # Can't use catfile(), as this removes the '.' prefix
+  push(@cmd, join('/', $prefix, $corpus, 'header.xml'));
+  push(@cmd, join('/', $prefix, $corpus, $doc, 'header.xml'));
+  push(@cmd, join('/', $prefix, $corpus, $doc, $text, '*'));
+
+  # Run system call
+  system(@cmd);
+
+  # Check for return code
+  if ($? != 0) {
+    carp("System call '" . join(' ', @cmd) . "' errors " . $?);
+    return;
+  };
+
+  # Fine
+  return 1;
+};
+
+
+1;
+
+__END__
+
+=POD
+
+C<KorAP::XML::Archive> expects the unzip tool to be installed.
+
+
+=head1 new
+
+=head1 test
+
+=head1 list_texts
+
+Returns all texts found in the zip file
+
+=head1 extract
+
+  $archive->extract('./GOE/AGU/0004', '~/temp');
+
+Extract all files for the named text to a certain directory.
diff --git a/lib/KorAP/XML/Krill.pm b/lib/KorAP/XML/Krill.pm
index 404c2b2..decae35 100644
--- a/lib/KorAP/XML/Krill.pm
+++ b/lib/KorAP/XML/Krill.pm
@@ -17,7 +17,7 @@
 #       Due to the kind of processing, processed metadata may be stored in
 #       a multiprocess cache instead.
 
-our $VERSION = '0.07';
+our $VERSION = '0.08';
 
 our @ATTR = qw/text_sigle
 	       doc_sigle
diff --git a/script/korapxml2krill b/script/korapxml2krill
index 5ec0805..9e2d1e8 100644
--- a/script/korapxml2krill
+++ b/script/korapxml2krill
@@ -10,8 +10,6 @@
 use KorAP::XML::Krill;
 use KorAP::XML::Tokenizer;
 
-our $VERSION = 0.04;
-
 # Merges foundry data to create indexer friendly documents
 # ndiewald, 2014/10/29
 
@@ -21,7 +19,14 @@
 #
 # 2016/02/12
 # - fixed foundry skipping
+#
+# 2016/02/14
+# - Added version information
 
+sub printversion {
+  print "Version " . $KorAP::XML::Krill::VERSION . "\n\n";
+  exit(1);
+};
 
 sub printhelp {
   print <<'EOHELP';
@@ -54,8 +59,9 @@
                                   (expects a defined output file)
   --log|-l                        The Log4perl log level, defaults to ERROR.
   --help|-h                       Print this document (optional)
+  --version|-v                    Print version information
 
-diewald@ids-mannheim.de, 2016/02/12
+diewald@ids-mannheim.de, 2016/02/14
 
 EOHELP
   exit(defined $_[0] ? $_[0] : 0);
@@ -76,7 +82,8 @@
   'allow|a=s'   => \@allow,
   'primary|p!'  => \$primary,
   'pretty|y'    => \$pretty,
-  'help|h'      => sub { printhelp }
+  'help|h'      => sub { printhelp },
+  'version|v'   => sub { printversion }
 );
 
 printhelp(1) if !$input || ($gzip && !$output);
diff --git a/script/korapxml2krill_dir b/script/korapxml2krill_dir
index 7b048cf..5b09566 100644
--- a/script/korapxml2krill_dir
+++ b/script/korapxml2krill_dir
@@ -1,9 +1,14 @@
 #!/usr/bin/env perl
 use strict;
 use warnings;
+use lib 'lib';
 use FindBin;
+use File::Temp qw/tempdir/;
+use File::Spec::Functions qw/catfile catdir/;
 use Getopt::Long;
 use Directory::Iterator;
+use KorAP::XML::Krill;
+use KorAP::XML::Archive;
 
 my $local = $FindBin::Bin;
 
@@ -16,6 +21,14 @@
 #
 # 2016/02/12
 # - Support overwrite
+#
+# 2016/02/14
+# - Added version information
+
+sub printversion {
+  print "Version " . $KorAP::XML::Krill::VERSION . "\n\n";
+  exit(1);
+};
 
 sub printhelp {
   print <<'EOHELP';
@@ -26,7 +39,7 @@
 Call:
 korapxml2krill_dir -z --input <directory> --output <directory>
 
-  --input|-i <directory>          Directory of documents to index
+  --input|-i <directory|file>     Directory or archive file of documents to index
   --output|-o <directory>         Name of output folder
   --overwrite|-w                  Overwrite files that already exist
   --token|-t <foundry>[#<layer>]  Define the default tokenization by specifying
@@ -48,8 +61,9 @@
                                   (expects a defined output file)
   --log|-l                        The Log4perl log level, defaults to ERROR.
   --help|-h                       Print this document (optional)
+  --version|-v                    Print version information
 
-diewald@ids-mannheim.de, 2016/02/12
+diewald@ids-mannheim.de, 2016/02/14
 
 EOHELP
 
@@ -70,12 +84,14 @@
   'allow|a=s'   => \@allow,
   'primary|p!'  => \$primary,
   'pretty|y'    => \$pretty,
-  'help|h'      => sub { printhelp }
+  'help|h'      => sub { printhelp },
+  'version|v'   => sub { printversion }
 );
 
 printhelp(1) if !$input || !$output;
 
 
+# write file
 sub write_file {
   my $anno = shift;
   my $file = $anno;
@@ -98,23 +114,71 @@
   print "\n";
 };
 
-
-my $it = Directory::Iterator->new($input);
-my @dirs;
-my $dir;
-while (1) {
-
+# Input is a directory
+if (-d $input) {
+  my $it = Directory::Iterator->new($input);
+  my @dirs;
+  my $dir;
+  while (1) {
     if (!$it->is_directory && ($dir = $it->get) && $dir =~ s{/data\.xml$}{}) {
-	push @dirs, $dir;
-	$it->prune;
+      push @dirs, $dir;
+      $it->prune;
     };
-  last unless $it->next;
-};
+    last unless $it->next;
+  };
 
-my $count = scalar @dirs;
-for (my $i = 0; $i < $count; $i++) {
-  print 'Convert [' . ($i + 1) . "/$count] ";
-  write_file($dirs[$i]);
+  my $count = scalar @dirs;
+  for (my $i = 0; $i < $count; $i++) {
+    print 'Convert [' . ($i + 1) . "/$count] ";
+    write_file($dirs[$i]);
+  };
+}
+
+# Input is a file
+elsif (-f($input) && (my $archive = KorAP::XML::Archive->new($input))) {
+  unless ($archive->test_unzip) {
+    print "Unzip is not installed or incompatible.\n\n";
+    exit(1);
+  };
+
+  unless ($archive->test) {
+    print "Zip archive not compatible.\n\n";
+    exit(1);
+  };
+
+  my @dirs = $archive->list_texts;
+  my $count = scalar @dirs;
+  for (my $i = 0; $i < $count; $i++) {
+    print 'Convert [' . ($i + 1) . "/$count] ";
+
+    # Split path information
+    my ($prefix, $corpus, $doc, $text) = $archive->split_path($dirs[$i]);
+
+    # Create temporary file
+    my $temp = tempdir(CLEANUP => 1);
+
+    # Extract from archive
+    if ($archive->extract($dirs[$i], $temp)) {
+
+      # Create corpus directory
+      $input = catdir($temp, $corpus);
+
+      # Temporary directory
+      my $dir = catdir($input, $doc, $text);
+
+      # Write file
+      write_file($dir);
+    }
+    else {
+      print "Unable to extract " . $dirs[$i] . "\n";
+    };
+
+    $temp = 0;
+  };
+}
+
+else {
+  print "Input is neither a directory nor an archive.\n\n";
 };
 
 
diff --git a/t/archive.t b/t/archive.t
new file mode 100644
index 0000000..4a729b6
--- /dev/null
+++ b/t/archive.t
@@ -0,0 +1,47 @@
+#!/usr/bin/env perl
+use strict;
+use warnings;
+use Test::More;
+use File::Basename 'dirname';
+use File::Spec::Functions qw/catfile catdir/;
+use File::Temp qw/tempdir/;
+
+use_ok('KorAP::XML::Archive');
+
+my $file = catfile(dirname(__FILE__), 'corpus','archive.zip');
+my $archive = KorAP::XML::Archive->new($file);
+
+unless ($archive->test_unzip) {
+  plan skip_all => 'unzip not found';
+};
+
+ok($archive->test, 'Test archive');
+like($archive->path, qr/archive\.zip$/, 'Archive path');
+
+my @list = $archive->list_texts;
+is(scalar @list, 10, 'Found all tests');
+is($list[0], './TEST/BSP/1', 'First document');
+is($list[-1], './TEST/BSP/9', 'First document');
+
+my @path = $archive->split_path('./TEST/BSP/9');
+is($path[0],'.', 'Prefix');
+is($path[1],'TEST', 'Prefix');
+is($path[2],'BSP', 'Prefix');
+is($path[3],'9', 'Prefix');
+
+my $dir = tempdir(CLEANUP => 1);
+
+{
+  local $SIG{__WARN__} = sub {};
+  ok($archive->extract('./TEST/BSP/8', $dir), 'Wrong path');
+};
+
+ok(-d catdir($dir, 'TEST'), 'Test corpus directory exists');
+ok(-f catdir($dir, 'TEST', 'header.xml'), 'Test corpus header exists');
+ok(-d catdir($dir, 'TEST', 'BSP'), 'Test doc directory exists');
+ok(-f catdir($dir, 'TEST', 'BSP', 'header.xml'), 'Test doc header exists');
+
+
+done_testing;
+
+__END__
diff --git a/t/corpus/archive.zip b/t/corpus/archive.zip
new file mode 100644
index 0000000..b88d147
--- /dev/null
+++ b/t/corpus/archive.zip
Binary files differ