Added archive support to korapxml2krill_dir

Change-Id: Ib62934a08628db3667891a1562acbf0149c17482
diff --git a/script/korapxml2krill b/script/korapxml2krill
index 5ec0805..9e2d1e8 100644
--- a/script/korapxml2krill
+++ b/script/korapxml2krill
@@ -10,8 +10,6 @@
 use KorAP::XML::Krill;
 use KorAP::XML::Tokenizer;
 
-our $VERSION = 0.04;
-
 # Merges foundry data to create indexer friendly documents
 # ndiewald, 2014/10/29
 
@@ -21,7 +19,14 @@
 #
 # 2016/02/12
 # - fixed foundry skipping
+#
+# 2016/02/14
+# - Added version information
 
+sub printversion {
+  print "Version " . $KorAP::XML::Krill::VERSION . "\n\n";
+  exit(1);
+};
 
 sub printhelp {
   print <<'EOHELP';
@@ -54,8 +59,9 @@
                                   (expects a defined output file)
   --log|-l                        The Log4perl log level, defaults to ERROR.
   --help|-h                       Print this document (optional)
+  --version|-v                    Print version information
 
-diewald@ids-mannheim.de, 2016/02/12
+diewald@ids-mannheim.de, 2016/02/14
 
 EOHELP
   exit(defined $_[0] ? $_[0] : 0);
@@ -76,7 +82,8 @@
   'allow|a=s'   => \@allow,
   'primary|p!'  => \$primary,
   'pretty|y'    => \$pretty,
-  'help|h'      => sub { printhelp }
+  'help|h'      => sub { printhelp },
+  'version|v'   => sub { printversion }
 );
 
 printhelp(1) if !$input || ($gzip && !$output);
diff --git a/script/korapxml2krill_dir b/script/korapxml2krill_dir
index 7b048cf..5b09566 100644
--- a/script/korapxml2krill_dir
+++ b/script/korapxml2krill_dir
@@ -1,9 +1,14 @@
 #!/usr/bin/env perl
 use strict;
 use warnings;
+use lib 'lib';
 use FindBin;
+use File::Temp qw/tempdir/;
+use File::Spec::Functions qw/catfile catdir/;
 use Getopt::Long;
 use Directory::Iterator;
+use KorAP::XML::Krill;
+use KorAP::XML::Archive;
 
 my $local = $FindBin::Bin;
 
@@ -16,6 +21,14 @@
 #
 # 2016/02/12
 # - Support overwrite
+#
+# 2016/02/14
+# - Added version information
+
+sub printversion {
+  print "Version " . $KorAP::XML::Krill::VERSION . "\n\n";
+  exit(1);
+};
 
 sub printhelp {
   print <<'EOHELP';
@@ -26,7 +39,7 @@
 Call:
 korapxml2krill_dir -z --input <directory> --output <directory>
 
-  --input|-i <directory>          Directory of documents to index
+  --input|-i <directory|file>     Directory or archive file of documents to index
   --output|-o <directory>         Name of output folder
   --overwrite|-w                  Overwrite files that already exist
   --token|-t <foundry>[#<layer>]  Define the default tokenization by specifying
@@ -48,8 +61,9 @@
                                   (expects a defined output file)
   --log|-l                        The Log4perl log level, defaults to ERROR.
   --help|-h                       Print this document (optional)
+  --version|-v                    Print version information
 
-diewald@ids-mannheim.de, 2016/02/12
+diewald@ids-mannheim.de, 2016/02/14
 
 EOHELP
 
@@ -70,12 +84,14 @@
   'allow|a=s'   => \@allow,
   'primary|p!'  => \$primary,
   'pretty|y'    => \$pretty,
-  'help|h'      => sub { printhelp }
+  'help|h'      => sub { printhelp },
+  'version|v'   => sub { printversion }
 );
 
 printhelp(1) if !$input || !$output;
 
 
+# write file
 sub write_file {
   my $anno = shift;
   my $file = $anno;
@@ -98,23 +114,71 @@
   print "\n";
 };
 
-
-my $it = Directory::Iterator->new($input);
-my @dirs;
-my $dir;
-while (1) {
-
+# Input is a directory
+if (-d $input) {
+  my $it = Directory::Iterator->new($input);
+  my @dirs;
+  my $dir;
+  while (1) {
     if (!$it->is_directory && ($dir = $it->get) && $dir =~ s{/data\.xml$}{}) {
-	push @dirs, $dir;
-	$it->prune;
+      push @dirs, $dir;
+      $it->prune;
     };
-  last unless $it->next;
-};
+    last unless $it->next;
+  };
 
-my $count = scalar @dirs;
-for (my $i = 0; $i < $count; $i++) {
-  print 'Convert [' . ($i + 1) . "/$count] ";
-  write_file($dirs[$i]);
+  my $count = scalar @dirs;
+  for (my $i = 0; $i < $count; $i++) {
+    print 'Convert [' . ($i + 1) . "/$count] ";
+    write_file($dirs[$i]);
+  };
+}
+
+# Input is a file
+elsif (-f($input) && (my $archive = KorAP::XML::Archive->new($input))) {
+  unless ($archive->test_unzip) {
+    print "Unzip is not installed or incompatible.\n\n";
+    exit(1);
+  };
+
+  unless ($archive->test) {
+    print "Zip archive not compatible.\n\n";
+    exit(1);
+  };
+
+  my @dirs = $archive->list_texts;
+  my $count = scalar @dirs;
+  for (my $i = 0; $i < $count; $i++) {
+    print 'Convert [' . ($i + 1) . "/$count] ";
+
+    # Split path information
+    my ($prefix, $corpus, $doc, $text) = $archive->split_path($dirs[$i]);
+
+    # Create temporary file
+    my $temp = tempdir(CLEANUP => 1);
+
+    # Extract from archive
+    if ($archive->extract($dirs[$i], $temp)) {
+
+      # Create corpus directory
+      $input = catdir($temp, $corpus);
+
+      # Temporary directory
+      my $dir = catdir($input, $doc, $text);
+
+      # Write file
+      write_file($dir);
+    }
+    else {
+      print "Unable to extract " . $dirs[$i] . "\n";
+    };
+
+    $temp = 0;
+  };
+}
+
+else {
+  print "Input is neither a directory nor an archive.\n\n";
 };