Improve Zipper to support root directory and fix bug in root directory normalization

Change-Id: I7f00a347fb4616cb8ae7df63374633c21d7ab0ce
diff --git a/lib/KorAP/XML/TEI/Zipper.pm b/lib/KorAP/XML/TEI/Zipper.pm
index c6d5143..c479239 100644
--- a/lib/KorAP/XML/TEI/Zipper.pm
+++ b/lib/KorAP/XML/TEI/Zipper.pm
@@ -19,8 +19,20 @@
 # Output parameter, that may be a file or a file handle.
 # Defaults to stdout.
 sub new {
-  my ($class, $out) = @_;
-  bless [$out // '-'], $class;
+  my ($class, $root_dir, $out) = @_;
+
+  if ($root_dir) {
+
+    # base dir must always end with a slash
+    $root_dir .= '/';
+
+    # remove leading /
+    # (only relative paths allowed in IO::Compress::Zip)
+    # and redundant ./
+    $root_dir =~ s/^\.?\/+//;
+  };
+
+  bless [$out // '-', undef, $root_dir // ''], $class;
 };
 
 
@@ -36,7 +48,7 @@
       TextFlag => 1,
       Method => $_COMPRESSION_METHOD,
       Append => 0,
-      Name => "$file"
+      Name => $self->[2] . $file
     ) or die $log->fatal("Zipping $file failed: $ZipError");
   }
 
@@ -47,7 +59,7 @@
       TextFlag => 1,
       Method => $_COMPRESSION_METHOD,
       Append => 1,
-      Name => "$file"
+      Name => $self->[2] . $file
     ) or die $log->fatal("Zipping $file failed: $ZipError");
   };
 
diff --git a/script/tei2korapxml b/script/tei2korapxml
index df4150a..ce84a4b 100755
--- a/script/tei2korapxml
+++ b/script/tei2korapxml
@@ -139,7 +139,7 @@
 
 
 # Initialize zipper
-my $zipper = KorAP::XML::TEI::Zipper->new;
+my $zipper = KorAP::XML::TEI::Zipper->new($_root_dir);
 my $input_fh;                                        # input file handle (default: stdin)
 
 my $buf_in;                                          # text body data extracted from input document ($input_fh), further processed by XML::LibXML::Reader
@@ -194,9 +194,6 @@
 
 $fval = $fval2 = 0;
 
-$_root_dir .= '/'; # base dir must always end with a slash
-$_root_dir =~ s/^\.?\///; # remove leading / (only relative paths allowed in IO::Compress::Zip) and redundant ./
-
 # Normalize regex for header parsing
 for ($_CORP_HEADER_BEG,
      $_DOC_HEADER_BEG,
@@ -325,7 +322,7 @@
 
           # Tokenize and output
           $ext_tok->tokenize($data)->to_zip(
-            $zipper->new_stream("$_root_dir$dir/$_tok_dir/$_tok_file_ext"),
+            $zipper->new_stream("$dir/$_tok_dir/$_tok_file_ext"),
             $text_id_esc
           );
         };
@@ -334,12 +331,12 @@
 
           # Tokenize and output
           $cons_tok->tokenize($data)->to_zip(
-            $zipper->new_stream("$_root_dir$dir/$_tok_dir/$_tok_file_con"),
+            $zipper->new_stream("$dir/$_tok_dir/$_tok_file_con"),
             $text_id_esc
           );
 
           $aggr_tok->tokenize($data)->to_zip(
-            $zipper->new_stream("$_root_dir$dir/$_tok_dir/$_tok_file_agg"),
+            $zipper->new_stream("$dir/$_tok_dir/$_tok_file_agg"),
             $text_id_esc
           );
 
@@ -354,10 +351,10 @@
         # corresponding indices in $_tokens_file)
 
         if ($_DEBUG) {
-          $log->debug("Writing (utf8-formatted) xml file $_root_dir$dir/$_data_file");
+          $log->debug("Writing (utf8-formatted) xml file $dir/$_data_file");
         };
 
-        $zipper->new_stream("$_root_dir$dir/$_data_file")
+        $zipper->new_stream("$dir/$_data_file")
           ->print("$data_prfx1$text_id_esc$data_prfx2$data$data_sfx");
 
         # ~ write structures ~
@@ -369,7 +366,7 @@
 
         if ($_TOKENS_PROC && !$tokens->empty) {
           $tokens->to_zip(
-            $zipper->new_stream("$_root_dir$dir/$_tokens_dir/${_tokens_file}"),
+            $zipper->new_stream("$dir/$_tokens_dir/${_tokens_file}"),
             $text_id_esc,
             $_INLINE_ANNOT
           );
@@ -467,7 +464,7 @@
       if ($header) {
 
         # Write header to zip
-        my $file = $_root_dir . $header->dir . '/' . $_header_file;
+        my $file = $header->dir . '/' . $_header_file;
 
         $log->debug("Writing file $file") if $_DEBUG;
 
@@ -940,7 +937,7 @@
 
   $output = encode( "UTF-8", $output ); # convert text string to binary string
 
-  $zipper->new_stream("$_root_dir$dir/$_structure_dir/$_structure_file")
+  $zipper->new_stream("$dir/$_structure_dir/$_structure_file")
     ->print($output);
 
   #print STDERR "$0: write_structures(): DONE\n";
diff --git a/t/tokenization.t b/t/tokenization.t
index 9fbf4d1..8a1aba6 100644
--- a/t/tokenization.t
+++ b/t/tokenization.t
@@ -132,7 +132,7 @@
 subtest 'Test Zipper' => sub {
   # Test Zipper
   my ($fh, $outzip) = korap_tempfile('tokenize_zipper');
-  my $zip = KorAP::XML::TEI::Zipper->new($outzip);
+  my $zip = KorAP::XML::TEI::Zipper->new('', $outzip);
   $fh->close;
 
   my $aggr = KorAP::XML::TEI::Tokenizer::Aggressive->new;
diff --git a/t/zipper.t b/t/zipper.t
index 86aa52a..1911620 100644
--- a/t/zipper.t
+++ b/t/zipper.t
@@ -13,37 +13,89 @@
 
 require_ok('KorAP::XML::TEI::Zipper');
 
-my $data;
-my ($fh, $outzip) = korap_tempfile('zipper');
+subtest 'Create Zipper' => sub {
+  my $data;
+  my ($fh, $outzip) = korap_tempfile('zipper');
 
-my $zip = KorAP::XML::TEI::Zipper->new($outzip);
-$fh->close;
+  my $zip = KorAP::XML::TEI::Zipper->new('', $outzip);
+  $fh->close;
 
-ok($zip, 'Zipper initialized');
+  ok($zip, 'Zipper initialized');
 
-ok($zip->new_stream('data/file1.txt')->print('hello'), 'Write to initial stream');
-ok($zip->new_stream('data/file2.txt')->print('world'), 'Write to appended stream');
+  ok($zip->new_stream('data/file1.txt')->print('hello'), 'Write to initial stream');
+  ok($zip->new_stream('data/file2.txt')->print('world'), 'Write to appended stream');
 
-$zip->close;
+  $zip->close;
 
-ok(-e $outzip, 'Zip exists');
+  ok(-e $outzip, 'Zip exists');
 
-# Uncompress GOE/header.xml from zip file
-my $unzip = IO::Uncompress::Unzip->new($outzip, Name => 'data/file1.txt');
+  my $unzip = IO::Uncompress::Unzip->new($outzip, Name => 'data/file1.txt');
 
-$data .= $unzip->getline while !$unzip->eof;
-ok($unzip->close, 'Closed');
+  $data .= $unzip->getline while !$unzip->eof;
+  ok($unzip->close, 'Closed');
 
-is($data, 'hello', 'Data correct');
+  is($data, 'hello', 'Data correct');
+
+  $unzip = IO::Uncompress::Unzip->new($outzip, Name => 'data/file2.txt');
+
+  $data = '';
+  $data .= $unzip->getline while !$unzip->eof;
+  ok($unzip->close, 'Closed');
+
+  is($data, 'world', 'Data correct');
+};
 
 
-# Uncompress data/file2.txt from zip file
-$unzip = IO::Uncompress::Unzip->new($outzip, Name => 'data/file2.txt');
+subtest 'Create Zipper with root dir "."' => sub {
+  my $data;
+  my ($fh, $outzip) = korap_tempfile('zipper');
 
-$data = '';
-$data .= $unzip->getline while !$unzip->eof;
-ok($unzip->close, 'Closed');
+  my $zip = KorAP::XML::TEI::Zipper->new('.', $outzip);
+  $fh->close;
 
-is($data, 'world', 'Data correct');
+  ok($zip, 'Zipper initialized');
+
+  ok($zip->new_stream('data/file1.txt')->print('hello'), 'Write to initial stream');
+  $zip->close;
+  ok(-e $outzip, 'Zip exists');
+
+  ok(IO::Uncompress::Unzip->new($outzip, Name => 'data/file1.txt'), 'File exists');
+};
+
+
+subtest 'Create Zipper with root dir "subdir"' => sub {
+  my $data;
+  my ($fh, $outzip) = korap_tempfile('zipper');
+
+  my $zip = KorAP::XML::TEI::Zipper->new('subdir', $outzip);
+  $fh->close;
+
+  ok($zip, 'Zipper initialized');
+
+  ok($zip->new_stream('data/file1.txt')->print('hello'), 'Write to initial stream');
+  $zip->close;
+  ok(-e $outzip, 'Zip exists');
+
+  ok(IO::Uncompress::Unzip->new($outzip, Name => 'subdir/data/file1.txt'), 'File exists');
+  ok(!IO::Uncompress::Unzip->new($outzip, Name => 'data/file1.txt'), 'File exists not');
+};
+
+subtest 'Create Zipper with root dir "./"' => sub {
+  my $data;
+  my ($fh, $outzip) = korap_tempfile('zipper');
+
+  my $zip = KorAP::XML::TEI::Zipper->new('./', $outzip);
+  $fh->close;
+
+  ok($zip, 'Zipper initialized');
+
+  ok($zip->new_stream('data/file1.txt')->print('hello'), 'Write to initial stream');
+  $zip->close;
+  ok(-e $outzip, 'Zip exists');
+
+  # Uncompress GOE/header.xml from zip file
+  ok(IO::Uncompress::Unzip->new($outzip, Name => 'data/file1.txt'), 'File exists');
+};
+
 
 done_testing;