Create Zip-Factory for simpler handling of Zip streams

Change-Id: I66fb1e980437f9b931d71b8bc9fde54bda2aee6f
diff --git a/lib/KorAP/XML/TEI/Zipper.pm b/lib/KorAP/XML/TEI/Zipper.pm
new file mode 100644
index 0000000..dd0c698
--- /dev/null
+++ b/lib/KorAP/XML/TEI/Zipper.pm
@@ -0,0 +1,64 @@
+package KorAP::XML::TEI::Zipper;
+use strict;
+use warnings;
+use IO::Compress::Zip qw($ZipError :constants);
+
+# man IO::Compress::Zip
+# At present three compression methods are supported by IO::Compress::Zip, namely
+# Store (no compression at all), Deflate, Bzip2 and LZMA.
+# Note that to create Bzip2 content, the module "IO::Compress::Bzip2" must be installed.
+# Note that to create LZMA content, the module "IO::Compress::Lzma" must be installed.
+
+# The symbols ZIP_CM_STORE, ZIP_CM_DEFLATE, ZIP_CM_BZIP2 and
+# ZIP_CM_LZMA are used to select the compression method.
+our $_COMPRESSION_METHOD = ZIP_CM_DEFLATE;
+
+
+# Construct a new zipper object. Accepts an optional
+# Output parameter, that may be a file or a file handle.
+# Defaults to stdout.
+sub new {
+  my ($class, $out) = @_;
+  bless [$out // '-'], $class;
+};
+
+
+# Return a new data stream for Zips
+sub new_stream {
+  my ($self, $file) = @_;
+
+  # No stream open currently
+  unless ($self->[1]) {
+    $self->[1] = IO::Compress::Zip->new(
+      $self->[0],
+      Zip64 => 1,
+      TextFlag => 1,
+      Method => $_COMPRESSION_METHOD,
+      Append => 0,
+      Name => "$file"
+    ) or die "ERROR ('$file'): zip failed: $ZipError\n"
+  }
+
+  # Close existing stream and open a new one
+  else {
+    $self->[1]->newStream(
+      Zip64 => 1,
+      TextFlag => 1,
+      Method => $_COMPRESSION_METHOD,
+      Append => 1,
+      Name => "$file"
+    ) or die "ERROR ('$file'): zip failed: $ZipError\n"
+  };
+
+  return $self->[1];
+};
+
+
+# Close stream and reset zipper
+sub close {
+  $_[0]->[1]->close;
+  @{$_[0]} = ($_[0]->[0]);
+};
+
+
+1;
diff --git a/script/tei2korapxml b/script/tei2korapxml
index 8fffecd..6136d47 100755
--- a/script/tei2korapxml
+++ b/script/tei2korapxml
@@ -41,7 +41,6 @@
 
 use XML::CompactTree::XS;
 use XML::LibXML::Reader;
-use IO::Compress::Zip qw(zip $ZipError :constants);
 use IPC::Open2 qw(open2);
 
 use FindBin;
@@ -51,6 +50,7 @@
 
 use KorAP::XML::TEI;
 use KorAP::XML::TEI::Tokenization;
+use KorAP::XML::TEI::Zipper;
 
 our $VERSION = '0.01';
 
@@ -114,13 +114,6 @@
   my ( @tok_tokens_con, @tok_tokens_agg, $txt, $offset );
 my $_base_tokenization_dir     = "base"; # name of directory for storing files of dummy tokenization (only used in func. select_tokenization)
 
-# man IO::Compress::Zip
-# At present three compression methods are supported by IO::Compress::Zip, namely
-# Store (no compression at all), Deflate, Bzip2 and LZMA.
-# Note that to create Bzip2 content, the module "IO::Compress::Bzip2" must be installed.
-# Note that to create LZMA content, the module "IO::Compress::Lzma" must be installed.
-my $_COMPRESSION_METHOD = ZIP_CM_DEFLATE; # The symbols, ZIP_CM_STORE, ZIP_CM_DEFLATE, ZIP_CM_BZIP2 and ZIP_CM_LZMA are used to select the compression method.
-
 my $_DEBUG           = 0;                            # set to 1 for minimal more debug output (no need to be parametrized)
 my $_XCT_LN          = 0;                            # only for debugging: include line numbers in elements of $tree_data
                                                      #  (see also manpage of XML::CompactTree::XS)
@@ -155,9 +148,8 @@
 # ~~~ variables ~~~
 #
 
-my $zip;                                             # IO::Compress::Zip object
-my $zip_outh;                                        # handle for zip file output (stdout)
-my $first_write;                                     # needed to decide wether to call '$zip->newStream' (for appending to zip file)
+# Initialize zipper
+my $zipper = KorAP::XML::TEI::Zipper->new;
 my $input_fh;                                        # input file handle (default: stdin)
 
 my $buf_in;                                          # text body data extracted from input document ($input_fh), further processed by XML::LibXML::Reader
@@ -287,9 +279,7 @@
 
   $input_fh = *STDIN;  # input file handle (default: stdin)
 
-  $zip_outh = *STDOUT; # output file handle (default: stdout)
-
-  $data_fl = 0; $first_write = 1;
+  $data_fl = 0;
 
   $buf_in = $data = $dir = $dir_doc = $dir_crp = "";
   $header_txt = $header_doc = $header_crp = "";
@@ -376,25 +366,11 @@
       
         print STDERR "DEBUG ($0): main(): Writing (utf8-formatted) xml file $_root_dir$dir/$_data_file\n" if $_DEBUG;
 
-        if ( $first_write ){
-
-          $first_write = 0;
-
-          # 1st time: create instance
-          $zip = new IO::Compress::Zip $zip_outh, Zip64 => 1, TextFlag => 1, Method => $_COMPRESSION_METHOD, Append => 0, Name => "$_root_dir$dir/$_data_file"
-            or die "ERROR ('$_root_dir$dir/$_data_file'): zip failed: $ZipError\n"
-
-        } else {
-
-          # closes the current compressed data stream and starts a new one.
-          $zip->newStream( Zip64 => 1, TextFlag => 1, Method => $_COMPRESSION_METHOD, Append => 1, Name => "$_root_dir$dir/$_data_file" )
-            or die "ERROR ('$_root_dir$dir/$_data_file'): zip failed: $ZipError\n"
-        }
 
         $data =~ s/(&|<|>)/$ent{$1}/g;
 
-        $zip->print( "$data_prfx1$text_id_esc$data_prfx2$data$data_sfx" );
-
+        $zipper->new_stream("$_root_dir$dir/$_data_file")
+          ->print("$data_prfx1$text_id_esc$data_prfx2$data$data_sfx");
 
         # ~ write structures ~
 
@@ -488,23 +464,10 @@
 
         print STDERR "DEBUG ($0): Writing file $_root_dir$dir/$_header_file\n" if $_DEBUG;
 
-        if ( $first_write ){
-
-          $first_write = 0;
-
-          $zip = new IO::Compress::Zip $zip_outh, Zip64 => 1, TextFlag => 1, Method => $_COMPRESSION_METHOD,
-                 Append => 0, Name => "$_root_dir$dir/$_header_file"
-            or die "ERROR ('$_root_dir$dir/$_header_file'): zip failed: $ZipError\n"
-
-        } else {
-
-          $zip->newStream( Zip64 => 1, TextFlag => 1, Method => $_COMPRESSION_METHOD, Append => 1, Name => "$_root_dir$dir/$_header_file" )
-            or die "ERROR ('$_root_dir$dir/$_header_file'): zip failed: $ZipError\n"
-        }
-
         $header_txt = encode_utf8( $header_txt );
 
-        $zip->print( "$header_prfx$header_txt" );
+        $zipper->new_stream("$_root_dir$dir/$_header_file")
+          ->print("$header_prfx$header_txt");
 
         $header_txt = "";
       }
@@ -547,7 +510,6 @@
 
       # ~ end of document header ~
 
-
       #print STDERR "end of doc header\n";
 
       # write it to header.xml
@@ -569,23 +531,10 @@
 
         print STDERR "DEBUG ($0): Writing file $_root_dir$dir_doc/$_header_file\n" if $_DEBUG;
 
-        if ( $first_write ){
-
-          $first_write = 0;
-
-          $zip = new IO::Compress::Zip $zip_outh, Zip64 => 1, TextFlag => 1, Method => $_COMPRESSION_METHOD, Append => 0,
-            Name => "$_root_dir$dir_doc/$_header_file"
-              or die "ERROR ('$_root_dir$dir_doc/$_header_file'): zip failed: $ZipError\n"
-
-        } else {
-
-          $zip->newStream( Zip64 => 1, TextFlag => 1, Method => $_COMPRESSION_METHOD, Append => 1, Name => "$_root_dir$dir_doc/$_header_file" )
-            or die "ERROR ('$_root_dir$dir_doc/$_header_file'): zip failed: $ZipError\n"
-        }
-
         $header_doc = encode_utf8( $header_doc );
 
-        $zip->print( "$header_prfx$header_doc" );
+        $zipper->new_stream("$_root_dir$dir_doc/$_header_file")
+          ->print("$header_prfx$header_doc");
 
         $header_doc = $dir_doc = "";
       }
@@ -682,23 +631,10 @@
 
         print STDERR "DEBUG ($0): Writing file $_root_dir$dir_crp/$_header_file\n" if $_DEBUG;
 
-        if ( $first_write ){
-
-          $first_write = 0;
-
-          $zip = new IO::Compress::Zip $zip_outh, Zip64 => 1, TextFlag => 1, Method => $_COMPRESSION_METHOD,
-            Append => 0, Name => "$_root_dir$dir_crp/$_header_file"
-               or die "ERROR ('$_root_dir$dir_crp/$_header_file'): zip failed: $ZipError\n";
-
-        } else {
-
-          $zip->newStream( Zip64 => 1, TextFlag => 1, Method => $_COMPRESSION_METHOD, Append => 1, Name => "$_root_dir$dir_crp/$_header_file" )
-            or die "ERROR ('$_root_dir$dir_crp/$_header_file'): zip failed: $ZipError\n"
-        }
-
         $header_crp = encode_utf8( $header_crp );
 
-        $zip->print( "$header_prfx$header_crp" );
+        $zipper->new_stream("$_root_dir$dir_crp/$_header_file")
+          ->print("$header_prfx$header_crp");
 
         $header_crp = $dir_crp = "";
       }
@@ -745,7 +681,7 @@
 
   } #end: while
 
-  $zip->close();
+  $zipper->close;
 
   ## DEPRECATED (only IDS-intern)
   if( $_GEN_TOK_BAS ){
@@ -1145,7 +1081,7 @@
         }
       }
     }else{
-      $zip->close();
+      $zipper->close;
       die "ERROR ($0): cannot retrieve token bounds from external tokenizer for text '$text_id' => Aborting ...\n";
     }
   ## 
@@ -1162,9 +1098,6 @@
 
   my ( $fname, $textid_esc, $bounds ) = @_;
 
-  $zip->newStream( Zip64 => 1, TextFlag => 1, Method => $_COMPRESSION_METHOD, Append => 1, Name => $fname)
-    or die "ERROR ('$fname'): zip failed: $ZipError\n";
-
   $output = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<?xml-model href=\"span.rng\" type=\"application/xml\""
     ." schematypens=\"http://relaxng.org/ns/structure/1.0\"?>\n\n<layer docid=\"$text_id_esc\" xmlns=\"http://ids-mannheim.de/ns/KorAP\""
     ." version=\"KorAP-0.4\">\n  <spanList>\n";
@@ -1180,7 +1113,7 @@
 
   $output .= "  </spanList>\n</layer>";
 
-  $zip->print ( "$output" );
+  $zipper->new_stream($fname)->print($output);
 
 } # end: sub write_tokenization
 
@@ -1197,9 +1130,6 @@
     return;
   }
 
-  $zip->newStream( Zip64 => 1, TextFlag => 1, Method => $_COMPRESSION_METHOD, Append => 1, Name => "$_root_dir$dir/$_structure_dir/$_structure_file" )
-      or die "ERROR ('$_root_dir$dir/$_structure_dir/$_structure_file'): zip failed: $ZipError\n";
-
   $output = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<?xml-model href=\"span.rng\" type=\"application/xml\""
            ." schematypens=\"http://relaxng.org/ns/structure/1.0\"?>\n\n<layer docid=\""
            .decode_utf8($text_id_esc)."\" xmlns=\"http://ids-mannheim.de/ns/KorAP\" version=\"KorAP-0.4\">\n  <spanList>\n";
@@ -1262,7 +1192,8 @@
 
   $output = encode_utf8( $output );
 
-  $zip->print( "$output" );
+  $zipper->new_stream("$_root_dir$dir/$_structure_dir/$_structure_file")
+    ->print($output);
 
   #print STDERR "$0: write_structures(): DONE\n";
 
@@ -1281,9 +1212,6 @@
     return;
   }
 
-  $zip->newStream( Zip64 => 1, TextFlag => 1, Method => $_COMPRESSION_METHOD, Append => 1, Name => "$_root_dir$dir/$_tokens_dir/$_tokens_file" )
-      or die "ERROR ('$_root_dir$dir/$_tokens_dir/$_tokens_file'): zip failed: $ZipError\n";
-
   $output = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<?xml-model href=\"span.rng\" type=\"application/xml\""
            ." schematypens=\"http://relaxng.org/ns/structure/1.0\"?>\n\n<layer docid=\""
            .decode_utf8($text_id_esc)."\" xmlns=\"http://ids-mannheim.de/ns/KorAP\" version=\"KorAP-0.4\">\n  <spanList>\n";
@@ -1361,7 +1289,8 @@
 
   $output = encode_utf8( $output );
 
-  $zip->print( "$output" );
+  $zipper->new_stream("$_root_dir$dir/$_tokens_dir/$_tokens_file")
+    ->print($output);
 
   #print STDERR "$0: write_tokens(): DONE\n";
 
diff --git a/t/script.t b/t/script.t
index da45633..3ac91d1 100644
--- a/t/script.t
+++ b/t/script.t
@@ -38,6 +38,8 @@
   'Processing'
 );
 
+ok(-e $outzip, "File $outzip exists");
+
 # Uncompress GOE/header.xml from zip file
 my $zip = IO::Uncompress::Unzip->new($outzip, Name => 'GOE/header.xml');
 
diff --git a/t/zipper.t b/t/zipper.t
new file mode 100644
index 0000000..c9eabf0
--- /dev/null
+++ b/t/zipper.t
@@ -0,0 +1,48 @@
+use strict;
+use warnings;
+use Test::More;
+use File::Basename 'dirname';
+use File::Spec::Functions qw/catfile/;
+use File::Temp ':POSIX';
+use IO::Uncompress::Unzip;
+
+use FindBin;
+BEGIN {
+  unshift @INC, "$FindBin::Bin/../lib";
+};
+
+require_ok('KorAP::XML::TEI::Zipper');
+
+my $data;
+my $outzip = tmpnam();
+
+my $zip = KorAP::XML::TEI::Zipper->new($outzip);
+
+ok($zip, 'Zipper initialized');
+
+ok($zip->new_stream('data/file1.txt')->print('hello'), 'Write to initial stream');
+ok($zip->new_stream('data/file2.txt')->print('world'), 'Write to appended stream');
+
+$zip->close;
+
+ok(-e $outzip, 'Zip exists');
+
+# Uncompress GOE/header.xml from zip file
+my $unzip = IO::Uncompress::Unzip->new($outzip, Name => 'data/file1.txt');
+
+$data .= $unzip->getline while !$unzip->eof;
+ok($unzip->close, 'Closed');
+
+is($data, 'hello', 'Data correct');
+
+
+# Uncompress GOE/header.xml from zip file
+$unzip = IO::Uncompress::Unzip->new($outzip, Name => 'data/file2.txt');
+
+$data = '';
+$data .= $unzip->getline while !$unzip->eof;
+ok($unzip->close, 'Closed');
+
+is($data, 'world', 'Data correct');
+
+done_testing;