Create Zip-Factory for simpler handling of Zip streams
Change-Id: I66fb1e980437f9b931d71b8bc9fde54bda2aee6f
diff --git a/lib/KorAP/XML/TEI/Zipper.pm b/lib/KorAP/XML/TEI/Zipper.pm
new file mode 100644
index 0000000..dd0c698
--- /dev/null
+++ b/lib/KorAP/XML/TEI/Zipper.pm
@@ -0,0 +1,64 @@
+package KorAP::XML::TEI::Zipper;
+use strict;
+use warnings;
+use IO::Compress::Zip qw($ZipError :constants);
+
+# man IO::Compress::Zip
+# At present three compression methods are supported by IO::Compress::Zip, namely
+# Store (no compression at all), Deflate, Bzip2 and LZMA.
+# Note that to create Bzip2 content, the module "IO::Compress::Bzip2" must be installed.
+# Note that to create LZMA content, the module "IO::Compress::Lzma" must be installed.
+
+# The symbols ZIP_CM_STORE, ZIP_CM_DEFLATE, ZIP_CM_BZIP2 and
+# ZIP_CM_LZMA are used to select the compression method.
+our $_COMPRESSION_METHOD = ZIP_CM_DEFLATE;
+
+
+# Construct a new zipper object. Accepts an optional
+# Output parameter, that may be a file or a file handle.
+# Defaults to stdout.
+sub new {
+ my ($class, $out) = @_;
+ bless [$out // '-'], $class;
+};
+
+
+# Return a new data stream for Zips
+sub new_stream {
+ my ($self, $file) = @_;
+
+ # No stream open currently
+ unless ($self->[1]) {
+ $self->[1] = IO::Compress::Zip->new(
+ $self->[0],
+ Zip64 => 1,
+ TextFlag => 1,
+ Method => $_COMPRESSION_METHOD,
+ Append => 0,
+ Name => "$file"
+ ) or die "ERROR ('$file'): zip failed: $ZipError\n"
+ }
+
+ # Close existing stream and open a new one
+ else {
+ $self->[1]->newStream(
+ Zip64 => 1,
+ TextFlag => 1,
+ Method => $_COMPRESSION_METHOD,
+ Append => 1,
+ Name => "$file"
+ ) or die "ERROR ('$file'): zip failed: $ZipError\n"
+ };
+
+ return $self->[1];
+};
+
+
+# Close stream and reset zipper
+sub close {
+ $_[0]->[1]->close;
+ @{$_[0]} = ($_[0]->[0]);
+};
+
+
+1;
diff --git a/script/tei2korapxml b/script/tei2korapxml
index 8fffecd..6136d47 100755
--- a/script/tei2korapxml
+++ b/script/tei2korapxml
@@ -41,7 +41,6 @@
use XML::CompactTree::XS;
use XML::LibXML::Reader;
-use IO::Compress::Zip qw(zip $ZipError :constants);
use IPC::Open2 qw(open2);
use FindBin;
@@ -51,6 +50,7 @@
use KorAP::XML::TEI;
use KorAP::XML::TEI::Tokenization;
+use KorAP::XML::TEI::Zipper;
our $VERSION = '0.01';
@@ -114,13 +114,6 @@
my ( @tok_tokens_con, @tok_tokens_agg, $txt, $offset );
my $_base_tokenization_dir = "base"; # name of directory for storing files of dummy tokenization (only used in func. select_tokenization)
-# man IO::Compress::Zip
-# At present three compression methods are supported by IO::Compress::Zip, namely
-# Store (no compression at all), Deflate, Bzip2 and LZMA.
-# Note that to create Bzip2 content, the module "IO::Compress::Bzip2" must be installed.
-# Note that to create LZMA content, the module "IO::Compress::Lzma" must be installed.
-my $_COMPRESSION_METHOD = ZIP_CM_DEFLATE; # The symbols, ZIP_CM_STORE, ZIP_CM_DEFLATE, ZIP_CM_BZIP2 and ZIP_CM_LZMA are used to select the compression method.
-
my $_DEBUG = 0; # set to 1 for minimal more debug output (no need to be parametrized)
my $_XCT_LN = 0; # only for debugging: include line numbers in elements of $tree_data
# (see also manpage of XML::CompactTree::XS)
@@ -155,9 +148,8 @@
# ~~~ variables ~~~
#
-my $zip; # IO::Compress::Zip object
-my $zip_outh; # handle for zip file output (stdout)
-my $first_write; # needed to decide wether to call '$zip->newStream' (for appending to zip file)
+# Initialize zipper
+my $zipper = KorAP::XML::TEI::Zipper->new;
my $input_fh; # input file handle (default: stdin)
my $buf_in; # text body data extracted from input document ($input_fh), further processed by XML::LibXML::Reader
@@ -287,9 +279,7 @@
$input_fh = *STDIN; # input file handle (default: stdin)
- $zip_outh = *STDOUT; # output file handle (default: stdout)
-
- $data_fl = 0; $first_write = 1;
+ $data_fl = 0;
$buf_in = $data = $dir = $dir_doc = $dir_crp = "";
$header_txt = $header_doc = $header_crp = "";
@@ -376,25 +366,11 @@
print STDERR "DEBUG ($0): main(): Writing (utf8-formatted) xml file $_root_dir$dir/$_data_file\n" if $_DEBUG;
- if ( $first_write ){
-
- $first_write = 0;
-
- # 1st time: create instance
- $zip = new IO::Compress::Zip $zip_outh, Zip64 => 1, TextFlag => 1, Method => $_COMPRESSION_METHOD, Append => 0, Name => "$_root_dir$dir/$_data_file"
- or die "ERROR ('$_root_dir$dir/$_data_file'): zip failed: $ZipError\n"
-
- } else {
-
- # closes the current compressed data stream and starts a new one.
- $zip->newStream( Zip64 => 1, TextFlag => 1, Method => $_COMPRESSION_METHOD, Append => 1, Name => "$_root_dir$dir/$_data_file" )
- or die "ERROR ('$_root_dir$dir/$_data_file'): zip failed: $ZipError\n"
- }
$data =~ s/(&|<|>)/$ent{$1}/g;
- $zip->print( "$data_prfx1$text_id_esc$data_prfx2$data$data_sfx" );
-
+ $zipper->new_stream("$_root_dir$dir/$_data_file")
+ ->print("$data_prfx1$text_id_esc$data_prfx2$data$data_sfx");
# ~ write structures ~
@@ -488,23 +464,10 @@
print STDERR "DEBUG ($0): Writing file $_root_dir$dir/$_header_file\n" if $_DEBUG;
- if ( $first_write ){
-
- $first_write = 0;
-
- $zip = new IO::Compress::Zip $zip_outh, Zip64 => 1, TextFlag => 1, Method => $_COMPRESSION_METHOD,
- Append => 0, Name => "$_root_dir$dir/$_header_file"
- or die "ERROR ('$_root_dir$dir/$_header_file'): zip failed: $ZipError\n"
-
- } else {
-
- $zip->newStream( Zip64 => 1, TextFlag => 1, Method => $_COMPRESSION_METHOD, Append => 1, Name => "$_root_dir$dir/$_header_file" )
- or die "ERROR ('$_root_dir$dir/$_header_file'): zip failed: $ZipError\n"
- }
-
$header_txt = encode_utf8( $header_txt );
- $zip->print( "$header_prfx$header_txt" );
+ $zipper->new_stream("$_root_dir$dir/$_header_file")
+ ->print("$header_prfx$header_txt");
$header_txt = "";
}
@@ -547,7 +510,6 @@
# ~ end of document header ~
-
#print STDERR "end of doc header\n";
# write it to header.xml
@@ -569,23 +531,10 @@
print STDERR "DEBUG ($0): Writing file $_root_dir$dir_doc/$_header_file\n" if $_DEBUG;
- if ( $first_write ){
-
- $first_write = 0;
-
- $zip = new IO::Compress::Zip $zip_outh, Zip64 => 1, TextFlag => 1, Method => $_COMPRESSION_METHOD, Append => 0,
- Name => "$_root_dir$dir_doc/$_header_file"
- or die "ERROR ('$_root_dir$dir_doc/$_header_file'): zip failed: $ZipError\n"
-
- } else {
-
- $zip->newStream( Zip64 => 1, TextFlag => 1, Method => $_COMPRESSION_METHOD, Append => 1, Name => "$_root_dir$dir_doc/$_header_file" )
- or die "ERROR ('$_root_dir$dir_doc/$_header_file'): zip failed: $ZipError\n"
- }
-
$header_doc = encode_utf8( $header_doc );
- $zip->print( "$header_prfx$header_doc" );
+ $zipper->new_stream("$_root_dir$dir_doc/$_header_file")
+ ->print("$header_prfx$header_doc");
$header_doc = $dir_doc = "";
}
@@ -682,23 +631,10 @@
print STDERR "DEBUG ($0): Writing file $_root_dir$dir_crp/$_header_file\n" if $_DEBUG;
- if ( $first_write ){
-
- $first_write = 0;
-
- $zip = new IO::Compress::Zip $zip_outh, Zip64 => 1, TextFlag => 1, Method => $_COMPRESSION_METHOD,
- Append => 0, Name => "$_root_dir$dir_crp/$_header_file"
- or die "ERROR ('$_root_dir$dir_crp/$_header_file'): zip failed: $ZipError\n";
-
- } else {
-
- $zip->newStream( Zip64 => 1, TextFlag => 1, Method => $_COMPRESSION_METHOD, Append => 1, Name => "$_root_dir$dir_crp/$_header_file" )
- or die "ERROR ('$_root_dir$dir_crp/$_header_file'): zip failed: $ZipError\n"
- }
-
$header_crp = encode_utf8( $header_crp );
- $zip->print( "$header_prfx$header_crp" );
+ $zipper->new_stream("$_root_dir$dir_crp/$_header_file")
+ ->print("$header_prfx$header_crp");
$header_crp = $dir_crp = "";
}
@@ -745,7 +681,7 @@
} #end: while
- $zip->close();
+ $zipper->close;
## DEPRECATED (only IDS-intern)
if( $_GEN_TOK_BAS ){
@@ -1145,7 +1081,7 @@
}
}
}else{
- $zip->close();
+ $zipper->close;
die "ERROR ($0): cannot retrieve token bounds from external tokenizer for text '$text_id' => Aborting ...\n";
}
##
@@ -1162,9 +1098,6 @@
my ( $fname, $textid_esc, $bounds ) = @_;
- $zip->newStream( Zip64 => 1, TextFlag => 1, Method => $_COMPRESSION_METHOD, Append => 1, Name => $fname)
- or die "ERROR ('$fname'): zip failed: $ZipError\n";
-
$output = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<?xml-model href=\"span.rng\" type=\"application/xml\""
." schematypens=\"http://relaxng.org/ns/structure/1.0\"?>\n\n<layer docid=\"$text_id_esc\" xmlns=\"http://ids-mannheim.de/ns/KorAP\""
." version=\"KorAP-0.4\">\n <spanList>\n";
@@ -1180,7 +1113,7 @@
$output .= " </spanList>\n</layer>";
- $zip->print ( "$output" );
+ $zipper->new_stream($fname)->print($output);
} # end: sub write_tokenization
@@ -1197,9 +1130,6 @@
return;
}
- $zip->newStream( Zip64 => 1, TextFlag => 1, Method => $_COMPRESSION_METHOD, Append => 1, Name => "$_root_dir$dir/$_structure_dir/$_structure_file" )
- or die "ERROR ('$_root_dir$dir/$_structure_dir/$_structure_file'): zip failed: $ZipError\n";
-
$output = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<?xml-model href=\"span.rng\" type=\"application/xml\""
." schematypens=\"http://relaxng.org/ns/structure/1.0\"?>\n\n<layer docid=\""
.decode_utf8($text_id_esc)."\" xmlns=\"http://ids-mannheim.de/ns/KorAP\" version=\"KorAP-0.4\">\n <spanList>\n";
@@ -1262,7 +1192,8 @@
$output = encode_utf8( $output );
- $zip->print( "$output" );
+ $zipper->new_stream("$_root_dir$dir/$_structure_dir/$_structure_file")
+ ->print($output);
#print STDERR "$0: write_structures(): DONE\n";
@@ -1281,9 +1212,6 @@
return;
}
- $zip->newStream( Zip64 => 1, TextFlag => 1, Method => $_COMPRESSION_METHOD, Append => 1, Name => "$_root_dir$dir/$_tokens_dir/$_tokens_file" )
- or die "ERROR ('$_root_dir$dir/$_tokens_dir/$_tokens_file'): zip failed: $ZipError\n";
-
$output = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<?xml-model href=\"span.rng\" type=\"application/xml\""
." schematypens=\"http://relaxng.org/ns/structure/1.0\"?>\n\n<layer docid=\""
.decode_utf8($text_id_esc)."\" xmlns=\"http://ids-mannheim.de/ns/KorAP\" version=\"KorAP-0.4\">\n <spanList>\n";
@@ -1361,7 +1289,8 @@
$output = encode_utf8( $output );
- $zip->print( "$output" );
+ $zipper->new_stream("$_root_dir$dir/$_tokens_dir/$_tokens_file")
+ ->print($output);
#print STDERR "$0: write_tokens(): DONE\n";
diff --git a/t/script.t b/t/script.t
index da45633..3ac91d1 100644
--- a/t/script.t
+++ b/t/script.t
@@ -38,6 +38,8 @@
'Processing'
);
+ok(-e $outzip, "File $outzip exists");
+
# Uncompress GOE/header.xml from zip file
my $zip = IO::Uncompress::Unzip->new($outzip, Name => 'GOE/header.xml');
diff --git a/t/zipper.t b/t/zipper.t
new file mode 100644
index 0000000..c9eabf0
--- /dev/null
+++ b/t/zipper.t
@@ -0,0 +1,48 @@
+use strict;
+use warnings;
+use Test::More;
+use File::Basename 'dirname';
+use File::Spec::Functions qw/catfile/;
+use File::Temp ':POSIX';
+use IO::Uncompress::Unzip;
+
+use FindBin;
+BEGIN {
+ unshift @INC, "$FindBin::Bin/../lib";
+};
+
+require_ok('KorAP::XML::TEI::Zipper');
+
+my $data;
+my $outzip = tmpnam();
+
+my $zip = KorAP::XML::TEI::Zipper->new($outzip);
+
+ok($zip, 'Zipper initialized');
+
+ok($zip->new_stream('data/file1.txt')->print('hello'), 'Write to initial stream');
+ok($zip->new_stream('data/file2.txt')->print('world'), 'Write to appended stream');
+
+$zip->close;
+
+ok(-e $outzip, 'Zip exists');
+
+# Uncompress GOE/header.xml from zip file
+my $unzip = IO::Uncompress::Unzip->new($outzip, Name => 'data/file1.txt');
+
+$data .= $unzip->getline while !$unzip->eof;
+ok($unzip->close, 'Closed');
+
+is($data, 'hello', 'Data correct');
+
+
+# Uncompress GOE/header.xml from zip file
+$unzip = IO::Uncompress::Unzip->new($outzip, Name => 'data/file2.txt');
+
+$data = '';
+$data .= $unzip->getline while !$unzip->eof;
+ok($unzip->close, 'Closed');
+
+is($data, 'world', 'Data correct');
+
+done_testing;