Create Zip-Factory for simpler handling of Zip streams
Change-Id: I66fb1e980437f9b931d71b8bc9fde54bda2aee6f
diff --git a/script/tei2korapxml b/script/tei2korapxml
index 8fffecd..6136d47 100755
--- a/script/tei2korapxml
+++ b/script/tei2korapxml
@@ -41,7 +41,6 @@
use XML::CompactTree::XS;
use XML::LibXML::Reader;
-use IO::Compress::Zip qw(zip $ZipError :constants);
use IPC::Open2 qw(open2);
use FindBin;
@@ -51,6 +50,7 @@
use KorAP::XML::TEI;
use KorAP::XML::TEI::Tokenization;
+use KorAP::XML::TEI::Zipper;
our $VERSION = '0.01';
@@ -114,13 +114,6 @@
my ( @tok_tokens_con, @tok_tokens_agg, $txt, $offset );
my $_base_tokenization_dir = "base"; # name of directory for storing files of dummy tokenization (only used in func. select_tokenization)
-# man IO::Compress::Zip
-# At present three compression methods are supported by IO::Compress::Zip, namely
-# Store (no compression at all), Deflate, Bzip2 and LZMA.
-# Note that to create Bzip2 content, the module "IO::Compress::Bzip2" must be installed.
-# Note that to create LZMA content, the module "IO::Compress::Lzma" must be installed.
-my $_COMPRESSION_METHOD = ZIP_CM_DEFLATE; # The symbols, ZIP_CM_STORE, ZIP_CM_DEFLATE, ZIP_CM_BZIP2 and ZIP_CM_LZMA are used to select the compression method.
-
my $_DEBUG = 0; # set to 1 for minimal more debug output (no need to be parametrized)
my $_XCT_LN = 0; # only for debugging: include line numbers in elements of $tree_data
# (see also manpage of XML::CompactTree::XS)
@@ -155,9 +148,8 @@
# ~~~ variables ~~~
#
-my $zip; # IO::Compress::Zip object
-my $zip_outh; # handle for zip file output (stdout)
-my $first_write; # needed to decide wether to call '$zip->newStream' (for appending to zip file)
+# Initialize zipper
+my $zipper = KorAP::XML::TEI::Zipper->new;
my $input_fh; # input file handle (default: stdin)
my $buf_in; # text body data extracted from input document ($input_fh), further processed by XML::LibXML::Reader
@@ -287,9 +279,7 @@
$input_fh = *STDIN; # input file handle (default: stdin)
- $zip_outh = *STDOUT; # output file handle (default: stdout)
-
- $data_fl = 0; $first_write = 1;
+ $data_fl = 0;
$buf_in = $data = $dir = $dir_doc = $dir_crp = "";
$header_txt = $header_doc = $header_crp = "";
@@ -376,25 +366,11 @@
print STDERR "DEBUG ($0): main(): Writing (utf8-formatted) xml file $_root_dir$dir/$_data_file\n" if $_DEBUG;
- if ( $first_write ){
-
- $first_write = 0;
-
- # 1st time: create instance
- $zip = new IO::Compress::Zip $zip_outh, Zip64 => 1, TextFlag => 1, Method => $_COMPRESSION_METHOD, Append => 0, Name => "$_root_dir$dir/$_data_file"
- or die "ERROR ('$_root_dir$dir/$_data_file'): zip failed: $ZipError\n"
-
- } else {
-
- # closes the current compressed data stream and starts a new one.
- $zip->newStream( Zip64 => 1, TextFlag => 1, Method => $_COMPRESSION_METHOD, Append => 1, Name => "$_root_dir$dir/$_data_file" )
- or die "ERROR ('$_root_dir$dir/$_data_file'): zip failed: $ZipError\n"
- }
$data =~ s/(&|<|>)/$ent{$1}/g;
- $zip->print( "$data_prfx1$text_id_esc$data_prfx2$data$data_sfx" );
-
+ $zipper->new_stream("$_root_dir$dir/$_data_file")
+ ->print("$data_prfx1$text_id_esc$data_prfx2$data$data_sfx");
# ~ write structures ~
@@ -488,23 +464,10 @@
print STDERR "DEBUG ($0): Writing file $_root_dir$dir/$_header_file\n" if $_DEBUG;
- if ( $first_write ){
-
- $first_write = 0;
-
- $zip = new IO::Compress::Zip $zip_outh, Zip64 => 1, TextFlag => 1, Method => $_COMPRESSION_METHOD,
- Append => 0, Name => "$_root_dir$dir/$_header_file"
- or die "ERROR ('$_root_dir$dir/$_header_file'): zip failed: $ZipError\n"
-
- } else {
-
- $zip->newStream( Zip64 => 1, TextFlag => 1, Method => $_COMPRESSION_METHOD, Append => 1, Name => "$_root_dir$dir/$_header_file" )
- or die "ERROR ('$_root_dir$dir/$_header_file'): zip failed: $ZipError\n"
- }
-
$header_txt = encode_utf8( $header_txt );
- $zip->print( "$header_prfx$header_txt" );
+ $zipper->new_stream("$_root_dir$dir/$_header_file")
+ ->print("$header_prfx$header_txt");
$header_txt = "";
}
@@ -547,7 +510,6 @@
# ~ end of document header ~
-
#print STDERR "end of doc header\n";
# write it to header.xml
@@ -569,23 +531,10 @@
print STDERR "DEBUG ($0): Writing file $_root_dir$dir_doc/$_header_file\n" if $_DEBUG;
- if ( $first_write ){
-
- $first_write = 0;
-
- $zip = new IO::Compress::Zip $zip_outh, Zip64 => 1, TextFlag => 1, Method => $_COMPRESSION_METHOD, Append => 0,
- Name => "$_root_dir$dir_doc/$_header_file"
- or die "ERROR ('$_root_dir$dir_doc/$_header_file'): zip failed: $ZipError\n"
-
- } else {
-
- $zip->newStream( Zip64 => 1, TextFlag => 1, Method => $_COMPRESSION_METHOD, Append => 1, Name => "$_root_dir$dir_doc/$_header_file" )
- or die "ERROR ('$_root_dir$dir_doc/$_header_file'): zip failed: $ZipError\n"
- }
-
$header_doc = encode_utf8( $header_doc );
- $zip->print( "$header_prfx$header_doc" );
+ $zipper->new_stream("$_root_dir$dir_doc/$_header_file")
+ ->print("$header_prfx$header_doc");
$header_doc = $dir_doc = "";
}
@@ -682,23 +631,10 @@
print STDERR "DEBUG ($0): Writing file $_root_dir$dir_crp/$_header_file\n" if $_DEBUG;
- if ( $first_write ){
-
- $first_write = 0;
-
- $zip = new IO::Compress::Zip $zip_outh, Zip64 => 1, TextFlag => 1, Method => $_COMPRESSION_METHOD,
- Append => 0, Name => "$_root_dir$dir_crp/$_header_file"
- or die "ERROR ('$_root_dir$dir_crp/$_header_file'): zip failed: $ZipError\n";
-
- } else {
-
- $zip->newStream( Zip64 => 1, TextFlag => 1, Method => $_COMPRESSION_METHOD, Append => 1, Name => "$_root_dir$dir_crp/$_header_file" )
- or die "ERROR ('$_root_dir$dir_crp/$_header_file'): zip failed: $ZipError\n"
- }
-
$header_crp = encode_utf8( $header_crp );
- $zip->print( "$header_prfx$header_crp" );
+ $zipper->new_stream("$_root_dir$dir_crp/$_header_file")
+ ->print("$header_prfx$header_crp");
$header_crp = $dir_crp = "";
}
@@ -745,7 +681,7 @@
} #end: while
- $zip->close();
+ $zipper->close;
## DEPRECATED (only IDS-intern)
if( $_GEN_TOK_BAS ){
@@ -1145,7 +1081,7 @@
}
}
}else{
- $zip->close();
+ $zipper->close;
die "ERROR ($0): cannot retrieve token bounds from external tokenizer for text '$text_id' => Aborting ...\n";
}
##
@@ -1162,9 +1098,6 @@
my ( $fname, $textid_esc, $bounds ) = @_;
- $zip->newStream( Zip64 => 1, TextFlag => 1, Method => $_COMPRESSION_METHOD, Append => 1, Name => $fname)
- or die "ERROR ('$fname'): zip failed: $ZipError\n";
-
$output = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<?xml-model href=\"span.rng\" type=\"application/xml\""
." schematypens=\"http://relaxng.org/ns/structure/1.0\"?>\n\n<layer docid=\"$text_id_esc\" xmlns=\"http://ids-mannheim.de/ns/KorAP\""
." version=\"KorAP-0.4\">\n <spanList>\n";
@@ -1180,7 +1113,7 @@
$output .= " </spanList>\n</layer>";
- $zip->print ( "$output" );
+ $zipper->new_stream($fname)->print($output);
} # end: sub write_tokenization
@@ -1197,9 +1130,6 @@
return;
}
- $zip->newStream( Zip64 => 1, TextFlag => 1, Method => $_COMPRESSION_METHOD, Append => 1, Name => "$_root_dir$dir/$_structure_dir/$_structure_file" )
- or die "ERROR ('$_root_dir$dir/$_structure_dir/$_structure_file'): zip failed: $ZipError\n";
-
$output = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<?xml-model href=\"span.rng\" type=\"application/xml\""
." schematypens=\"http://relaxng.org/ns/structure/1.0\"?>\n\n<layer docid=\""
.decode_utf8($text_id_esc)."\" xmlns=\"http://ids-mannheim.de/ns/KorAP\" version=\"KorAP-0.4\">\n <spanList>\n";
@@ -1262,7 +1192,8 @@
$output = encode_utf8( $output );
- $zip->print( "$output" );
+ $zipper->new_stream("$_root_dir$dir/$_structure_dir/$_structure_file")
+ ->print($output);
#print STDERR "$0: write_structures(): DONE\n";
@@ -1281,9 +1212,6 @@
return;
}
- $zip->newStream( Zip64 => 1, TextFlag => 1, Method => $_COMPRESSION_METHOD, Append => 1, Name => "$_root_dir$dir/$_tokens_dir/$_tokens_file" )
- or die "ERROR ('$_root_dir$dir/$_tokens_dir/$_tokens_file'): zip failed: $ZipError\n";
-
$output = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<?xml-model href=\"span.rng\" type=\"application/xml\""
." schematypens=\"http://relaxng.org/ns/structure/1.0\"?>\n\n<layer docid=\""
.decode_utf8($text_id_esc)."\" xmlns=\"http://ids-mannheim.de/ns/KorAP\" version=\"KorAP-0.4\">\n <spanList>\n";
@@ -1361,7 +1289,8 @@
$output = encode_utf8( $output );
- $zip->print( "$output" );
+ $zipper->new_stream("$_root_dir$dir/$_tokens_dir/$_tokens_file")
+ ->print($output);
#print STDERR "$0: write_tokens(): DONE\n";