Improve Zipper to support root directory and fix bug in root directory normalization
Change-Id: I7f00a347fb4616cb8ae7df63374633c21d7ab0ce
diff --git a/lib/KorAP/XML/TEI/Zipper.pm b/lib/KorAP/XML/TEI/Zipper.pm
index c6d5143..c479239 100644
--- a/lib/KorAP/XML/TEI/Zipper.pm
+++ b/lib/KorAP/XML/TEI/Zipper.pm
@@ -19,8 +19,20 @@
# Output parameter, that may be a file or a file handle.
# Defaults to stdout.
sub new {
- my ($class, $out) = @_;
- bless [$out // '-'], $class;
+ my ($class, $root_dir, $out) = @_;
+
+ if ($root_dir) {
+
+ # base dir must always end with a slash
+ $root_dir .= '/';
+
+ # remove leading /
+ # (only relative paths allowed in IO::Compress::Zip)
+ # and redundant ./
+ $root_dir =~ s/^\.?\/+//;
+ };
+
+ bless [$out // '-', undef, $root_dir // ''], $class;
};
@@ -36,7 +48,7 @@
TextFlag => 1,
Method => $_COMPRESSION_METHOD,
Append => 0,
- Name => "$file"
+ Name => $self->[2] . $file
) or die $log->fatal("Zipping $file failed: $ZipError");
}
@@ -47,7 +59,7 @@
TextFlag => 1,
Method => $_COMPRESSION_METHOD,
Append => 1,
- Name => "$file"
+ Name => $self->[2] . $file
) or die $log->fatal("Zipping $file failed: $ZipError");
};
diff --git a/script/tei2korapxml b/script/tei2korapxml
index df4150a..ce84a4b 100755
--- a/script/tei2korapxml
+++ b/script/tei2korapxml
@@ -139,7 +139,7 @@
# Initialize zipper
-my $zipper = KorAP::XML::TEI::Zipper->new;
+my $zipper = KorAP::XML::TEI::Zipper->new($_root_dir);
my $input_fh; # input file handle (default: stdin)
my $buf_in; # text body data extracted from input document ($input_fh), further processed by XML::LibXML::Reader
@@ -194,9 +194,6 @@
$fval = $fval2 = 0;
-$_root_dir .= '/'; # base dir must always end with a slash
-$_root_dir =~ s/^\.?\///; # remove leading / (only relative paths allowed in IO::Compress::Zip) and redundant ./
-
# Normalize regex for header parsing
for ($_CORP_HEADER_BEG,
$_DOC_HEADER_BEG,
@@ -325,7 +322,7 @@
# Tokenize and output
$ext_tok->tokenize($data)->to_zip(
- $zipper->new_stream("$_root_dir$dir/$_tok_dir/$_tok_file_ext"),
+ $zipper->new_stream("$dir/$_tok_dir/$_tok_file_ext"),
$text_id_esc
);
};
@@ -334,12 +331,12 @@
# Tokenize and output
$cons_tok->tokenize($data)->to_zip(
- $zipper->new_stream("$_root_dir$dir/$_tok_dir/$_tok_file_con"),
+ $zipper->new_stream("$dir/$_tok_dir/$_tok_file_con"),
$text_id_esc
);
$aggr_tok->tokenize($data)->to_zip(
- $zipper->new_stream("$_root_dir$dir/$_tok_dir/$_tok_file_agg"),
+ $zipper->new_stream("$dir/$_tok_dir/$_tok_file_agg"),
$text_id_esc
);
@@ -354,10 +351,10 @@
# corresponding indices in $_tokens_file)
if ($_DEBUG) {
- $log->debug("Writing (utf8-formatted) xml file $_root_dir$dir/$_data_file");
+ $log->debug("Writing (utf8-formatted) xml file $dir/$_data_file");
};
- $zipper->new_stream("$_root_dir$dir/$_data_file")
+ $zipper->new_stream("$dir/$_data_file")
->print("$data_prfx1$text_id_esc$data_prfx2$data$data_sfx");
# ~ write structures ~
@@ -369,7 +366,7 @@
if ($_TOKENS_PROC && !$tokens->empty) {
$tokens->to_zip(
- $zipper->new_stream("$_root_dir$dir/$_tokens_dir/${_tokens_file}"),
+ $zipper->new_stream("$dir/$_tokens_dir/${_tokens_file}"),
$text_id_esc,
$_INLINE_ANNOT
);
@@ -467,7 +464,7 @@
if ($header) {
# Write header to zip
- my $file = $_root_dir . $header->dir . '/' . $_header_file;
+ my $file = $header->dir . '/' . $_header_file;
$log->debug("Writing file $file") if $_DEBUG;
@@ -940,7 +937,7 @@
$output = encode( "UTF-8", $output ); # convert text string to binary string
- $zipper->new_stream("$_root_dir$dir/$_structure_dir/$_structure_file")
+ $zipper->new_stream("$dir/$_structure_dir/$_structure_file")
->print($output);
#print STDERR "$0: write_structures(): DONE\n";
diff --git a/t/tokenization.t b/t/tokenization.t
index 9fbf4d1..8a1aba6 100644
--- a/t/tokenization.t
+++ b/t/tokenization.t
@@ -132,7 +132,7 @@
subtest 'Test Zipper' => sub {
# Test Zipper
my ($fh, $outzip) = korap_tempfile('tokenize_zipper');
- my $zip = KorAP::XML::TEI::Zipper->new($outzip);
+ my $zip = KorAP::XML::TEI::Zipper->new('', $outzip);
$fh->close;
my $aggr = KorAP::XML::TEI::Tokenizer::Aggressive->new;
diff --git a/t/zipper.t b/t/zipper.t
index 86aa52a..1911620 100644
--- a/t/zipper.t
+++ b/t/zipper.t
@@ -13,37 +13,89 @@
require_ok('KorAP::XML::TEI::Zipper');
-my $data;
-my ($fh, $outzip) = korap_tempfile('zipper');
+subtest 'Create Zipper' => sub {
+ my $data;
+ my ($fh, $outzip) = korap_tempfile('zipper');
-my $zip = KorAP::XML::TEI::Zipper->new($outzip);
-$fh->close;
+ my $zip = KorAP::XML::TEI::Zipper->new('', $outzip);
+ $fh->close;
-ok($zip, 'Zipper initialized');
+ ok($zip, 'Zipper initialized');
-ok($zip->new_stream('data/file1.txt')->print('hello'), 'Write to initial stream');
-ok($zip->new_stream('data/file2.txt')->print('world'), 'Write to appended stream');
+ ok($zip->new_stream('data/file1.txt')->print('hello'), 'Write to initial stream');
+ ok($zip->new_stream('data/file2.txt')->print('world'), 'Write to appended stream');
-$zip->close;
+ $zip->close;
-ok(-e $outzip, 'Zip exists');
+ ok(-e $outzip, 'Zip exists');
-# Uncompress GOE/header.xml from zip file
-my $unzip = IO::Uncompress::Unzip->new($outzip, Name => 'data/file1.txt');
+ my $unzip = IO::Uncompress::Unzip->new($outzip, Name => 'data/file1.txt');
-$data .= $unzip->getline while !$unzip->eof;
-ok($unzip->close, 'Closed');
+ $data .= $unzip->getline while !$unzip->eof;
+ ok($unzip->close, 'Closed');
-is($data, 'hello', 'Data correct');
+ is($data, 'hello', 'Data correct');
+
+ $unzip = IO::Uncompress::Unzip->new($outzip, Name => 'data/file2.txt');
+
+ $data = '';
+ $data .= $unzip->getline while !$unzip->eof;
+ ok($unzip->close, 'Closed');
+
+ is($data, 'world', 'Data correct');
+};
-# Uncompress data/file2.txt from zip file
-$unzip = IO::Uncompress::Unzip->new($outzip, Name => 'data/file2.txt');
+subtest 'Create Zipper with root dir "."' => sub {
+ my $data;
+ my ($fh, $outzip) = korap_tempfile('zipper');
-$data = '';
-$data .= $unzip->getline while !$unzip->eof;
-ok($unzip->close, 'Closed');
+ my $zip = KorAP::XML::TEI::Zipper->new('.', $outzip);
+ $fh->close;
-is($data, 'world', 'Data correct');
+ ok($zip, 'Zipper initialized');
+
+ ok($zip->new_stream('data/file1.txt')->print('hello'), 'Write to initial stream');
+ $zip->close;
+ ok(-e $outzip, 'Zip exists');
+
+ ok(IO::Uncompress::Unzip->new($outzip, Name => 'data/file1.txt'), 'File exists');
+};
+
+
+subtest 'Create Zipper with root dir "subdir"' => sub {
+ my $data;
+ my ($fh, $outzip) = korap_tempfile('zipper');
+
+ my $zip = KorAP::XML::TEI::Zipper->new('subdir', $outzip);
+ $fh->close;
+
+ ok($zip, 'Zipper initialized');
+
+ ok($zip->new_stream('data/file1.txt')->print('hello'), 'Write to initial stream');
+ $zip->close;
+ ok(-e $outzip, 'Zip exists');
+
+ ok(IO::Uncompress::Unzip->new($outzip, Name => 'subdir/data/file1.txt'), 'File exists');
+ ok(!IO::Uncompress::Unzip->new($outzip, Name => 'data/file1.txt'), 'File exists not');
+};
+
+subtest 'Create Zipper with root dir "./"' => sub {
+ my $data;
+ my ($fh, $outzip) = korap_tempfile('zipper');
+
+ my $zip = KorAP::XML::TEI::Zipper->new('./', $outzip);
+ $fh->close;
+
+ ok($zip, 'Zipper initialized');
+
+ ok($zip->new_stream('data/file1.txt')->print('hello'), 'Write to initial stream');
+ $zip->close;
+ ok(-e $outzip, 'Zip exists');
+
+ # Uncompress GOE/header.xml from zip file
+ ok(IO::Uncompress::Unzip->new($outzip, Name => 'data/file1.txt'), 'File exists');
+};
+
done_testing;