Improve handling of broken data
Change-Id: I39afa2ad58ce39db3ce171876a0c3a5c540271df
diff --git a/Changes b/Changes
index d122f20..f30ae16 100644
--- a/Changes
+++ b/Changes
@@ -9,6 +9,7 @@
- Introduce --skip-inline-token-annotations parameter
- Deprecate KORAPXMLTEI_INLINE environment variable
in favor of --skip-inline-token-annotations
+ - Improve script handling of broken data
1.00 2021-02-18 Release
- -s option added that uses sentence boundaries
diff --git a/lib/KorAP/XML/TEI.pm b/lib/KorAP/XML/TEI.pm
index 493fce5..aa8cd0f 100644
--- a/lib/KorAP/XML/TEI.pm
+++ b/lib/KorAP/XML/TEI.pm
@@ -115,11 +115,13 @@
CHECK:
+ return '' unless $html;
+
$html =~ s/<!--.*?-->//g; # remove all comments in actual line
# Remove comment spanning over several lines
# No closing comment found
- if ( index($html, '-->') == -1) {
+ if (index($html, '-->') == -1) {
# Opening comment found
$i = index($html, '<!--');
diff --git a/lib/KorAP/XML/TEI/Zipper.pm b/lib/KorAP/XML/TEI/Zipper.pm
index c479239..3f96370 100644
--- a/lib/KorAP/XML/TEI/Zipper.pm
+++ b/lib/KorAP/XML/TEI/Zipper.pm
@@ -3,6 +3,7 @@
use warnings;
use Log::Any qw($log);
use IO::Compress::Zip qw($ZipError :constants);
+use Scalar::Util 'blessed';
# man IO::Compress::Zip
# At present three compression methods are supported by IO::Compress::Zip, namely
@@ -69,6 +70,10 @@
# Close stream and reset zipper
sub close {
+ unless (blessed $_[0]->[1]) {
+ $log->fatal("No opened zip file to close");
+ return;
+ };
$_[0]->[1]->close;
@{$_[0]} = ($_[0]->[0]);
};
diff --git a/t/script.t b/t/script.t
index 614ba1d..bd3f28d 100644
--- a/t/script.t
+++ b/t/script.t
@@ -735,4 +735,18 @@
};
+subtest 'Broken data testing' => sub {
+ my $file = catfile($f, 'data', 'wikipedia.txt');
+
+ my $t = test_tei2korapxml(
+ tmp => 'script_ginkgo',
+ file => $file,
+ param => '-ti',
+ env => 'KORAPXMLTEI_DEBUG=1'
+ )->stderr_like(qr!No opened zip file to close!)
+ ->stderr_like(qr!Debugging is activated!);
+};
+
+
+
done_testing;