Improve error handling
Change-Id: If55488859011ac541412d1132d1f279e50178605
diff --git a/lib/KorAP/XML/Batch/File.pm b/lib/KorAP/XML/Batch/File.pm
index 33ef565..2dd5171 100644
--- a/lib/KorAP/XML/Batch/File.pm
+++ b/lib/KorAP/XML/Batch/File.pm
@@ -62,7 +62,7 @@
# Unable to process base tokenization
unless ($tokens->parse) {
- $self->{log}->error(($output // $input) . " can't be processed - no working base tokenization");
+ $self->{log}->error(($output // $input) . " can't be processed - " . $tokens->error);
return;
};
diff --git a/lib/KorAP/XML/Tokenizer.pm b/lib/KorAP/XML/Tokenizer.pm
index a8ebb8c..92c8d1f 100644
--- a/lib/KorAP/XML/Tokenizer.pm
+++ b/lib/KorAP/XML/Tokenizer.pm
@@ -36,6 +36,8 @@
has layer => 'Tokens';
has non_word_tokens => 0;
+has 'error';
+
has log => sub {
if (Log::Log4perl->initialized()) {
state $log = Log::Log4perl->get_logger(__PACKAGE__);
@@ -60,7 +62,8 @@
my $path = $self->path . lc($self->foundry) . '/' . $layer_file;
unless (-e $path) {
- $self->log->warn('Unable to load base tokenization: ' . $path);
+ $self->error('Unable to load base tokenization: ' . $path);
+ $self->log->warn($self->error);
return;
};
@@ -93,8 +96,8 @@
)->{layer}->{spanList};
} catch {
- $self->log->warn('Token error in ' . $path . ($_ ? ': ' . $_ : ''));
- $error = 1;
+ $self->error('Token error in ' . $path . ($_ ? ': ' . $_ : ''));
+ $self->log->warn($self->error);
};
return if $error;
@@ -124,9 +127,8 @@
# Token is undefined
unless (defined $token) {
+ $self->error("Tokenization with failing offsets in $path");
$self->log->warn("Unable to find substring [$from-$to] in $path");
- $self->log->error("Tokenization with failing offsets in $path");
- # next;
return;
};
@@ -192,7 +194,10 @@
$have++;
};
- return if $have == 0;
+ if ($have == 0) {
+ $self->error('No tokens found in ' . $path);
+ return;
+ };
# Add token count
$mtts->add_meta('tokens', '<i>' . $have);
diff --git a/t/script/archive.t b/t/script/archive.t
index 598c8e5..93edc8d 100644
--- a/t/script/archive.t
+++ b/t/script/archive.t
@@ -170,9 +170,9 @@
# Test without compression
{
local $SIG{__WARN__} = sub {};
- my $out = stderr_from(sub { system($call); });
+ my $out = combined_from(sub { system($call); });
- like($out, qr!no working base tokenization!s, $call);
+ like($out, qr!No tokens found!s, $call);
};
my $input_quotes = catfile($f, '..', 'corpus', 'archive_quotes.zip');