Warn on invalid certainty values
Change-Id: Ia6ec333036d25422b39cb72e7911da0c6571b95a
diff --git a/Changes b/Changes
index dc172e7..0e8520f 100644
--- a/Changes
+++ b/Changes
@@ -1,6 +1,8 @@
0.43 2022-01-17
- Fix temporary extract handling when defined
in a config file.
+ - Improve handling of invalid certainty values
+ in TreeTagger.
0.42 2022-01-11
- Replaced Log4perl with Log::Any.
diff --git a/Makefile.PL b/Makefile.PL
index 6032c0d..ff1de01 100644
--- a/Makefile.PL
+++ b/Makefile.PL
@@ -45,6 +45,7 @@
'Archive::Tar' => 2.24,
'Clone' => 0.45,
'List::Util' => 1.45,
+ 'Scalar::Util' => 1.60,
},
MIN_PERL_VERSION => '5.016',
test => {
diff --git a/lib/KorAP/XML/Annotation/TreeTagger/Morpho.pm b/lib/KorAP/XML/Annotation/TreeTagger/Morpho.pm
index 2e65abc..81fc525 100644
--- a/lib/KorAP/XML/Annotation/TreeTagger/Morpho.pm
+++ b/lib/KorAP/XML/Annotation/TreeTagger/Morpho.pm
@@ -1,6 +1,7 @@
package KorAP::XML::Annotation::TreeTagger::Morpho;
use KorAP::XML::Annotation::Base;
use POSIX 'floor';
+use Scalar::Util 'looks_like_number';
sub parse {
my $self = shift;
@@ -28,7 +29,14 @@
my $certainty = 0;
foreach (@$content) {
if ($_->{-name} eq 'certainty') {
- $certainty = $_->{'#text'};
+
+ if (looks_like_number($_->{'#text'})) {
+ $certainty = $_->{'#text'};
+ }
+ else {
+ $certainty = 1;
+ $$self->log->warn('"' . $_->{'#text'} . '" is not a valid certainty value');
+ }
}
else {
push @val, $_
diff --git a/t/annotation/corpus/doc/0001/tree_tagger/morpho.xml b/t/annotation/corpus/doc/0001/tree_tagger/morpho.xml
index 983ef05..c2c71bc 100644
--- a/t/annotation/corpus/doc/0001/tree_tagger/morpho.xml
+++ b/t/annotation/corpus/doc/0001/tree_tagger/morpho.xml
@@ -180,7 +180,7 @@
<f name="lex">
<fs>
<f name="lemma">Betrieb</f>
- <f name="certainty">1.000000</f>
+ <f name="certainty">'hurra'</f>
<f name="ctag">NN</f>
</fs>
</f>
diff --git a/t/annotation/tt_morpho.t b/t/annotation/tt_morpho.t
index 616f08c..705f7d0 100644
--- a/t/annotation/tt_morpho.t
+++ b/t/annotation/tt_morpho.t
@@ -28,6 +28,8 @@
is($data->{stream}->[10]->[3], 'tt/l:ein$<b>129<b>253', 'POS');
is($data->{stream}->[10]->[4], 'tt/p:PTKVZ$<b>129<b>253', 'POS');
+is($data->{stream}->[13]->[3], 'tt/l:Betrieb', 'POS');
+
is($data->{stream}->[-1]->[3], 'tt/l:werden', 'POS');
is($data->{stream}->[-1]->[4], 'tt/p:VAFIN', 'POS');