Fixed primary data handling for data with white space at the beginning and at the end
Change-Id: Ib831e8ab819e7b8ca5767b2c2f61d0afc00deba9
diff --git a/lib/KorAP/XML/Krill.pm b/lib/KorAP/XML/Krill.pm
index ddc859d..3c750a0 100644
--- a/lib/KorAP/XML/Krill.pm
+++ b/lib/KorAP/XML/Krill.pm
@@ -1,7 +1,7 @@
package KorAP::XML::Krill;
use Mojo::Base -base;
use Mojo::ByteStream 'b';
-use Mojo::Util qw/encode/;
+use Mojo::Util qw/encode html_unescape/;
use Mojo::File;
use Scalar::Util qw/weaken/;
use XML::Fast;
@@ -104,8 +104,14 @@
return;
};
- # Get primary data
- my $pd = $rt->{text};
+ # Get primary data (was my "$pd = $rt->{text};" before)
+ # Unfortunately xml2hash removes spaces at the start and at
+ # the end of a text node, making it impossible to deal with cmc data.
+ $file =~ $ENC_RE;
+ $file = $file->decode($2 // 'UTF-8');
+ my $start = index($file, '<text>') + 6;
+ my $end = index($file, '</text>');
+ my $pd = html_unescape substr($file, $start, $end - $start);
unless ($pd) {
$self->log->warn($unable . ': No primary data found');
diff --git a/t/annotation/mdp_dependency.t b/t/annotation/mdp_dependency.t
index 478bbc1..1506d8b 100644
--- a/t/annotation/mdp_dependency.t
+++ b/t/annotation/mdp_dependency.t
@@ -71,6 +71,8 @@
is($stream->[0]->[0], '-:tokens$<i>3555', 'Token count');
+is($stream->[-1]->[-1], 's:978-3-89487-607-4', 'Last token');
+
# Term-to-term
is($stream->[0]->[1], '<:mdp/d:NMOD$<b>32<i>5', 'Term-to-Term');
is($stream->[5]->[0], '>:mdp/d:NMOD$<b>32<i>0', 'Term-to-Term');