Fixed primary data handling for data with white space at the beginning and at the end

Change-Id: Ib831e8ab819e7b8ca5767b2c2f61d0afc00deba9
diff --git a/lib/KorAP/XML/Krill.pm b/lib/KorAP/XML/Krill.pm
index ddc859d..3c750a0 100644
--- a/lib/KorAP/XML/Krill.pm
+++ b/lib/KorAP/XML/Krill.pm
@@ -1,7 +1,7 @@
 package KorAP::XML::Krill;
 use Mojo::Base -base;
 use Mojo::ByteStream 'b';
-use Mojo::Util qw/encode/;
+use Mojo::Util qw/encode html_unescape/;
 use Mojo::File;
 use Scalar::Util qw/weaken/;
 use XML::Fast;
@@ -104,8 +104,14 @@
     return;
   };
 
-  # Get primary data
-  my $pd = $rt->{text};
+  # Get primary data (was my "$pd = $rt->{text};" before)
+  # Unfortunately xml2hash removes spaces at the start and at
+  # the end of a text node, making it impossible to deal with cmc data.
+  $file =~ $ENC_RE;
+  $file = $file->decode($2 // 'UTF-8');
+  my $start = index($file, '<text>') + 6;
+  my $end = index($file, '</text>');
+  my $pd = html_unescape substr($file, $start, $end - $start);
 
   unless ($pd) {
     $self->log->warn($unable . ': No primary data found');
diff --git a/t/annotation/mdp_dependency.t b/t/annotation/mdp_dependency.t
index 478bbc1..1506d8b 100644
--- a/t/annotation/mdp_dependency.t
+++ b/t/annotation/mdp_dependency.t
@@ -71,6 +71,8 @@
 
 is($stream->[0]->[0], '-:tokens$<i>3555', 'Token count');
 
+is($stream->[-1]->[-1], 's:978-3-89487-607-4', 'Last token');
+
 # Term-to-term
 is($stream->[0]->[1], '<:mdp/d:NMOD$<b>32<i>5', 'Term-to-Term');
 is($stream->[5]->[0], '>:mdp/d:NMOD$<b>32<i>0', 'Term-to-Term');