Improved DRuKoLa meta data handling

Change-Id: I50a79baa595429abeb8b56b11b1942f6e2ed8374
diff --git a/lib/KorAP/XML/Krill.pm b/lib/KorAP/XML/Krill.pm
index 29b1291..398c045 100644
--- a/lib/KorAP/XML/Krill.pm
+++ b/lib/KorAP/XML/Krill.pm
@@ -69,12 +69,14 @@
   else {
     # Load file
     $file = b(Mojo::File->new($data_xml)->slurp);
+
     try {
       local $SIG{__WARN__} = sub {
         $error = 1;
       };
 
       $rt = xml2hash($file, text => '#text', attr => '-')->{raw_text};
+
     } catch  {
       $self->log->warn($unable);
       $error = 1;
diff --git a/lib/KorAP/XML/Meta/I5.pm b/lib/KorAP/XML/Meta/I5.pm
index 0dd5330..34d99e5 100644
--- a/lib/KorAP/XML/Meta/I5.pm
+++ b/lib/KorAP/XML/Meta/I5.pm
@@ -63,7 +63,8 @@
     my $translator;
     if ($editor && $editor->attr('role') && $editor->attr('role') eq 'translator') {
       # Translator is only supported on the text level currently
-      $self->{translator} = _squish $editor->all_text;
+      $translator = _squish $editor->all_text;
+      $self->{translator} = $translator if $translator;
       $editor = undef;
     }
     else {
@@ -112,7 +113,7 @@
     # Corpus title not yet given
     unless ($self->{corpus_title}) {
       if ($title = $dom->at('fileDesc > titleStmt > c\.title')) {
-        $title = $title->all_text;
+        $title = _squish($title->all_text);
 
         if ($title) {
           $self->{corpus_title} = _remove_prefix($title, $self->corpus_sigle);
@@ -125,7 +126,7 @@
   elsif ($type eq 'doc') {
     unless ($self->{doc_title}) {
       if ($title = $dom->at('fileDesc > titleStmt > d\.title')) {
-        $title = $title->all_text;
+        $title = _squish($title->all_text);
 
         if ($title) {
           $self->{doc_title} = _remove_prefix($title, $self->doc_sigle);
@@ -138,7 +139,7 @@
   elsif ($type eq 'text') {
     unless ($self->{title}) {
       if ($title = $dom->at('fileDesc > titleStmt > t\.title')) {
-        $title = $title->all_text;
+        $title = _squish($title->all_text);
         if ($title) {
           $self->{title} = _remove_prefix($title, $self->text_sigle);
         };
@@ -242,14 +243,14 @@
 
     $temp->find("catRef")->each(
       sub {
-        my ($ign, @ttopic) = split('\.', $_->attr('target'));
+        my ($ign, @ttopic) = grep { $_ } map { _squish($_) } split('\.', $_->attr('target'));
         push(@topic, @ttopic);
       }
     );
     $self->{text_class} = [@topic] if @topic > 0;
 
     my $kws = $self->{keywords};
-    my @keywords = $temp->find("h\.keywords > keyTerm")->each;
+    my @keywords = $temp->find("h\.keywords > keyTerm")->map(sub {_squish($_) })->grep(sub { $_ })->each;
     push(@$kws, @keywords) if @keywords > 0;
   };
 
diff --git a/t/corpus/BBU2/Blog/83701_a_82376/data.xml b/t/corpus/BBU2/Blog/83701_a_82376/data.xml
new file mode 100644
index 0000000..630b2dc
--- /dev/null
+++ b/t/corpus/BBU2/Blog/83701_a_82376/data.xml
@@ -0,0 +1,7 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<?xml-model href="text.rng" type="application/xml" schematypens="http://relaxng.org/ns/structure/1.0"?>
+
+<raw_text docid="BBU_BLOG.83701_a_82376" xmlns="http://ids-mannheim.de/ns/KorAP">
+  <metadata file="metadata.xml" />
+  <text>.</text>
+</raw_text>
diff --git a/t/corpus/BBU2/Blog/83701_a_82376/header.xml b/t/corpus/BBU2/Blog/83701_a_82376/header.xml
new file mode 100644
index 0000000..94f5be2
--- /dev/null
+++ b/t/corpus/BBU2/Blog/83701_a_82376/header.xml
@@ -0,0 +1,38 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<?xml-model href="header.rng" type="application/xml" schematypens="http://relaxng.org/ns/structure/1.0"?>
+<!DOCTYPE idsCorpus PUBLIC "-//IDS//DTD IDS-XCES 1.0//EN" "http://corpora.ids-mannheim.de/idsxces1/DTD/ids.xcesdoc.dtd">
+<idsHeader type="text" pattern="text" status="new" version="1.1" TEIform="teiHeader">
+        <fileDesc>
+          <titleStmt>
+            <textSigle>Corola-Bucurenci/Blog.83701_a_82376</textSigle>
+            <t.title>Confesiunile unui misogin</t.title>
+          </titleStmt>
+          <publicationStmt>
+            <distributor/>
+            <pubAddress/>
+            <availability region="world">QAO-NC</availability>
+            <pubDate/>
+          </publicationStmt>
+          <sourceDesc>
+            <biblStruct>
+              <monogr>
+                <h.author>DragoČ™ Bucurenci</h.author>
+                <editor role="translator">-</editor>
+                <imprint>
+                  <pubDate type="year">2013</pubDate>
+                  <pubDate type="month">11</pubDate>
+                  <pubDate type="day">18</pubDate>
+                  <pubPlace>URL:http://www.bucurenci.ro</pubPlace>
+                </imprint>
+              </monogr>
+            </biblStruct>
+          </sourceDesc>
+        </fileDesc>
+         <profileDesc>
+             <textDesc>
+                 <textType>-.-</textType>
+             <textClass>
+                 <catRef target="-.-" scheme="topic"/>
+             </textClass>
+         </profileDesc>
+      </idsHeader>
\ No newline at end of file
diff --git a/t/corpus/BBU2/Blog/header.xml b/t/corpus/BBU2/Blog/header.xml
new file mode 100644
index 0000000..a6afb53
--- /dev/null
+++ b/t/corpus/BBU2/Blog/header.xml
@@ -0,0 +1,30 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<?xml-model href="header.rng" type="application/xml" schematypens="http://relaxng.org/ns/structure/1.0"?>
+<!DOCTYPE idsCorpus PUBLIC "-//IDS//DTD IDS-XCES 1.0//EN" "http://corpora.ids-mannheim.de/idsxces1/DTD/ids.xcesdoc.dtd">
+<idsHeader type="document" pattern="text" status="new" version="1.1" TEIform="teiHeader">
+      <fileDesc>
+        <titleStmt>
+          <dokumentSigle>Corola-Bucurenci/Blog</dokumentSigle>
+          <d.title>-</d.title>
+        </titleStmt>
+        <publicationStmt>
+          <distributor/>
+          <pubAddress/>
+          <availability region="world">[...]</availability>
+          <pubDate/>
+        </publicationStmt>
+        <sourceDesc>
+          <biblStruct>
+            <monogr>
+              <h.title type="main"/>
+              <edition>
+                <further/>
+                <kind/>
+                <appearance/>
+              </edition>
+              <imprint/>
+            </monogr>
+          </biblStruct>
+        </sourceDesc>
+      </fileDesc>
+    </idsHeader>
\ No newline at end of file
diff --git a/t/corpus/BBU2/header.xml b/t/corpus/BBU2/header.xml
new file mode 100644
index 0000000..7901a2e
--- /dev/null
+++ b/t/corpus/BBU2/header.xml
@@ -0,0 +1,22 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<?xml-model href="header.rng" type="application/xml" schematypens="http://relaxng.org/ns/structure/1.0"?>
+<!DOCTYPE idsCorpus PUBLIC "-//IDS//DTD IDS-XCES 1.0//EN" "http://corpora.ids-mannheim.de/idsxces1/DTD/ids.xcesdoc.dtd">
+<idsHeader type="corpus" status="new" version="1.1" TEIform="teiHeader">
+    <fileDesc>
+      <titleStmt>
+        <korpusSigle>Corola-Bucurenci</korpusSigle>
+        <c.title>Corola-Bucurenci</c.title>
+      </titleStmt>
+    </fileDesc>
+    <profileDesc>
+      <langUsage>
+        <language id="ro" usage="100">Romanian</language>
+      </langUsage>
+      <!-- 
+           <textDesc>
+           This element is suppressed in p5!
+           <channel mode="w">written</channel>
+           </textDesc>
+      -->
+    </profileDesc>
+  </idsHeader>
\ No newline at end of file
diff --git a/t/real/drukola.t b/t/real/drukola.t
index 4656f18..1eb4587 100644
--- a/t/real/drukola.t
+++ b/t/real/drukola.t
@@ -106,5 +106,23 @@
 like($token, qr!drukola/m:gender:feminine!, 'data');
 like($token, qr!drukola/p:NOUN!, 'data');
 
+
+# New
+# BBU2/BLOG/83709_a_82384
+$path = catdir(dirname(__FILE__), '../corpus/BBU2/Blog/83701_a_82376');
+
+
+
+ok($doc = KorAP::XML::Krill->new( path => $path . '/' ), 'Load Korap::Document');
+ok($doc->parse, 'Parse document');
+
+$meta = $doc->meta;
+
+ok(!exists $meta->{doc_title}, 'No doc title');
+ok(!exists $meta->{translator}, 'No translator');
+
+ok(!exists $meta->{text_class}, 'No translator');
+
+
 done_testing;
 __END__