Fixed bug in tokenizer to recognize non-word-tokenizations

Change-Id: I4d9d5ffaefc45dc2220c17273dee70e05080137e
diff --git a/MANIFEST b/MANIFEST
index af56715..826961f 100755
--- a/MANIFEST
+++ b/MANIFEST
@@ -107,6 +107,7 @@
 t/corpus/GOE/header.xml
 t/corpus/VDI/header.xml
 t/corpus/WDD/header.xml
+t/corpus/WDD15/header.xml
 t/corpus/REI/header.xml
 t/corpus/artificial/data.xml
 t/corpus/artificial/header.xml
@@ -147,6 +148,7 @@
 t/corpus/REI/RBR/header.xml
 t/corpus/VDI/JAN/header.xml
 t/corpus/WDD/G27/header.xml
+t/corpus/WDD15/A79/header.xml
 t/corpus/WPD/00001/data.xml
 t/corpus/WPD/00001/header.xml
 t/corpus/WPD/00001/metadata.xml
@@ -425,6 +427,9 @@
 t/corpus/WDD/G27/38989/data.xml
 t/corpus/WDD/G27/38989/header.xml
 t/corpus/WDD/G27/38989/text.txt
+t/corpus/WDD15/A79/83946/header.xml
+t/corpus/WDD15/A79/83946/data.xml
+t/corpus/WDD15/A79/83946/opennlp/tokens.xml
 t/corpus/WPD/00001/base/metadata.xml
 t/corpus/WPD/00001/base/paragraph.xml
 t/corpus/WPD/00001/base/sentences.xml
diff --git a/lib/KorAP/XML/Index/MultiTermTokenStream.pm b/lib/KorAP/XML/Index/MultiTermTokenStream.pm
index f480be1..7824ae3 100644
--- a/lib/KorAP/XML/Index/MultiTermTokenStream.pm
+++ b/lib/KorAP/XML/Index/MultiTermTokenStream.pm
@@ -60,7 +60,8 @@
 
 sub add_meta {
   my $self = shift;
-  my $mt = $self->pos(0)->add('-:' . shift);
+  my $pos_0 = $self->pos(0) or return;
+  my $mt = $pos_0->add('-:' . shift);
   $mt->payload(shift);
   $mt->store_offsets(0);
 };
diff --git a/lib/KorAP/XML/Tokenizer.pm b/lib/KorAP/XML/Tokenizer.pm
index 27fcd24..f5c2f8f 100644
--- a/lib/KorAP/XML/Tokenizer.pm
+++ b/lib/KorAP/XML/Tokenizer.pm
@@ -177,6 +177,8 @@
     $have++;
   };
 
+  return if $have == 0;
+
   # Add token count
   $mtts->add_meta('tokens', '<i>' . $have);
 
diff --git a/t/corpus/WDD15/A79/83946/data.xml b/t/corpus/WDD15/A79/83946/data.xml
new file mode 100644
index 0000000..01b3072
--- /dev/null
+++ b/t/corpus/WDD15/A79/83946/data.xml
@@ -0,0 +1,7 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<?xml-model href="text.rng" type="application/xml" schematypens="http://relaxng.org/ns/structure/1.0"?>
+
+<raw_text docid="WDD15_A79.83946" xmlns="http://ids-mannheim.de/ns/KorAP">
+  <metadata file="metadata.xml" />
+  <text>.</text>
+</raw_text>
\ No newline at end of file
diff --git a/t/corpus/WDD15/A79/83946/header.xml b/t/corpus/WDD15/A79/83946/header.xml
new file mode 100644
index 0000000..4b953a9
--- /dev/null
+++ b/t/corpus/WDD15/A79/83946/header.xml
@@ -0,0 +1,69 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<?xml-model href="header.rng" type="application/xml" schematypens="http://relaxng.org/ns/structure/1.0"?>
+<!DOCTYPE idsCorpus PUBLIC "-//IDS//DTD IDS-XCES 1.0//EN" "http://corpora.ids-mannheim.de/idsxces1/DTD/ids.xcesdoc.dtd">
+<idsHeader type="text" pattern="text" status="new" version="1.0" TEIform="teiHeader">
+    <fileDesc>
+     <titleStmt>
+      <textSigle>WDD15/A79.83946</textSigle>
+      <t.title assemblage="external">WDD15/A79.83946: Diskussion:Arteria interossea communis, In: Wikipedia - URL:http://de.wikipedia.org/wiki/Diskussion:Arteria_interossea_communis: Wikipedia, 2015</t.title>
+     </titleStmt>
+     <editionStmt version="0"></editionStmt>
+     <publicationStmt>
+      <distributor></distributor>
+      <pubAddress></pubAddress>
+      <availability region="world" Default="n" status="unknown">CC-BY-SA</availability>
+      <pubDate></pubDate>
+     </publicationStmt>
+     <sourceDesc Default="n">
+      <biblStruct Default="n" status="draft">
+       <analytic>
+        <h.title type="main">Diskussion:Arteria interossea communis</h.title>
+        <h.title type="sub"></h.title>
+        <h.title type="abbr" level="m"></h.title>
+        <h.title type="abbr" level="a"></h.title>
+        <h.author>188.22.195.109, u.a.</h.author>
+        <editor></editor>
+        <imprint></imprint>
+        <biblScope type="subsume"></biblScope>
+        <biblScope type="pp"></biblScope>
+        <biblNote n="1"></biblNote>
+       </analytic>
+       <monogr>
+        <h.title type="main"></h.title>
+        <editor>wikipedia.org</editor>
+        <edition>
+         <further> Dump file "dewiki-20150501-pages-meta-current.xml" retrieved from http://dumps.wikimedia.org </further>
+         <kind></kind>
+         <appearance></appearance>
+        </edition>
+        <imprint>
+         <pubDate type="year">2015</pubDate>
+         <pubDate type="month">05</pubDate>
+         <pubDate type="day">01</pubDate>
+        </imprint>
+        <biblScope type="vol"></biblScope>
+        <biblScope type="volume-title"></biblScope>
+       </monogr>
+      </biblStruct>
+      <reference type="complete" assemblage="non-automatic">WDD15/A79.83946: Diskussion:Arteria interossea communis, In: Wikipedia - URL:http://de.wikipedia.org/wiki/Diskussion:Arteria_interossea_communis: Wikipedia, 2015</reference>
+      <reference type="short" assemblage="regular">WDD15/A79.83946 Wikipedia; Diskussion:Arteria interossea communis, (Letzte Änderung 24.11.2013 ) 1.5.2015</reference>
+     </sourceDesc>
+    </fileDesc>
+    <encodingDesc>
+     <samplingDecl Default="n"></samplingDecl>
+     <editorialDecl Default="n">
+      <pagination type="no"></pagination>
+     </editorialDecl>
+    </encodingDesc>
+    <profileDesc>
+     <creation>
+      <creatDate>2013.11.24</creatDate>
+      <creatRef>(Letzte Änderung 24.11.2013)</creatRef>
+      <creatRefShort>(Letzte Änderung 24.11.2013)</creatRefShort>
+     </creation>
+     <textDesc Default="n">
+      <textTypeArt>Diskussion</textTypeArt>
+      <textDomain></textDomain>
+     </textDesc>
+    </profileDesc>
+   </idsHeader>
\ No newline at end of file
diff --git a/t/corpus/WDD15/A79/83946/opennlp/tokens.xml b/t/corpus/WDD15/A79/83946/opennlp/tokens.xml
new file mode 100644
index 0000000..78fd76b
--- /dev/null
+++ b/t/corpus/WDD15/A79/83946/opennlp/tokens.xml
@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<?xml-model href="span.rng" type="application/xml" schematypens="http://relaxng.org/ns/structure/1.0"?>
+
+<layer xmlns="http://ids-mannheim.de/ns/KorAP" docid="WDD15_A79.83946" VERSION="KorAP-0.4">
+   <spanList>
+      <span id="s_0" from="0" to="1"/>
+   </spanList>
+</layer>
diff --git a/t/corpus/WDD15/A79/header.xml b/t/corpus/WDD15/A79/header.xml
new file mode 100644
index 0000000..aab1ec6
--- /dev/null
+++ b/t/corpus/WDD15/A79/header.xml
@@ -0,0 +1,25 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<?xml-model href="header.rng" type="application/xml" schematypens="http://relaxng.org/ns/structure/1.0"?>
+<!DOCTYPE idsCorpus PUBLIC "-//IDS//DTD IDS-XCES 1.0//EN" "http://corpora.ids-mannheim.de/idsxces1/DTD/ids.xcesdoc.dtd">
+<idsHeader type="document" pattern="text" status="new" version="1.0" TEIform="teiHeader">
+   <fileDesc>
+    <titleStmt>
+     <dokumentSigle>WDD15/A79</dokumentSigle>
+     <d.title>Wikipedia, Diskussionen zu Artikeln mit Anfangsbuchstabe A, Teil 79</d.title>
+    </titleStmt>
+    <publicationStmt>
+     <distributor/>
+     <pubAddress/>
+     <availability region="world">CC-BY-SA</availability>
+     <pubDate/>
+    </publicationStmt>
+    <sourceDesc>
+     <biblStruct Default="n">
+      <monogr>
+       <h.title type="main"></h.title>
+       <imprint/>
+      </monogr>
+     </biblStruct>
+    </sourceDesc>
+   </fileDesc>
+  </idsHeader>
\ No newline at end of file
diff --git a/t/corpus/WDD15/header.xml b/t/corpus/WDD15/header.xml
new file mode 100644
index 0000000..039033c
--- /dev/null
+++ b/t/corpus/WDD15/header.xml
@@ -0,0 +1,60 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<?xml-model href="header.rng" type="application/xml" schematypens="http://relaxng.org/ns/structure/1.0"?>
+<!DOCTYPE idsCorpus PUBLIC "-//IDS//DTD IDS-XCES 1.0//EN" "http://corpora.ids-mannheim.de/idsxces1/DTD/ids.xcesdoc.dtd">
+<idsHeader type="corpus" pattern="allesaußerZtg/Zschr" status="new" version="1.0" TEIform="teiHeader">
+  <fileDesc>
+   <titleStmt>
+    <korpusSigle>WDD15</korpusSigle>
+    <c.title>Wikipedia.de 2015 Diskussionen zu Artikeln</c.title>
+   </titleStmt>
+   <editionStmt version="1.0"></editionStmt>
+   <publicationStmt>
+    <distributor>Institut für Deutsche Sprache</distributor>
+    <pubAddress>Postfach 10 16 21, D-68016 Mannheim</pubAddress>
+    <telephone>+49 (0)621 1581 0</telephone>
+    <eAddress type="www">http://www.ids-mannheim.de</eAddress>
+    <eAddress type="www">http://www.ids-mannheim.de/kl/projekte/korpora/</eAddress>
+    <eAddress type="email">dereko@ids-mannheim.de</eAddress>
+    <availability status="restricted">This document, the IDS-Wikipedia.de-Corpus, is part of the Archive of General Reference Corpora at the IDS. It is published under the Creative Commons Attribution-ShareAlike License. See http://creativecommons.org/licenses/by-sa/3.0/legalcode for details. See http://www.ids-mannheim.de/kl/projekte/korpora/releases.html on how to refer to this document.</availability>
+    <pubDate type="year">2015</pubDate>
+   </publicationStmt>
+   <sourceDesc>
+    <biblStruct Default="n">
+     <monogr>
+      <h.title type="main">Wikipedia</h.title>
+      <h.author/>
+      <editor>wikipedia.org</editor>
+      <edition>
+       <further>Dump file "dewiki-20150501-pages-meta-current.xml" retrieved from http://dumps.wikimedia.org</further>
+       <kind/>
+       <appearance/>
+      </edition>
+      <imprint>
+       <publisher>Wikipedia</publisher>
+       <pubPlace>URL:http://de.wikipedia.org</pubPlace>
+      </imprint>
+     </monogr>
+    </biblStruct>
+   </sourceDesc>
+  </fileDesc>
+  <encodingDesc>
+   <editorialDecl>
+    <conformance>This document conforms to I5 (see http://jtei.revues.org/508)</conformance>
+    <transduction>This document has been generated via a two-stage conversion by Eliza Margaretha. In the first stage, wikitext from a Wikidump is converted into WikiXML by the WikiXMLConverter tool and in the second stage, WikiXML is converted into I5 by the WikiI5Converter tool. The converters are available at http://corpora.ids-mannheim.de/pub/tools/. Reference: Eliza Margaretha and Harald Lüngen (2014): Building Linguistic Corporafrom Wikipedia Articles and Discussions. In: Journal of LanguageTechnology and Computational Linguistics (JLCL) 29 (2). Special Issue onBuilding and Annotating Corpora of Computer-mediated Communication:Issues and Challenges at the Interface between Computational and CorpusLinguistics, edited by Michael Beißwenger, Nelleke Oostdijk, AngelikaStorrer and Henk van den Heuvel. URL:http://www.jlcl.org/2014_Heft2/Heft2-2014.pdf</transduction>
+   </editorialDecl>
+  </encodingDesc>
+  <profileDesc>
+   <langUsage>
+    <language id="de" usage="100">Deutsch</language>
+   </langUsage>
+   <textDesc>
+    <textType>Diskussionen zu Enzyklopädie-Artikeln</textType>
+    <textTypeRef/>
+   </textDesc>
+  </profileDesc>
+  <revisionDesc>
+   <listChange>
+    <change when="2015-09-10" who="#EM">initial public release</change>
+   </listChange>
+  </revisionDesc>
+ </idsHeader>
\ No newline at end of file
diff --git a/t/real/wdd.t b/t/real/wdd.t
index eb5db64..952682d 100644
--- a/t/real/wdd.t
+++ b/t/real/wdd.t
@@ -265,5 +265,35 @@
 
 # diag "No test for xip dependency";
 
+$path = catdir(dirname(__FILE__), '../corpus/WDD15/A79/83946');
+
+ok($doc = KorAP::XML::Krill->new( path => $path . '/' ), 'Load Korap::Document');
+ok($doc->parse, 'Parse document');
+
+is($doc->text_sigle, 'WDD15/A79/83946', 'Correct text sigle');
+is($doc->doc_sigle, 'WDD15/A79', 'Correct document sigle');
+is($doc->corpus_sigle, 'WDD15', 'Correct corpus sigle');
+
+# Get tokenization
+$tokens = KorAP::XML::Tokenizer->new(
+  path => $doc->path,
+  doc => $doc,
+  foundry => $token_base_foundry,
+  layer => $token_base_layer,
+  name => 'tokens'
+);
+ok($tokens, 'Token Object is fine');
+
+# Initialize log4perl object
+Log::Log4perl->init({
+  'log4perl.rootLogger' => 'DEBUG, STDERR',
+  'log4perl.appender.STDERR' => 'Log::Log4perl::Appender::ScreenColoredLevels',
+  'log4perl.appender.STDERR.layout' => 'PatternLayout',
+  'log4perl.appender.STDERR.layout.ConversionPattern' => '[%r] %F %L %c - %m%n'
+});
+
+ok(!$tokens->parse, 'Token parsing is fine');
+
+
 done_testing;
 __END__
diff --git a/t/real/wpd.t b/t/real/wpd.t
index 9ed0237..a343790 100644
--- a/t/real/wpd.t
+++ b/t/real/wpd.t
@@ -107,15 +107,6 @@
 is($tokens->foundry, 'Base', 'Foundry');
 is($tokens->layer, 'tokens_aggr', 'Layer');
 
-
-# Initialize log4perl object
-Log::Log4perl->init({
-  'log4perl.rootLogger' => 'DEBUG, STDERR',
-  'log4perl.appender.STDERR' => 'Log::Log4perl::Appender::ScreenColoredLevels',
-  'log4perl.appender.STDERR.layout' => 'PatternLayout',
-  'log4perl.appender.STDERR.layout.ConversionPattern' => '[%r] %F %L %c - %m%n'
-});
-
 ok($tokens->add('CoreNLP', 'Constituency'), 'Add Structure');
 
 $output = $tokens->to_data;
diff --git a/t/script/archive.t b/t/script/archive.t
index aee0860..a0b4dd6 100644
--- a/t/script/archive.t
+++ b/t/script/archive.t
@@ -128,6 +128,24 @@
 };
 
 ok(-d $output, 'Ouput directory exists');
+
+
+$input = catfile($f, '..', 'corpus', 'WDD15', 'A79', '83946');
+$call = join(
+  ' ',
+  'perl', $script,
+  '--input' => $input
+);
+
+# Test without compression
+{
+  local $SIG{__WARN__} = sub {};
+  my $out = stderr_from(sub { system($call); });
+
+  like($out, qr!no base tokenization!s, $call);
+};
+
+
 unlink($output);
 
 done_testing;