Fixed bug in tokenizer to recognize non-word-tokenizations
Change-Id: I4d9d5ffaefc45dc2220c17273dee70e05080137e
diff --git a/MANIFEST b/MANIFEST
index af56715..826961f 100755
--- a/MANIFEST
+++ b/MANIFEST
@@ -107,6 +107,7 @@
t/corpus/GOE/header.xml
t/corpus/VDI/header.xml
t/corpus/WDD/header.xml
+t/corpus/WDD15/header.xml
t/corpus/REI/header.xml
t/corpus/artificial/data.xml
t/corpus/artificial/header.xml
@@ -147,6 +148,7 @@
t/corpus/REI/RBR/header.xml
t/corpus/VDI/JAN/header.xml
t/corpus/WDD/G27/header.xml
+t/corpus/WDD15/A79/header.xml
t/corpus/WPD/00001/data.xml
t/corpus/WPD/00001/header.xml
t/corpus/WPD/00001/metadata.xml
@@ -425,6 +427,9 @@
t/corpus/WDD/G27/38989/data.xml
t/corpus/WDD/G27/38989/header.xml
t/corpus/WDD/G27/38989/text.txt
+t/corpus/WDD15/A79/83946/header.xml
+t/corpus/WDD15/A79/83946/data.xml
+t/corpus/WDD15/A79/83946/opennlp/tokens.xml
t/corpus/WPD/00001/base/metadata.xml
t/corpus/WPD/00001/base/paragraph.xml
t/corpus/WPD/00001/base/sentences.xml
diff --git a/lib/KorAP/XML/Index/MultiTermTokenStream.pm b/lib/KorAP/XML/Index/MultiTermTokenStream.pm
index f480be1..7824ae3 100644
--- a/lib/KorAP/XML/Index/MultiTermTokenStream.pm
+++ b/lib/KorAP/XML/Index/MultiTermTokenStream.pm
@@ -60,7 +60,8 @@
sub add_meta {
my $self = shift;
- my $mt = $self->pos(0)->add('-:' . shift);
+ my $pos_0 = $self->pos(0) or return;
+ my $mt = $pos_0->add('-:' . shift);
$mt->payload(shift);
$mt->store_offsets(0);
};
diff --git a/lib/KorAP/XML/Tokenizer.pm b/lib/KorAP/XML/Tokenizer.pm
index 27fcd24..f5c2f8f 100644
--- a/lib/KorAP/XML/Tokenizer.pm
+++ b/lib/KorAP/XML/Tokenizer.pm
@@ -177,6 +177,8 @@
$have++;
};
+ return if $have == 0;
+
# Add token count
$mtts->add_meta('tokens', '<i>' . $have);
diff --git a/t/corpus/WDD15/A79/83946/data.xml b/t/corpus/WDD15/A79/83946/data.xml
new file mode 100644
index 0000000..01b3072
--- /dev/null
+++ b/t/corpus/WDD15/A79/83946/data.xml
@@ -0,0 +1,7 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<?xml-model href="text.rng" type="application/xml" schematypens="http://relaxng.org/ns/structure/1.0"?>
+
+<raw_text docid="WDD15_A79.83946" xmlns="http://ids-mannheim.de/ns/KorAP">
+ <metadata file="metadata.xml" />
+ <text>.</text>
+</raw_text>
\ No newline at end of file
diff --git a/t/corpus/WDD15/A79/83946/header.xml b/t/corpus/WDD15/A79/83946/header.xml
new file mode 100644
index 0000000..4b953a9
--- /dev/null
+++ b/t/corpus/WDD15/A79/83946/header.xml
@@ -0,0 +1,69 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<?xml-model href="header.rng" type="application/xml" schematypens="http://relaxng.org/ns/structure/1.0"?>
+<!DOCTYPE idsCorpus PUBLIC "-//IDS//DTD IDS-XCES 1.0//EN" "http://corpora.ids-mannheim.de/idsxces1/DTD/ids.xcesdoc.dtd">
+<idsHeader type="text" pattern="text" status="new" version="1.0" TEIform="teiHeader">
+ <fileDesc>
+ <titleStmt>
+ <textSigle>WDD15/A79.83946</textSigle>
+ <t.title assemblage="external">WDD15/A79.83946: Diskussion:Arteria interossea communis, In: Wikipedia - URL:http://de.wikipedia.org/wiki/Diskussion:Arteria_interossea_communis: Wikipedia, 2015</t.title>
+ </titleStmt>
+ <editionStmt version="0"></editionStmt>
+ <publicationStmt>
+ <distributor></distributor>
+ <pubAddress></pubAddress>
+ <availability region="world" Default="n" status="unknown">CC-BY-SA</availability>
+ <pubDate></pubDate>
+ </publicationStmt>
+ <sourceDesc Default="n">
+ <biblStruct Default="n" status="draft">
+ <analytic>
+ <h.title type="main">Diskussion:Arteria interossea communis</h.title>
+ <h.title type="sub"></h.title>
+ <h.title type="abbr" level="m"></h.title>
+ <h.title type="abbr" level="a"></h.title>
+ <h.author>188.22.195.109, u.a.</h.author>
+ <editor></editor>
+ <imprint></imprint>
+ <biblScope type="subsume"></biblScope>
+ <biblScope type="pp"></biblScope>
+ <biblNote n="1"></biblNote>
+ </analytic>
+ <monogr>
+ <h.title type="main"></h.title>
+ <editor>wikipedia.org</editor>
+ <edition>
+ <further> Dump file "dewiki-20150501-pages-meta-current.xml" retrieved from http://dumps.wikimedia.org </further>
+ <kind></kind>
+ <appearance></appearance>
+ </edition>
+ <imprint>
+ <pubDate type="year">2015</pubDate>
+ <pubDate type="month">05</pubDate>
+ <pubDate type="day">01</pubDate>
+ </imprint>
+ <biblScope type="vol"></biblScope>
+ <biblScope type="volume-title"></biblScope>
+ </monogr>
+ </biblStruct>
+ <reference type="complete" assemblage="non-automatic">WDD15/A79.83946: Diskussion:Arteria interossea communis, In: Wikipedia - URL:http://de.wikipedia.org/wiki/Diskussion:Arteria_interossea_communis: Wikipedia, 2015</reference>
+ <reference type="short" assemblage="regular">WDD15/A79.83946 Wikipedia; Diskussion:Arteria interossea communis, (Letzte Änderung 24.11.2013 ) 1.5.2015</reference>
+ </sourceDesc>
+ </fileDesc>
+ <encodingDesc>
+ <samplingDecl Default="n"></samplingDecl>
+ <editorialDecl Default="n">
+ <pagination type="no"></pagination>
+ </editorialDecl>
+ </encodingDesc>
+ <profileDesc>
+ <creation>
+ <creatDate>2013.11.24</creatDate>
+ <creatRef>(Letzte Änderung 24.11.2013)</creatRef>
+ <creatRefShort>(Letzte Änderung 24.11.2013)</creatRefShort>
+ </creation>
+ <textDesc Default="n">
+ <textTypeArt>Diskussion</textTypeArt>
+ <textDomain></textDomain>
+ </textDesc>
+ </profileDesc>
+ </idsHeader>
\ No newline at end of file
diff --git a/t/corpus/WDD15/A79/83946/opennlp/tokens.xml b/t/corpus/WDD15/A79/83946/opennlp/tokens.xml
new file mode 100644
index 0000000..78fd76b
--- /dev/null
+++ b/t/corpus/WDD15/A79/83946/opennlp/tokens.xml
@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<?xml-model href="span.rng" type="application/xml" schematypens="http://relaxng.org/ns/structure/1.0"?>
+
+<layer xmlns="http://ids-mannheim.de/ns/KorAP" docid="WDD15_A79.83946" VERSION="KorAP-0.4">
+ <spanList>
+ <span id="s_0" from="0" to="1"/>
+ </spanList>
+</layer>
diff --git a/t/corpus/WDD15/A79/header.xml b/t/corpus/WDD15/A79/header.xml
new file mode 100644
index 0000000..aab1ec6
--- /dev/null
+++ b/t/corpus/WDD15/A79/header.xml
@@ -0,0 +1,25 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<?xml-model href="header.rng" type="application/xml" schematypens="http://relaxng.org/ns/structure/1.0"?>
+<!DOCTYPE idsCorpus PUBLIC "-//IDS//DTD IDS-XCES 1.0//EN" "http://corpora.ids-mannheim.de/idsxces1/DTD/ids.xcesdoc.dtd">
+<idsHeader type="document" pattern="text" status="new" version="1.0" TEIform="teiHeader">
+ <fileDesc>
+ <titleStmt>
+ <dokumentSigle>WDD15/A79</dokumentSigle>
+ <d.title>Wikipedia, Diskussionen zu Artikeln mit Anfangsbuchstabe A, Teil 79</d.title>
+ </titleStmt>
+ <publicationStmt>
+ <distributor/>
+ <pubAddress/>
+ <availability region="world">CC-BY-SA</availability>
+ <pubDate/>
+ </publicationStmt>
+ <sourceDesc>
+ <biblStruct Default="n">
+ <monogr>
+ <h.title type="main"></h.title>
+ <imprint/>
+ </monogr>
+ </biblStruct>
+ </sourceDesc>
+ </fileDesc>
+ </idsHeader>
\ No newline at end of file
diff --git a/t/corpus/WDD15/header.xml b/t/corpus/WDD15/header.xml
new file mode 100644
index 0000000..039033c
--- /dev/null
+++ b/t/corpus/WDD15/header.xml
@@ -0,0 +1,60 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<?xml-model href="header.rng" type="application/xml" schematypens="http://relaxng.org/ns/structure/1.0"?>
+<!DOCTYPE idsCorpus PUBLIC "-//IDS//DTD IDS-XCES 1.0//EN" "http://corpora.ids-mannheim.de/idsxces1/DTD/ids.xcesdoc.dtd">
+<idsHeader type="corpus" pattern="allesaußerZtg/Zschr" status="new" version="1.0" TEIform="teiHeader">
+ <fileDesc>
+ <titleStmt>
+ <korpusSigle>WDD15</korpusSigle>
+ <c.title>Wikipedia.de 2015 Diskussionen zu Artikeln</c.title>
+ </titleStmt>
+ <editionStmt version="1.0"></editionStmt>
+ <publicationStmt>
+ <distributor>Institut für Deutsche Sprache</distributor>
+ <pubAddress>Postfach 10 16 21, D-68016 Mannheim</pubAddress>
+ <telephone>+49 (0)621 1581 0</telephone>
+ <eAddress type="www">http://www.ids-mannheim.de</eAddress>
+ <eAddress type="www">http://www.ids-mannheim.de/kl/projekte/korpora/</eAddress>
+ <eAddress type="email">dereko@ids-mannheim.de</eAddress>
+ <availability status="restricted">This document, the IDS-Wikipedia.de-Corpus, is part of the Archive of General Reference Corpora at the IDS. It is published under the Creative Commons Attribution-ShareAlike License. See http://creativecommons.org/licenses/by-sa/3.0/legalcode for details. See http://www.ids-mannheim.de/kl/projekte/korpora/releases.html on how to refer to this document.</availability>
+ <pubDate type="year">2015</pubDate>
+ </publicationStmt>
+ <sourceDesc>
+ <biblStruct Default="n">
+ <monogr>
+ <h.title type="main">Wikipedia</h.title>
+ <h.author/>
+ <editor>wikipedia.org</editor>
+ <edition>
+ <further>Dump file "dewiki-20150501-pages-meta-current.xml" retrieved from http://dumps.wikimedia.org</further>
+ <kind/>
+ <appearance/>
+ </edition>
+ <imprint>
+ <publisher>Wikipedia</publisher>
+ <pubPlace>URL:http://de.wikipedia.org</pubPlace>
+ </imprint>
+ </monogr>
+ </biblStruct>
+ </sourceDesc>
+ </fileDesc>
+ <encodingDesc>
+ <editorialDecl>
+ <conformance>This document conforms to I5 (see http://jtei.revues.org/508)</conformance>
+ <transduction>This document has been generated via a two-stage conversion by Eliza Margaretha. In the first stage, wikitext from a Wikidump is converted into WikiXML by the WikiXMLConverter tool and in the second stage, WikiXML is converted into I5 by the WikiI5Converter tool. The converters are available at http://corpora.ids-mannheim.de/pub/tools/. Reference: Eliza Margaretha and Harald Lüngen (2014): Building Linguistic Corporafrom Wikipedia Articles and Discussions. In: Journal of LanguageTechnology and Computational Linguistics (JLCL) 29 (2). Special Issue onBuilding and Annotating Corpora of Computer-mediated Communication:Issues and Challenges at the Interface between Computational and CorpusLinguistics, edited by Michael Beißwenger, Nelleke Oostdijk, AngelikaStorrer and Henk van den Heuvel. URL:http://www.jlcl.org/2014_Heft2/Heft2-2014.pdf</transduction>
+ </editorialDecl>
+ </encodingDesc>
+ <profileDesc>
+ <langUsage>
+ <language id="de" usage="100">Deutsch</language>
+ </langUsage>
+ <textDesc>
+ <textType>Diskussionen zu Enzyklopädie-Artikeln</textType>
+ <textTypeRef/>
+ </textDesc>
+ </profileDesc>
+ <revisionDesc>
+ <listChange>
+ <change when="2015-09-10" who="#EM">initial public release</change>
+ </listChange>
+ </revisionDesc>
+ </idsHeader>
\ No newline at end of file
diff --git a/t/real/wdd.t b/t/real/wdd.t
index eb5db64..952682d 100644
--- a/t/real/wdd.t
+++ b/t/real/wdd.t
@@ -265,5 +265,35 @@
# diag "No test for xip dependency";
+$path = catdir(dirname(__FILE__), '../corpus/WDD15/A79/83946');
+
+ok($doc = KorAP::XML::Krill->new( path => $path . '/' ), 'Load Korap::Document');
+ok($doc->parse, 'Parse document');
+
+is($doc->text_sigle, 'WDD15/A79/83946', 'Correct text sigle');
+is($doc->doc_sigle, 'WDD15/A79', 'Correct document sigle');
+is($doc->corpus_sigle, 'WDD15', 'Correct corpus sigle');
+
+# Get tokenization
+$tokens = KorAP::XML::Tokenizer->new(
+ path => $doc->path,
+ doc => $doc,
+ foundry => $token_base_foundry,
+ layer => $token_base_layer,
+ name => 'tokens'
+);
+ok($tokens, 'Token Object is fine');
+
+# Initialize log4perl object
+Log::Log4perl->init({
+ 'log4perl.rootLogger' => 'DEBUG, STDERR',
+ 'log4perl.appender.STDERR' => 'Log::Log4perl::Appender::ScreenColoredLevels',
+ 'log4perl.appender.STDERR.layout' => 'PatternLayout',
+ 'log4perl.appender.STDERR.layout.ConversionPattern' => '[%r] %F %L %c - %m%n'
+});
+
+ok(!$tokens->parse, 'Token parsing is fine');
+
+
done_testing;
__END__
diff --git a/t/real/wpd.t b/t/real/wpd.t
index 9ed0237..a343790 100644
--- a/t/real/wpd.t
+++ b/t/real/wpd.t
@@ -107,15 +107,6 @@
is($tokens->foundry, 'Base', 'Foundry');
is($tokens->layer, 'tokens_aggr', 'Layer');
-
-# Initialize log4perl object
-Log::Log4perl->init({
- 'log4perl.rootLogger' => 'DEBUG, STDERR',
- 'log4perl.appender.STDERR' => 'Log::Log4perl::Appender::ScreenColoredLevels',
- 'log4perl.appender.STDERR.layout' => 'PatternLayout',
- 'log4perl.appender.STDERR.layout.ConversionPattern' => '[%r] %F %L %c - %m%n'
-});
-
ok($tokens->add('CoreNLP', 'Constituency'), 'Add Structure');
$output = $tokens->to_data;
diff --git a/t/script/archive.t b/t/script/archive.t
index aee0860..a0b4dd6 100644
--- a/t/script/archive.t
+++ b/t/script/archive.t
@@ -128,6 +128,24 @@
};
ok(-d $output, 'Ouput directory exists');
+
+
+$input = catfile($f, '..', 'corpus', 'WDD15', 'A79', '83946');
+$call = join(
+ ' ',
+ 'perl', $script,
+ '--input' => $input
+);
+
+# Test without compression
+{
+ local $SIG{__WARN__} = sub {};
+ my $out = stderr_from(sub { system($call); });
+
+ like($out, qr!no base tokenization!s, $call);
+};
+
+
unlink($output);
done_testing;