Introduce support for Gingko
Change-Id: I6d93ebd402b94141ec1d8847605e243b22397eb0
diff --git a/Changes b/Changes
index cd886d7..a3fc7ea 100644
--- a/Changes
+++ b/Changes
@@ -5,6 +5,7 @@
- Define resources in Makefile.
- Add GitHub action for CI.
- Remove MANIFEST file from repo.
+ - Introduce Gingko support.
0.41 2020-08-10
- Added support for RWK annotations.
diff --git a/Readme.pod b/Readme.pod
index 1c68abf..89abd5f 100644
--- a/Readme.pod
+++ b/Readme.pod
@@ -370,6 +370,9 @@
DRuKoLa
#Morpho
+ Gingko
+ #Morpho
+
Glemm
#Morpho
diff --git a/lib/KorAP/XML/Annotation/Gingko/Morpho.pm b/lib/KorAP/XML/Annotation/Gingko/Morpho.pm
new file mode 100644
index 0000000..6897a3c
--- /dev/null
+++ b/lib/KorAP/XML/Annotation/Gingko/Morpho.pm
@@ -0,0 +1,44 @@
+package KorAP::XML::Annotation::Gingko::Morpho;
+use KorAP::XML::Annotation::Base;
+
+sub parse {
+ my $self = shift;
+
+ $$self->add_tokendata(
+ foundry => 'gingko',
+ layer => 'morpho',
+ cb => sub {
+ my ($stream, $token) = @_;
+ my $mtt = $stream->pos($token->get_pos);
+
+ my $content = $token->get_hash->{fs}->{f};
+
+ my $found;
+
+ my $name;
+ foreach my $f (@{$content->{fs}->{f}}) {
+
+ $name = $f->{-name};
+
+ # pos tag
+ if (($name eq 'pos') &&
+ ($found = $f->{'#text'})) {
+ $mtt->add_by_term('ginkgo/p:' . $found);
+ }
+
+ # lemma tag
+ elsif (($name eq 'lemma')
+ && ($found = $f->{'#text'})
+ && $found ne '<unknown>') {
+ $mtt->add_by_term('gingko/l:' . $found);
+ };
+ };
+ }) or return;
+ return 1;
+};
+
+sub layer_info {
+ ['gingko/l=tokens', 'gingko/p=tokens']
+}
+
+1;
diff --git a/script/korapxml2krill b/script/korapxml2krill
index e3eb0cb..2376a5e 100644
--- a/script/korapxml2krill
+++ b/script/korapxml2krill
@@ -151,9 +151,12 @@
# - Added support for Redewiedergabe-Korpus structure
# annotations, based on sentence and paragraph milestones
# - Added support for Redewiedergabe-Korpus morphology
+#
+# 2021/10/11
+# - Introduced support for Gingko
# ----------------------------------------------------------
-our $LAST_CHANGE = '2021/02/08';
+our $LAST_CHANGE = '2021/10/11';
our $LOCAL = $FindBin::Bin;
our $KORAL_VERSION = 0.03;
our $VERSION_MSG = <<"VERSION";
@@ -411,6 +414,10 @@
push(@layers,
['DRuKoLa', 'Morpho']);
+# Gingko
+push(@layers,
+ ['Gingko', 'Morpho']);
+
# Glemm
push(@layers,
['Glemm', 'Morpho']);
@@ -1371,6 +1378,9 @@
Glemm
#Morpho
+ Gingko
+ #Morpho
+
HNC
#Morpho
diff --git a/t/real/corpus/Gingko/ATZ07/JAN/00001/data.xml b/t/real/corpus/Gingko/ATZ07/JAN/00001/data.xml
new file mode 100644
index 0000000..d549681
--- /dev/null
+++ b/t/real/corpus/Gingko/ATZ07/JAN/00001/data.xml
@@ -0,0 +1,9 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<?xml-model href="text.rng"
+ type="application/xml"
+ schematypens="http://relaxng.org/ns/structure/1.0"?>
+<raw_text docid="ATZ07_JAN.00001"
+ xmlns="http://ids-mannheim.de/ns/KorAP">
+ <metadata file="metadata.xml" />
+ <text>Ein neues Energiemanagement-Konzept für das elektrische Bordnetz Energiemanagement-Systeme für heutige Kraftfahrzeuge sollen ohne Komfort einbußen die Fahrzeugstartfähigkeit sicherstellen und durch einen möglichst optimalen Betrieb der Batterie vorzeitige Batterieausfälle vermeiden.</text>
+</raw_text>
\ No newline at end of file
diff --git a/t/real/corpus/Gingko/ATZ07/JAN/00001/gingko/morpho.xml b/t/real/corpus/Gingko/ATZ07/JAN/00001/gingko/morpho.xml
new file mode 100644
index 0000000..5df0854
--- /dev/null
+++ b/t/real/corpus/Gingko/ATZ07/JAN/00001/gingko/morpho.xml
@@ -0,0 +1,311 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<?xml-model href="span.rng"
+ type="application/xml"
+ schematypens="http://relaxng.org/ns/structure/1.0"?>
+<layer docid="ATZ07_JAN.00001"
+ xmlns="http://ids-mannheim.de/ns/KorAP"
+ version="KorAP-0.4">
+ <spanList>
+ <span id="s0" from="0" to="3" l="5">
+ <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
+ <f name="lex">
+ <fs>
+ <f name="pos">ART</f>
+ <f name="lemma">eine</f>
+ </fs>
+ </f>
+ </fs>
+ </span>
+ <span id="s1" from="4" to="9" l="5">
+ <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
+ <f name="lex">
+ <fs>
+ <f name="pos">ADJA</f>
+ <f name="lemma">neu</f>
+ </fs>
+ </f>
+ </fs>
+ </span>
+ <span id="s2" from="10" to="35" l="5">
+ <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
+ <f name="lex">
+ <fs>
+ <f name="pos">NN</f>
+ <f name="lemma"><unknown></f>
+ </fs>
+ </f>
+ </fs>
+ </span>
+ <span id="s3" from="36" to="39" l="5">
+ <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
+ <f name="lex">
+ <fs>
+ <f name="pos">APPR</f>
+ <f name="lemma">für</f>
+ </fs>
+ </f>
+ </fs>
+ </span>
+ <span id="s4" from="40" to="43" l="5">
+ <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
+ <f name="lex">
+ <fs>
+ <f name="pos">ART</f>
+ <f name="lemma">die</f>
+ </fs>
+ </f>
+ </fs>
+ </span>
+ <span id="s5" from="44" to="55" l="5">
+ <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
+ <f name="lex">
+ <fs>
+ <f name="pos">ADJA</f>
+ <f name="lemma">elektrisch</f>
+ </fs>
+ </f>
+ </fs>
+ </span>
+ <span id="s6" from="56" to="64" l="5">
+ <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
+ <f name="lex">
+ <fs>
+ <f name="pos">NN</f>
+ <f name="lemma">Bordnetz</f>
+ </fs>
+ </f>
+ </fs>
+ </span>
+ <span id="s7" from="65" to="90" l="7">
+ <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
+ <f name="lex">
+ <fs>
+ <f name="pos">NN</f>
+ <f name="lemma"><unknown></f>
+ </fs>
+ </f>
+ </fs>
+ </span>
+ <span id="s8" from="91" to="94" l="7">
+ <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
+ <f name="lex">
+ <fs>
+ <f name="pos">APPR</f>
+ <f name="lemma">für</f>
+ </fs>
+ </f>
+ </fs>
+ </span>
+ <span id="s9" from="95" to="102" l="7">
+ <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
+ <f name="lex">
+ <fs>
+ <f name="pos">ADJA</f>
+ <f name="lemma">heutig</f>
+ </fs>
+ </f>
+ </fs>
+ </span>
+ <span id="s10" from="103" to="117" l="7">
+ <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
+ <f name="lex">
+ <fs>
+ <f name="pos">NN</f>
+ <f name="lemma">Kraftfahrzeug</f>
+ </fs>
+ </f>
+ </fs>
+ </span>
+ <span id="s11" from="118" to="124" l="7">
+ <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
+ <f name="lex">
+ <fs>
+ <f name="pos">VMFIN</f>
+ <f name="lemma">sollen</f>
+ </fs>
+ </f>
+ </fs>
+ </span>
+ <span id="s12" from="125" to="129" l="7">
+ <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
+ <f name="lex">
+ <fs>
+ <f name="pos">APPR</f>
+ <f name="lemma">ohne</f>
+ </fs>
+ </f>
+ </fs>
+ </span>
+ <span id="s13" from="130" to="137" l="7">
+ <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
+ <f name="lex">
+ <fs>
+ <f name="pos">NN</f>
+ <f name="lemma">Komfort</f>
+ </fs>
+ </f>
+ </fs>
+ </span>
+ <span id="s14" from="138" to="146" l="7">
+ <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
+ <f name="lex">
+ <fs>
+ <f name="pos">VVFIN</f>
+ <f name="lemma"><unknown></f>
+ </fs>
+ </f>
+ </fs>
+ </span>
+ <span id="s15" from="147" to="150" l="7">
+ <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
+ <f name="lex">
+ <fs>
+ <f name="pos">ART</f>
+ <f name="lemma">die</f>
+ </fs>
+ </f>
+ </fs>
+ </span>
+ <span id="s16" from="151" to="173" l="7">
+ <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
+ <f name="lex">
+ <fs>
+ <f name="pos">NN</f>
+ <f name="lemma"><unknown></f>
+ </fs>
+ </f>
+ </fs>
+ </span>
+ <span id="s17" from="174" to="187" l="7">
+ <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
+ <f name="lex">
+ <fs>
+ <f name="pos">VVINF</f>
+ <f name="lemma">sicherstellen</f>
+ </fs>
+ </f>
+ </fs>
+ </span>
+ <span id="s18" from="188" to="191" l="7">
+ <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
+ <f name="lex">
+ <fs>
+ <f name="pos">KON</f>
+ <f name="lemma">und</f>
+ </fs>
+ </f>
+ </fs>
+ </span>
+ <span id="s19" from="192" to="197" l="7">
+ <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
+ <f name="lex">
+ <fs>
+ <f name="pos">APPR</f>
+ <f name="lemma">durch</f>
+ </fs>
+ </f>
+ </fs>
+ </span>
+ <span id="s20" from="198" to="203" l="7">
+ <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
+ <f name="lex">
+ <fs>
+ <f name="pos">ART</f>
+ <f name="lemma">eine</f>
+ </fs>
+ </f>
+ </fs>
+ </span>
+ <span id="s21" from="204" to="213" l="7">
+ <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
+ <f name="lex">
+ <fs>
+ <f name="pos">ADV</f>
+ <f name="lemma">möglichst</f>
+ </fs>
+ </f>
+ </fs>
+ </span>
+ <span id="s22" from="214" to="223" l="7">
+ <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
+ <f name="lex">
+ <fs>
+ <f name="pos">ADJA</f>
+ <f name="lemma">optimal</f>
+ </fs>
+ </f>
+ </fs>
+ </span>
+ <span id="s23" from="224" to="231" l="7">
+ <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
+ <f name="lex">
+ <fs>
+ <f name="pos">NN</f>
+ <f name="lemma">Betrieb</f>
+ </fs>
+ </f>
+ </fs>
+ </span>
+ <span id="s24" from="232" to="235" l="7">
+ <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
+ <f name="lex">
+ <fs>
+ <f name="pos">ART</f>
+ <f name="lemma">die</f>
+ </fs>
+ </f>
+ </fs>
+ </span>
+ <span id="s25" from="236" to="244" l="7">
+ <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
+ <f name="lex">
+ <fs>
+ <f name="pos">NN</f>
+ <f name="lemma">Batterie</f>
+ </fs>
+ </f>
+ </fs>
+ </span>
+ <span id="s26" from="245" to="255" l="7">
+ <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
+ <f name="lex">
+ <fs>
+ <f name="pos">ADJA</f>
+ <f name="lemma">vorzeitig</f>
+ </fs>
+ </f>
+ </fs>
+ </span>
+ <span id="s27" from="256" to="272" l="7">
+ <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
+ <f name="lex">
+ <fs>
+ <f name="pos">NN</f>
+ <f name="lemma"><unknown></f>
+ </fs>
+ </f>
+ </fs>
+ </span>
+ <span id="s28" from="273" to="282" l="7">
+ <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
+ <f name="lex">
+ <fs>
+ <f name="pos">VVINF</f>
+ <f name="lemma">vermeiden</f>
+ </fs>
+ </f>
+ </fs>
+ </span>
+ <span id="s29" from="282" to="283" l="7">
+ <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
+ <f name="lex">
+ <fs>
+ <f name="pos">$.</f>
+ <f name="lemma">.</f>
+ <f name="join">left</f>
+ </fs>
+ </f>
+ </fs>
+ </span>
+ </spanList>
+</layer>
\ No newline at end of file
diff --git a/t/real/corpus/Gingko/ATZ07/JAN/00001/header.xml b/t/real/corpus/Gingko/ATZ07/JAN/00001/header.xml
new file mode 100644
index 0000000..e71c153
--- /dev/null
+++ b/t/real/corpus/Gingko/ATZ07/JAN/00001/header.xml
@@ -0,0 +1,81 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<?xml-model href="header.rng"
+ type="application/xml"
+ schematypens="http://relaxng.org/ns/structure/1.0"?>
+<!DOCTYPE idsCorpus PUBLIC "-//IDS//DTD IDS-XCES 1.0//EN"
+ "http://corpora.ids-mannheim.de/idsxces1/DTD/ids.xcesdoc.dtd">
+<idsHeader TEIform="teiHeader" pattern="text" status="new" type="text" version="1.0">
+ <fileDesc>
+ <titleStmt>
+ <textSigle>ATZ07/JAN.00001</textSigle>
+ <t.title assemblage="external">ATZ07/JAN.00001 ATZ - Automobiltechnische Zeitschrift, Januar 2007, Nr.109, S. 10-15; Ein neues Energiemanagement-Konzept für das elektrische Bordnetz</t.title>
+ </titleStmt>
+ <publicationStmt>
+ <distributor/>
+ <pubAddress/>
+ <availability region="world">QAO-NC</availability>
+ <pubDate type="year">2021</pubDate>
+ </publicationStmt>
+ <sourceDesc>
+ <biblStruct>
+ <analytic>
+ <h.title type="main">Ein neues Energiemanagement-Konzept für das elektrische Bordnetz</h.title>
+ <h.title type="sub"/>
+ <h.author>Theuerkauf, Heinz; Schmidt, Matthias</h.author>
+ <imprint/>
+ <biblScope type="pp">S. 10-15</biblScope>
+ <biblNote n="DOI">10.1007/BF03221854</biblNote>
+ </analytic>
+ <monogr>
+ <h.title type="main">ATZ - Automobiltechnische Zeitschrift</h.title>
+ <h.title type="short">ATZ</h.title>
+ <imprint>
+ <publisher>Springer Fachmedien GmbH</publisher>
+ <pubPlace>Wiesbaden</pubPlace>
+ <pubDate type="year">2007</pubDate>
+ </imprint>
+ <biblScope type="issue">1</biblScope>
+ <biblScope type="vol">109</biblScope>
+ </monogr>
+ </biblStruct>
+ <reference type="complete" assemblage="external">ATZ07/JAN.00001 ATZ - Automobiltechnische Zeitschrift, Januar 2007, Nr.109, S. 10-15 - Theuerkauf, H.; Schmidt, M.: Ein neues Energiemanagement-Konzept für das elektrische Bordnetz</reference>
+ <reference type="short" assemblage="external">ATZ07/JAN.00001 ATZ, 2007, Nr.109</reference>
+ <reference type="super" assemblage="external">ATZ07/JAN ATZ - Automobiltechnische Zeitschrift, Wiesbaden: Springer Fachmedien GmbH; 2007</reference>
+ </sourceDesc>
+ </fileDesc>
+ <encodingDesc>
+ <editorialDecl>
+ <pagination type="no"/>
+ <transduction n="1">gingko-XML by Leipzig University</transduction>
+ <transduction n="2">Sentence splitting using NLTK by Leipzig
+ University</transduction>
+ <transduction n="3">Tokenisation, Lemmatisation, POS-annotation using TreeTagger
+ with STTS by Leipzig University</transduction>
+ <transduction n="4">XSL Conversion to I5 by IDS</transduction>
+ <correction n="lemma">no</correction>
+ </editorialDecl>
+ <tagsDecl>
+ <tagUsage gi="w" occurs="2191">used to mark a single token</tagUsage>
+ </tagsDecl>
+ </encodingDesc>
+ <profileDesc>
+ <textClass>
+ <catRef n="1" target="topic.wissenschaft.populaerwissenschaft" scheme="topic"/>
+ </textClass>
+ <textDesc>
+ <textType>Zeitschrift: Fachzeitschrift</textType>
+ <textTypeRef>Fachzeitschrift</textTypeRef>
+ <textTypeArt>Fachartikel</textTypeArt>
+ </textDesc>
+ <creation>
+ <creatDate>2007.01.</creatDate>
+ <creatRef>Januar 2007</creatRef>
+ <creatRefShort>Januar 2007</creatRefShort>
+ </creation>
+ <textClass>
+ <catRef n="0.6" target="topic.technik-industrie.kfz" scheme="topic"/>
+ <classCode scheme="gingkoGenre.top">wissenschaftlich</classCode>
+ <classCode scheme="gingkoGenre.sub">wissenschaftlich</classCode>
+ </textClass>
+ </profileDesc>
+ </idsHeader>
\ No newline at end of file
diff --git a/t/real/corpus/Gingko/ATZ07/JAN/00001/struct/structure.xml b/t/real/corpus/Gingko/ATZ07/JAN/00001/struct/structure.xml
new file mode 100644
index 0000000..b2c5757
--- /dev/null
+++ b/t/real/corpus/Gingko/ATZ07/JAN/00001/struct/structure.xml
@@ -0,0 +1,408 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<?xml-model href="span.rng"
+ type="application/xml"
+ schematypens="http://relaxng.org/ns/structure/1.0"?>
+<layer docid="ATZ07_JAN.00001"
+ xmlns="http://ids-mannheim.de/ns/KorAP"
+ version="KorAP-0.4">
+ <spanList>
+ <span id="s0" from="0" to="283" l="1">
+ <fs type="struct" xmlns="http://www.tei-c.org/ns/1.0">
+ <f name="name">text</f>
+ </fs>
+ </span>
+ <span id="s1" from="0" to="283" l="2">
+ <fs type="struct" xmlns="http://www.tei-c.org/ns/1.0">
+ <f name="name">body</f>
+ </fs>
+ </span>
+ <span id="s2" from="0" to="283" l="3">
+ <fs type="struct" xmlns="http://www.tei-c.org/ns/1.0">
+ <f name="name">div</f>
+ <f name="attr">
+ <fs type="attr">
+ <f name="type">article</f>
+ <f name="n">0</f>
+ </fs>
+ </f>
+ </fs>
+ </span>
+ <span id="s3" from="0" to="64" l="4">
+ <fs type="struct" xmlns="http://www.tei-c.org/ns/1.0">
+ <f name="name">head</f>
+ <f name="attr">
+ <fs type="attr">
+ <f name="type"></f>
+ </fs>
+ </f>
+ </fs>
+ </span>
+ <span id="s4" from="0" to="3" l="5">
+ <fs type="struct" xmlns="http://www.tei-c.org/ns/1.0">
+ <f name="name">w</f>
+ <f name="attr">
+ <fs type="attr">
+ <f name="pos">ART</f>
+ <f name="lemma">eine</f>
+ </fs>
+ </f>
+ </fs>
+ </span>
+ <span id="s5" from="4" to="9" l="5">
+ <fs type="struct" xmlns="http://www.tei-c.org/ns/1.0">
+ <f name="name">w</f>
+ <f name="attr">
+ <fs type="attr">
+ <f name="pos">ADJA</f>
+ <f name="lemma">neu</f>
+ </fs>
+ </f>
+ </fs>
+ </span>
+ <span id="s6" from="10" to="35" l="5">
+ <fs type="struct" xmlns="http://www.tei-c.org/ns/1.0">
+ <f name="name">w</f>
+ <f name="attr">
+ <fs type="attr">
+ <f name="pos">NN</f>
+ <f name="lemma"><unknown></f>
+ </fs>
+ </f>
+ </fs>
+ </span>
+ <span id="s7" from="36" to="39" l="5">
+ <fs type="struct" xmlns="http://www.tei-c.org/ns/1.0">
+ <f name="name">w</f>
+ <f name="attr">
+ <fs type="attr">
+ <f name="pos">APPR</f>
+ <f name="lemma">für</f>
+ </fs>
+ </f>
+ </fs>
+ </span>
+ <span id="s8" from="40" to="43" l="5">
+ <fs type="struct" xmlns="http://www.tei-c.org/ns/1.0">
+ <f name="name">w</f>
+ <f name="attr">
+ <fs type="attr">
+ <f name="pos">ART</f>
+ <f name="lemma">die</f>
+ </fs>
+ </f>
+ </fs>
+ </span>
+ <span id="s9" from="44" to="55" l="5">
+ <fs type="struct" xmlns="http://www.tei-c.org/ns/1.0">
+ <f name="name">w</f>
+ <f name="attr">
+ <fs type="attr">
+ <f name="pos">ADJA</f>
+ <f name="lemma">elektrisch</f>
+ </fs>
+ </f>
+ </fs>
+ </span>
+ <span id="s10" from="56" to="64" l="5">
+ <fs type="struct" xmlns="http://www.tei-c.org/ns/1.0">
+ <f name="name">w</f>
+ <f name="attr">
+ <fs type="attr">
+ <f name="pos">NN</f>
+ <f name="lemma">Bordnetz</f>
+ </fs>
+ </f>
+ </fs>
+ </span>
+ <span id="s11" from="64" to="64" l="4">
+ <fs type="struct" xmlns="http://www.tei-c.org/ns/1.0">
+ <f name="name">head</f>
+ <f name="attr">
+ <fs type="attr">
+ <f name="type"></f>
+ </fs>
+ </f>
+ </fs>
+ </span>
+ <span id="s12" from="64" to="283" l="4">
+ <fs type="struct" xmlns="http://www.tei-c.org/ns/1.0">
+ <f name="name">div</f>
+ <f name="attr">
+ <fs type="attr">
+ <f name="type">sec0</f>
+ <f name="n">1</f>
+ </fs>
+ </f>
+ </fs>
+ </span>
+ <span id="s13" from="64" to="283" l="5">
+ <fs type="struct" xmlns="http://www.tei-c.org/ns/1.0">
+ <f name="name">p</f>
+ </fs>
+ </span>
+ <span id="s14" from="64" to="283" l="6">
+ <fs type="struct" xmlns="http://www.tei-c.org/ns/1.0">
+ <f name="name">s</f>
+ <f name="attr">
+ <fs type="attr">
+ <f name="type">s</f>
+ </fs>
+ </f>
+ </fs>
+ </span>
+ <span id="s15" from="65" to="90" l="7">
+ <fs type="struct" xmlns="http://www.tei-c.org/ns/1.0">
+ <f name="name">w</f>
+ <f name="attr">
+ <fs type="attr">
+ <f name="pos">NN</f>
+ <f name="lemma"><unknown></f>
+ </fs>
+ </f>
+ </fs>
+ </span>
+ <span id="s16" from="91" to="94" l="7">
+ <fs type="struct" xmlns="http://www.tei-c.org/ns/1.0">
+ <f name="name">w</f>
+ <f name="attr">
+ <fs type="attr">
+ <f name="pos">APPR</f>
+ <f name="lemma">für</f>
+ </fs>
+ </f>
+ </fs>
+ </span>
+ <span id="s17" from="95" to="102" l="7">
+ <fs type="struct" xmlns="http://www.tei-c.org/ns/1.0">
+ <f name="name">w</f>
+ <f name="attr">
+ <fs type="attr">
+ <f name="pos">ADJA</f>
+ <f name="lemma">heutig</f>
+ </fs>
+ </f>
+ </fs>
+ </span>
+ <span id="s18" from="103" to="117" l="7">
+ <fs type="struct" xmlns="http://www.tei-c.org/ns/1.0">
+ <f name="name">w</f>
+ <f name="attr">
+ <fs type="attr">
+ <f name="pos">NN</f>
+ <f name="lemma">Kraftfahrzeug</f>
+ </fs>
+ </f>
+ </fs>
+ </span>
+ <span id="s19" from="118" to="124" l="7">
+ <fs type="struct" xmlns="http://www.tei-c.org/ns/1.0">
+ <f name="name">w</f>
+ <f name="attr">
+ <fs type="attr">
+ <f name="pos">VMFIN</f>
+ <f name="lemma">sollen</f>
+ </fs>
+ </f>
+ </fs>
+ </span>
+ <span id="s20" from="125" to="129" l="7">
+ <fs type="struct" xmlns="http://www.tei-c.org/ns/1.0">
+ <f name="name">w</f>
+ <f name="attr">
+ <fs type="attr">
+ <f name="pos">APPR</f>
+ <f name="lemma">ohne</f>
+ </fs>
+ </f>
+ </fs>
+ </span>
+ <span id="s21" from="130" to="137" l="7">
+ <fs type="struct" xmlns="http://www.tei-c.org/ns/1.0">
+ <f name="name">w</f>
+ <f name="attr">
+ <fs type="attr">
+ <f name="pos">NN</f>
+ <f name="lemma">Komfort</f>
+ </fs>
+ </f>
+ </fs>
+ </span>
+ <span id="s22" from="138" to="146" l="7">
+ <fs type="struct" xmlns="http://www.tei-c.org/ns/1.0">
+ <f name="name">w</f>
+ <f name="attr">
+ <fs type="attr">
+ <f name="pos">VVFIN</f>
+ <f name="lemma"><unknown></f>
+ </fs>
+ </f>
+ </fs>
+ </span>
+ <span id="s23" from="147" to="150" l="7">
+ <fs type="struct" xmlns="http://www.tei-c.org/ns/1.0">
+ <f name="name">w</f>
+ <f name="attr">
+ <fs type="attr">
+ <f name="pos">ART</f>
+ <f name="lemma">die</f>
+ </fs>
+ </f>
+ </fs>
+ </span>
+ <span id="s24" from="151" to="173" l="7">
+ <fs type="struct" xmlns="http://www.tei-c.org/ns/1.0">
+ <f name="name">w</f>
+ <f name="attr">
+ <fs type="attr">
+ <f name="pos">NN</f>
+ <f name="lemma"><unknown></f>
+ </fs>
+ </f>
+ </fs>
+ </span>
+ <span id="s25" from="174" to="187" l="7">
+ <fs type="struct" xmlns="http://www.tei-c.org/ns/1.0">
+ <f name="name">w</f>
+ <f name="attr">
+ <fs type="attr">
+ <f name="pos">VVINF</f>
+ <f name="lemma">sicherstellen</f>
+ </fs>
+ </f>
+ </fs>
+ </span>
+ <span id="s26" from="188" to="191" l="7">
+ <fs type="struct" xmlns="http://www.tei-c.org/ns/1.0">
+ <f name="name">w</f>
+ <f name="attr">
+ <fs type="attr">
+ <f name="pos">KON</f>
+ <f name="lemma">und</f>
+ </fs>
+ </f>
+ </fs>
+ </span>
+ <span id="s27" from="192" to="197" l="7">
+ <fs type="struct" xmlns="http://www.tei-c.org/ns/1.0">
+ <f name="name">w</f>
+ <f name="attr">
+ <fs type="attr">
+ <f name="pos">APPR</f>
+ <f name="lemma">durch</f>
+ </fs>
+ </f>
+ </fs>
+ </span>
+ <span id="s28" from="198" to="203" l="7">
+ <fs type="struct" xmlns="http://www.tei-c.org/ns/1.0">
+ <f name="name">w</f>
+ <f name="attr">
+ <fs type="attr">
+ <f name="pos">ART</f>
+ <f name="lemma">eine</f>
+ </fs>
+ </f>
+ </fs>
+ </span>
+ <span id="s29" from="204" to="213" l="7">
+ <fs type="struct" xmlns="http://www.tei-c.org/ns/1.0">
+ <f name="name">w</f>
+ <f name="attr">
+ <fs type="attr">
+ <f name="pos">ADV</f>
+ <f name="lemma">möglichst</f>
+ </fs>
+ </f>
+ </fs>
+ </span>
+ <span id="s30" from="214" to="223" l="7">
+ <fs type="struct" xmlns="http://www.tei-c.org/ns/1.0">
+ <f name="name">w</f>
+ <f name="attr">
+ <fs type="attr">
+ <f name="pos">ADJA</f>
+ <f name="lemma">optimal</f>
+ </fs>
+ </f>
+ </fs>
+ </span>
+ <span id="s31" from="224" to="231" l="7">
+ <fs type="struct" xmlns="http://www.tei-c.org/ns/1.0">
+ <f name="name">w</f>
+ <f name="attr">
+ <fs type="attr">
+ <f name="pos">NN</f>
+ <f name="lemma">Betrieb</f>
+ </fs>
+ </f>
+ </fs>
+ </span>
+ <span id="s32" from="232" to="235" l="7">
+ <fs type="struct" xmlns="http://www.tei-c.org/ns/1.0">
+ <f name="name">w</f>
+ <f name="attr">
+ <fs type="attr">
+ <f name="pos">ART</f>
+ <f name="lemma">die</f>
+ </fs>
+ </f>
+ </fs>
+ </span>
+ <span id="s33" from="236" to="244" l="7">
+ <fs type="struct" xmlns="http://www.tei-c.org/ns/1.0">
+ <f name="name">w</f>
+ <f name="attr">
+ <fs type="attr">
+ <f name="pos">NN</f>
+ <f name="lemma">Batterie</f>
+ </fs>
+ </f>
+ </fs>
+ </span>
+ <span id="s34" from="245" to="255" l="7">
+ <fs type="struct" xmlns="http://www.tei-c.org/ns/1.0">
+ <f name="name">w</f>
+ <f name="attr">
+ <fs type="attr">
+ <f name="pos">ADJA</f>
+ <f name="lemma">vorzeitig</f>
+ </fs>
+ </f>
+ </fs>
+ </span>
+ <span id="s35" from="256" to="272" l="7">
+ <fs type="struct" xmlns="http://www.tei-c.org/ns/1.0">
+ <f name="name">w</f>
+ <f name="attr">
+ <fs type="attr">
+ <f name="pos">NN</f>
+ <f name="lemma"><unknown></f>
+ </fs>
+ </f>
+ </fs>
+ </span>
+ <span id="s36" from="273" to="282" l="7">
+ <fs type="struct" xmlns="http://www.tei-c.org/ns/1.0">
+ <f name="name">w</f>
+ <f name="attr">
+ <fs type="attr">
+ <f name="pos">VVINF</f>
+ <f name="lemma">vermeiden</f>
+ </fs>
+ </f>
+ </fs>
+ </span>
+ <span id="s37" from="282" to="283" l="7">
+ <fs type="struct" xmlns="http://www.tei-c.org/ns/1.0">
+ <f name="name">w</f>
+ <f name="attr">
+ <fs type="attr">
+ <f name="pos">$.</f>
+ <f name="lemma">.</f>
+ <f name="join">left</f>
+ </fs>
+ </f>
+ </fs>
+ </span>
+ </spanList>
+</layer>
\ No newline at end of file
diff --git a/t/real/corpus/Gingko/ATZ07/JAN/header.xml b/t/real/corpus/Gingko/ATZ07/JAN/header.xml
new file mode 100644
index 0000000..609b353
--- /dev/null
+++ b/t/real/corpus/Gingko/ATZ07/JAN/header.xml
@@ -0,0 +1,40 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<?xml-model href="header.rng"
+ type="application/xml"
+ schematypens="http://relaxng.org/ns/structure/1.0"?>
+<!DOCTYPE idsCorpus PUBLIC "-//IDS//DTD IDS-XCES 1.0//EN"
+ "http://corpora.ids-mannheim.de/idsxces1/DTD/ids.xcesdoc.dtd">
+<idsHeader TEIform="teiHeader" pattern="text" type="document" version="1.1">
+ <fileDesc>
+ <titleStmt>
+ <dokumentSigle>ATZ07/JAN</dokumentSigle>
+ <d.title>Gingko - Geschriebenes Ingenieurwissenschaftliches Korpus: ATZ -
+ Automobiltechnische Zeitschrift, Januar 2007</d.title>
+ <editor>
+ <orgName type="project" from="2017" to="2021">Muster in der Sprache der Ingenieurwissenschaften</orgName>
+ <persName>Prof. Dr. Christian Fandrych, University of Leipzig</persName>
+ </editor>
+ </titleStmt>
+ <publicationStmt>
+ <distributor> Institut für Deutsche Sprache </distributor>
+ <pubAddress> Postfach 10 16 21, D-68016 Mannheim </pubAddress>
+ <telephone> +49 (0)621 1581 0 </telephone>
+ <availability region="ids">QAO-NC</availability>
+ <pubDate>2021</pubDate>
+ </publicationStmt>
+ <sourceDesc>
+ <biblStruct>
+ <monogr>
+ <h.title type="main">Gingko - Geschriebenes Ingenieurwissenschaftliches Korpus</h.title>
+ <editor>Prof. Dr. Christian Fandrych, Leipzig University</editor>
+ <imprint>
+ <publisher>Herder-Institut, Leipzig University</publisher>
+ <pubPlace>Leipzig</pubPlace>
+ <pubDate type="year">2021</pubDate>
+ </imprint>
+ <biblNote>https://www.philol.uni-leipzig.de/herder-institut/forschung/projekte/laufende-projekte/gingko/</biblNote>
+ </monogr>
+ </biblStruct>
+ </sourceDesc>
+ </fileDesc>
+ </idsHeader>
\ No newline at end of file
diff --git a/t/real/corpus/Gingko/ATZ07/header.xml b/t/real/corpus/Gingko/ATZ07/header.xml
new file mode 100644
index 0000000..b8f8e5c
--- /dev/null
+++ b/t/real/corpus/Gingko/ATZ07/header.xml
@@ -0,0 +1,280 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<?xml-model href="header.rng"
+ type="application/xml"
+ schematypens="http://relaxng.org/ns/structure/1.0"?>
+<!DOCTYPE idsCorpus PUBLIC "-//IDS//DTD IDS-XCES 1.0//EN"
+ "http://corpora.ids-mannheim.de/idsxces1/DTD/ids.xcesdoc.dtd">
+<idsHeader TEIform="teiHeader" pattern="Ztg/Zschr" status="new" type="corpus" version="1.1">
+ <fileDesc>
+ <titleStmt>
+ <korpusSigle>ATZ07</korpusSigle>
+ <c.title>Gingko - Geschriebenes Ingenieurwissenschaftliches Korpus: ATZ - Automobiltechnische Zeitschrift, 2007</c.title>
+ <editor>
+ <orgName type="project" from="2017" to="2021">Muster in der Sprache der
+ Ingenieurwissenschaften</orgName>
+ <persName>Prof. Dr. Christian Fandrych, University of Leipzig</persName>
+ </editor>
+ <respStmt>
+ <persName from="2017" to="2019">Unbekannt</persName>
+ </respStmt>
+ </titleStmt>
+ <publicationStmt>
+ <distributor> Institut für Deutsche Sprache </distributor>
+ <pubAddress> Postfach 10 16 21, D-68016 Mannheim </pubAddress>
+ <telephone> +49 (0)621 1581 0 </telephone>
+ <availability region="ids">QAO-NC</availability>
+ <pubDate>2021</pubDate>
+ </publicationStmt>
+ <sourceDesc>
+ <biblStruct>
+ <monogr>
+ <h.title type="main">Gingko - Geschriebenes Ingenieurwissenschaftliches
+ Korpus</h.title>
+ <editor>Christian Fandrych</editor>
+ <imprint>
+ <publisher>Herder-Institut der Universität Leipzig</publisher>
+ <pubPlace>Leipzig</pubPlace>
+ <pubDate type="year">2021</pubDate>
+ </imprint>
+ <biblNote n="url">https://www.philol.uni-leipzig.de/herder-institut/forschung/projekte/laufende-projekte/gingko/</biblNote>
+ <biblNote n="collection">Gingko - Geschriebenes Ingenieurwissenschaftliches Korpus</biblNote>
+ <biblNote n="collectionShort">Gingko</biblNote>
+ </monogr>
+ </biblStruct>
+ </sourceDesc>
+ </fileDesc>
+ <encodingDesc>
+ <projectDesc>
+ <p>Project "Muster in der Sprache der Ingenieurwissenschaften"</p>
+ <p>Universität Greifswald, Institut für Deutsche Philologie (2017-2019)</p>
+ <p>Universität Leipzig, Herder-Institut (2020-2021)</p>
+ <p>Third-party funding by Deutsche Forschungsgemeinschaft (DFG), AOBJ: 692723</p>
+ <p>Project head 2017-2019 Jun.-Prof. Dr. Antje Heine</p>
+ <p>Project head 2020-2021 Prof. Dr. Christian Fandrych</p>
+ <p xml:lang="de">Das Projektkorpus Gingko (Geschriebenes ingenieurwissenschaftliches
+ Korpus) besteht aus 2498 wissenschaftlichen Artikeln der Zeitschriften
+ Automobiltechnische Zeitschrift (ATZ) und Motortechnische Zeitschrift (MTZ) der
+ Jahrgänge 2007-2016 und umfasst insgesamt 4.667.656 Tokens. Es ist im Rahmen des
+ Forschungsprojektes „Muster in der Sprache der Ingenieurwissenschaften“ entstanden.
+ Das Projekt hat das Ziel, Muster in der Sprache der Ingenieurwissenschaften (am
+ Beispiel der Automobiltechnik) systematisch zu erfassen und zu beschreiben.</p>
+ <p>Project Website: <ref type="url" target="https://www.philol.uni-leipzig.de/herder-institut/forschung/projekte/laufende-projekte/gingko/">Gingko website</ref></p>
+ <p>Publication: Schirrmeister, L., Rummel, M., Heine, A., Suppus, N. & Mendoza
+ Sánchez, B. (2021). Gingko – ein Korpus der ingenieurwissenschaftlichen Sprache.
+ <ref target="https://www.dafdigital.de/">Deutsch als Fremdsprache</ref> 58.</p>
+ </projectDesc>
+ <editorialDecl>
+ <transduction n="1">gingko-XML by Leipzig University</transduction>
+ <transduction n="2">Sentence splitting using NLTK by Leipzig University</transduction>
+ <transduction n="3">Tokenisation, Lemmatisation, POS-annotation using TreeTagger with
+ STTS by Leipzig University</transduction>
+ <transduction n="4">XSL Conversion to I5 by IDS</transduction>
+ </editorialDecl>
+ <classDecl>
+ <taxonomy id="topic">
+ <h.bibl>Thementaxonomie (siehe
+ http://www.ids-mannheim.de/kl/projekte/methoden/te.html)</h.bibl>
+ <category id="topic.fiktion">
+ <catDesc>Fiktion</catDesc>
+ <category id="topic.fiktion.vermischtes">
+ <catDesc>Fiktion:Vermischtes</catDesc>
+ </category>
+ </category>
+ <category id="topic.freizeit-unterhaltung">
+ <catDesc>Freizeit_Unterhaltung</catDesc>
+ <category id="topic.freizeit-unterhaltung.reisen">
+ <catDesc>Freizeit_Unterhaltung:Reisen</catDesc>
+ </category>
+ <category id="topic.freizeit-unterhaltung.rundfunk">
+ <catDesc>Freizeit_Unterhaltung:Rundfunk</catDesc>
+ </category>
+ <category id="topic.freizeit-unterhaltung.vereine-veranstaltungen">
+ <catDesc>Freizeit_Unterhaltung:Vereine_Veranstaltungen</catDesc>
+ </category>
+ </category>
+ <category id="topic.gesundheit-ernaehrung">
+ <catDesc>Gesundheit_Ernaehrung</catDesc>
+ <category id="topic.gesundheit-ernaehrung.ernaehrung">
+ <catDesc>Gesundheit_Ernaehrung:Ernaehrung</catDesc>
+ </category>
+ <category id="topic.gesundheit-ernaehrung.gesundheit">
+ <catDesc>Gesundheit_Ernaehrung:Gesundheit</catDesc>
+ </category>
+ </category>
+ <category id="topic.kultur">
+ <catDesc>Kultur</catDesc>
+ <category id="topic.kultur.bildende-kunst">
+ <catDesc>Kultur:Bildende Kunst</catDesc>
+ </category>
+ <category id="topic.kultur.darstellende-kunst">
+ <catDesc>Kultur:Darstellende Kunst</catDesc>
+ </category>
+ <category id="topic.kultur.film">
+ <catDesc>Kultur:Film</catDesc>
+ </category>
+ <category id="topic.kultur.literatur">
+ <catDesc>Kultur:Literatur</catDesc>
+ </category>
+ <category id="topic.kultur.mode">
+ <catDesc>Kultur:Mode</catDesc>
+ </category>
+ <category id="topic.kultur.musik">
+ <catDesc>Kultur:Musik</catDesc>
+ </category>
+ </category>
+ <category id="topic.natur-umwelt">
+ <catDesc>Natur_Umwelt</catDesc>
+ <category id="topic.natur-umwelt.garten">
+ <catDesc>Natur_Umwelt:Garten</catDesc>
+ </category>
+ <category id="topic.natur-umwelt.tiere">
+ <catDesc>Natur_Umwelt:Tiere</catDesc>
+ </category>
+ <category id="topic.natur-umwelt.wetter-klima">
+ <catDesc>Natur_Umwelt:Wetter_Klima</catDesc>
+ </category>
+ </category>
+ <category id="topic.politik">
+ <catDesc>Politik</catDesc>
+ <category id="topic.politik.ausland">
+ <catDesc>Politik:Ausland</catDesc>
+ </category>
+ <category id="topic.politik.inland">
+ <catDesc>Politik:Inland</catDesc>
+ </category>
+ <category id="topic.politik.kommunalpolitik">
+ <catDesc>Politik:Kommunalpolitik</catDesc>
+ </category>
+ </category>
+ <category id="topic.rest">
+ <catDesc>Rest</catDesc>
+ <category id="topic.rest.boersenkurse">
+ <catDesc>Rest:boersenkurse</catDesc>
+ </category>
+ <category id="topic.rest.geburt-tod-heirat">
+ <catDesc>Rest:geburt_tod_heirat</catDesc>
+ </category>
+ <category id="topic.rest.impressum">
+ <catDesc>Rest:impressum</catDesc>
+ </category>
+ <category id="topic.rest.inhaltsverzeichnisse">
+ <catDesc>Rest:inhaltsverzeichnisse</catDesc>
+ </category>
+ <category id="topic.rest.ligatabellen">
+ <catDesc>Rest:ligatabellen</catDesc>
+ </category>
+ <category id="topic.rest.tabellen">
+ <catDesc>Rest:tabellen</catDesc>
+ </category>
+ <category id="topic.rest.veranstaltungshinweise">
+ <catDesc>Rest:veranstaltungshinweise</catDesc>
+ </category>
+ </category>
+ <category id="topic.sport">
+ <catDesc>Sport</catDesc>
+ <category id="topic.sport.ballsport">
+ <catDesc>Sport:Ballsport</catDesc>
+ </category>
+ <category id="topic.sport.fussball">
+ <catDesc>Sport:Fussball</catDesc>
+ </category>
+ <category id="topic.sport.motorsport">
+ <catDesc>Sport:Motorsport</catDesc>
+ </category>
+ <category id="topic.sport.radsport">
+ <catDesc>Sport:Radsport</catDesc>
+ </category>
+ <category id="topic.sport.tennis">
+ <catDesc>Sport:Tennis</catDesc>
+ </category>
+ <category id="topic.sport.vermischtes">
+ <catDesc>Sport:Vermischtes</catDesc>
+ </category>
+ <category id="topic.sport.wintersport">
+ <catDesc>Sport:Wintersport</catDesc>
+ </category>
+ </category>
+ <category id="topic.staat-gesellschaft">
+ <catDesc>Staat_Gesellschaft</catDesc>
+ <category id="topic.staat-gesellschaft.arbeit-und-beruf">
+ <catDesc>Staat_Gesellschaft:Arbeit_und_Beruf</catDesc>
+ </category>
+ <category id="topic.staat-gesellschaft.bildung">
+ <catDesc>Staat_Gesellschaft:Bildung</catDesc>
+ </category>
+ <category id="topic.staat-gesellschaft.biographien-interviews">
+ <catDesc>Staat_Gesellschaft:Biographien_Interviews</catDesc>
+ </category>
+ <category id="topic.staat-gesellschaft.drittes-reich-rechtsextremismus">
+ <catDesc>Staat_Gesellschaft:Drittes_Reich_Rechtsextremismus</catDesc>
+ </category>
+ <category id="topic.staat-gesellschaft.familie-geschlecht">
+ <catDesc>Staat_Gesellschaft:Familie_Geschlecht</catDesc>
+ </category>
+ <category id="topic.staat-gesellschaft.kirche">
+ <catDesc>Staat_Gesellschaft:Kirche</catDesc>
+ </category>
+ <category id="topic.staat-gesellschaft.recht">
+ <catDesc>Staat_Gesellschaft:Recht</catDesc>
+ </category>
+ <category id="topic.staat-gesellschaft.tod">
+ <catDesc>Staat_Gesellschaft:Tod</catDesc>
+ </category>
+ <category id="topic.staat-gesellschaft.verbrechen">
+ <catDesc>Staat_Gesellschaft:Verbrechen</catDesc>
+ </category>
+ </category>
+ <category id="topic.technik-industrie">
+ <catDesc>Technik_Industrie</catDesc>
+ <category id="topic.technik-industrie.edv-elektronik">
+ <catDesc>Technik_Industrie:EDV_Elektronik</catDesc>
+ </category>
+ <category id="topic.technik-industrie.kfz">
+ <catDesc>Technik_Industrie:Kfz</catDesc>
+ </category>
+ <category id="topic.technik-industrie.transport-verkehr">
+ <catDesc>Technik_Industrie:Transport_Verkehr</catDesc>
+ </category>
+ <category id="topic.technik-industrie.umweltschutz">
+ <catDesc>Technik_Industrie:Umweltschutz</catDesc>
+ </category>
+ <category id="topic.technik-industrie.unfaelle">
+ <catDesc>Technik_Industrie:Unfaelle</catDesc>
+ </category>
+ </category>
+ <category id="topic.wirtschaft-finanzen">
+ <catDesc>Wirtschaft_Finanzen</catDesc>
+ <category id="topic.wirtschaft-finanzen.banken">
+ <catDesc>Wirtschaft_Finanzen:Banken</catDesc>
+ </category>
+ <category id="topic.wirtschaft-finanzen.bilanzen">
+ <catDesc>Wirtschaft_Finanzen:Bilanzen</catDesc>
+ </category>
+ <category id="topic.wirtschaft-finanzen.oeffentliche-finanzen">
+ <catDesc>Wirtschaft_Finanzen:Oeffentliche_Finanzen</catDesc>
+ </category>
+ <category id="topic.wirtschaft-finanzen.sozialprodukt">
+ <catDesc>Wirtschaft_Finanzen:Sozialprodukt</catDesc>
+ </category>
+ <category id="topic.wirtschaft-finanzen.waehrung">
+ <catDesc>Wirtschaft_Finanzen:Waehrung</catDesc>
+ </category>
+ </category>
+ <category id="topic.wissenschaft">
+ <catDesc>Wissenschaft</catDesc>
+ <category id="topic.wissenschaft.populaerwissenschaft">
+ <catDesc>Wissenschaft:Populaerwissenschaft</catDesc>
+ </category>
+ </category>
+ <category id="topic.unklassifizierbar">
+ <catDesc>Text ist thematisch nicht klassifizierbar.</catDesc>
+ </category>
+ </taxonomy>
+ </classDecl>
+ </encodingDesc>
+ <profileDesc>
+ <langUsage>
+ <language id="de" usage="100">Deutsch</language>
+ </langUsage>
+ </profileDesc>
+ </idsHeader>
\ No newline at end of file
diff --git a/t/real/gingko.t b/t/real/gingko.t
new file mode 100644
index 0000000..5edd877
--- /dev/null
+++ b/t/real/gingko.t
@@ -0,0 +1,111 @@
+use strict;
+use warnings;
+use Test::More;
+use Data::Dumper;
+use JSON::XS;
+
+if ($ENV{SKIP_REAL}) {
+ plan skip_all => 'Skip real tests';
+};
+
+use Benchmark qw/:hireswallclock/;
+
+my $t = Benchmark->new;
+
+use utf8;
+use lib 'lib', '../lib';
+
+use File::Basename 'dirname';
+use File::Spec::Functions 'catdir';
+
+use_ok('KorAP::XML::Krill');
+
+# This will Check Gingko-Files
+
+# New
+# ATZ07/JAN/00001
+my $path = catdir(dirname(__FILE__), 'corpus','Gingko', 'ATZ07','JAN','00001');
+
+ok(my $doc = KorAP::XML::Krill->new( path => $path . '/' ), 'Load Korap::Document');
+ok($doc->parse, 'Parse document');
+
+is($doc->text_sigle, 'ATZ07/JAN/00001', 'Correct text sigle');
+is($doc->doc_sigle, 'ATZ07/JAN', 'Correct document sigle');
+is($doc->corpus_sigle, 'ATZ07', 'Correct corpus sigle');
+
+my $meta = $doc->meta;
+is($meta->{T_title}, 'Ein neues Energiemanagement-Konzept für das elektrische Bordnetz', 'Title');
+is($meta->{S_pub_place}, 'Wiesbaden', 'PubPlace');
+is($meta->{D_pub_date}, '20070000', 'Creation Date');
+ok(!$meta->{T_sub_title}, 'SubTitle');
+is($meta->{T_author}, 'Theuerkauf, Heinz; Schmidt, Matthias', 'Author');
+
+is($meta->{A_publisher}, 'Springer Fachmedien GmbH', 'Publisher');
+ok(!$meta->{A_editor}, 'Editor');
+ok(!$meta->{A_translator}, 'Translator');
+is($meta->{S_text_type}, 'Zeitschrift: Fachzeitschrift', 'Correct Text Type');
+is($meta->{S_text_type_art}, 'Fachartikel', 'Correct Text Type Art');
+is($meta->{S_text_type_ref}, 'Fachzeitschrift', 'Correct Text Type Ref');
+ok(!$meta->{S_text_column}, 'Correct Text Column');
+ok(!$meta->{S_text_domain}, 'Correct Text Domain');
+ok(!$meta->{D_creation_date}, 'Creation Date');
+
+ok(!$meta->{pages}, 'Pages');
+ok(!$meta->{A_file_edition_statement}, 'File Ed Statement');
+ok(!$meta->{A_bibl_edition_statement}, 'Bibl Ed Statement');
+is($meta->{A_reference}, 'ATZ - Automobiltechnische Zeitschrift, Januar 2007, Nr.109, S. 10-15 - Theuerkauf, H.; Schmidt, M.: Ein neues Energiemanagement-Konzept für das elektrische Bordnetz', 'Reference');
+is($meta->{S_language}, 'de', 'Language');
+
+is($meta->{T_corpus_title}, 'Gingko - Geschriebenes Ingenieurwissenschaftliches Korpus', 'Correct Corpus title');
+ok(!$meta->{T_corpus_sub_title}, 'Correct Corpus Sub title');
+ok(!$meta->{T_corpus_author}, 'Correct Corpus author');
+is($meta->{A_corpus_editor}, 'Christian Fandrych', 'Correct Corpus editor');
+
+is($meta->{T_doc_title}, 'Gingko - Geschriebenes Ingenieurwissenschaftliches Korpus', 'Correct Doc title');
+ok(!$meta->{T_doc_sub_title}, 'Correct Doc Sub title');
+ok(!$meta->{T_doc_author}, 'Correct Doc author');
+is($meta->{A_doc_editor}, 'Prof. Dr. Christian Fandrych, Leipzig University', 'Correct Doc editor');
+
+# Tokenization
+use_ok('KorAP::XML::Tokenizer');
+
+my ($token_base_foundry, $token_base_layer) = (qw/Gingko Morpho/);
+
+# Get tokenization
+my $tokens = KorAP::XML::Tokenizer->new(
+ path => $doc->path,
+ doc => $doc,
+ foundry => $token_base_foundry,
+ layer => $token_base_layer,
+ name => 'tokens'
+);
+ok($tokens, 'Token Object is fine');
+ok($tokens->parse, 'Token parsing is fine');
+
+my $output = decode_json( $tokens->to_json );
+
+## Base
+ok($tokens->add('DeReKo', 'Structure', 'base_sentences_paragraphs'));
+ok($tokens->add('Gingko', 'Morpho'), 'Add Gingko');
+
+$output = $tokens->to_data;
+
+is($output->{data}->{foundries}, 'dereko dereko/structure dereko/structure/base_sentences_paragraphs gingko gingko/morpho', 'Foundries');
+
+is($output->{data}->{layerInfos}, 'dereko/s=spans gingko/l=tokens gingko/p=tokens', 'layerInfos');
+
+my $token = join('||', @{$output->{data}->{stream}->[7]});
+
+# Unknown
+unlike($token, qr!gingko/l!, 'data');
+like($token, qr!ginkgo/p:NN!, 'data');
+
+$token = join('||', @{$output->{data}->{stream}->[9]});
+
+like($token, qr!i:heutige!, 'data');
+like($token, qr!ginkgo/p:ADJA!, 'data');
+like($token, qr!gingko/l:heutig!, 'data');
+
+done_testing;
+__END__
+