Somehow fixed relation indexation and meta data parsing (consistent to the GDoc)

commit: 192057192e65d09fdc7f3c3de1d66d31c3852cc5 [log] [tgz]
author: Nils Diewald <nils@diewald-online.de> Thu Jun 18 20:06:45 2015 +0000
committer: Nils Diewald <nils@diewald-online.de> Thu Jun 18 20:06:45 2015 +0000
tree: 97d53f6e4f9d003c0d3d18d87a660025a1b2c652
parent: 0d76734639be97dc264ac8c02118faba0b1dd3df [diff]
diff --git a/lib/KorAP/Document.pm b/lib/KorAP/Document.pm
index b9bdfe3..4af6c1a 100644
--- a/lib/KorAP/Document.pm
+++ b/lib/KorAP/Document.pm

@@ -15,9 +15,8 @@
 our @ATTR = qw/text_sigle
 	       doc_sigle
 	       corpus_sigle
-
-	       pub_date
 	       title
+	       pub_date
 	       sub_title
 	       pub_place
 	       author/;
@@ -47,6 +46,7 @@
 			corpus_sub_title
 			corpus_editor
 			/;
+# Separate: text_class, keywords
 
 # Removed:    coll_title, coll_sub_title, coll_author, coll_editor
 # Introduced: doc_title, doc_sub_title, corpus_editor, doc_editor, corpus_author, doc_author
@@ -136,6 +136,7 @@
   my @path = grep { $_ } splitdir($self->path);
   my @header;
 
+  # Parse the corpus file, the doc file, and the text file for meta information
   foreach (0..2) {
     unshift @header, '/' . catfile(@path, 'header.xml');
     pop @path;
@@ -176,6 +177,10 @@
   return ($self->{topics} // []);
 };
 
+sub text_class_string {
+  return join ' ', @{shift->text_class};
+}
+
 sub keywords {
   my $self = shift;
   if ($_[0]) {
@@ -184,6 +189,25 @@
   return ($self->{keywords} // []);
 };
 
+sub keywords_string {
+  return join ' ', @{shift->keywords};
+}
+
+sub _remove_prefix {
+  return $_[0];
+
+  # This may render some titles wrong, e.g. 'VDI nachrichten 2014' ...
+  my $title = shift;
+  my $prefix = shift;
+  $prefix =~ tr!_!/!;
+  if (index($title, $prefix) == 0) {
+    $title = substr($title, length($prefix));
+    $title =~ s/^\s+//;
+    $title =~ s/\s+$//;
+  };
+  return $title;
+};
+
 
 sub _parse_meta {
   my $self = shift;
@@ -211,19 +235,19 @@
     $editor    = $editor    ? $editor->all_text    : undef;
 
     if ($type eq 'text') {
-      $self->title($title)         if $title;
+      $self->title(_remove_prefix($title, $self->text_sigle)) if $title;
       $self->sub_title($sub_title) if $sub_title;
       $self->editor($editor)       if $editor;
       $self->author($author)       if $author;
     }
     elsif ($type eq 'doc') {
-      $self->doc_title($title)         if $title;
+      $self->doc_title(_remove_prefix($title, $self->doc_sigle)) if $title;
       $self->doc_sub_title($sub_title) if $sub_title;
       $self->doc_author($author)       if $author;
       $self->doc_editor($editor)       if $editor;
     }
     elsif ($type eq 'corpus') {
-      $self->corpus_title($title)         if $title;
+      $self->corpus_title(_remove_prefix($title, $self->corpus_sigle)) if $title;
       $self->corpus_sub_title($sub_title) if $sub_title;
       $self->corpus_author($author)       if $author;
       $self->corpus_editor($editor)       if $editor;
@@ -232,15 +256,19 @@
 
   # Not in analytic
   if ($type eq 'corpus') {
-    if (my $title = $dom->at('fileDesc > titleStmt > c\.title')) {
-      $self->corpus_title($title->all_text) if $title->all_text;
+    unless ($self->corpus_title) {
+      if (my $title = $dom->at('fileDesc > titleStmt > c\.title')) {
+	$self->corpus_title(_remove_prefix($title->all_text, $self->corpus_sigle)) if $title->all_text;
+      };
     };
   }
 
   # doc title
   elsif ($type eq 'doc') {
-    if (my $title = $dom->at('fileDesc > titleStmt > d\.title')) {
-      $self->doc_title($title->all_text) if $title->all_text;
+    unless ($self->doc_title) {
+      if (my $title = $dom->at('fileDesc > titleStmt > d\.title')) {
+	$self->doc_title(_remove_prefix($title->all_text, $self->doc_sigle)) if $title->all_text;
+      };
     };
   }
 
@@ -248,7 +276,7 @@
   elsif ($type eq 'text') {
     unless ($self->title) {
       if (my $title = $dom->at('fileDesc > titleStmt > t\.title')) {
-	$self->title($title->all_text) if $title->all_text;
+	$self->title(_remove_prefix($title->all_text, $self->text_sigle)) if $title->all_text;
       };
     };
   };
@@ -263,39 +291,39 @@
     $self->publisher($publisher->all_text) if $publisher->all_text;
   };
 
-  my $mono = $dom->at('monogr');
-  if ($mono) {
-
-    # Get title, subtitle, author, editor
-    my $title     = $mono->at('h\.title[type=main]');
-    my $sub_title = $mono->at('h\.title[type=sub]');
-    my $author    = $mono->at('h\.author');
-    my $editor    = $mono->at('editor');
-
-    $title     = $title     ? $title->all_text     : undef;
-    $sub_title = $sub_title ? $sub_title->all_text : undef;
-    $author    = $author    ? $author->all_text    : undef;
-    $editor    = $editor    ? $editor->all_text    : undef;
-
-    if ($type eq 'text') {
-      $self->title($title)         if $title;
-      $self->sub_title($sub_title) if $sub_title;
-      $self->editor($editor)       if $editor;
-      $self->author($author)       if $author;
-    }
-    elsif ($type eq 'doc') {
-      $self->doc_title($title)         if $title;
-      $self->doc_sub_title($sub_title) if $sub_title;
-      $self->doc_author($author)       if $author;
-      $self->doc_editor($editor)       if $editor;
-    }
-    elsif ($type eq 'corpus') {
-      $self->corpus_title($title)         if $title;
-      $self->corpus_sub_title($sub_title) if $sub_title;
-      $self->corpus_author($author)       if $author;
-      $self->corpus_editor($editor)       if $editor;
-    };
-  };
+#  my $mono = $dom->at('monogr');
+#  if ($mono) {
+#
+#    # Get title, subtitle, author, editor
+#    my $title     = $mono->at('h\.title[type=main]');
+#    my $sub_title = $mono->at('h\.title[type=sub]');
+#    my $author    = $mono->at('h\.author');
+#    my $editor    = $mono->at('editor');
+#
+#    $title     = $title     ? $title->all_text     : undef;
+#    $sub_title = $sub_title ? $sub_title->all_text : undef;
+#    $author    = $author    ? $author->all_text    : undef;
+#    $editor    = $editor    ? $editor->all_text    : undef;
+#
+#    if ($type eq 'text') {
+#      $self->title($title)         if $title && !$self->title;
+#      $self->sub_title($sub_title) if $sub_title && !$self->sub_title;
+#      $self->editor($editor)       if $editor && !$self->editor;
+#      $self->author($author)       if $author && !$self->author;
+#    }
+#    elsif ($type eq 'doc') {
+#      $self->doc_title($title)         if $title && !$self->doc_title;
+#      $self->doc_sub_title($sub_title) if $sub_title && !$self->doc_sub_title;
+#      $self->doc_author($author)       if $author && !$self->doc_author;
+#      $self->doc_editor($editor)       if $editor && !$self->doc_editor;
+#    }
+#    elsif ($type eq 'corpus') {
+#      $self->corpus_title($title)         if $title && !$self->corpus_title;
+#      $self->corpus_sub_title($sub_title) if $sub_title && !$self->corpus_sub_title;
+#      $self->corpus_author($author)       if $author && !$self->corpus_author;
+#      $self->corpus_editor($editor)       if $editor && !$self->corpus_editor;
+#    };
+#  };
 
   # Get text type
   my $text_desc = $dom->at('textDesc');
@@ -425,6 +453,7 @@
 };
 
 
+
 sub to_string {
   my $self = shift;
 

diff --git a/lib/KorAP/Index/Mate/Dependency.pm b/lib/KorAP/Index/Mate/Dependency.pm
index 8dc7010..4b97261 100644
--- a/lib/KorAP/Index/Mate/Dependency.pm
+++ b/lib/KorAP/Index/Mate/Dependency.pm

@@ -8,21 +8,28 @@
   # TODO: Create XIP tree here - for indirect dependency
   # >>:xip/d:SUBJ<i>566<i>789
 
+  # Relation data
   $$self->add_tokendata(
     foundry => 'mate',
     layer => 'dependency',
     cb => sub {
       my ($stream, $token, $tokens) = @_;
+
+      # Get MultiTermToken from stream
       my $mtt = $stream->pos($token->pos);
 
+      # Serialized information from token
       my $content = $token->hash;
 
+      # Get relation information
       my $rel = $content->{rel};
       $rel = [$rel] unless ref $rel eq 'ARRAY';
 
+      # Iterate over relations
       foreach (@$rel) {
 	my $label = $_->{-label};
 
+	# Relation type
 	if ($_->{-type} && $_->{-type} eq 'unary') {
 	  next if $_->{-label} eq '--';
 	  $mtt->add(

diff --git a/lib/KorAP/Index/XIP/Dependency.pm b/lib/KorAP/Index/XIP/Dependency.pm
index ce9c40b..1b53b24 100644
--- a/lib/KorAP/Index/XIP/Dependency.pm
+++ b/lib/KorAP/Index/XIP/Dependency.pm

@@ -14,8 +14,6 @@
       my ($stream, $token, $tokens) = @_;
       my $mtt = $stream->pos($token->pos);
 
-warn $tokens;
-
       my $content = $token->hash;
 
       my $rel = $content->{rel};

diff --git a/lib/KorAP/Tokenizer.pm b/lib/KorAP/Tokenizer.pm
index a75242c..a4a9721 100644
--- a/lib/KorAP/Tokenizer.pm
+++ b/lib/KorAP/Tokenizer.pm

@@ -4,6 +4,7 @@
 use XML::Fast;
 use Try::Tiny;
 use Carp qw/croak/;
+use Scalar::Util qw/weaken/;
 use KorAP::Tokenizer::Range;
 use KorAP::Tokenizer::Match;
 use KorAP::Tokenizer::Spans;
@@ -335,7 +336,8 @@
 
   if ($cb) {
     foreach (@$tokenarray) {
-      $cb->($self->stream, $_) if defined $_->pos;
+      # weaken $tokens;
+      $cb->($self->stream, $_, $tokens) if defined $_->pos;
       #, $tokens);
     };
     return 1;

diff --git a/t/VDI/JAN/00001/data.xml b/t/VDI/JAN/00001/data.xml
new file mode 100644
index 0000000..21fd76f
--- /dev/null
+++ b/t/VDI/JAN/00001/data.xml

@@ -0,0 +1,7 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<?xml-model href="text.rng" type="application/xml" schematypens="http://relaxng.org/ns/structure/1.0"?>
+
+<raw_text docid="VDI_JAN.00001" xmlns="http://ids-mannheim.de/ns/KorAP">
+  <metadata file="metadata.xml" />
+  <text>hui</text>
+</raw_text>

diff --git a/t/VDI/JAN/00001/header.xml b/t/VDI/JAN/00001/header.xml
new file mode 100644
index 0000000..a919407
--- /dev/null
+++ b/t/VDI/JAN/00001/header.xml

@@ -0,0 +1,69 @@
+<?xml version="1.0" encoding="iso-8859-1"?>
+<?xml-model href="header.rng" type="application/xml" schematypens="http://relaxng.org/ns/structure/1.0"?>
+<!DOCTYPE idsCorpus PUBLIC "-//IDS//DTD IDS-XCES 1.0//EN" "http://corpora.ids-mannheim.de/idsxces1/DTD/ids.xcesdoc.dtd">
+<idsheader type="text" version="1.1">
+  <fileDesc>
+    <titleStmt>
+      <textsigle>VDI14/JAN.00001</textsigle>
+      <t.title assemblage="external">VDI14/JAN.00001 VDI nachrichten, 17.01.2014, S. 10; 10- Zz mit Zahl</t.title>
+    </titleStmt>
+    <publicationStmt>
+      <distributor></distributor>
+      <pubAddress></pubAddress>
+      <availability></availability>
+      <pubDate></pubDate>
+    </publicationStmt>
+    <sourceDesc>
+      <biblStruct>
+	<analytic>
+	  <h.title type="main">10- Zz mit Zahl</h.title>
+	  <h.author>Windhövel, Kerstin</h.author>
+	  <imprint>
+	  </imprint>
+	  <biblScope type="pp">S. 10</biblScope>
+	  <biblScope type="suppl"></biblScope>
+	  <biblScope type="suppltitle"></biblScope>
+	  <biblNote n="1">Id: 578453</biblNote>
+	</analytic>
+	<monogr>
+	  <h.title></h.title>
+	  <imprint>
+	    <pubDate type="year">2014</pubDate>
+	    <pubDate type="month">01</pubDate>
+	    <pubDate type="day">17</pubDate>
+	  </imprint>
+	  <biblScope type="issue">03</biblScope>
+	  <biblScope type="issueplace"></biblScope>
+	</monogr>
+      </biblStruct>
+      <reference type="complete" assemblage="regular">VDI14/JAN.00001 VDI nachrichten, 17.01.2014, S. 10; 10- Zz mit Zahl [Ausführliche Zitierung nicht verfügbar]</reference>
+      <reference type="short" assemblage="regular">VDI14/JAN.00001 VDI nachr., 17.01.2014, S. 10</reference>
+    </sourceDesc>
+  </fileDesc>
+  <encodingDesc>
+    <samplingDecl>
+    </samplingDecl>
+    <tagsDecl>
+      <tagUsage gi="p" occurs="2"></tagUsage>
+      <tagUsage gi="q" occurs="1"></tagUsage>
+      <tagUsage gi="s" occurs="3"></tagUsage>
+    </tagsDecl>
+  </encodingDesc>
+  <profileDesc>
+    <creation>
+      <creatdate>2014.01.17</creatdate>
+    </creation>
+    <textClass>
+      <catRef target="topic.Freizeit-Unterhaltung.Reisen" n="0.38" scheme="topic" />
+      <catRef target="topic.Politik.Ausland" n="0.14" scheme="topic" />
+      <h.keywords>
+	<keyterm></keyterm>
+      </h.keywords>
+    </textClass>
+    <textdesc>
+      <texttypeart></texttypeart>
+      <textdomain></textdomain>
+      <column></column>
+    </textdesc>
+  </profileDesc>
+</idsheader>

diff --git a/t/VDI/JAN/00001/text.txt b/t/VDI/JAN/00001/text.txt
new file mode 100644
index 0000000..0cb4652
--- /dev/null
+++ b/t/VDI/JAN/00001/text.txt

@@ -0,0 +1 @@
+hui

diff --git a/t/VDI/JAN/header.xml b/t/VDI/JAN/header.xml
new file mode 100644
index 0000000..ec1d7cb
--- /dev/null
+++ b/t/VDI/JAN/header.xml

@@ -0,0 +1,26 @@
+<?xml version="1.0" encoding="iso-8859-1"?>
+<?xml-model href="header.rng" type="application/xml" schematypens="http://relaxng.org/ns/structure/1.0"?>
+<!DOCTYPE idsCorpus PUBLIC "-//IDS//DTD IDS-XCES 1.0//EN" "http://corpora.ids-mannheim.de/idsxces1/DTD/ids.xcesdoc.dtd">
+<idsheader type="document" version="1.1">
+  <fileDesc>
+    <titleStmt>
+      <dokumentsigle>VDI14/JAN</dokumentsigle>
+      <d.title>VDI nachrichten, Januar 2014</d.title>
+    </titleStmt>
+    <publicationStmt>
+      <distributor></distributor>
+      <pubAddress></pubAddress>
+      <availability></availability>
+      <pubDate></pubDate>
+    </publicationStmt>
+    <sourceDesc>
+      <biblStruct>
+	<monogr>
+	  <h.title></h.title>
+	  <imprint>
+	  </imprint>
+	</monogr>
+      </biblStruct>
+    </sourceDesc>
+  </fileDesc>
+</idsheader>

diff --git a/t/VDI/header.xml b/t/VDI/header.xml
new file mode 100644
index 0000000..28a29b4
--- /dev/null
+++ b/t/VDI/header.xml

@@ -0,0 +1,277 @@
+<?xml version="1.0" encoding="iso-8859-1"?>
+<?xml-model href="header.rng" type="application/xml" schematypens="http://relaxng.org/ns/structure/1.0"?>
+<!DOCTYPE idsCorpus PUBLIC "-//IDS//DTD IDS-XCES 1.0//EN" "http://corpora.ids-mannheim.de/idsxces1/DTD/ids.xcesdoc.dtd">
+<idsheader type="corpus" pattern="Ztg/Zschr" version="1.1">
+  <fileDesc>
+    <titleStmt>
+      <korpussigle>VDI14</korpussigle>
+      <c.title>VDI nachrichten 2014</c.title>
+    </titleStmt>
+    <publicationStmt>
+      <distributor>		Institut für Deutsche Sprache		</distributor>
+      <pubAddress>		Postfach 10 16 21, D-68016 Mannheim	</pubAddress>
+      <telephone>		+49 (0)621 1581 0			</telephone>
+      <availability></availability>
+      <pubDate></pubDate>
+    </publicationStmt>
+    <sourceDesc>
+      <biblFull>
+	<titleStmt>
+	  <x.title></x.title>
+	</titleStmt>
+	<editionStmt>
+	</editionStmt>
+	<publicationStmt>
+	  <distributor></distributor>
+	  <pubAddress></pubAddress>
+	  <availability></availability>
+	  <pubDate></pubDate>
+	</publicationStmt>
+      </biblFull>
+      <biblStruct>
+	<monogr>
+	  <h.title type="main">VDI nachrichten</h.title>
+	  <h.title type="sub"></h.title>
+	  <h.title type="abbr" level="m">VDI nachr.</h.title>
+	  <editor>Verein Deutscher Ingenieure</editor>
+	  <imprint>
+	    <publisher>VDI Verlag GmbH</publisher>
+	    <pubPlace>Düsseldorf</pubPlace>
+	  </imprint>
+	  <biblScope type="vol"></biblScope>
+	</monogr>
+      </biblStruct>
+      <reference type="super" assemblage="regular">VDI14 VDI nachrichten, [Wochenzeitung]; Hrsg.: Verein Deutscher Ingenieure, Düsseldorf: VDI Verlag GmbH; 2014</reference>
+    </sourceDesc>
+  </fileDesc>
+  <encodingDesc>
+    <projectDesc>
+    </projectDesc>
+    <samplingDecl>
+    </samplingDecl>
+    <editorialDecl>
+      <transduction>
+	TraDuCES - Korpus-Transformationscompiler, Version 3.6.4,
+	Eric Seubert, IDS Mannheim, 7. April 2014
+	Optionen bei der Konvertierung:
+	- Dubletten-Modus:
+	Entfernung aller als Dubletten klassifizierten Texte.
+	- Indexierungsmodus für COSMAS II:
+	Erzeugung von Ersatzreferenzen für ausführliche Zitierung.
+	Entfernung aller Deklarationen für Dubletten.
+	Entfernung von Texten mit Sperrvermerken.
+	Entfernung von Texten mit minimalem Inhalt.
+      </transduction>
+      <pagination type="no"></pagination>
+    </editorialDecl>
+    <classDecl>
+      <taxonomy id="topic">
+	<h.bibl>Thementaxonomie (siehe http://www.ids-mannheim.de/kl/projekte/methoden/te.html)</h.bibl>
+	<category id="topic.Fiktion">
+	  <catDesc>Fiktion</catDesc>
+	  <category id="topic.Fiktion.Vermischtes">
+	    <catDesc>Fiktion:Vermischtes</catDesc>
+	  </category>
+	</category>
+	<category id="topic.Freizeit-Unterhaltung">
+	  <catDesc>Freizeit_Unterhaltung</catDesc>
+	  <category id="topic.Freizeit-Unterhaltung.Reisen">
+	    <catDesc>Freizeit_Unterhaltung:Reisen</catDesc>
+	  </category>
+	  <category id="topic.Freizeit-Unterhaltung.Rundfunk">
+	    <catDesc>Freizeit_Unterhaltung:Rundfunk</catDesc>
+	  </category>
+	  <category id="topic.Freizeit-Unterhaltung.Vereine-Veranstaltungen">
+	    <catDesc>Freizeit_Unterhaltung:Vereine_Veranstaltungen</catDesc>
+	  </category>
+	</category>
+	<category id="topic.Gesundheit-Ernaehrung">
+	  <catDesc>Gesundheit_Ernaehrung</catDesc>
+	  <category id="topic.Gesundheit-Ernaehrung.Ernaehrung">
+	    <catDesc>Gesundheit_Ernaehrung:Ernaehrung</catDesc>
+	  </category>
+	  <category id="topic.Gesundheit-Ernaehrung.Gesundheit">
+	    <catDesc>Gesundheit_Ernaehrung:Gesundheit</catDesc>
+	  </category>
+	</category>
+	<category id="topic.Kultur">
+	  <catDesc>Kultur</catDesc>
+	  <category id="topic.Kultur.Bildende-Kunst">
+	    <catDesc>Kultur:Bildende Kunst</catDesc>
+	  </category>
+	  <category id="topic.Kultur.Darstellende-Kunst">
+	    <catDesc>Kultur:Darstellende Kunst</catDesc>
+	  </category>
+	  <category id="topic.Kultur.Film">
+	    <catDesc>Kultur:Film</catDesc>
+	  </category>
+	  <category id="topic.Kultur.Literatur">
+	    <catDesc>Kultur:Literatur</catDesc>
+	  </category>
+	  <category id="topic.Kultur.Mode">
+	    <catDesc>Kultur:Mode</catDesc>
+	  </category>
+	  <category id="topic.Kultur.Musik">
+	    <catDesc>Kultur:Musik</catDesc>
+	  </category>
+	</category>
+	<category id="topic.Natur-Umwelt">
+	  <catDesc>Natur_Umwelt</catDesc>
+	  <category id="topic.Natur-Umwelt.Garten">
+	    <catDesc>Natur_Umwelt:Garten</catDesc>
+	  </category>
+	  <category id="topic.Natur-Umwelt.Tiere">
+	    <catDesc>Natur_Umwelt:Tiere</catDesc>
+	  </category>
+	  <category id="topic.Natur-Umwelt.Wetter-Klima">
+	    <catDesc>Natur_Umwelt:Wetter_Klima</catDesc>
+	  </category>
+	</category>
+	<category id="topic.Politik">
+	  <catDesc>Politik</catDesc>
+	  <category id="topic.Politik.Ausland">
+	    <catDesc>Politik:Ausland</catDesc>
+	  </category>
+	  <category id="topic.Politik.Inland">
+	    <catDesc>Politik:Inland</catDesc>
+	  </category>
+	  <category id="topic.Politik.Kommunalpolitik">
+	    <catDesc>Politik:Kommunalpolitik</catDesc>
+	  </category>
+	</category>
+	<category id="topic.Rest">
+	  <catDesc>Rest</catDesc>
+	  <category id="topic.Rest.boersenkurse">
+	    <catDesc>Rest:boersenkurse</catDesc>
+	  </category>
+	  <category id="topic.Rest.geburt-tod-heirat">
+	    <catDesc>Rest:geburt_tod_heirat</catDesc>
+	  </category>
+	  <category id="topic.Rest.impressum">
+	    <catDesc>Rest:impressum</catDesc>
+	  </category>
+	  <category id="topic.Rest.inhaltsverzeichnisse">
+	    <catDesc>Rest:inhaltsverzeichnisse</catDesc>
+	  </category>
+	  <category id="topic.Rest.ligatabellen">
+	    <catDesc>Rest:ligatabellen</catDesc>
+	  </category>
+	  <category id="topic.Rest.tabellen">
+	    <catDesc>Rest:tabellen</catDesc>
+	  </category>
+	  <category id="topic.Rest.veranstaltungshinweise">
+	    <catDesc>Rest:veranstaltungshinweise</catDesc>
+	  </category>
+	</category>
+	<category id="topic.Sport">
+	  <catDesc>Sport</catDesc>
+	  <category id="topic.Sport.Ballsport">
+	    <catDesc>Sport:Ballsport</catDesc>
+	  </category>
+	  <category id="topic.Sport.Fussball">
+	    <catDesc>Sport:Fussball</catDesc>
+	  </category>
+	  <category id="topic.Sport.Motorsport">
+	    <catDesc>Sport:Motorsport</catDesc>
+	  </category>
+	  <category id="topic.Sport.Radsport">
+	    <catDesc>Sport:Radsport</catDesc>
+	  </category>
+	  <category id="topic.Sport.Tennis">
+	    <catDesc>Sport:Tennis</catDesc>
+	  </category>
+	  <category id="topic.Sport.Vermischtes">
+	    <catDesc>Sport:Vermischtes</catDesc>
+	  </category>
+	  <category id="topic.Sport.Wintersport">
+	    <catDesc>Sport:Wintersport</catDesc>
+	  </category>
+	</category>
+	<category id="topic.Staat-Gesellschaft">
+	  <catDesc>Staat_Gesellschaft</catDesc>
+	  <category id="topic.Staat-Gesellschaft.Arbeit-und-Beruf">
+	    <catDesc>Staat_Gesellschaft:Arbeit_und_Beruf</catDesc>
+	  </category>
+	  <category id="topic.Staat-Gesellschaft.Bildung">
+	    <catDesc>Staat_Gesellschaft:Bildung</catDesc>
+	  </category>
+	  <category id="topic.Staat-Gesellschaft.Biographien-Interviews">
+	    <catDesc>Staat_Gesellschaft:Biographien_Interviews</catDesc>
+	  </category>
+	  <category id="topic.Staat-Gesellschaft.Drittes-Reich-Rechtsextremismus">
+	    <catDesc>Staat_Gesellschaft:Drittes_Reich_Rechtsextremismus</catDesc>
+	  </category>
+	  <category id="topic.Staat-Gesellschaft.Familie-Geschlecht">
+	    <catDesc>Staat_Gesellschaft:Familie_Geschlecht</catDesc>
+	  </category>
+	  <category id="topic.Staat-Gesellschaft.Kirche">
+	    <catDesc>Staat_Gesellschaft:Kirche</catDesc>
+	  </category>
+	  <category id="topic.Staat-Gesellschaft.Recht">
+	    <catDesc>Staat_Gesellschaft:Recht</catDesc>
+	  </category>
+	  <category id="topic.Staat-Gesellschaft.Tod">
+	    <catDesc>Staat_Gesellschaft:Tod</catDesc>
+	  </category>
+	  <category id="topic.Staat-Gesellschaft.Verbrechen">
+	    <catDesc>Staat_Gesellschaft:Verbrechen</catDesc>
+	  </category>
+	</category>
+	<category id="topic.Technik-Industrie">
+	  <catDesc>Technik_Industrie</catDesc>
+	  <category id="topic.Technik-Industrie.EDV-Elektronik">
+	    <catDesc>Technik_Industrie:EDV_Elektronik</catDesc>
+	  </category>
+	  <category id="topic.Technik-Industrie.Kfz">
+	    <catDesc>Technik_Industrie:Kfz</catDesc>
+	  </category>
+	  <category id="topic.Technik-Industrie.Transport-Verkehr">
+	    <catDesc>Technik_Industrie:Transport_Verkehr</catDesc>
+	  </category>
+	  <category id="topic.Technik-Industrie.Umweltschutz">
+	    <catDesc>Technik_Industrie:Umweltschutz</catDesc>
+	  </category>
+	  <category id="topic.Technik-Industrie.Unfaelle">
+	    <catDesc>Technik_Industrie:Unfaelle</catDesc>
+	  </category>
+	</category>
+	<category id="topic.Wirtschaft-Finanzen">
+	  <catDesc>Wirtschaft_Finanzen</catDesc>
+	  <category id="topic.Wirtschaft-Finanzen.Banken">
+	    <catDesc>Wirtschaft_Finanzen:Banken</catDesc>
+	  </category>
+	  <category id="topic.Wirtschaft-Finanzen.Bilanzen">
+	    <catDesc>Wirtschaft_Finanzen:Bilanzen</catDesc>
+	  </category>
+	  <category id="topic.Wirtschaft-Finanzen.Oeffentliche-Finanzen">
+	    <catDesc>Wirtschaft_Finanzen:Oeffentliche_Finanzen</catDesc>
+	  </category>
+	  <category id="topic.Wirtschaft-Finanzen.Sozialprodukt">
+	    <catDesc>Wirtschaft_Finanzen:Sozialprodukt</catDesc>
+	  </category>
+	  <category id="topic.Wirtschaft-Finanzen.Waehrung">
+	    <catDesc>Wirtschaft_Finanzen:Waehrung</catDesc>
+	  </category>
+	</category>
+	<category id="topic.Wissenschaft">
+	  <catDesc>Wissenschaft</catDesc>
+	  <category id="topic.Wissenschaft.Populaerwissenschaft">
+	    <catDesc>Wissenschaft:Populaerwissenschaft</catDesc>
+	  </category>
+	</category>
+	<category id="topic.unklassifizierbar">
+	  <catDesc>Text ist thematisch nicht klassifizierbar.</catDesc>
+	</category>
+      </taxonomy>
+    </classDecl>
+  </encodingDesc>
+  <profileDesc>
+    <langusage>
+      <language id="de" usage="100">Deutsch</language>
+    </langusage>
+    <textdesc>
+      <texttype>Zeitung: Wochenzeitung</texttype>
+      <texttyperef>Wochenzeitung</texttyperef>
+    </textdesc>
+  </profileDesc>
+</idsheader>

diff --git a/t/artificial-subtoken.t b/t/artificial-subtoken.t
index ebf3b33..7a30103 100644
--- a/t/artificial-subtoken.t
+++ b/t/artificial-subtoken.t

@@ -16,7 +16,7 @@
 
 my $path = catdir(dirname(__FILE__), 'artificial');
 ok(my $doc = KorAP::Document->new( path => $path . '/' ), 'Load Korap::Document');
-is($doc->path, $path . '/', 'Path');
+like($doc->path, qr!$path/$!, 'Path');
 ok($doc->parse, 'Parse document');
 
 sub new_tokenizer {

diff --git a/t/meta.t b/t/meta.t
index 87180de..5159889 100644
--- a/t/meta.t
+++ b/t/meta.t

@@ -11,22 +11,26 @@
 use File::Basename 'dirname';
 use File::Spec::Functions 'catdir';
 
+
+# TODO: Make 'text' -> 'primaryText'
+
 use_ok('KorAP::Document');
 
 # WPD/00001
 my $path = catdir(dirname(__FILE__), 'WPD/00001');
 ok(my $doc = KorAP::Document->new( path => $path . '/' ), 'Load Korap::Document');
-is($doc->path, $path . '/', 'Path');
+like($doc->path, qr!$path/!, 'Path');
 
 ok($doc = KorAP::Document->new( path => $path ), 'Load Korap::Document');
-is($doc->path, $path . '/', 'Path');
+like($doc->path, qr!$path/$!, 'Path');
 
 ok($doc->parse, 'Parse document');
 
 # Metdata
+is($doc->text_sigle, 'WPD_AAA.00001', 'ID');
+
 is($doc->title, 'A', 'title');
 ok(!$doc->sub_title, 'subTitle');
-is($doc->text_sigle, 'WPD_AAA.00001', 'ID');
 is($doc->corpus_sigle, 'WPD', 'corpusID');
 is($doc->pub_date, '20050328', 'pubDate');
 is($doc->pub_place, 'URL:http://de.wikipedia.org', 'pubPlace');
@@ -35,21 +39,30 @@
 is($doc->text_class->[2], 'wissenschaft', 'TextClass');
 is($doc->text_class->[3], 'populaerwissenschaft', 'TextClass');
 ok(!$doc->text_class->[4], 'TextClass');
-is($doc->author->[0], 'Ruru', 'author');
-is($doc->author->[1], 'Jens.Ol', 'author');
-is($doc->author->[2], 'Aglarech', 'author');
-ok(!$doc->author->[3], 'author');
+is($doc->author, 'Ruru; Jens.Ol; Aglarech; u.a.', 'author');
+
+#is($doc->author->[0], 'Ruru', 'author');
+#is($doc->author->[1], 'Jens.Ol', 'author');
+#is($doc->author->[2], 'Aglarech', 'author');
+#ok(!$doc->author->[3], 'author');
 
 # Additional information
-is($doc->editor,'wikipedia.org', 'Editor');
+ok(!$doc->editor, 'Editor');
 is($doc->publisher, 'Wikipedia', 'Publisher');
 is($doc->creation_date, '20050000', 'Creation date');
-is($doc->coll_title, 'Wikipedia', 'Collection title');
-is($doc->coll_sub_title, 'Die freie Enzyklopädie', 'Collection subtitle');
-is($doc->coll_editor, 'wikipedia.org', 'Collection editor');
-ok(!$doc->coll_author, 'Collection author');
 ok(!$doc->text_type, 'No text_type');
-ok(!$doc->text_type_art, 'text_type art');
+ok(!$doc->text_type_art, 'no text_type art');
+ok(!$doc->text_type_ref, 'no text_type ref');
+ok(!$doc->text_domain, 'no text_domain');
+ok(!$doc->text_column, 'no text_column');
+ok(!$doc->keywords_string, 'no keywords');
+is($doc->text_class_string, 'freizeit-unterhaltung reisen wissenschaft populaerwissenschaft', 'no text classes');
+ok(!$doc->language, 'no text_column');
+
+#is($doc->coll_title, 'Wikipedia', 'Collection title');
+#is($doc->coll_sub_title, 'Die freie Enzyklopädie', 'Collection subtitle');
+#is($doc->coll_editor, 'wikipedia.org', 'Collection editor');
+#ok(!$doc->coll_author, 'Collection author');
 
 # BRZ13/00001
 $path = catdir(dirname(__FILE__), 'BRZ13/00001');
@@ -60,21 +73,24 @@
 ok(!$doc->sub_title, 'subTitle');
 is($doc->text_sigle, 'BRZ13_APR.00001', 'ID');
 is($doc->corpus_sigle, 'BRZ13', 'corpusID');
+
+
 is($doc->pub_date, '20130402', 'pubDate');
 is($doc->pub_place, 'Braunschweig', 'pubPlace');
+
 is($doc->text_class->[0], 'staat-gesellschaft', 'TextClass');
 is($doc->text_class->[1], 'familie-geschlecht', 'TextClass');
 ok(!$doc->text_class->[2], 'TextClass');
-ok(!$doc->author->[0], 'author');
+ok(!$doc->author, 'author');
 
 # Additional information
 ok(!$doc->editor, 'Editor');
 is($doc->publisher, 'Braunschweiger Zeitungsverlag, Druckhaus Albert Limbach GmbH & Co. KG', 'Publisher');
 is($doc->creation_date, '20130402', 'Creation date');
-is($doc->coll_title, 'Braunschweiger Zeitung', 'Collection title');
-ok(!$doc->coll_sub_title, 'Collection subtitle');
-ok(!$doc->coll_editor, 'Collection editor');
-ok(!$doc->coll_author, 'Collection author');
+#is($doc->coll_title, 'Braunschweiger Zeitung', 'Collection title');
+#ok(!$doc->coll_sub_title, 'Collection subtitle');
+#ok(!$doc->coll_editor, 'Collection editor');
+#ok(!$doc->coll_author, 'Collection author');
 is($doc->text_type, 'Zeitung: Tageszeitung', 'text_type');
 ok(!$doc->text_type_art, 'text_type art');
 
@@ -92,17 +108,16 @@
 is($doc->text_class->[0], 'freizeit-unterhaltung', 'TextClass');
 is($doc->text_class->[1], 'vereine-veranstaltungen', 'TextClass');
 ok(!$doc->text_class->[2], 'TextClass');
-ok(!$doc->author->[0], 'author');
-
+ok(!$doc->author, 'author');
 
 # Additional information
 ok(!$doc->editor, 'Editor');
 ok(!$doc->publisher, 'Publisher');
 is($doc->creation_date, '20010402', 'Creation date');
-ok(!$doc->coll_title, 'Collection title');
-ok(!$doc->coll_sub_title, 'Collection subtitle');
-ok(!$doc->coll_editor, 'Collection editor');
-ok(!$doc->coll_author, 'Collection author');
+#ok(!$doc->coll_title, 'Collection title');
+#ok(!$doc->coll_sub_title, 'Collection subtitle');
+#ok(!$doc->coll_editor, 'Collection editor');
+#ok(!$doc->coll_author, 'Collection author');
 ok(!$doc->text_type, 'text_type');
 is($doc->text_type_art, 'Bericht', 'text_type art');
 
@@ -112,7 +127,8 @@
 ok($doc = KorAP::Document->new( path => $path . '/' ), 'Load Korap::Document');
 
 ok($doc->parse, 'Parse document');
-is($doc->title, 'Amtsblatt des Landesbezirks Baden [diverse Erlasse]', 'title');
+is($doc->title, 'MK2/ERL.00001 Amtsblatt des Landesbezirks Baden [diverse Erlasse], Hrsg. und Schriftleitung: Präsidialstelle der Landesverwaltung Baden in Karlsruhe. - Karlsruhe, o.J.', 'title'); # Amtsblatt des Landesbezirks Baden [diverse Erlasse]
+
 ok(!$doc->sub_title, 'subTitle');
 is($doc->text_sigle, 'MK2_ERL.00001', 'ID');
 is($doc->corpus_sigle, 'MK2', 'corpusID');
@@ -121,27 +137,26 @@
 is($doc->text_class->[0], 'politik', 'TextClass');
 is($doc->text_class->[1], 'kommunalpolitik', 'TextClass');
 ok(!$doc->text_class->[2], 'TextClass');
-ok(!$doc->author->[0], 'author');
+ok(!$doc->author, 'author');
 
 # Additional information
 ok(!$doc->editor, 'Editor');
 is($doc->publisher, 'Badenia Verlag und Druckerei', 'Publisher');
 is($doc->creation_date, '19600000', 'Creation date');
-diag 'Non-acceptance of creation date ranges is temporary';
-ok(!$doc->coll_title, 'Collection title');
-ok(!$doc->coll_sub_title, 'Collection subtitle');
-ok(!$doc->coll_editor, 'Collection editor');
-ok(!$doc->coll_author, 'Collection author');
+diag 'Non-acceptance of creation date ranges may be temporary';
+#ok(!$doc->coll_title, 'Collection title');
+#ok(!$doc->coll_sub_title, 'Collection subtitle');
+#ok(!$doc->coll_editor, 'Collection editor');
+#ok(!$doc->coll_author, 'Collection author');
 is($doc->text_type, 'Erlass', 'text_type');
 ok(!$doc->text_type_art, 'text_type art');
 
-
 # A01/02035-substring
 $path = catdir(dirname(__FILE__), 'A01/02035-substring');
 ok($doc = KorAP::Document->new( path => $path . '/' ), 'Load Korap::Document');
 
 ok($doc->parse, 'Parse document');
-ok(!$doc->title, 'title');
+is($doc->title, 'A00/JAN.02035 St. Galler Tagblatt, 11.01.2000, Ressort: TB-RSP (Abk.)', 'title');
 ok(!$doc->sub_title, 'subTitle');
 is($doc->text_sigle, 'A00_JAN.02035', 'ID');
 is($doc->corpus_sigle, 'A00', 'corpusID');
@@ -150,20 +165,19 @@
 is($doc->text_class->[0], 'sport', 'TextClass');
 is($doc->text_class->[1], 'ballsport', 'TextClass');
 ok(!$doc->text_class->[2], 'TextClass');
-ok(!$doc->author->[0], 'author');
+ok(!$doc->author, 'author');
 
 # Additional information
 ok(!$doc->editor, 'Editor');
 ok(!$doc->publisher, 'Publisher');
 is($doc->creation_date, "20000111", 'Creation date');
-ok(!$doc->coll_title, 'Collection title');
-ok(!$doc->coll_sub_title, 'Collection subtitle');
-ok(!$doc->coll_editor, 'Collection editor');
-ok(!$doc->coll_author, 'Collection author');
+#ok(!$doc->coll_title, 'Collection title');
+#ok(!$doc->coll_sub_title, 'Collection subtitle');
+#ok(!$doc->coll_editor, 'Collection editor');
+#ok(!$doc->coll_author, 'Collection author');
 ok(!$doc->text_type, 'text_type');
 is($doc->text_type_art, 'Bericht', 'text_type art');
 
-
 # A01/02873-meta
 $path = catdir(dirname(__FILE__), 'A01/02873-meta');
 ok($doc = KorAP::Document->new( path => $path . '/' ), 'Load Korap::Document');
@@ -178,16 +192,16 @@
 is($doc->text_class->[0], 'kultur', 'TextClass');
 is($doc->text_class->[1], 'film', 'TextClass');
 ok(!$doc->text_class->[2], 'TextClass');
-ok(!$doc->author->[0], 'author');
+ok(!$doc->author, 'author');
 
 # Additional information
 ok(!$doc->editor, 'Editor');
 ok(!$doc->publisher, 'Publisher');
 is($doc->creation_date, "20000113", 'Creation date');
-ok(!$doc->coll_title, 'Collection title');
-ok(!$doc->coll_sub_title, 'Collection subtitle');
-ok(!$doc->coll_editor, 'Collection editor');
-ok(!$doc->coll_author, 'Collection author');
+#ok(!$doc->coll_title, 'Collection title');
+#ok(!$doc->coll_sub_title, 'Collection subtitle');
+#ok(!$doc->coll_editor, 'Collection editor');
+#ok(!$doc->coll_author, 'Collection author');
 ok(!$doc->text_type, 'text_type');
 is($doc->text_type_art, 'Bericht', 'text_type art');
 
@@ -206,21 +220,20 @@
 is($doc->text_class->[0], 'gesundheit-ernaehrung', 'TextClass');
 is($doc->text_class->[1], 'gesundheit', 'TextClass');
 ok(!$doc->text_class->[2], 'TextClass');
-ok(!$doc->author->[0], 'author');
+ok(!$doc->author, 'author');
 
 # Additional information
 ok(!$doc->editor, 'Editor');
 ok(!$doc->publisher, 'Publisher');
 is($doc->creation_date, "20000124", 'Creation date');
-ok(!$doc->coll_title, 'Collection title');
-ok(!$doc->coll_sub_title, 'Collection subtitle');
-ok(!$doc->coll_editor, 'Collection editor');
-ok(!$doc->coll_author, 'Collection author');
+#ok(!$doc->coll_title, 'Collection title');
+#ok(!$doc->coll_sub_title, 'Collection subtitle');
+#ok(!$doc->coll_editor, 'Collection editor');
+#ok(!$doc->coll_author, 'Collection author');
 ok(!$doc->text_type, 'text_type');
 is($doc->text_type_art, 'Bericht', 'text_type art');
 
 
-
 # A01/07452-deep
 $path = catdir(dirname(__FILE__), 'A01/07452-deep');
 ok($doc = KorAP::Document->new( path => $path . '/' ), 'Load Korap::Document');
@@ -235,26 +248,27 @@
 is($doc->text_class->[0], 'politik', 'TextClass');
 is($doc->text_class->[1], 'kommunalpolitik', 'TextClass');
 ok(!$doc->text_class->[2], 'TextClass');
-ok(!$doc->author->[0], 'author');
+ok(!$doc->author, 'author');
 
 # Additional information
 ok(!$doc->editor, 'Editor');
 ok(!$doc->publisher, 'Publisher');
 is($doc->creation_date, "20000129", 'Creation date');
-ok(!$doc->coll_title, 'Collection title');
-ok(!$doc->coll_sub_title, 'Collection subtitle');
-ok(!$doc->coll_editor, 'Collection editor');
-ok(!$doc->coll_author, 'Collection author');
+#ok(!$doc->coll_title, 'Collection title');
+#ok(!$doc->coll_sub_title, 'Collection subtitle');
+#ok(!$doc->coll_editor, 'Collection editor');
+#ok(!$doc->coll_author, 'Collection author');
 ok(!$doc->text_type, 'text_type');
 is($doc->text_type_art, 'Bericht', 'text_type art');
 
+
 # ART
 $path = catdir(dirname(__FILE__), 'artificial');
 ok($doc = KorAP::Document->new( path => $path . '/' ), 'Load Korap::Document');
-is($doc->path, $path . '/', 'Path');
+#is($doc->path, $path . '/', 'Path');
 
 ok($doc = KorAP::Document->new( path => $path ), 'Load Korap::Document');
-is($doc->path, $path . '/', 'Path');
+#is($doc->path, $path . '/', 'Path');
 
 ok($doc->parse, 'Parse document');
 
@@ -268,21 +282,119 @@
 is($doc->text_class->[0], 'freizeit-unterhaltung', 'TextClass');
 is($doc->text_class->[1], 'vereine-veranstaltungen', 'TextClass');
 ok(!$doc->text_class->[2], 'TextClass');
-is($doc->author->[0], 'Ruru', 'author');
-is($doc->author->[1], 'Jens.Ol', 'author');
-is($doc->author->[2], 'Aglarech', 'author');
-ok(!$doc->author->[3], 'author');
+#is($doc->author->[0], 'Ruru', 'author');
+#is($doc->author->[1], 'Jens.Ol', 'author');
+#is($doc->author->[2], 'Aglarech', 'author');
+is($doc->author, 'Ruru; Jens.Ol; Aglarech; u.a.', 'author');
 
 # Additional information
 is($doc->editor, 'Nils Diewald', 'Editor');
 is($doc->publisher, 'Artificial articles Inc.', 'Publisher');
 is($doc->creation_date, '19990601', 'Creation date');
-is($doc->coll_title, 'Artificial articles', 'Collection title');
-is($doc->coll_sub_title, 'Best of!', 'Collection subtitle');
-is($doc->coll_editor, 'Nils Diewald', 'Collection editor');
-is($doc->coll_author, 'Nils Diewald', 'Collection author');
+#is($doc->coll_title, 'Artificial articles', 'Collection title');
+#is($doc->coll_sub_title, 'Best of!', 'Collection subtitle');
+#is($doc->coll_editor, 'Nils Diewald', 'Collection editor');
+#is($doc->coll_author, 'Nils Diewald', 'Collection author');
 is($doc->text_type, 'Zeitung: Tageszeitung', 'No text_type');
 is($doc->text_type_art, 'Bericht', 'text_type art');
 
+# Multipath headers
+$path = catdir(dirname(__FILE__), 'VDI/JAN/00001');
+ok($doc = KorAP::Document->new( path => $path . '/' ), 'Load Korap::Document');
+like($doc->path, qr!$path/!, 'Path');
+
+ok($doc = KorAP::Document->new( path => $path ), 'Load Korap::Document');
+like($doc->path, qr!$path/$!, 'Path');
+
+ok($doc->parse, 'Parse document');
+is($doc->text_sigle, 'VDI_JAN.00001', 'text sigle');
+is($doc->doc_sigle, 'VDI_JAN', 'doc sigle');
+is($doc->corpus_sigle, 'VDI', 'corpus sigle');
+is($doc->title, '10- Zz mit Zahl', 'title');
+ok(!$doc->sub_title, 'subtitle');
+is($doc->pub_date, '20140117', 'pubdate');
+is($doc->pub_place, 'Düsseldorf', 'pubplace');
+is($doc->author, 'Windhövel, Kerstin', 'author');
+is($doc->publisher, 'VDI Verlag GmbH', 'publisher');
+ok(!$doc->editor, 'editor');
+
+ok(!$doc->text_type, 'text type');
+ok(!$doc->text_type_art, 'text type art');
+ok(!$doc->text_type_ref, 'text type ref');
+ok(!$doc->text_column, 'text column');
+ok(!$doc->text_domain, 'text domain');
+ok(!$doc->creation_date, 'creation date');
+ok(!$doc->license, 'License');
+ok(!$doc->pages, 'Pages');
+ok(!$doc->file_edition_statement, 'file edition statement');
+ok(!$doc->bibl_edition_statement, 'bibl edition statement');
+is($doc->reference, 'VDI nachrichten, 17.01.2014, S. 10; 10- Zz mit Zahl [Ausführliche Zitierung nicht verfügbar]', 'Reference');
+
+ok(!$doc->language, 'Language');
+diag 'This may be "de" in the future';
+
+is($doc->doc_title, 'VDI nachrichten, Januar 2014', 'Doc title');
+ok(!$doc->doc_sub_title, 'Doc Sub title');
+ok(!$doc->doc_editor, 'Doc editor');
+ok(!$doc->doc_author, 'Doc author');
+
+is($doc->corpus_title, 'VDI nachrichten 2014', 'Corpus title');
+ok(!$doc->corpus_sub_title, 'Corpus Sub title');
+ok(!$doc->corpus_editor, 'Corpus editor');
+ok(!$doc->corpus_author, 'Corpus author');
+
+is($doc->keywords_string, '', 'Keywords');
+is($doc->text_class_string, 'Freizeit-Unterhaltung Reisen Politik Ausland', 'Text class');
+
+
+# WDD
+$path = catdir(dirname(__FILE__), 'WDD/G27/38989');
+ok($doc = KorAP::Document->new( path => $path . '/' ), 'Load Korap::Document');
+like($doc->path, qr!$path/!, 'Path');
+ok($doc->parse, 'Parse document');
+
+is($doc->text_sigle, 'WDD11_G27.38989', 'text sigle');
+is($doc->doc_sigle, 'WDD11_G27', 'doc sigle');
+is($doc->corpus_sigle, 'WDD11', 'corpus sigle');
+
+is($doc->title, 'Diskussion:Gunter A. Pilz', 'title');
+ok(!$doc->sub_title, 'subtitle');
+is($doc->pub_date, '20111029', 'pubdate');
+is($doc->pub_place, 'URL:http://de.wikipedia.org', 'pubplace');
+
+is($doc->author, '€pa, u.a.', 'author');
+is($doc->publisher, 'Wikipedia', 'publisher');
+ok(!$doc->editor, 'editor');
+
+is($doc->text_type, 'Diskussionen zu Enzyklopädie-Artikeln', 'text type');
+ok(!$doc->text_type_art, 'text type art');
+ok(!$doc->text_type_ref, 'text type ref');
+ok(!$doc->text_column, 'text column');
+ok(!$doc->text_domain, 'text domain');
+
+is($doc->creation_date, '20070707', 'creation date');
+is($doc->license, 'CC-BY-SA', 'License');
+ok(!$doc->pages, 'Pages');
+ok(!$doc->file_edition_statement, 'file edition statement');
+ok(!$doc->bibl_edition_statement, 'bibl edition statement');
+is($doc->reference, 'Diskussion:Gunter A. Pilz, In: Wikipedia - URL:http://de.wikipedia.org/wiki/Diskussion:Gunter_A._Pilz: Wikipedia, 2007', 'Reference');
+
+is($doc->language, 'de', 'Language');
+
+is($doc->doc_title, 'Wikipedia, Diskussionen zu Artikeln mit Anfangsbuchstabe G, Teil 27', 'Doc title');
+ok(!$doc->doc_sub_title, 'Doc Sub title');
+ok(!$doc->doc_editor, 'Doc editor');
+ok(!$doc->doc_author, 'Doc author');
+
+is($doc->corpus_title, 'Wikipedia.de 2011 Diskussionen', 'Corpus title');
+ok(!$doc->corpus_sub_title, 'Corpus Sub title');
+ok(!$doc->corpus_editor, 'Corpus editor');
+ok(!$doc->corpus_author, 'Corpus author');
+
+is($doc->keywords_string, '', 'Keywords');
+is($doc->text_class_string, '', 'Text class');
+
 done_testing;
 __END__
+
+

diff --git a/t/real_bzk.t b/t/real_bzk.t
index d033e77..a590d44 100644
--- a/t/real_bzk.t
+++ b/t/real_bzk.t

@@ -63,7 +63,7 @@
 ok(!$doc->corpus_author, 'Correct Corpus author');
 ok(!$doc->corpus_editor, 'Correct Corpus editor');
 
-is($doc->doc_title, 'Neues Deutschland', 'Correct Doc title');
+is($doc->doc_title, 'Neues Deutschland, Jahrgangsquerschnitt 1959', 'Correct Doc title');
 is($doc->doc_sub_title, 'Organ des Zentralkomitees der Sozialistischen Einheitspartei Deutschlands', 'Correct Doc sub title');
 ok(!$doc->doc_author, 'Correct Doc author');
 ok(!$doc->doc_editor, 'Correct doc editor');
@@ -127,7 +127,7 @@
 ok(!exists $output->{corpusAuthor}, 'Correct Corpus author');
 ok(!exists $output->{corpusEditor}, 'Correct Corpus editor');
 
-is($output->{docTitle}, 'Neues Deutschland', 'Correct Doc title');
+is($output->{docTitle}, 'Neues Deutschland, Jahrgangsquerschnitt 1959', 'Correct Doc title');
 is($output->{docSubTitle}, 'Organ des Zentralkomitees der Sozialistischen Einheitspartei Deutschlands', 'Correct Doc sub title');
 ok(!exists $output->{docAuthor}, 'Correct Doc author');
 ok(!exists $output->{docEditor}, 'Correct doc editor');

diff --git a/t/real_goethe.t b/t/real_goethe.t
index 878607b..9efe4c5 100644
--- a/t/real_goethe.t
+++ b/t/real_goethe.t

@@ -53,7 +53,7 @@
 REF
 is($doc->language, 'de', 'Language');
 
-is($doc->corpus_title, 'Goethes Werke', 'Correct Corpus title');
+is($doc->corpus_title, 'Goethe-Korpus', 'Correct Corpus title');
 ok(!$doc->corpus_sub_title, 'Correct Corpus Sub title');
 is($doc->corpus_author, 'Goethe, Johann Wolfgang von', 'Correct Corpus author');
 is($doc->corpus_editor, 'Trunz, Erich', 'Correct Corpus editor');
@@ -64,7 +64,6 @@
 ok(!$doc->doc_author, 'Correct Doc author');
 ok(!$doc->doc_editor, 'Correct Doc editor');
 
-
 # Tokenization
 use_ok('KorAP::Tokenizer');
 
@@ -120,7 +119,7 @@
 REF
 is($output->{language}, 'de', 'Language');
 
-is($output->{corpusTitle}, 'Goethes Werke', 'Correct Corpus title');
+is($output->{corpusTitle}, 'Goethe-Korpus', 'Correct Corpus title');
 ok(!exists $output->{corpusSubTitle}, 'Correct Text Type');
 is($output->{corpusAuthor}, 'Goethe, Johann Wolfgang von', 'Correct Corpus title');
 is($output->{corpusEditor}, 'Trunz, Erich', 'Editor');

diff --git a/t/transform.t b/t/transform.t
index 50cd2d7..783042d 100644
--- a/t/transform.t
+++ b/t/transform.t

@@ -14,6 +14,16 @@
 
 use_ok('KorAP::Document');
 
+sub _t2h {
+  my $string = shift;
+  $string =~ s/^\[\(\d+?-\d+?\)(.+?)\]$/$1/;
+  my %hash = ();
+  foreach (split(qr!\|!, $string)) {
+    $hash{$_} = 1;
+  };
+  return \%hash;
+};
+
 my @layers;
 # push(@layers, ['Base', 'Sentences']);
 push(@layers, ['Base', 'Paragraphs']);
@@ -50,10 +60,10 @@
 
 my $path = catdir(dirname(__FILE__), 'WPD/00001');
 ok(my $doc = KorAP::Document->new( path => $path . '/' ), 'Load Korap::Document');
-is($doc->path, $path . '/', 'Path');
+like($doc->path, qr!$path/$!, 'Path');
 
 ok($doc = KorAP::Document->new( path => $path ), 'Load Korap::Document');
-is($doc->path, $path . '/', 'Path');
+like($doc->path, qr!$path/$!, 'Path');
 
 ok($doc->parse, 'Parse document');
 
@@ -70,10 +80,11 @@
 is($doc->text_class->[2], 'wissenschaft', 'TextClass');
 is($doc->text_class->[3], 'populaerwissenschaft', 'TextClass');
 ok(!$doc->text_class->[4], 'TextClass');
-is($doc->author->[0], 'Ruru', 'author');
-is($doc->author->[1], 'Jens.Ol', 'author');
-is($doc->author->[2], 'Aglarech', 'author');
-ok(!$doc->author->[3], 'author');
+is($doc->author, 'Ruru; Jens.Ol; Aglarech; u.a.', 'author');
+#is($doc->author->[0], 'Ruru', 'author');
+#is($doc->author->[1], 'Jens.Ol', 'author');
+#is($doc->author->[2], 'Aglarech', 'author');
+#ok(!$doc->author->[3], 'author');
 
 # Get tokens
 use_ok('KorAP::Tokenizer');
@@ -87,7 +98,7 @@
 ), 'New Tokenizer');
 ok($tokens->parse, 'Parse');
 
-is($tokens->path, $path . '/', 'Path');
+like($tokens->path, qr!$path/$!, 'Path');
 is($tokens->foundry, 'OpenNLP', 'Foundry');
 is($tokens->doc->text_sigle, 'WPD_AAA.00001', 'Doc id');
 is($tokens->should, 1068, 'Should');
@@ -95,23 +106,38 @@
 is($tokens->name, 'tokens', 'Name');
 is($tokens->layer, 'Tokens', 'Layer');
 
-is($tokens->stream->pos(118)->to_string, '[(763-768)s:Linie|i:linie|_118#763-768]', 'Token is correct');
+is_deeply(_t2h($tokens->stream->pos(118)->to_string),
+   _t2h('[(763-768)s:Linie|i:linie|_118#763-768]'),
+   'Token is correct');
 
 # Add Mate
 ok($tokens->add('Mate', 'Morpho'), 'Add Mate');
 
-is($tokens->stream->pos(118)->to_string, '[(763-768)s:Linie|i:linie|_118#763-768|mate/l:linie|mate/p:NN|mate/m:case:acc|mate/m:number:sg|mate/m:gender:fem]', 'with Mate');
+is_deeply(
+  _t2h($tokens->stream->pos(118)->to_string),
+  _t2h('[(763-768)s:Linie|i:linie|_118#763-768|mate/l:linie|mate/p:NN|mate/m:case:acc|mate/m:number:sg|mate/m:gender:fem]'),
+  'with Mate');
 
 # Add sentences
 ok($tokens->add('Base', 'Sentences'), 'Add Sentences');
 
-is($tokens->stream->pos(0)->to_string, '[(0-1)s:A|i:a|_0#0-1|-:tokens$<i>923|mate/p:XY|<>:base/s:s#0-74$<i>13|<>:base/s:t#0-6083$<i>923|-:base/sentences$<i>96]', 'Startinfo');
+is_deeply(
+  _t2h($tokens->stream->pos(0)->to_string),
+  _t2h('[(0-1)s:A|i:a|_0#0-1|-:tokens$<i>923|mate/p:XY|<>:base/s:s#0-74$<i>13<b>2|<>:base/s:t#0-6083$<i>923<b>0|-:base/sentences$<i>96]'),
+  'Startinfo'
+);
 
 foreach (@layers) {
   ok($tokens->add(@$_), 'Add '. join(', ', @$_));
 };
 
-is($tokens->stream->pos(0)->to_string, '[(0-1)s:A|i:a|_0#0-1|-:tokens$<i>923|mate/p:XY|<>:base/s:s#0-74$<i>13|<>:base/s:t#0-6083$<i>923|-:base/sentences$<i>96|<>:base/s:p#0-224$<i>34|-:base/paragraphs$<i>76|opennlp/p:NE|<>:opennlp/s:s#0-74$<i>13|-:opennlp/sentences$<i>50|<>:corenlp/s:s#0-6$<i>2|-:corenlp/sentences$<i>65|cnx/l:A|cnx/p:N|cnx/syn:@NH|<>:cnx/c:np#0-1$<i>1|<>:cnx/s:s#0-74$<i>13|-:cnx/sentences$<i>62|tt/l:A|tt/p:NN|tt/l:A|tt/p:FM|<>:tt/s:s#0-6083$<i>923|-:tt/sentences$<i>1|>:mate/d:PNC$<i>2|xip/p:SYMBOL|xip/l:A|<>:xip/c:TOP#0-74$<i>13|<>:xip/c:MC#0-73$<i>13<b>1|<>:xip/c:NP#0-1$<i>1<b>2|<>:xip/c:NPA#0-1$<i>1<b>3|<>:xip/c:NOUN#0-1$<i>1<b>4|<>:xip/c:SYMBOL#0-1$<i>1<b>5|>:xip/d:SUBJ$<i>3|<:xip/d:COORD$<i>1|<>:xip/s:s#0-74$<i>13|-:xip/sentences$<i>64]', 'Startinfo');
+is(
+  _t2h($tokens->stream->pos(0)->to_string),
+  _t2h('[(0-1)s:A|i:a|_0#0-1|-:tokens$<i>923|mate/p:XY|<>:base/s:s#0-74$<i>13|<>:base/s:t#0-6083$<i>923|-:base/sentences$<i>96|<>:base/s:p#0-224$<i>34|-:base/paragraphs$<i>76|opennlp/p:NE|<>:opennlp/s:s#0-74$<i>13|-:opennlp/sentences$<i>50|<>:corenlp/s:s#0-6$<i>2|-:corenlp/sentences$<i>65|cnx/l:A|cnx/p:N|cnx/syn:@NH|<>:cnx/c:np#0-1$<i>1|<>:cnx/s:s#0-74$<i>13|-:cnx/sentences$<i>62|tt/l:A|tt/p:NN|tt/l:A|tt/p:FM|<>:tt/s:s#0-6083$<i>923|-:tt/sentences$<i>1|>:mate/d:PNC$<i>2|xip/p:SYMBOL|xip/l:A|<>:xip/c:TOP#0-74$<i>13|<>:xip/c:MC#0-73$<i>13<b>1|<>:xip/c:NP#0-1$<i>1<b>2|<>:xip/c:NPA#0-1$<i>1<b>3|<>:xip/c:NOUN#0-1$<i>1<b>4|<>:xip/c:SYMBOL#0-1$<i>1<b>5|>:xip/d:SUBJ$<i>3|<:xip/d:COORD$<i>1|<>:xip/s:s#0-74$<i>13|-:xip/sentences$<i>64]'),
+  'Startinfo');
+
+done_testing;
+__END__
 
 
 #is($tokens->stream->pos(118)->to_string,
commit	192057192e65d09fdc7f3c3de1d66d31c3852cc5	[log] [tgz]
author	Nils Diewald <nils@diewald-online.de>	Thu Jun 18 20:06:45 2015 +0000
committer	Nils Diewald <nils@diewald-online.de>	Thu Jun 18 20:06:45 2015 +0000
tree	97d53f6e4f9d003c0d3d18d87a660025a1b2c652
parent	0d76734639be97dc264ac8c02118faba0b1dd3df [diff]