Somehow fixed relation indexation and meta data parsing (consistent to the GDoc)
diff --git a/lib/KorAP/Document.pm b/lib/KorAP/Document.pm
index b9bdfe3..4af6c1a 100644
--- a/lib/KorAP/Document.pm
+++ b/lib/KorAP/Document.pm
@@ -15,9 +15,8 @@
our @ATTR = qw/text_sigle
doc_sigle
corpus_sigle
-
- pub_date
title
+ pub_date
sub_title
pub_place
author/;
@@ -47,6 +46,7 @@
corpus_sub_title
corpus_editor
/;
+# Separate: text_class, keywords
# Removed: coll_title, coll_sub_title, coll_author, coll_editor
# Introduced: doc_title, doc_sub_title, corpus_editor, doc_editor, corpus_author, doc_author
@@ -136,6 +136,7 @@
my @path = grep { $_ } splitdir($self->path);
my @header;
+ # Parse the corpus file, the doc file, and the text file for meta information
foreach (0..2) {
unshift @header, '/' . catfile(@path, 'header.xml');
pop @path;
@@ -176,6 +177,10 @@
return ($self->{topics} // []);
};
+sub text_class_string {
+ return join ' ', @{shift->text_class};
+}
+
sub keywords {
my $self = shift;
if ($_[0]) {
@@ -184,6 +189,25 @@
return ($self->{keywords} // []);
};
+sub keywords_string {
+ return join ' ', @{shift->keywords};
+}
+
+sub _remove_prefix {
+ return $_[0];
+
+ # This may render some titles wrong, e.g. 'VDI nachrichten 2014' ...
+ my $title = shift;
+ my $prefix = shift;
+ $prefix =~ tr!_!/!;
+ if (index($title, $prefix) == 0) {
+ $title = substr($title, length($prefix));
+ $title =~ s/^\s+//;
+ $title =~ s/\s+$//;
+ };
+ return $title;
+};
+
sub _parse_meta {
my $self = shift;
@@ -211,19 +235,19 @@
$editor = $editor ? $editor->all_text : undef;
if ($type eq 'text') {
- $self->title($title) if $title;
+ $self->title(_remove_prefix($title, $self->text_sigle)) if $title;
$self->sub_title($sub_title) if $sub_title;
$self->editor($editor) if $editor;
$self->author($author) if $author;
}
elsif ($type eq 'doc') {
- $self->doc_title($title) if $title;
+ $self->doc_title(_remove_prefix($title, $self->doc_sigle)) if $title;
$self->doc_sub_title($sub_title) if $sub_title;
$self->doc_author($author) if $author;
$self->doc_editor($editor) if $editor;
}
elsif ($type eq 'corpus') {
- $self->corpus_title($title) if $title;
+ $self->corpus_title(_remove_prefix($title, $self->corpus_sigle)) if $title;
$self->corpus_sub_title($sub_title) if $sub_title;
$self->corpus_author($author) if $author;
$self->corpus_editor($editor) if $editor;
@@ -232,15 +256,19 @@
# Not in analytic
if ($type eq 'corpus') {
- if (my $title = $dom->at('fileDesc > titleStmt > c\.title')) {
- $self->corpus_title($title->all_text) if $title->all_text;
+ unless ($self->corpus_title) {
+ if (my $title = $dom->at('fileDesc > titleStmt > c\.title')) {
+ $self->corpus_title(_remove_prefix($title->all_text, $self->corpus_sigle)) if $title->all_text;
+ };
};
}
# doc title
elsif ($type eq 'doc') {
- if (my $title = $dom->at('fileDesc > titleStmt > d\.title')) {
- $self->doc_title($title->all_text) if $title->all_text;
+ unless ($self->doc_title) {
+ if (my $title = $dom->at('fileDesc > titleStmt > d\.title')) {
+ $self->doc_title(_remove_prefix($title->all_text, $self->doc_sigle)) if $title->all_text;
+ };
};
}
@@ -248,7 +276,7 @@
elsif ($type eq 'text') {
unless ($self->title) {
if (my $title = $dom->at('fileDesc > titleStmt > t\.title')) {
- $self->title($title->all_text) if $title->all_text;
+ $self->title(_remove_prefix($title->all_text, $self->text_sigle)) if $title->all_text;
};
};
};
@@ -263,39 +291,39 @@
$self->publisher($publisher->all_text) if $publisher->all_text;
};
- my $mono = $dom->at('monogr');
- if ($mono) {
-
- # Get title, subtitle, author, editor
- my $title = $mono->at('h\.title[type=main]');
- my $sub_title = $mono->at('h\.title[type=sub]');
- my $author = $mono->at('h\.author');
- my $editor = $mono->at('editor');
-
- $title = $title ? $title->all_text : undef;
- $sub_title = $sub_title ? $sub_title->all_text : undef;
- $author = $author ? $author->all_text : undef;
- $editor = $editor ? $editor->all_text : undef;
-
- if ($type eq 'text') {
- $self->title($title) if $title;
- $self->sub_title($sub_title) if $sub_title;
- $self->editor($editor) if $editor;
- $self->author($author) if $author;
- }
- elsif ($type eq 'doc') {
- $self->doc_title($title) if $title;
- $self->doc_sub_title($sub_title) if $sub_title;
- $self->doc_author($author) if $author;
- $self->doc_editor($editor) if $editor;
- }
- elsif ($type eq 'corpus') {
- $self->corpus_title($title) if $title;
- $self->corpus_sub_title($sub_title) if $sub_title;
- $self->corpus_author($author) if $author;
- $self->corpus_editor($editor) if $editor;
- };
- };
+# my $mono = $dom->at('monogr');
+# if ($mono) {
+#
+# # Get title, subtitle, author, editor
+# my $title = $mono->at('h\.title[type=main]');
+# my $sub_title = $mono->at('h\.title[type=sub]');
+# my $author = $mono->at('h\.author');
+# my $editor = $mono->at('editor');
+#
+# $title = $title ? $title->all_text : undef;
+# $sub_title = $sub_title ? $sub_title->all_text : undef;
+# $author = $author ? $author->all_text : undef;
+# $editor = $editor ? $editor->all_text : undef;
+#
+# if ($type eq 'text') {
+# $self->title($title) if $title && !$self->title;
+# $self->sub_title($sub_title) if $sub_title && !$self->sub_title;
+# $self->editor($editor) if $editor && !$self->editor;
+# $self->author($author) if $author && !$self->author;
+# }
+# elsif ($type eq 'doc') {
+# $self->doc_title($title) if $title && !$self->doc_title;
+# $self->doc_sub_title($sub_title) if $sub_title && !$self->doc_sub_title;
+# $self->doc_author($author) if $author && !$self->doc_author;
+# $self->doc_editor($editor) if $editor && !$self->doc_editor;
+# }
+# elsif ($type eq 'corpus') {
+# $self->corpus_title($title) if $title && !$self->corpus_title;
+# $self->corpus_sub_title($sub_title) if $sub_title && !$self->corpus_sub_title;
+# $self->corpus_author($author) if $author && !$self->corpus_author;
+# $self->corpus_editor($editor) if $editor && !$self->corpus_editor;
+# };
+# };
# Get text type
my $text_desc = $dom->at('textDesc');
@@ -425,6 +453,7 @@
};
+
sub to_string {
my $self = shift;
diff --git a/lib/KorAP/Index/Mate/Dependency.pm b/lib/KorAP/Index/Mate/Dependency.pm
index 8dc7010..4b97261 100644
--- a/lib/KorAP/Index/Mate/Dependency.pm
+++ b/lib/KorAP/Index/Mate/Dependency.pm
@@ -8,21 +8,28 @@
# TODO: Create XIP tree here - for indirect dependency
# >>:xip/d:SUBJ<i>566<i>789
+ # Relation data
$$self->add_tokendata(
foundry => 'mate',
layer => 'dependency',
cb => sub {
my ($stream, $token, $tokens) = @_;
+
+ # Get MultiTermToken from stream
my $mtt = $stream->pos($token->pos);
+ # Serialized information from token
my $content = $token->hash;
+ # Get relation information
my $rel = $content->{rel};
$rel = [$rel] unless ref $rel eq 'ARRAY';
+ # Iterate over relations
foreach (@$rel) {
my $label = $_->{-label};
+ # Relation type
if ($_->{-type} && $_->{-type} eq 'unary') {
next if $_->{-label} eq '--';
$mtt->add(
diff --git a/lib/KorAP/Index/XIP/Dependency.pm b/lib/KorAP/Index/XIP/Dependency.pm
index ce9c40b..1b53b24 100644
--- a/lib/KorAP/Index/XIP/Dependency.pm
+++ b/lib/KorAP/Index/XIP/Dependency.pm
@@ -14,8 +14,6 @@
my ($stream, $token, $tokens) = @_;
my $mtt = $stream->pos($token->pos);
-warn $tokens;
-
my $content = $token->hash;
my $rel = $content->{rel};
diff --git a/lib/KorAP/Tokenizer.pm b/lib/KorAP/Tokenizer.pm
index a75242c..a4a9721 100644
--- a/lib/KorAP/Tokenizer.pm
+++ b/lib/KorAP/Tokenizer.pm
@@ -4,6 +4,7 @@
use XML::Fast;
use Try::Tiny;
use Carp qw/croak/;
+use Scalar::Util qw/weaken/;
use KorAP::Tokenizer::Range;
use KorAP::Tokenizer::Match;
use KorAP::Tokenizer::Spans;
@@ -335,7 +336,8 @@
if ($cb) {
foreach (@$tokenarray) {
- $cb->($self->stream, $_) if defined $_->pos;
+ # weaken $tokens;
+ $cb->($self->stream, $_, $tokens) if defined $_->pos;
#, $tokens);
};
return 1;
diff --git a/t/VDI/JAN/00001/data.xml b/t/VDI/JAN/00001/data.xml
new file mode 100644
index 0000000..21fd76f
--- /dev/null
+++ b/t/VDI/JAN/00001/data.xml
@@ -0,0 +1,7 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<?xml-model href="text.rng" type="application/xml" schematypens="http://relaxng.org/ns/structure/1.0"?>
+
+<raw_text docid="VDI_JAN.00001" xmlns="http://ids-mannheim.de/ns/KorAP">
+ <metadata file="metadata.xml" />
+ <text>hui</text>
+</raw_text>
diff --git a/t/VDI/JAN/00001/header.xml b/t/VDI/JAN/00001/header.xml
new file mode 100644
index 0000000..a919407
--- /dev/null
+++ b/t/VDI/JAN/00001/header.xml
@@ -0,0 +1,69 @@
+<?xml version="1.0" encoding="iso-8859-1"?>
+<?xml-model href="header.rng" type="application/xml" schematypens="http://relaxng.org/ns/structure/1.0"?>
+<!DOCTYPE idsCorpus PUBLIC "-//IDS//DTD IDS-XCES 1.0//EN" "http://corpora.ids-mannheim.de/idsxces1/DTD/ids.xcesdoc.dtd">
+<idsheader type="text" version="1.1">
+ <fileDesc>
+ <titleStmt>
+ <textsigle>VDI14/JAN.00001</textsigle>
+ <t.title assemblage="external">VDI14/JAN.00001 VDI nachrichten, 17.01.2014, S. 10; 10- Zz mit Zahl</t.title>
+ </titleStmt>
+ <publicationStmt>
+ <distributor></distributor>
+ <pubAddress></pubAddress>
+ <availability></availability>
+ <pubDate></pubDate>
+ </publicationStmt>
+ <sourceDesc>
+ <biblStruct>
+ <analytic>
+ <h.title type="main">10- Zz mit Zahl</h.title>
+ <h.author>Windhövel, Kerstin</h.author>
+ <imprint>
+ </imprint>
+ <biblScope type="pp">S. 10</biblScope>
+ <biblScope type="suppl"></biblScope>
+ <biblScope type="suppltitle"></biblScope>
+ <biblNote n="1">Id: 578453</biblNote>
+ </analytic>
+ <monogr>
+ <h.title></h.title>
+ <imprint>
+ <pubDate type="year">2014</pubDate>
+ <pubDate type="month">01</pubDate>
+ <pubDate type="day">17</pubDate>
+ </imprint>
+ <biblScope type="issue">03</biblScope>
+ <biblScope type="issueplace"></biblScope>
+ </monogr>
+ </biblStruct>
+ <reference type="complete" assemblage="regular">VDI14/JAN.00001 VDI nachrichten, 17.01.2014, S. 10; 10- Zz mit Zahl [Ausführliche Zitierung nicht verfügbar]</reference>
+ <reference type="short" assemblage="regular">VDI14/JAN.00001 VDI nachr., 17.01.2014, S. 10</reference>
+ </sourceDesc>
+ </fileDesc>
+ <encodingDesc>
+ <samplingDecl>
+ </samplingDecl>
+ <tagsDecl>
+ <tagUsage gi="p" occurs="2"></tagUsage>
+ <tagUsage gi="q" occurs="1"></tagUsage>
+ <tagUsage gi="s" occurs="3"></tagUsage>
+ </tagsDecl>
+ </encodingDesc>
+ <profileDesc>
+ <creation>
+ <creatdate>2014.01.17</creatdate>
+ </creation>
+ <textClass>
+ <catRef target="topic.Freizeit-Unterhaltung.Reisen" n="0.38" scheme="topic" />
+ <catRef target="topic.Politik.Ausland" n="0.14" scheme="topic" />
+ <h.keywords>
+ <keyterm></keyterm>
+ </h.keywords>
+ </textClass>
+ <textdesc>
+ <texttypeart></texttypeart>
+ <textdomain></textdomain>
+ <column></column>
+ </textdesc>
+ </profileDesc>
+</idsheader>
diff --git a/t/VDI/JAN/00001/text.txt b/t/VDI/JAN/00001/text.txt
new file mode 100644
index 0000000..0cb4652
--- /dev/null
+++ b/t/VDI/JAN/00001/text.txt
@@ -0,0 +1 @@
+hui
diff --git a/t/VDI/JAN/header.xml b/t/VDI/JAN/header.xml
new file mode 100644
index 0000000..ec1d7cb
--- /dev/null
+++ b/t/VDI/JAN/header.xml
@@ -0,0 +1,26 @@
+<?xml version="1.0" encoding="iso-8859-1"?>
+<?xml-model href="header.rng" type="application/xml" schematypens="http://relaxng.org/ns/structure/1.0"?>
+<!DOCTYPE idsCorpus PUBLIC "-//IDS//DTD IDS-XCES 1.0//EN" "http://corpora.ids-mannheim.de/idsxces1/DTD/ids.xcesdoc.dtd">
+<idsheader type="document" version="1.1">
+ <fileDesc>
+ <titleStmt>
+ <dokumentsigle>VDI14/JAN</dokumentsigle>
+ <d.title>VDI nachrichten, Januar 2014</d.title>
+ </titleStmt>
+ <publicationStmt>
+ <distributor></distributor>
+ <pubAddress></pubAddress>
+ <availability></availability>
+ <pubDate></pubDate>
+ </publicationStmt>
+ <sourceDesc>
+ <biblStruct>
+ <monogr>
+ <h.title></h.title>
+ <imprint>
+ </imprint>
+ </monogr>
+ </biblStruct>
+ </sourceDesc>
+ </fileDesc>
+</idsheader>
diff --git a/t/VDI/header.xml b/t/VDI/header.xml
new file mode 100644
index 0000000..28a29b4
--- /dev/null
+++ b/t/VDI/header.xml
@@ -0,0 +1,277 @@
+<?xml version="1.0" encoding="iso-8859-1"?>
+<?xml-model href="header.rng" type="application/xml" schematypens="http://relaxng.org/ns/structure/1.0"?>
+<!DOCTYPE idsCorpus PUBLIC "-//IDS//DTD IDS-XCES 1.0//EN" "http://corpora.ids-mannheim.de/idsxces1/DTD/ids.xcesdoc.dtd">
+<idsheader type="corpus" pattern="Ztg/Zschr" version="1.1">
+ <fileDesc>
+ <titleStmt>
+ <korpussigle>VDI14</korpussigle>
+ <c.title>VDI nachrichten 2014</c.title>
+ </titleStmt>
+ <publicationStmt>
+ <distributor> Institut für Deutsche Sprache </distributor>
+ <pubAddress> Postfach 10 16 21, D-68016 Mannheim </pubAddress>
+ <telephone> +49 (0)621 1581 0 </telephone>
+ <availability></availability>
+ <pubDate></pubDate>
+ </publicationStmt>
+ <sourceDesc>
+ <biblFull>
+ <titleStmt>
+ <x.title></x.title>
+ </titleStmt>
+ <editionStmt>
+ </editionStmt>
+ <publicationStmt>
+ <distributor></distributor>
+ <pubAddress></pubAddress>
+ <availability></availability>
+ <pubDate></pubDate>
+ </publicationStmt>
+ </biblFull>
+ <biblStruct>
+ <monogr>
+ <h.title type="main">VDI nachrichten</h.title>
+ <h.title type="sub"></h.title>
+ <h.title type="abbr" level="m">VDI nachr.</h.title>
+ <editor>Verein Deutscher Ingenieure</editor>
+ <imprint>
+ <publisher>VDI Verlag GmbH</publisher>
+ <pubPlace>Düsseldorf</pubPlace>
+ </imprint>
+ <biblScope type="vol"></biblScope>
+ </monogr>
+ </biblStruct>
+ <reference type="super" assemblage="regular">VDI14 VDI nachrichten, [Wochenzeitung]; Hrsg.: Verein Deutscher Ingenieure, Düsseldorf: VDI Verlag GmbH; 2014</reference>
+ </sourceDesc>
+ </fileDesc>
+ <encodingDesc>
+ <projectDesc>
+ </projectDesc>
+ <samplingDecl>
+ </samplingDecl>
+ <editorialDecl>
+ <transduction>
+ TraDuCES - Korpus-Transformationscompiler, Version 3.6.4,
+ Eric Seubert, IDS Mannheim, 7. April 2014
+ Optionen bei der Konvertierung:
+ - Dubletten-Modus:
+ Entfernung aller als Dubletten klassifizierten Texte.
+ - Indexierungsmodus für COSMAS II:
+ Erzeugung von Ersatzreferenzen für ausführliche Zitierung.
+ Entfernung aller Deklarationen für Dubletten.
+ Entfernung von Texten mit Sperrvermerken.
+ Entfernung von Texten mit minimalem Inhalt.
+ </transduction>
+ <pagination type="no"></pagination>
+ </editorialDecl>
+ <classDecl>
+ <taxonomy id="topic">
+ <h.bibl>Thementaxonomie (siehe http://www.ids-mannheim.de/kl/projekte/methoden/te.html)</h.bibl>
+ <category id="topic.Fiktion">
+ <catDesc>Fiktion</catDesc>
+ <category id="topic.Fiktion.Vermischtes">
+ <catDesc>Fiktion:Vermischtes</catDesc>
+ </category>
+ </category>
+ <category id="topic.Freizeit-Unterhaltung">
+ <catDesc>Freizeit_Unterhaltung</catDesc>
+ <category id="topic.Freizeit-Unterhaltung.Reisen">
+ <catDesc>Freizeit_Unterhaltung:Reisen</catDesc>
+ </category>
+ <category id="topic.Freizeit-Unterhaltung.Rundfunk">
+ <catDesc>Freizeit_Unterhaltung:Rundfunk</catDesc>
+ </category>
+ <category id="topic.Freizeit-Unterhaltung.Vereine-Veranstaltungen">
+ <catDesc>Freizeit_Unterhaltung:Vereine_Veranstaltungen</catDesc>
+ </category>
+ </category>
+ <category id="topic.Gesundheit-Ernaehrung">
+ <catDesc>Gesundheit_Ernaehrung</catDesc>
+ <category id="topic.Gesundheit-Ernaehrung.Ernaehrung">
+ <catDesc>Gesundheit_Ernaehrung:Ernaehrung</catDesc>
+ </category>
+ <category id="topic.Gesundheit-Ernaehrung.Gesundheit">
+ <catDesc>Gesundheit_Ernaehrung:Gesundheit</catDesc>
+ </category>
+ </category>
+ <category id="topic.Kultur">
+ <catDesc>Kultur</catDesc>
+ <category id="topic.Kultur.Bildende-Kunst">
+ <catDesc>Kultur:Bildende Kunst</catDesc>
+ </category>
+ <category id="topic.Kultur.Darstellende-Kunst">
+ <catDesc>Kultur:Darstellende Kunst</catDesc>
+ </category>
+ <category id="topic.Kultur.Film">
+ <catDesc>Kultur:Film</catDesc>
+ </category>
+ <category id="topic.Kultur.Literatur">
+ <catDesc>Kultur:Literatur</catDesc>
+ </category>
+ <category id="topic.Kultur.Mode">
+ <catDesc>Kultur:Mode</catDesc>
+ </category>
+ <category id="topic.Kultur.Musik">
+ <catDesc>Kultur:Musik</catDesc>
+ </category>
+ </category>
+ <category id="topic.Natur-Umwelt">
+ <catDesc>Natur_Umwelt</catDesc>
+ <category id="topic.Natur-Umwelt.Garten">
+ <catDesc>Natur_Umwelt:Garten</catDesc>
+ </category>
+ <category id="topic.Natur-Umwelt.Tiere">
+ <catDesc>Natur_Umwelt:Tiere</catDesc>
+ </category>
+ <category id="topic.Natur-Umwelt.Wetter-Klima">
+ <catDesc>Natur_Umwelt:Wetter_Klima</catDesc>
+ </category>
+ </category>
+ <category id="topic.Politik">
+ <catDesc>Politik</catDesc>
+ <category id="topic.Politik.Ausland">
+ <catDesc>Politik:Ausland</catDesc>
+ </category>
+ <category id="topic.Politik.Inland">
+ <catDesc>Politik:Inland</catDesc>
+ </category>
+ <category id="topic.Politik.Kommunalpolitik">
+ <catDesc>Politik:Kommunalpolitik</catDesc>
+ </category>
+ </category>
+ <category id="topic.Rest">
+ <catDesc>Rest</catDesc>
+ <category id="topic.Rest.boersenkurse">
+ <catDesc>Rest:boersenkurse</catDesc>
+ </category>
+ <category id="topic.Rest.geburt-tod-heirat">
+ <catDesc>Rest:geburt_tod_heirat</catDesc>
+ </category>
+ <category id="topic.Rest.impressum">
+ <catDesc>Rest:impressum</catDesc>
+ </category>
+ <category id="topic.Rest.inhaltsverzeichnisse">
+ <catDesc>Rest:inhaltsverzeichnisse</catDesc>
+ </category>
+ <category id="topic.Rest.ligatabellen">
+ <catDesc>Rest:ligatabellen</catDesc>
+ </category>
+ <category id="topic.Rest.tabellen">
+ <catDesc>Rest:tabellen</catDesc>
+ </category>
+ <category id="topic.Rest.veranstaltungshinweise">
+ <catDesc>Rest:veranstaltungshinweise</catDesc>
+ </category>
+ </category>
+ <category id="topic.Sport">
+ <catDesc>Sport</catDesc>
+ <category id="topic.Sport.Ballsport">
+ <catDesc>Sport:Ballsport</catDesc>
+ </category>
+ <category id="topic.Sport.Fussball">
+ <catDesc>Sport:Fussball</catDesc>
+ </category>
+ <category id="topic.Sport.Motorsport">
+ <catDesc>Sport:Motorsport</catDesc>
+ </category>
+ <category id="topic.Sport.Radsport">
+ <catDesc>Sport:Radsport</catDesc>
+ </category>
+ <category id="topic.Sport.Tennis">
+ <catDesc>Sport:Tennis</catDesc>
+ </category>
+ <category id="topic.Sport.Vermischtes">
+ <catDesc>Sport:Vermischtes</catDesc>
+ </category>
+ <category id="topic.Sport.Wintersport">
+ <catDesc>Sport:Wintersport</catDesc>
+ </category>
+ </category>
+ <category id="topic.Staat-Gesellschaft">
+ <catDesc>Staat_Gesellschaft</catDesc>
+ <category id="topic.Staat-Gesellschaft.Arbeit-und-Beruf">
+ <catDesc>Staat_Gesellschaft:Arbeit_und_Beruf</catDesc>
+ </category>
+ <category id="topic.Staat-Gesellschaft.Bildung">
+ <catDesc>Staat_Gesellschaft:Bildung</catDesc>
+ </category>
+ <category id="topic.Staat-Gesellschaft.Biographien-Interviews">
+ <catDesc>Staat_Gesellschaft:Biographien_Interviews</catDesc>
+ </category>
+ <category id="topic.Staat-Gesellschaft.Drittes-Reich-Rechtsextremismus">
+ <catDesc>Staat_Gesellschaft:Drittes_Reich_Rechtsextremismus</catDesc>
+ </category>
+ <category id="topic.Staat-Gesellschaft.Familie-Geschlecht">
+ <catDesc>Staat_Gesellschaft:Familie_Geschlecht</catDesc>
+ </category>
+ <category id="topic.Staat-Gesellschaft.Kirche">
+ <catDesc>Staat_Gesellschaft:Kirche</catDesc>
+ </category>
+ <category id="topic.Staat-Gesellschaft.Recht">
+ <catDesc>Staat_Gesellschaft:Recht</catDesc>
+ </category>
+ <category id="topic.Staat-Gesellschaft.Tod">
+ <catDesc>Staat_Gesellschaft:Tod</catDesc>
+ </category>
+ <category id="topic.Staat-Gesellschaft.Verbrechen">
+ <catDesc>Staat_Gesellschaft:Verbrechen</catDesc>
+ </category>
+ </category>
+ <category id="topic.Technik-Industrie">
+ <catDesc>Technik_Industrie</catDesc>
+ <category id="topic.Technik-Industrie.EDV-Elektronik">
+ <catDesc>Technik_Industrie:EDV_Elektronik</catDesc>
+ </category>
+ <category id="topic.Technik-Industrie.Kfz">
+ <catDesc>Technik_Industrie:Kfz</catDesc>
+ </category>
+ <category id="topic.Technik-Industrie.Transport-Verkehr">
+ <catDesc>Technik_Industrie:Transport_Verkehr</catDesc>
+ </category>
+ <category id="topic.Technik-Industrie.Umweltschutz">
+ <catDesc>Technik_Industrie:Umweltschutz</catDesc>
+ </category>
+ <category id="topic.Technik-Industrie.Unfaelle">
+ <catDesc>Technik_Industrie:Unfaelle</catDesc>
+ </category>
+ </category>
+ <category id="topic.Wirtschaft-Finanzen">
+ <catDesc>Wirtschaft_Finanzen</catDesc>
+ <category id="topic.Wirtschaft-Finanzen.Banken">
+ <catDesc>Wirtschaft_Finanzen:Banken</catDesc>
+ </category>
+ <category id="topic.Wirtschaft-Finanzen.Bilanzen">
+ <catDesc>Wirtschaft_Finanzen:Bilanzen</catDesc>
+ </category>
+ <category id="topic.Wirtschaft-Finanzen.Oeffentliche-Finanzen">
+ <catDesc>Wirtschaft_Finanzen:Oeffentliche_Finanzen</catDesc>
+ </category>
+ <category id="topic.Wirtschaft-Finanzen.Sozialprodukt">
+ <catDesc>Wirtschaft_Finanzen:Sozialprodukt</catDesc>
+ </category>
+ <category id="topic.Wirtschaft-Finanzen.Waehrung">
+ <catDesc>Wirtschaft_Finanzen:Waehrung</catDesc>
+ </category>
+ </category>
+ <category id="topic.Wissenschaft">
+ <catDesc>Wissenschaft</catDesc>
+ <category id="topic.Wissenschaft.Populaerwissenschaft">
+ <catDesc>Wissenschaft:Populaerwissenschaft</catDesc>
+ </category>
+ </category>
+ <category id="topic.unklassifizierbar">
+ <catDesc>Text ist thematisch nicht klassifizierbar.</catDesc>
+ </category>
+ </taxonomy>
+ </classDecl>
+ </encodingDesc>
+ <profileDesc>
+ <langusage>
+ <language id="de" usage="100">Deutsch</language>
+ </langusage>
+ <textdesc>
+ <texttype>Zeitung: Wochenzeitung</texttype>
+ <texttyperef>Wochenzeitung</texttyperef>
+ </textdesc>
+ </profileDesc>
+</idsheader>
diff --git a/t/artificial-subtoken.t b/t/artificial-subtoken.t
index ebf3b33..7a30103 100644
--- a/t/artificial-subtoken.t
+++ b/t/artificial-subtoken.t
@@ -16,7 +16,7 @@
my $path = catdir(dirname(__FILE__), 'artificial');
ok(my $doc = KorAP::Document->new( path => $path . '/' ), 'Load Korap::Document');
-is($doc->path, $path . '/', 'Path');
+like($doc->path, qr!$path/$!, 'Path');
ok($doc->parse, 'Parse document');
sub new_tokenizer {
diff --git a/t/meta.t b/t/meta.t
index 87180de..5159889 100644
--- a/t/meta.t
+++ b/t/meta.t
@@ -11,22 +11,26 @@
use File::Basename 'dirname';
use File::Spec::Functions 'catdir';
+
+# TODO: Make 'text' -> 'primaryText'
+
use_ok('KorAP::Document');
# WPD/00001
my $path = catdir(dirname(__FILE__), 'WPD/00001');
ok(my $doc = KorAP::Document->new( path => $path . '/' ), 'Load Korap::Document');
-is($doc->path, $path . '/', 'Path');
+like($doc->path, qr!$path/!, 'Path');
ok($doc = KorAP::Document->new( path => $path ), 'Load Korap::Document');
-is($doc->path, $path . '/', 'Path');
+like($doc->path, qr!$path/$!, 'Path');
ok($doc->parse, 'Parse document');
# Metdata
+is($doc->text_sigle, 'WPD_AAA.00001', 'ID');
+
is($doc->title, 'A', 'title');
ok(!$doc->sub_title, 'subTitle');
-is($doc->text_sigle, 'WPD_AAA.00001', 'ID');
is($doc->corpus_sigle, 'WPD', 'corpusID');
is($doc->pub_date, '20050328', 'pubDate');
is($doc->pub_place, 'URL:http://de.wikipedia.org', 'pubPlace');
@@ -35,21 +39,30 @@
is($doc->text_class->[2], 'wissenschaft', 'TextClass');
is($doc->text_class->[3], 'populaerwissenschaft', 'TextClass');
ok(!$doc->text_class->[4], 'TextClass');
-is($doc->author->[0], 'Ruru', 'author');
-is($doc->author->[1], 'Jens.Ol', 'author');
-is($doc->author->[2], 'Aglarech', 'author');
-ok(!$doc->author->[3], 'author');
+is($doc->author, 'Ruru; Jens.Ol; Aglarech; u.a.', 'author');
+
+#is($doc->author->[0], 'Ruru', 'author');
+#is($doc->author->[1], 'Jens.Ol', 'author');
+#is($doc->author->[2], 'Aglarech', 'author');
+#ok(!$doc->author->[3], 'author');
# Additional information
-is($doc->editor,'wikipedia.org', 'Editor');
+ok(!$doc->editor, 'Editor');
is($doc->publisher, 'Wikipedia', 'Publisher');
is($doc->creation_date, '20050000', 'Creation date');
-is($doc->coll_title, 'Wikipedia', 'Collection title');
-is($doc->coll_sub_title, 'Die freie Enzyklopädie', 'Collection subtitle');
-is($doc->coll_editor, 'wikipedia.org', 'Collection editor');
-ok(!$doc->coll_author, 'Collection author');
ok(!$doc->text_type, 'No text_type');
-ok(!$doc->text_type_art, 'text_type art');
+ok(!$doc->text_type_art, 'no text_type art');
+ok(!$doc->text_type_ref, 'no text_type ref');
+ok(!$doc->text_domain, 'no text_domain');
+ok(!$doc->text_column, 'no text_column');
+ok(!$doc->keywords_string, 'no keywords');
+is($doc->text_class_string, 'freizeit-unterhaltung reisen wissenschaft populaerwissenschaft', 'no text classes');
+ok(!$doc->language, 'no text_column');
+
+#is($doc->coll_title, 'Wikipedia', 'Collection title');
+#is($doc->coll_sub_title, 'Die freie Enzyklopädie', 'Collection subtitle');
+#is($doc->coll_editor, 'wikipedia.org', 'Collection editor');
+#ok(!$doc->coll_author, 'Collection author');
# BRZ13/00001
$path = catdir(dirname(__FILE__), 'BRZ13/00001');
@@ -60,21 +73,24 @@
ok(!$doc->sub_title, 'subTitle');
is($doc->text_sigle, 'BRZ13_APR.00001', 'ID');
is($doc->corpus_sigle, 'BRZ13', 'corpusID');
+
+
is($doc->pub_date, '20130402', 'pubDate');
is($doc->pub_place, 'Braunschweig', 'pubPlace');
+
is($doc->text_class->[0], 'staat-gesellschaft', 'TextClass');
is($doc->text_class->[1], 'familie-geschlecht', 'TextClass');
ok(!$doc->text_class->[2], 'TextClass');
-ok(!$doc->author->[0], 'author');
+ok(!$doc->author, 'author');
# Additional information
ok(!$doc->editor, 'Editor');
is($doc->publisher, 'Braunschweiger Zeitungsverlag, Druckhaus Albert Limbach GmbH & Co. KG', 'Publisher');
is($doc->creation_date, '20130402', 'Creation date');
-is($doc->coll_title, 'Braunschweiger Zeitung', 'Collection title');
-ok(!$doc->coll_sub_title, 'Collection subtitle');
-ok(!$doc->coll_editor, 'Collection editor');
-ok(!$doc->coll_author, 'Collection author');
+#is($doc->coll_title, 'Braunschweiger Zeitung', 'Collection title');
+#ok(!$doc->coll_sub_title, 'Collection subtitle');
+#ok(!$doc->coll_editor, 'Collection editor');
+#ok(!$doc->coll_author, 'Collection author');
is($doc->text_type, 'Zeitung: Tageszeitung', 'text_type');
ok(!$doc->text_type_art, 'text_type art');
@@ -92,17 +108,16 @@
is($doc->text_class->[0], 'freizeit-unterhaltung', 'TextClass');
is($doc->text_class->[1], 'vereine-veranstaltungen', 'TextClass');
ok(!$doc->text_class->[2], 'TextClass');
-ok(!$doc->author->[0], 'author');
-
+ok(!$doc->author, 'author');
# Additional information
ok(!$doc->editor, 'Editor');
ok(!$doc->publisher, 'Publisher');
is($doc->creation_date, '20010402', 'Creation date');
-ok(!$doc->coll_title, 'Collection title');
-ok(!$doc->coll_sub_title, 'Collection subtitle');
-ok(!$doc->coll_editor, 'Collection editor');
-ok(!$doc->coll_author, 'Collection author');
+#ok(!$doc->coll_title, 'Collection title');
+#ok(!$doc->coll_sub_title, 'Collection subtitle');
+#ok(!$doc->coll_editor, 'Collection editor');
+#ok(!$doc->coll_author, 'Collection author');
ok(!$doc->text_type, 'text_type');
is($doc->text_type_art, 'Bericht', 'text_type art');
@@ -112,7 +127,8 @@
ok($doc = KorAP::Document->new( path => $path . '/' ), 'Load Korap::Document');
ok($doc->parse, 'Parse document');
-is($doc->title, 'Amtsblatt des Landesbezirks Baden [diverse Erlasse]', 'title');
+is($doc->title, 'MK2/ERL.00001 Amtsblatt des Landesbezirks Baden [diverse Erlasse], Hrsg. und Schriftleitung: Präsidialstelle der Landesverwaltung Baden in Karlsruhe. - Karlsruhe, o.J.', 'title'); # Amtsblatt des Landesbezirks Baden [diverse Erlasse]
+
ok(!$doc->sub_title, 'subTitle');
is($doc->text_sigle, 'MK2_ERL.00001', 'ID');
is($doc->corpus_sigle, 'MK2', 'corpusID');
@@ -121,27 +137,26 @@
is($doc->text_class->[0], 'politik', 'TextClass');
is($doc->text_class->[1], 'kommunalpolitik', 'TextClass');
ok(!$doc->text_class->[2], 'TextClass');
-ok(!$doc->author->[0], 'author');
+ok(!$doc->author, 'author');
# Additional information
ok(!$doc->editor, 'Editor');
is($doc->publisher, 'Badenia Verlag und Druckerei', 'Publisher');
is($doc->creation_date, '19600000', 'Creation date');
-diag 'Non-acceptance of creation date ranges is temporary';
-ok(!$doc->coll_title, 'Collection title');
-ok(!$doc->coll_sub_title, 'Collection subtitle');
-ok(!$doc->coll_editor, 'Collection editor');
-ok(!$doc->coll_author, 'Collection author');
+diag 'Non-acceptance of creation date ranges may be temporary';
+#ok(!$doc->coll_title, 'Collection title');
+#ok(!$doc->coll_sub_title, 'Collection subtitle');
+#ok(!$doc->coll_editor, 'Collection editor');
+#ok(!$doc->coll_author, 'Collection author');
is($doc->text_type, 'Erlass', 'text_type');
ok(!$doc->text_type_art, 'text_type art');
-
# A01/02035-substring
$path = catdir(dirname(__FILE__), 'A01/02035-substring');
ok($doc = KorAP::Document->new( path => $path . '/' ), 'Load Korap::Document');
ok($doc->parse, 'Parse document');
-ok(!$doc->title, 'title');
+is($doc->title, 'A00/JAN.02035 St. Galler Tagblatt, 11.01.2000, Ressort: TB-RSP (Abk.)', 'title');
ok(!$doc->sub_title, 'subTitle');
is($doc->text_sigle, 'A00_JAN.02035', 'ID');
is($doc->corpus_sigle, 'A00', 'corpusID');
@@ -150,20 +165,19 @@
is($doc->text_class->[0], 'sport', 'TextClass');
is($doc->text_class->[1], 'ballsport', 'TextClass');
ok(!$doc->text_class->[2], 'TextClass');
-ok(!$doc->author->[0], 'author');
+ok(!$doc->author, 'author');
# Additional information
ok(!$doc->editor, 'Editor');
ok(!$doc->publisher, 'Publisher');
is($doc->creation_date, "20000111", 'Creation date');
-ok(!$doc->coll_title, 'Collection title');
-ok(!$doc->coll_sub_title, 'Collection subtitle');
-ok(!$doc->coll_editor, 'Collection editor');
-ok(!$doc->coll_author, 'Collection author');
+#ok(!$doc->coll_title, 'Collection title');
+#ok(!$doc->coll_sub_title, 'Collection subtitle');
+#ok(!$doc->coll_editor, 'Collection editor');
+#ok(!$doc->coll_author, 'Collection author');
ok(!$doc->text_type, 'text_type');
is($doc->text_type_art, 'Bericht', 'text_type art');
-
# A01/02873-meta
$path = catdir(dirname(__FILE__), 'A01/02873-meta');
ok($doc = KorAP::Document->new( path => $path . '/' ), 'Load Korap::Document');
@@ -178,16 +192,16 @@
is($doc->text_class->[0], 'kultur', 'TextClass');
is($doc->text_class->[1], 'film', 'TextClass');
ok(!$doc->text_class->[2], 'TextClass');
-ok(!$doc->author->[0], 'author');
+ok(!$doc->author, 'author');
# Additional information
ok(!$doc->editor, 'Editor');
ok(!$doc->publisher, 'Publisher');
is($doc->creation_date, "20000113", 'Creation date');
-ok(!$doc->coll_title, 'Collection title');
-ok(!$doc->coll_sub_title, 'Collection subtitle');
-ok(!$doc->coll_editor, 'Collection editor');
-ok(!$doc->coll_author, 'Collection author');
+#ok(!$doc->coll_title, 'Collection title');
+#ok(!$doc->coll_sub_title, 'Collection subtitle');
+#ok(!$doc->coll_editor, 'Collection editor');
+#ok(!$doc->coll_author, 'Collection author');
ok(!$doc->text_type, 'text_type');
is($doc->text_type_art, 'Bericht', 'text_type art');
@@ -206,21 +220,20 @@
is($doc->text_class->[0], 'gesundheit-ernaehrung', 'TextClass');
is($doc->text_class->[1], 'gesundheit', 'TextClass');
ok(!$doc->text_class->[2], 'TextClass');
-ok(!$doc->author->[0], 'author');
+ok(!$doc->author, 'author');
# Additional information
ok(!$doc->editor, 'Editor');
ok(!$doc->publisher, 'Publisher');
is($doc->creation_date, "20000124", 'Creation date');
-ok(!$doc->coll_title, 'Collection title');
-ok(!$doc->coll_sub_title, 'Collection subtitle');
-ok(!$doc->coll_editor, 'Collection editor');
-ok(!$doc->coll_author, 'Collection author');
+#ok(!$doc->coll_title, 'Collection title');
+#ok(!$doc->coll_sub_title, 'Collection subtitle');
+#ok(!$doc->coll_editor, 'Collection editor');
+#ok(!$doc->coll_author, 'Collection author');
ok(!$doc->text_type, 'text_type');
is($doc->text_type_art, 'Bericht', 'text_type art');
-
# A01/07452-deep
$path = catdir(dirname(__FILE__), 'A01/07452-deep');
ok($doc = KorAP::Document->new( path => $path . '/' ), 'Load Korap::Document');
@@ -235,26 +248,27 @@
is($doc->text_class->[0], 'politik', 'TextClass');
is($doc->text_class->[1], 'kommunalpolitik', 'TextClass');
ok(!$doc->text_class->[2], 'TextClass');
-ok(!$doc->author->[0], 'author');
+ok(!$doc->author, 'author');
# Additional information
ok(!$doc->editor, 'Editor');
ok(!$doc->publisher, 'Publisher');
is($doc->creation_date, "20000129", 'Creation date');
-ok(!$doc->coll_title, 'Collection title');
-ok(!$doc->coll_sub_title, 'Collection subtitle');
-ok(!$doc->coll_editor, 'Collection editor');
-ok(!$doc->coll_author, 'Collection author');
+#ok(!$doc->coll_title, 'Collection title');
+#ok(!$doc->coll_sub_title, 'Collection subtitle');
+#ok(!$doc->coll_editor, 'Collection editor');
+#ok(!$doc->coll_author, 'Collection author');
ok(!$doc->text_type, 'text_type');
is($doc->text_type_art, 'Bericht', 'text_type art');
+
# ART
$path = catdir(dirname(__FILE__), 'artificial');
ok($doc = KorAP::Document->new( path => $path . '/' ), 'Load Korap::Document');
-is($doc->path, $path . '/', 'Path');
+#is($doc->path, $path . '/', 'Path');
ok($doc = KorAP::Document->new( path => $path ), 'Load Korap::Document');
-is($doc->path, $path . '/', 'Path');
+#is($doc->path, $path . '/', 'Path');
ok($doc->parse, 'Parse document');
@@ -268,21 +282,119 @@
is($doc->text_class->[0], 'freizeit-unterhaltung', 'TextClass');
is($doc->text_class->[1], 'vereine-veranstaltungen', 'TextClass');
ok(!$doc->text_class->[2], 'TextClass');
-is($doc->author->[0], 'Ruru', 'author');
-is($doc->author->[1], 'Jens.Ol', 'author');
-is($doc->author->[2], 'Aglarech', 'author');
-ok(!$doc->author->[3], 'author');
+#is($doc->author->[0], 'Ruru', 'author');
+#is($doc->author->[1], 'Jens.Ol', 'author');
+#is($doc->author->[2], 'Aglarech', 'author');
+is($doc->author, 'Ruru; Jens.Ol; Aglarech; u.a.', 'author');
# Additional information
is($doc->editor, 'Nils Diewald', 'Editor');
is($doc->publisher, 'Artificial articles Inc.', 'Publisher');
is($doc->creation_date, '19990601', 'Creation date');
-is($doc->coll_title, 'Artificial articles', 'Collection title');
-is($doc->coll_sub_title, 'Best of!', 'Collection subtitle');
-is($doc->coll_editor, 'Nils Diewald', 'Collection editor');
-is($doc->coll_author, 'Nils Diewald', 'Collection author');
+#is($doc->coll_title, 'Artificial articles', 'Collection title');
+#is($doc->coll_sub_title, 'Best of!', 'Collection subtitle');
+#is($doc->coll_editor, 'Nils Diewald', 'Collection editor');
+#is($doc->coll_author, 'Nils Diewald', 'Collection author');
is($doc->text_type, 'Zeitung: Tageszeitung', 'No text_type');
is($doc->text_type_art, 'Bericht', 'text_type art');
+# Multipath headers
+$path = catdir(dirname(__FILE__), 'VDI/JAN/00001');
+ok($doc = KorAP::Document->new( path => $path . '/' ), 'Load Korap::Document');
+like($doc->path, qr!$path/!, 'Path');
+
+ok($doc = KorAP::Document->new( path => $path ), 'Load Korap::Document');
+like($doc->path, qr!$path/$!, 'Path');
+
+ok($doc->parse, 'Parse document');
+is($doc->text_sigle, 'VDI_JAN.00001', 'text sigle');
+is($doc->doc_sigle, 'VDI_JAN', 'doc sigle');
+is($doc->corpus_sigle, 'VDI', 'corpus sigle');
+is($doc->title, '10- Zz mit Zahl', 'title');
+ok(!$doc->sub_title, 'subtitle');
+is($doc->pub_date, '20140117', 'pubdate');
+is($doc->pub_place, 'Düsseldorf', 'pubplace');
+is($doc->author, 'Windhövel, Kerstin', 'author');
+is($doc->publisher, 'VDI Verlag GmbH', 'publisher');
+ok(!$doc->editor, 'editor');
+
+ok(!$doc->text_type, 'text type');
+ok(!$doc->text_type_art, 'text type art');
+ok(!$doc->text_type_ref, 'text type ref');
+ok(!$doc->text_column, 'text column');
+ok(!$doc->text_domain, 'text domain');
+ok(!$doc->creation_date, 'creation date');
+ok(!$doc->license, 'License');
+ok(!$doc->pages, 'Pages');
+ok(!$doc->file_edition_statement, 'file edition statement');
+ok(!$doc->bibl_edition_statement, 'bibl edition statement');
+is($doc->reference, 'VDI nachrichten, 17.01.2014, S. 10; 10- Zz mit Zahl [Ausführliche Zitierung nicht verfügbar]', 'Reference');
+
+ok(!$doc->language, 'Language');
+diag 'This may be "de" in the future';
+
+is($doc->doc_title, 'VDI nachrichten, Januar 2014', 'Doc title');
+ok(!$doc->doc_sub_title, 'Doc Sub title');
+ok(!$doc->doc_editor, 'Doc editor');
+ok(!$doc->doc_author, 'Doc author');
+
+is($doc->corpus_title, 'VDI nachrichten 2014', 'Corpus title');
+ok(!$doc->corpus_sub_title, 'Corpus Sub title');
+ok(!$doc->corpus_editor, 'Corpus editor');
+ok(!$doc->corpus_author, 'Corpus author');
+
+is($doc->keywords_string, '', 'Keywords');
+is($doc->text_class_string, 'Freizeit-Unterhaltung Reisen Politik Ausland', 'Text class');
+
+
+# WDD
+$path = catdir(dirname(__FILE__), 'WDD/G27/38989');
+ok($doc = KorAP::Document->new( path => $path . '/' ), 'Load Korap::Document');
+like($doc->path, qr!$path/!, 'Path');
+ok($doc->parse, 'Parse document');
+
+is($doc->text_sigle, 'WDD11_G27.38989', 'text sigle');
+is($doc->doc_sigle, 'WDD11_G27', 'doc sigle');
+is($doc->corpus_sigle, 'WDD11', 'corpus sigle');
+
+is($doc->title, 'Diskussion:Gunter A. Pilz', 'title');
+ok(!$doc->sub_title, 'subtitle');
+is($doc->pub_date, '20111029', 'pubdate');
+is($doc->pub_place, 'URL:http://de.wikipedia.org', 'pubplace');
+
+is($doc->author, '€pa, u.a.', 'author');
+is($doc->publisher, 'Wikipedia', 'publisher');
+ok(!$doc->editor, 'editor');
+
+is($doc->text_type, 'Diskussionen zu Enzyklopädie-Artikeln', 'text type');
+ok(!$doc->text_type_art, 'text type art');
+ok(!$doc->text_type_ref, 'text type ref');
+ok(!$doc->text_column, 'text column');
+ok(!$doc->text_domain, 'text domain');
+
+is($doc->creation_date, '20070707', 'creation date');
+is($doc->license, 'CC-BY-SA', 'License');
+ok(!$doc->pages, 'Pages');
+ok(!$doc->file_edition_statement, 'file edition statement');
+ok(!$doc->bibl_edition_statement, 'bibl edition statement');
+is($doc->reference, 'Diskussion:Gunter A. Pilz, In: Wikipedia - URL:http://de.wikipedia.org/wiki/Diskussion:Gunter_A._Pilz: Wikipedia, 2007', 'Reference');
+
+is($doc->language, 'de', 'Language');
+
+is($doc->doc_title, 'Wikipedia, Diskussionen zu Artikeln mit Anfangsbuchstabe G, Teil 27', 'Doc title');
+ok(!$doc->doc_sub_title, 'Doc Sub title');
+ok(!$doc->doc_editor, 'Doc editor');
+ok(!$doc->doc_author, 'Doc author');
+
+is($doc->corpus_title, 'Wikipedia.de 2011 Diskussionen', 'Corpus title');
+ok(!$doc->corpus_sub_title, 'Corpus Sub title');
+ok(!$doc->corpus_editor, 'Corpus editor');
+ok(!$doc->corpus_author, 'Corpus author');
+
+is($doc->keywords_string, '', 'Keywords');
+is($doc->text_class_string, '', 'Text class');
+
done_testing;
__END__
+
+
diff --git a/t/real_bzk.t b/t/real_bzk.t
index d033e77..a590d44 100644
--- a/t/real_bzk.t
+++ b/t/real_bzk.t
@@ -63,7 +63,7 @@
ok(!$doc->corpus_author, 'Correct Corpus author');
ok(!$doc->corpus_editor, 'Correct Corpus editor');
-is($doc->doc_title, 'Neues Deutschland', 'Correct Doc title');
+is($doc->doc_title, 'Neues Deutschland, Jahrgangsquerschnitt 1959', 'Correct Doc title');
is($doc->doc_sub_title, 'Organ des Zentralkomitees der Sozialistischen Einheitspartei Deutschlands', 'Correct Doc sub title');
ok(!$doc->doc_author, 'Correct Doc author');
ok(!$doc->doc_editor, 'Correct doc editor');
@@ -127,7 +127,7 @@
ok(!exists $output->{corpusAuthor}, 'Correct Corpus author');
ok(!exists $output->{corpusEditor}, 'Correct Corpus editor');
-is($output->{docTitle}, 'Neues Deutschland', 'Correct Doc title');
+is($output->{docTitle}, 'Neues Deutschland, Jahrgangsquerschnitt 1959', 'Correct Doc title');
is($output->{docSubTitle}, 'Organ des Zentralkomitees der Sozialistischen Einheitspartei Deutschlands', 'Correct Doc sub title');
ok(!exists $output->{docAuthor}, 'Correct Doc author');
ok(!exists $output->{docEditor}, 'Correct doc editor');
diff --git a/t/real_goethe.t b/t/real_goethe.t
index 878607b..9efe4c5 100644
--- a/t/real_goethe.t
+++ b/t/real_goethe.t
@@ -53,7 +53,7 @@
REF
is($doc->language, 'de', 'Language');
-is($doc->corpus_title, 'Goethes Werke', 'Correct Corpus title');
+is($doc->corpus_title, 'Goethe-Korpus', 'Correct Corpus title');
ok(!$doc->corpus_sub_title, 'Correct Corpus Sub title');
is($doc->corpus_author, 'Goethe, Johann Wolfgang von', 'Correct Corpus author');
is($doc->corpus_editor, 'Trunz, Erich', 'Correct Corpus editor');
@@ -64,7 +64,6 @@
ok(!$doc->doc_author, 'Correct Doc author');
ok(!$doc->doc_editor, 'Correct Doc editor');
-
# Tokenization
use_ok('KorAP::Tokenizer');
@@ -120,7 +119,7 @@
REF
is($output->{language}, 'de', 'Language');
-is($output->{corpusTitle}, 'Goethes Werke', 'Correct Corpus title');
+is($output->{corpusTitle}, 'Goethe-Korpus', 'Correct Corpus title');
ok(!exists $output->{corpusSubTitle}, 'Correct Text Type');
is($output->{corpusAuthor}, 'Goethe, Johann Wolfgang von', 'Correct Corpus title');
is($output->{corpusEditor}, 'Trunz, Erich', 'Editor');
diff --git a/t/transform.t b/t/transform.t
index 50cd2d7..783042d 100644
--- a/t/transform.t
+++ b/t/transform.t
@@ -14,6 +14,16 @@
use_ok('KorAP::Document');
+sub _t2h {
+ my $string = shift;
+ $string =~ s/^\[\(\d+?-\d+?\)(.+?)\]$/$1/;
+ my %hash = ();
+ foreach (split(qr!\|!, $string)) {
+ $hash{$_} = 1;
+ };
+ return \%hash;
+};
+
my @layers;
# push(@layers, ['Base', 'Sentences']);
push(@layers, ['Base', 'Paragraphs']);
@@ -50,10 +60,10 @@
my $path = catdir(dirname(__FILE__), 'WPD/00001');
ok(my $doc = KorAP::Document->new( path => $path . '/' ), 'Load Korap::Document');
-is($doc->path, $path . '/', 'Path');
+like($doc->path, qr!$path/$!, 'Path');
ok($doc = KorAP::Document->new( path => $path ), 'Load Korap::Document');
-is($doc->path, $path . '/', 'Path');
+like($doc->path, qr!$path/$!, 'Path');
ok($doc->parse, 'Parse document');
@@ -70,10 +80,11 @@
is($doc->text_class->[2], 'wissenschaft', 'TextClass');
is($doc->text_class->[3], 'populaerwissenschaft', 'TextClass');
ok(!$doc->text_class->[4], 'TextClass');
-is($doc->author->[0], 'Ruru', 'author');
-is($doc->author->[1], 'Jens.Ol', 'author');
-is($doc->author->[2], 'Aglarech', 'author');
-ok(!$doc->author->[3], 'author');
+is($doc->author, 'Ruru; Jens.Ol; Aglarech; u.a.', 'author');
+#is($doc->author->[0], 'Ruru', 'author');
+#is($doc->author->[1], 'Jens.Ol', 'author');
+#is($doc->author->[2], 'Aglarech', 'author');
+#ok(!$doc->author->[3], 'author');
# Get tokens
use_ok('KorAP::Tokenizer');
@@ -87,7 +98,7 @@
), 'New Tokenizer');
ok($tokens->parse, 'Parse');
-is($tokens->path, $path . '/', 'Path');
+like($tokens->path, qr!$path/$!, 'Path');
is($tokens->foundry, 'OpenNLP', 'Foundry');
is($tokens->doc->text_sigle, 'WPD_AAA.00001', 'Doc id');
is($tokens->should, 1068, 'Should');
@@ -95,23 +106,38 @@
is($tokens->name, 'tokens', 'Name');
is($tokens->layer, 'Tokens', 'Layer');
-is($tokens->stream->pos(118)->to_string, '[(763-768)s:Linie|i:linie|_118#763-768]', 'Token is correct');
+is_deeply(_t2h($tokens->stream->pos(118)->to_string),
+ _t2h('[(763-768)s:Linie|i:linie|_118#763-768]'),
+ 'Token is correct');
# Add Mate
ok($tokens->add('Mate', 'Morpho'), 'Add Mate');
-is($tokens->stream->pos(118)->to_string, '[(763-768)s:Linie|i:linie|_118#763-768|mate/l:linie|mate/p:NN|mate/m:case:acc|mate/m:number:sg|mate/m:gender:fem]', 'with Mate');
+is_deeply(
+ _t2h($tokens->stream->pos(118)->to_string),
+ _t2h('[(763-768)s:Linie|i:linie|_118#763-768|mate/l:linie|mate/p:NN|mate/m:case:acc|mate/m:number:sg|mate/m:gender:fem]'),
+ 'with Mate');
# Add sentences
ok($tokens->add('Base', 'Sentences'), 'Add Sentences');
-is($tokens->stream->pos(0)->to_string, '[(0-1)s:A|i:a|_0#0-1|-:tokens$<i>923|mate/p:XY|<>:base/s:s#0-74$<i>13|<>:base/s:t#0-6083$<i>923|-:base/sentences$<i>96]', 'Startinfo');
+is_deeply(
+ _t2h($tokens->stream->pos(0)->to_string),
+ _t2h('[(0-1)s:A|i:a|_0#0-1|-:tokens$<i>923|mate/p:XY|<>:base/s:s#0-74$<i>13<b>2|<>:base/s:t#0-6083$<i>923<b>0|-:base/sentences$<i>96]'),
+ 'Startinfo'
+);
foreach (@layers) {
ok($tokens->add(@$_), 'Add '. join(', ', @$_));
};
-is($tokens->stream->pos(0)->to_string, '[(0-1)s:A|i:a|_0#0-1|-:tokens$<i>923|mate/p:XY|<>:base/s:s#0-74$<i>13|<>:base/s:t#0-6083$<i>923|-:base/sentences$<i>96|<>:base/s:p#0-224$<i>34|-:base/paragraphs$<i>76|opennlp/p:NE|<>:opennlp/s:s#0-74$<i>13|-:opennlp/sentences$<i>50|<>:corenlp/s:s#0-6$<i>2|-:corenlp/sentences$<i>65|cnx/l:A|cnx/p:N|cnx/syn:@NH|<>:cnx/c:np#0-1$<i>1|<>:cnx/s:s#0-74$<i>13|-:cnx/sentences$<i>62|tt/l:A|tt/p:NN|tt/l:A|tt/p:FM|<>:tt/s:s#0-6083$<i>923|-:tt/sentences$<i>1|>:mate/d:PNC$<i>2|xip/p:SYMBOL|xip/l:A|<>:xip/c:TOP#0-74$<i>13|<>:xip/c:MC#0-73$<i>13<b>1|<>:xip/c:NP#0-1$<i>1<b>2|<>:xip/c:NPA#0-1$<i>1<b>3|<>:xip/c:NOUN#0-1$<i>1<b>4|<>:xip/c:SYMBOL#0-1$<i>1<b>5|>:xip/d:SUBJ$<i>3|<:xip/d:COORD$<i>1|<>:xip/s:s#0-74$<i>13|-:xip/sentences$<i>64]', 'Startinfo');
+is(
+ _t2h($tokens->stream->pos(0)->to_string),
+ _t2h('[(0-1)s:A|i:a|_0#0-1|-:tokens$<i>923|mate/p:XY|<>:base/s:s#0-74$<i>13|<>:base/s:t#0-6083$<i>923|-:base/sentences$<i>96|<>:base/s:p#0-224$<i>34|-:base/paragraphs$<i>76|opennlp/p:NE|<>:opennlp/s:s#0-74$<i>13|-:opennlp/sentences$<i>50|<>:corenlp/s:s#0-6$<i>2|-:corenlp/sentences$<i>65|cnx/l:A|cnx/p:N|cnx/syn:@NH|<>:cnx/c:np#0-1$<i>1|<>:cnx/s:s#0-74$<i>13|-:cnx/sentences$<i>62|tt/l:A|tt/p:NN|tt/l:A|tt/p:FM|<>:tt/s:s#0-6083$<i>923|-:tt/sentences$<i>1|>:mate/d:PNC$<i>2|xip/p:SYMBOL|xip/l:A|<>:xip/c:TOP#0-74$<i>13|<>:xip/c:MC#0-73$<i>13<b>1|<>:xip/c:NP#0-1$<i>1<b>2|<>:xip/c:NPA#0-1$<i>1<b>3|<>:xip/c:NOUN#0-1$<i>1<b>4|<>:xip/c:SYMBOL#0-1$<i>1<b>5|>:xip/d:SUBJ$<i>3|<:xip/d:COORD$<i>1|<>:xip/s:s#0-74$<i>13|-:xip/sentences$<i>64]'),
+ 'Startinfo');
+
+done_testing;
+__END__
#is($tokens->stream->pos(118)->to_string,