Added meta data for Schreibgebrauch and fixed Metadata parsing for I5
Change-Id: Ib2c9c5cce11c67bb093b2c0aa61449adff69e16e
diff --git a/lib/KorAP/Document.pm b/lib/KorAP/Document.pm
index 3e9f82e..74a3d8c 100644
--- a/lib/KorAP/Document.pm
+++ b/lib/KorAP/Document.pm
@@ -49,6 +49,9 @@
corpus_title
corpus_sub_title
corpus_editor
+
+ availability
+ pub_place_key
/;
# Separate: text_class, keywords
@@ -67,6 +70,7 @@
return $log;
};
+
sub new {
my $class = shift;
my $self = bless { @_ }, $class;
@@ -95,6 +99,7 @@
}
else {
+
$file = b($data_xml)->slurp;
try {
@@ -102,11 +107,10 @@
$error = 1;
};
$rt = xml2hash($file, text => '#text', attr => '-')->{raw_text};
- }
- catch {
- $self->log->warn($unable);
- $error = 1;
- };
+ } catch {
+ $self->log->warn($unable);
+ $error = 1;
+ };
};
return if $error;
@@ -150,13 +154,37 @@
foreach (@header) {
# Get corpus, doc and text meta data
my $type = shift(@type);
- $self->_parse_meta_i5($_, $type) if -e $_;
+
+ next unless -e $_;
+
+ my $slurp = b($_)->slurp;
+ $slurp =~ /^[^>]+encoding\s*=\s*(["'])([^\1]+?)\1/;
+ my $file = $slurp->decode($2 // 'UTF-8');
+
+ # Get DOM
+ my $dom = Mojo::DOM->new($file);
+
+ if ($dom->at('idsHeader') || $dom->at('idsheader')) {
+ $self->_parse_meta_i5($dom, $type);
+ }
+ else {
+ $self->_parse_meta_tei($dom, $type);
+ };
};
return 1;
};
+# Store arbitrary data
+sub store {
+ my $self = shift;
+ return $self->{store} unless @_;
+ return $self->{store}->{$_[0]} if @_ == 1;
+ $self->{store}->{$_[0]} = $_[1];
+};
+
+
# Primary data
sub primary {
$_[0]->{pd};
@@ -199,7 +227,7 @@
}
sub _remove_prefix {
- return $_[0];
+# return $_[0];
# This may render some titles wrong, e.g. 'VDI nachrichten 2014' ...
my $title = shift;
@@ -214,14 +242,78 @@
};
-sub _parse_meta_i5 {
+sub _parse_meta_tei {
my $self = shift;
- my $header_xml = shift;
+ my $dom = shift;
my $type = shift;
- my $file = b($header_xml)->slurp->decode('iso-8859-1');
+ my $stmt;
+ if ($type eq 'text' && ($stmt = $dom->at('titleStmt'))) {
- my $dom = Mojo::DOM->new($file);
+ # Title
+ try {
+ $stmt->find('title')->each(
+ sub {
+ my $type = $_->attr('type') || 'main';
+ $self->title($_->all_text) if $type eq 'main';
+ $self->sub_title($_->all_text) if $type eq 'sub';
+ }
+ );
+ };
+
+ # Author
+ try {
+ my $author = $stmt->at('author')->attr('ref');
+ $author = $self->{ref_author}->{$author};
+ if ($author) {
+ $self->author($author->{id});
+ $self->store('sgbrAuthorAgeClass' => $author->{age}) if $author->{age};
+ $self->store('sgbrAuthorSex' => $author->{sex}) if $author->{sex};
+ };
+ };
+
+ try {
+ my $kodex = $dom->at('item[rend]')->attr('rend');
+ $self->store('sgbrKodex' => $kodex);
+ };
+ }
+
+ elsif ($type eq 'doc') {
+ try {
+ $dom->find('particDesc person')->each(
+ sub {
+ $self->{ref_author}->{'#' . $_->attr('xml:id')} = {
+ age => $_->attr('age'),
+ sex => $_->attr('sex'),
+ id => $_->attr('xml:id')
+ }
+ });
+ };
+
+ try {
+ my $lang = $dom->at('language[ident]')->attr('ident');
+ $self->language($lang);
+ };
+
+ try {
+ $stmt = $dom->find('titleStmt > title')->each(
+ sub {
+ my $type = $_->attr('type') || 'main';
+ $self->doc_title($_->all_text) if $type eq 'main';
+ $self->doc_sub_title($_->all_text) if $type eq 'sub';
+ }
+ );
+ };
+ };
+ return;
+};
+
+
+
+sub _parse_meta_i5 {
+ my $self = shift;
+ my $dom = shift;
+ my $type = shift;
my $analytic = $dom->at('analytic');
@@ -263,7 +355,8 @@
if ($type eq 'corpus') {
unless ($self->corpus_title) {
if (my $title = $dom->at('fileDesc > titleStmt > c\.title')) {
- $self->corpus_title(_remove_prefix($title->all_text, $self->corpus_sigle)) if $title->all_text;
+ $self->corpus_title(_remove_prefix($title->all_text, $self->corpus_sigle))
+ if $title->all_text;
};
};
}
@@ -272,7 +365,8 @@
elsif ($type eq 'doc') {
unless ($self->doc_title) {
if (my $title = $dom->at('fileDesc > titleStmt > d\.title')) {
- $self->doc_title(_remove_prefix($title->all_text, $self->doc_sigle)) if $title->all_text;
+ $self->doc_title(_remove_prefix($title->all_text, $self->doc_sigle))
+ if $title->all_text;
};
};
}
@@ -281,14 +375,16 @@
elsif ($type eq 'text') {
unless ($self->title) {
if (my $title = $dom->at('fileDesc > titleStmt > t\.title')) {
- $self->title(_remove_prefix($title->all_text, $self->text_sigle)) if $title->all_text;
- };
+ $self->title(_remove_prefix($title->all_text, $self->text_sigle))
+ if $title->all_text;
+ }
};
};
# Get PubPlace
if (my $place = $dom->at('pubPlace')) {
$self->pub_place($place->all_text) if $place->all_text;
+ $self->pub_place_key($place->attr('key')) if $place->attr('key');
};
# Get Publisher
@@ -354,6 +450,13 @@
};
};
+ # Availability
+ try {
+ $self->availability(
+ $dom->at('availability')->all_text
+ );
+ };
+
# Get pubDate
my $pub_date = $dom->find('pubDate[type=year]');
$pub_date->each(
diff --git a/t/A01/02035-substring/base/paragraph.xml b/t/A00/02035-substring/base/paragraph.xml
similarity index 100%
rename from t/A01/02035-substring/base/paragraph.xml
rename to t/A00/02035-substring/base/paragraph.xml
diff --git a/t/A01/02035-substring/base/sentences.xml b/t/A00/02035-substring/base/sentences.xml
similarity index 100%
rename from t/A01/02035-substring/base/sentences.xml
rename to t/A00/02035-substring/base/sentences.xml
diff --git a/t/A01/02035-substring/base/tokens_aggr.xml b/t/A00/02035-substring/base/tokens_aggr.xml
similarity index 100%
rename from t/A01/02035-substring/base/tokens_aggr.xml
rename to t/A00/02035-substring/base/tokens_aggr.xml
diff --git a/t/A01/02035-substring/base/tokens_conservative.xml b/t/A00/02035-substring/base/tokens_conservative.xml
similarity index 100%
rename from t/A01/02035-substring/base/tokens_conservative.xml
rename to t/A00/02035-substring/base/tokens_conservative.xml
diff --git a/t/A01/02035-substring/connexor/metadata.xml b/t/A00/02035-substring/connexor/metadata.xml
similarity index 100%
rename from t/A01/02035-substring/connexor/metadata.xml
rename to t/A00/02035-substring/connexor/metadata.xml
diff --git a/t/A01/02035-substring/connexor/morpho.xml b/t/A00/02035-substring/connexor/morpho.xml
similarity index 100%
rename from t/A01/02035-substring/connexor/morpho.xml
rename to t/A00/02035-substring/connexor/morpho.xml
diff --git a/t/A01/02035-substring/connexor/mpt.xml b/t/A00/02035-substring/connexor/mpt.xml
similarity index 100%
rename from t/A01/02035-substring/connexor/mpt.xml
rename to t/A00/02035-substring/connexor/mpt.xml
diff --git a/t/A01/02035-substring/connexor/phrase.xml b/t/A00/02035-substring/connexor/phrase.xml
similarity index 100%
rename from t/A01/02035-substring/connexor/phrase.xml
rename to t/A00/02035-substring/connexor/phrase.xml
diff --git a/t/A01/02035-substring/connexor/sentences.xml b/t/A00/02035-substring/connexor/sentences.xml
similarity index 100%
rename from t/A01/02035-substring/connexor/sentences.xml
rename to t/A00/02035-substring/connexor/sentences.xml
diff --git a/t/A01/02035-substring/connexor/syntax.xml b/t/A00/02035-substring/connexor/syntax.xml
similarity index 100%
rename from t/A01/02035-substring/connexor/syntax.xml
rename to t/A00/02035-substring/connexor/syntax.xml
diff --git a/t/A01/02035-substring/connexor/tokens.xml b/t/A00/02035-substring/connexor/tokens.xml
similarity index 100%
rename from t/A01/02035-substring/connexor/tokens.xml
rename to t/A00/02035-substring/connexor/tokens.xml
diff --git a/t/A01/02035-substring/corenlp/ne_dewac_175m_600.xml b/t/A00/02035-substring/corenlp/ne_dewac_175m_600.xml
similarity index 100%
rename from t/A01/02035-substring/corenlp/ne_dewac_175m_600.xml
rename to t/A00/02035-substring/corenlp/ne_dewac_175m_600.xml
diff --git a/t/A01/02035-substring/corenlp/ne_hgc_175m_600.xml b/t/A00/02035-substring/corenlp/ne_hgc_175m_600.xml
similarity index 100%
rename from t/A01/02035-substring/corenlp/ne_hgc_175m_600.xml
rename to t/A00/02035-substring/corenlp/ne_hgc_175m_600.xml
diff --git a/t/A01/02035-substring/corenlp/sentences.xml b/t/A00/02035-substring/corenlp/sentences.xml
similarity index 100%
rename from t/A01/02035-substring/corenlp/sentences.xml
rename to t/A00/02035-substring/corenlp/sentences.xml
diff --git a/t/A01/02035-substring/corenlp/tokens.xml b/t/A00/02035-substring/corenlp/tokens.xml
similarity index 100%
rename from t/A01/02035-substring/corenlp/tokens.xml
rename to t/A00/02035-substring/corenlp/tokens.xml
diff --git a/t/A01/02035-substring/data.xml b/t/A00/02035-substring/data.xml
similarity index 100%
rename from t/A01/02035-substring/data.xml
rename to t/A00/02035-substring/data.xml
diff --git a/t/A01/02035-substring/header.xml b/t/A00/02035-substring/header.xml
similarity index 100%
rename from t/A01/02035-substring/header.xml
rename to t/A00/02035-substring/header.xml
diff --git a/t/A01/02035-substring/mate/dependency.xml b/t/A00/02035-substring/mate/dependency.xml
similarity index 100%
rename from t/A01/02035-substring/mate/dependency.xml
rename to t/A00/02035-substring/mate/dependency.xml
diff --git a/t/A01/02035-substring/mate/morpho.xml b/t/A00/02035-substring/mate/morpho.xml
similarity index 100%
rename from t/A01/02035-substring/mate/morpho.xml
rename to t/A00/02035-substring/mate/morpho.xml
diff --git a/t/A01/02035-substring/mate/pipeline/one_token_per_line.txt b/t/A00/02035-substring/mate/pipeline/one_token_per_line.txt
similarity index 100%
rename from t/A01/02035-substring/mate/pipeline/one_token_per_line.txt
rename to t/A00/02035-substring/mate/pipeline/one_token_per_line.txt
diff --git a/t/A01/02035-substring/mate/pipeline/parsed.txt b/t/A00/02035-substring/mate/pipeline/parsed.txt
similarity index 100%
rename from t/A01/02035-substring/mate/pipeline/parsed.txt
rename to t/A00/02035-substring/mate/pipeline/parsed.txt
diff --git a/t/A01/02035-substring/mate/tokenSpans/number_tokenSpans.xml b/t/A00/02035-substring/mate/tokenSpans/number_tokenSpans.xml
similarity index 100%
rename from t/A01/02035-substring/mate/tokenSpans/number_tokenSpans.xml
rename to t/A00/02035-substring/mate/tokenSpans/number_tokenSpans.xml
diff --git a/t/A01/02035-substring/opennlp/morpho.xml b/t/A00/02035-substring/opennlp/morpho.xml
similarity index 100%
rename from t/A01/02035-substring/opennlp/morpho.xml
rename to t/A00/02035-substring/opennlp/morpho.xml
diff --git a/t/A01/02035-substring/opennlp/sentences.xml b/t/A00/02035-substring/opennlp/sentences.xml
similarity index 100%
rename from t/A01/02035-substring/opennlp/sentences.xml
rename to t/A00/02035-substring/opennlp/sentences.xml
diff --git a/t/A01/02035-substring/opennlp/tokens.xml b/t/A00/02035-substring/opennlp/tokens.xml
similarity index 100%
rename from t/A01/02035-substring/opennlp/tokens.xml
rename to t/A00/02035-substring/opennlp/tokens.xml
diff --git a/t/A01/02035-substring/struct/structure.xml b/t/A00/02035-substring/struct/structure.xml
similarity index 100%
rename from t/A01/02035-substring/struct/structure.xml
rename to t/A00/02035-substring/struct/structure.xml
diff --git a/t/A01/02035-substring/text.txt b/t/A00/02035-substring/text.txt
similarity index 100%
rename from t/A01/02035-substring/text.txt
rename to t/A00/02035-substring/text.txt
diff --git a/t/A01/02035-substring/tree_tagger/metadata.xml b/t/A00/02035-substring/tree_tagger/metadata.xml
similarity index 100%
rename from t/A01/02035-substring/tree_tagger/metadata.xml
rename to t/A00/02035-substring/tree_tagger/metadata.xml
diff --git a/t/A01/02035-substring/tree_tagger/morpho.xml b/t/A00/02035-substring/tree_tagger/morpho.xml
similarity index 100%
rename from t/A01/02035-substring/tree_tagger/morpho.xml
rename to t/A00/02035-substring/tree_tagger/morpho.xml
diff --git a/t/A01/02035-substring/tree_tagger/sentences.xml b/t/A00/02035-substring/tree_tagger/sentences.xml
similarity index 100%
rename from t/A01/02035-substring/tree_tagger/sentences.xml
rename to t/A00/02035-substring/tree_tagger/sentences.xml
diff --git a/t/A01/02035-substring/tree_tagger/tokens.xml b/t/A00/02035-substring/tree_tagger/tokens.xml
similarity index 100%
rename from t/A01/02035-substring/tree_tagger/tokens.xml
rename to t/A00/02035-substring/tree_tagger/tokens.xml
diff --git a/t/A01/02035-substring/xip/constituency.xml b/t/A00/02035-substring/xip/constituency.xml
similarity index 100%
rename from t/A01/02035-substring/xip/constituency.xml
rename to t/A00/02035-substring/xip/constituency.xml
diff --git a/t/A01/02035-substring/xip/dependency.xml b/t/A00/02035-substring/xip/dependency.xml
similarity index 100%
rename from t/A01/02035-substring/xip/dependency.xml
rename to t/A00/02035-substring/xip/dependency.xml
diff --git a/t/A01/02035-substring/xip/metadata.xml b/t/A00/02035-substring/xip/metadata.xml
similarity index 100%
rename from t/A01/02035-substring/xip/metadata.xml
rename to t/A00/02035-substring/xip/metadata.xml
diff --git a/t/A01/02035-substring/xip/morpho.xml b/t/A00/02035-substring/xip/morpho.xml
similarity index 100%
rename from t/A01/02035-substring/xip/morpho.xml
rename to t/A00/02035-substring/xip/morpho.xml
diff --git a/t/A01/02035-substring/xip/sentences.xml b/t/A00/02035-substring/xip/sentences.xml
similarity index 100%
rename from t/A01/02035-substring/xip/sentences.xml
rename to t/A00/02035-substring/xip/sentences.xml
diff --git a/t/A01/02035-substring/xip/tokens.xml b/t/A00/02035-substring/xip/tokens.xml
similarity index 100%
rename from t/A01/02035-substring/xip/tokens.xml
rename to t/A00/02035-substring/xip/tokens.xml
diff --git a/t/A01/02873-meta/base/paragraph.xml b/t/A00/02873-meta/base/paragraph.xml
similarity index 100%
rename from t/A01/02873-meta/base/paragraph.xml
rename to t/A00/02873-meta/base/paragraph.xml
diff --git a/t/A01/02873-meta/base/sentences.xml b/t/A00/02873-meta/base/sentences.xml
similarity index 100%
rename from t/A01/02873-meta/base/sentences.xml
rename to t/A00/02873-meta/base/sentences.xml
diff --git a/t/A01/02873-meta/base/tokens_aggr.xml b/t/A00/02873-meta/base/tokens_aggr.xml
similarity index 100%
rename from t/A01/02873-meta/base/tokens_aggr.xml
rename to t/A00/02873-meta/base/tokens_aggr.xml
diff --git a/t/A01/02873-meta/base/tokens_conservative.xml b/t/A00/02873-meta/base/tokens_conservative.xml
similarity index 100%
rename from t/A01/02873-meta/base/tokens_conservative.xml
rename to t/A00/02873-meta/base/tokens_conservative.xml
diff --git a/t/A01/02873-meta/connexor/metadata.xml b/t/A00/02873-meta/connexor/metadata.xml
similarity index 100%
rename from t/A01/02873-meta/connexor/metadata.xml
rename to t/A00/02873-meta/connexor/metadata.xml
diff --git a/t/A01/02873-meta/connexor/morpho.xml b/t/A00/02873-meta/connexor/morpho.xml
similarity index 100%
rename from t/A01/02873-meta/connexor/morpho.xml
rename to t/A00/02873-meta/connexor/morpho.xml
diff --git a/t/A01/02873-meta/connexor/mpt.xml b/t/A00/02873-meta/connexor/mpt.xml
similarity index 100%
rename from t/A01/02873-meta/connexor/mpt.xml
rename to t/A00/02873-meta/connexor/mpt.xml
diff --git a/t/A01/02873-meta/connexor/phrase.xml b/t/A00/02873-meta/connexor/phrase.xml
similarity index 100%
rename from t/A01/02873-meta/connexor/phrase.xml
rename to t/A00/02873-meta/connexor/phrase.xml
diff --git a/t/A01/02873-meta/connexor/sentences.xml b/t/A00/02873-meta/connexor/sentences.xml
similarity index 100%
rename from t/A01/02873-meta/connexor/sentences.xml
rename to t/A00/02873-meta/connexor/sentences.xml
diff --git a/t/A01/02873-meta/connexor/syntax.xml b/t/A00/02873-meta/connexor/syntax.xml
similarity index 100%
rename from t/A01/02873-meta/connexor/syntax.xml
rename to t/A00/02873-meta/connexor/syntax.xml
diff --git a/t/A01/02873-meta/connexor/tokens.xml b/t/A00/02873-meta/connexor/tokens.xml
similarity index 100%
rename from t/A01/02873-meta/connexor/tokens.xml
rename to t/A00/02873-meta/connexor/tokens.xml
diff --git a/t/A01/02873-meta/corenlp/ne_dewac_175m_600.xml b/t/A00/02873-meta/corenlp/ne_dewac_175m_600.xml
similarity index 100%
rename from t/A01/02873-meta/corenlp/ne_dewac_175m_600.xml
rename to t/A00/02873-meta/corenlp/ne_dewac_175m_600.xml
diff --git a/t/A01/02873-meta/corenlp/ne_hgc_175m_600.xml b/t/A00/02873-meta/corenlp/ne_hgc_175m_600.xml
similarity index 100%
rename from t/A01/02873-meta/corenlp/ne_hgc_175m_600.xml
rename to t/A00/02873-meta/corenlp/ne_hgc_175m_600.xml
diff --git a/t/A01/02873-meta/corenlp/sentences.xml b/t/A00/02873-meta/corenlp/sentences.xml
similarity index 100%
rename from t/A01/02873-meta/corenlp/sentences.xml
rename to t/A00/02873-meta/corenlp/sentences.xml
diff --git a/t/A01/02873-meta/corenlp/tokens.xml b/t/A00/02873-meta/corenlp/tokens.xml
similarity index 100%
rename from t/A01/02873-meta/corenlp/tokens.xml
rename to t/A00/02873-meta/corenlp/tokens.xml
diff --git a/t/A01/02873-meta/data.xml b/t/A00/02873-meta/data.xml
similarity index 100%
rename from t/A01/02873-meta/data.xml
rename to t/A00/02873-meta/data.xml
diff --git a/t/A01/02873-meta/header.xml b/t/A00/02873-meta/header.xml
similarity index 100%
rename from t/A01/02873-meta/header.xml
rename to t/A00/02873-meta/header.xml
diff --git a/t/A01/02873-meta/mate/dependency.xml b/t/A00/02873-meta/mate/dependency.xml
similarity index 100%
rename from t/A01/02873-meta/mate/dependency.xml
rename to t/A00/02873-meta/mate/dependency.xml
diff --git a/t/A01/02873-meta/mate/morpho.xml b/t/A00/02873-meta/mate/morpho.xml
similarity index 100%
rename from t/A01/02873-meta/mate/morpho.xml
rename to t/A00/02873-meta/mate/morpho.xml
diff --git a/t/A01/02873-meta/mate/pipeline/one_token_per_line.txt b/t/A00/02873-meta/mate/pipeline/one_token_per_line.txt
similarity index 100%
rename from t/A01/02873-meta/mate/pipeline/one_token_per_line.txt
rename to t/A00/02873-meta/mate/pipeline/one_token_per_line.txt
diff --git a/t/A01/02873-meta/mate/pipeline/parsed.txt b/t/A00/02873-meta/mate/pipeline/parsed.txt
similarity index 100%
rename from t/A01/02873-meta/mate/pipeline/parsed.txt
rename to t/A00/02873-meta/mate/pipeline/parsed.txt
diff --git a/t/A01/02873-meta/mate/tokenSpans/number_tokenSpans.xml b/t/A00/02873-meta/mate/tokenSpans/number_tokenSpans.xml
similarity index 100%
rename from t/A01/02873-meta/mate/tokenSpans/number_tokenSpans.xml
rename to t/A00/02873-meta/mate/tokenSpans/number_tokenSpans.xml
diff --git a/t/A01/02873-meta/opennlp/morpho.xml b/t/A00/02873-meta/opennlp/morpho.xml
similarity index 100%
rename from t/A01/02873-meta/opennlp/morpho.xml
rename to t/A00/02873-meta/opennlp/morpho.xml
diff --git a/t/A01/02873-meta/opennlp/sentences.xml b/t/A00/02873-meta/opennlp/sentences.xml
similarity index 100%
rename from t/A01/02873-meta/opennlp/sentences.xml
rename to t/A00/02873-meta/opennlp/sentences.xml
diff --git a/t/A01/02873-meta/opennlp/tokens.xml b/t/A00/02873-meta/opennlp/tokens.xml
similarity index 100%
rename from t/A01/02873-meta/opennlp/tokens.xml
rename to t/A00/02873-meta/opennlp/tokens.xml
diff --git a/t/A01/02873-meta/struct/structure.xml b/t/A00/02873-meta/struct/structure.xml
similarity index 100%
rename from t/A01/02873-meta/struct/structure.xml
rename to t/A00/02873-meta/struct/structure.xml
diff --git a/t/A01/02873-meta/text.txt b/t/A00/02873-meta/text.txt
similarity index 100%
rename from t/A01/02873-meta/text.txt
rename to t/A00/02873-meta/text.txt
diff --git a/t/A01/02873-meta/tree_tagger/metadata.xml b/t/A00/02873-meta/tree_tagger/metadata.xml
similarity index 100%
rename from t/A01/02873-meta/tree_tagger/metadata.xml
rename to t/A00/02873-meta/tree_tagger/metadata.xml
diff --git a/t/A01/02873-meta/tree_tagger/morpho.xml b/t/A00/02873-meta/tree_tagger/morpho.xml
similarity index 100%
rename from t/A01/02873-meta/tree_tagger/morpho.xml
rename to t/A00/02873-meta/tree_tagger/morpho.xml
diff --git a/t/A01/02873-meta/tree_tagger/sentences.xml b/t/A00/02873-meta/tree_tagger/sentences.xml
similarity index 100%
rename from t/A01/02873-meta/tree_tagger/sentences.xml
rename to t/A00/02873-meta/tree_tagger/sentences.xml
diff --git a/t/A01/02873-meta/tree_tagger/tokens.xml b/t/A00/02873-meta/tree_tagger/tokens.xml
similarity index 100%
rename from t/A01/02873-meta/tree_tagger/tokens.xml
rename to t/A00/02873-meta/tree_tagger/tokens.xml
diff --git a/t/A01/02873-meta/xip/constituency.xml b/t/A00/02873-meta/xip/constituency.xml
similarity index 100%
rename from t/A01/02873-meta/xip/constituency.xml
rename to t/A00/02873-meta/xip/constituency.xml
diff --git a/t/A01/02873-meta/xip/dependency.xml b/t/A00/02873-meta/xip/dependency.xml
similarity index 100%
rename from t/A01/02873-meta/xip/dependency.xml
rename to t/A00/02873-meta/xip/dependency.xml
diff --git a/t/A01/02873-meta/xip/metadata.xml b/t/A00/02873-meta/xip/metadata.xml
similarity index 100%
rename from t/A01/02873-meta/xip/metadata.xml
rename to t/A00/02873-meta/xip/metadata.xml
diff --git a/t/A01/02873-meta/xip/morpho.xml b/t/A00/02873-meta/xip/morpho.xml
similarity index 100%
rename from t/A01/02873-meta/xip/morpho.xml
rename to t/A00/02873-meta/xip/morpho.xml
diff --git a/t/A01/02873-meta/xip/sentences.xml b/t/A00/02873-meta/xip/sentences.xml
similarity index 100%
rename from t/A01/02873-meta/xip/sentences.xml
rename to t/A00/02873-meta/xip/sentences.xml
diff --git a/t/A01/02873-meta/xip/tokens.xml b/t/A00/02873-meta/xip/tokens.xml
similarity index 100%
rename from t/A01/02873-meta/xip/tokens.xml
rename to t/A00/02873-meta/xip/tokens.xml
diff --git a/t/A01/05663-unbalanced/base/paragraph.xml b/t/A00/05663-unbalanced/base/paragraph.xml
similarity index 100%
rename from t/A01/05663-unbalanced/base/paragraph.xml
rename to t/A00/05663-unbalanced/base/paragraph.xml
diff --git a/t/A01/05663-unbalanced/base/sentences.xml b/t/A00/05663-unbalanced/base/sentences.xml
similarity index 100%
rename from t/A01/05663-unbalanced/base/sentences.xml
rename to t/A00/05663-unbalanced/base/sentences.xml
diff --git a/t/A01/05663-unbalanced/base/tokens_aggr.xml b/t/A00/05663-unbalanced/base/tokens_aggr.xml
similarity index 100%
rename from t/A01/05663-unbalanced/base/tokens_aggr.xml
rename to t/A00/05663-unbalanced/base/tokens_aggr.xml
diff --git a/t/A01/05663-unbalanced/base/tokens_conservative.xml b/t/A00/05663-unbalanced/base/tokens_conservative.xml
similarity index 100%
rename from t/A01/05663-unbalanced/base/tokens_conservative.xml
rename to t/A00/05663-unbalanced/base/tokens_conservative.xml
diff --git a/t/A01/05663-unbalanced/connexor/metadata.xml b/t/A00/05663-unbalanced/connexor/metadata.xml
similarity index 100%
rename from t/A01/05663-unbalanced/connexor/metadata.xml
rename to t/A00/05663-unbalanced/connexor/metadata.xml
diff --git a/t/A01/05663-unbalanced/connexor/morpho.xml b/t/A00/05663-unbalanced/connexor/morpho.xml
similarity index 100%
rename from t/A01/05663-unbalanced/connexor/morpho.xml
rename to t/A00/05663-unbalanced/connexor/morpho.xml
diff --git a/t/A01/05663-unbalanced/connexor/mpt.xml b/t/A00/05663-unbalanced/connexor/mpt.xml
similarity index 100%
rename from t/A01/05663-unbalanced/connexor/mpt.xml
rename to t/A00/05663-unbalanced/connexor/mpt.xml
diff --git a/t/A01/05663-unbalanced/connexor/phrase.xml b/t/A00/05663-unbalanced/connexor/phrase.xml
similarity index 100%
rename from t/A01/05663-unbalanced/connexor/phrase.xml
rename to t/A00/05663-unbalanced/connexor/phrase.xml
diff --git a/t/A01/05663-unbalanced/connexor/sentences.xml b/t/A00/05663-unbalanced/connexor/sentences.xml
similarity index 100%
rename from t/A01/05663-unbalanced/connexor/sentences.xml
rename to t/A00/05663-unbalanced/connexor/sentences.xml
diff --git a/t/A01/05663-unbalanced/connexor/syntax.xml b/t/A00/05663-unbalanced/connexor/syntax.xml
similarity index 100%
rename from t/A01/05663-unbalanced/connexor/syntax.xml
rename to t/A00/05663-unbalanced/connexor/syntax.xml
diff --git a/t/A01/05663-unbalanced/connexor/tokens.xml b/t/A00/05663-unbalanced/connexor/tokens.xml
similarity index 100%
rename from t/A01/05663-unbalanced/connexor/tokens.xml
rename to t/A00/05663-unbalanced/connexor/tokens.xml
diff --git a/t/A01/05663-unbalanced/corenlp/ne_dewac_175m_600.xml b/t/A00/05663-unbalanced/corenlp/ne_dewac_175m_600.xml
similarity index 100%
rename from t/A01/05663-unbalanced/corenlp/ne_dewac_175m_600.xml
rename to t/A00/05663-unbalanced/corenlp/ne_dewac_175m_600.xml
diff --git a/t/A01/05663-unbalanced/corenlp/ne_hgc_175m_600.xml b/t/A00/05663-unbalanced/corenlp/ne_hgc_175m_600.xml
similarity index 100%
rename from t/A01/05663-unbalanced/corenlp/ne_hgc_175m_600.xml
rename to t/A00/05663-unbalanced/corenlp/ne_hgc_175m_600.xml
diff --git a/t/A01/05663-unbalanced/corenlp/sentences.xml b/t/A00/05663-unbalanced/corenlp/sentences.xml
similarity index 100%
rename from t/A01/05663-unbalanced/corenlp/sentences.xml
rename to t/A00/05663-unbalanced/corenlp/sentences.xml
diff --git a/t/A01/05663-unbalanced/corenlp/tokens.xml b/t/A00/05663-unbalanced/corenlp/tokens.xml
similarity index 100%
rename from t/A01/05663-unbalanced/corenlp/tokens.xml
rename to t/A00/05663-unbalanced/corenlp/tokens.xml
diff --git a/t/A01/05663-unbalanced/data.xml b/t/A00/05663-unbalanced/data.xml
similarity index 100%
rename from t/A01/05663-unbalanced/data.xml
rename to t/A00/05663-unbalanced/data.xml
diff --git a/t/A01/05663-unbalanced/header.xml b/t/A00/05663-unbalanced/header.xml
similarity index 100%
rename from t/A01/05663-unbalanced/header.xml
rename to t/A00/05663-unbalanced/header.xml
diff --git a/t/A01/05663-unbalanced/mate/dependency.xml b/t/A00/05663-unbalanced/mate/dependency.xml
similarity index 100%
rename from t/A01/05663-unbalanced/mate/dependency.xml
rename to t/A00/05663-unbalanced/mate/dependency.xml
diff --git a/t/A01/05663-unbalanced/mate/morpho.xml b/t/A00/05663-unbalanced/mate/morpho.xml
similarity index 100%
rename from t/A01/05663-unbalanced/mate/morpho.xml
rename to t/A00/05663-unbalanced/mate/morpho.xml
diff --git a/t/A01/05663-unbalanced/mate/pipeline/one_token_per_line.txt b/t/A00/05663-unbalanced/mate/pipeline/one_token_per_line.txt
similarity index 100%
rename from t/A01/05663-unbalanced/mate/pipeline/one_token_per_line.txt
rename to t/A00/05663-unbalanced/mate/pipeline/one_token_per_line.txt
diff --git a/t/A01/05663-unbalanced/mate/pipeline/parsed.txt b/t/A00/05663-unbalanced/mate/pipeline/parsed.txt
similarity index 100%
rename from t/A01/05663-unbalanced/mate/pipeline/parsed.txt
rename to t/A00/05663-unbalanced/mate/pipeline/parsed.txt
diff --git a/t/A01/05663-unbalanced/mate/tokenSpans/number_tokenSpans.xml b/t/A00/05663-unbalanced/mate/tokenSpans/number_tokenSpans.xml
similarity index 100%
rename from t/A01/05663-unbalanced/mate/tokenSpans/number_tokenSpans.xml
rename to t/A00/05663-unbalanced/mate/tokenSpans/number_tokenSpans.xml
diff --git a/t/A01/05663-unbalanced/opennlp/morpho.xml b/t/A00/05663-unbalanced/opennlp/morpho.xml
similarity index 100%
rename from t/A01/05663-unbalanced/opennlp/morpho.xml
rename to t/A00/05663-unbalanced/opennlp/morpho.xml
diff --git a/t/A01/05663-unbalanced/opennlp/sentences.xml b/t/A00/05663-unbalanced/opennlp/sentences.xml
similarity index 100%
rename from t/A01/05663-unbalanced/opennlp/sentences.xml
rename to t/A00/05663-unbalanced/opennlp/sentences.xml
diff --git a/t/A01/05663-unbalanced/opennlp/tokens.xml b/t/A00/05663-unbalanced/opennlp/tokens.xml
similarity index 100%
rename from t/A01/05663-unbalanced/opennlp/tokens.xml
rename to t/A00/05663-unbalanced/opennlp/tokens.xml
diff --git a/t/A01/05663-unbalanced/struct/structure.xml b/t/A00/05663-unbalanced/struct/structure.xml
similarity index 100%
rename from t/A01/05663-unbalanced/struct/structure.xml
rename to t/A00/05663-unbalanced/struct/structure.xml
diff --git a/t/A01/05663-unbalanced/text.txt b/t/A00/05663-unbalanced/text.txt
similarity index 100%
rename from t/A01/05663-unbalanced/text.txt
rename to t/A00/05663-unbalanced/text.txt
diff --git a/t/A01/05663-unbalanced/tree_tagger/metadata.xml b/t/A00/05663-unbalanced/tree_tagger/metadata.xml
similarity index 100%
rename from t/A01/05663-unbalanced/tree_tagger/metadata.xml
rename to t/A00/05663-unbalanced/tree_tagger/metadata.xml
diff --git a/t/A01/05663-unbalanced/tree_tagger/morpho.xml b/t/A00/05663-unbalanced/tree_tagger/morpho.xml
similarity index 100%
rename from t/A01/05663-unbalanced/tree_tagger/morpho.xml
rename to t/A00/05663-unbalanced/tree_tagger/morpho.xml
diff --git a/t/A01/05663-unbalanced/tree_tagger/sentences.xml b/t/A00/05663-unbalanced/tree_tagger/sentences.xml
similarity index 100%
rename from t/A01/05663-unbalanced/tree_tagger/sentences.xml
rename to t/A00/05663-unbalanced/tree_tagger/sentences.xml
diff --git a/t/A01/05663-unbalanced/tree_tagger/tokens.xml b/t/A00/05663-unbalanced/tree_tagger/tokens.xml
similarity index 100%
rename from t/A01/05663-unbalanced/tree_tagger/tokens.xml
rename to t/A00/05663-unbalanced/tree_tagger/tokens.xml
diff --git a/t/A01/05663-unbalanced/xip/constituency.xml b/t/A00/05663-unbalanced/xip/constituency.xml
similarity index 100%
rename from t/A01/05663-unbalanced/xip/constituency.xml
rename to t/A00/05663-unbalanced/xip/constituency.xml
diff --git a/t/A01/05663-unbalanced/xip/dependency.xml b/t/A00/05663-unbalanced/xip/dependency.xml
similarity index 100%
rename from t/A01/05663-unbalanced/xip/dependency.xml
rename to t/A00/05663-unbalanced/xip/dependency.xml
diff --git a/t/A01/05663-unbalanced/xip/metadata.xml b/t/A00/05663-unbalanced/xip/metadata.xml
similarity index 100%
rename from t/A01/05663-unbalanced/xip/metadata.xml
rename to t/A00/05663-unbalanced/xip/metadata.xml
diff --git a/t/A01/05663-unbalanced/xip/morpho.xml b/t/A00/05663-unbalanced/xip/morpho.xml
similarity index 100%
rename from t/A01/05663-unbalanced/xip/morpho.xml
rename to t/A00/05663-unbalanced/xip/morpho.xml
diff --git a/t/A01/05663-unbalanced/xip/sentences.xml b/t/A00/05663-unbalanced/xip/sentences.xml
similarity index 100%
rename from t/A01/05663-unbalanced/xip/sentences.xml
rename to t/A00/05663-unbalanced/xip/sentences.xml
diff --git a/t/A01/05663-unbalanced/xip/tokens.xml b/t/A00/05663-unbalanced/xip/tokens.xml
similarity index 100%
rename from t/A01/05663-unbalanced/xip/tokens.xml
rename to t/A00/05663-unbalanced/xip/tokens.xml
diff --git a/t/A01/07452-deep/base/paragraph.xml b/t/A00/07452-deep/base/paragraph.xml
similarity index 100%
rename from t/A01/07452-deep/base/paragraph.xml
rename to t/A00/07452-deep/base/paragraph.xml
diff --git a/t/A01/07452-deep/base/sentences.xml b/t/A00/07452-deep/base/sentences.xml
similarity index 100%
rename from t/A01/07452-deep/base/sentences.xml
rename to t/A00/07452-deep/base/sentences.xml
diff --git a/t/A01/07452-deep/base/tokens_aggr.xml b/t/A00/07452-deep/base/tokens_aggr.xml
similarity index 100%
rename from t/A01/07452-deep/base/tokens_aggr.xml
rename to t/A00/07452-deep/base/tokens_aggr.xml
diff --git a/t/A01/07452-deep/base/tokens_conservative.xml b/t/A00/07452-deep/base/tokens_conservative.xml
similarity index 100%
rename from t/A01/07452-deep/base/tokens_conservative.xml
rename to t/A00/07452-deep/base/tokens_conservative.xml
diff --git a/t/A01/07452-deep/connexor/metadata.xml b/t/A00/07452-deep/connexor/metadata.xml
similarity index 100%
rename from t/A01/07452-deep/connexor/metadata.xml
rename to t/A00/07452-deep/connexor/metadata.xml
diff --git a/t/A01/07452-deep/connexor/morpho.xml b/t/A00/07452-deep/connexor/morpho.xml
similarity index 100%
rename from t/A01/07452-deep/connexor/morpho.xml
rename to t/A00/07452-deep/connexor/morpho.xml
diff --git a/t/A01/07452-deep/connexor/mpt.xml b/t/A00/07452-deep/connexor/mpt.xml
similarity index 100%
rename from t/A01/07452-deep/connexor/mpt.xml
rename to t/A00/07452-deep/connexor/mpt.xml
diff --git a/t/A01/07452-deep/connexor/phrase.xml b/t/A00/07452-deep/connexor/phrase.xml
similarity index 100%
rename from t/A01/07452-deep/connexor/phrase.xml
rename to t/A00/07452-deep/connexor/phrase.xml
diff --git a/t/A01/07452-deep/connexor/sentences.xml b/t/A00/07452-deep/connexor/sentences.xml
similarity index 100%
rename from t/A01/07452-deep/connexor/sentences.xml
rename to t/A00/07452-deep/connexor/sentences.xml
diff --git a/t/A01/07452-deep/connexor/syntax.xml b/t/A00/07452-deep/connexor/syntax.xml
similarity index 100%
rename from t/A01/07452-deep/connexor/syntax.xml
rename to t/A00/07452-deep/connexor/syntax.xml
diff --git a/t/A01/07452-deep/connexor/tokens.xml b/t/A00/07452-deep/connexor/tokens.xml
similarity index 100%
rename from t/A01/07452-deep/connexor/tokens.xml
rename to t/A00/07452-deep/connexor/tokens.xml
diff --git a/t/A01/07452-deep/corenlp/ne_dewac_175m_600.xml b/t/A00/07452-deep/corenlp/ne_dewac_175m_600.xml
similarity index 100%
rename from t/A01/07452-deep/corenlp/ne_dewac_175m_600.xml
rename to t/A00/07452-deep/corenlp/ne_dewac_175m_600.xml
diff --git a/t/A01/07452-deep/corenlp/ne_hgc_175m_600.xml b/t/A00/07452-deep/corenlp/ne_hgc_175m_600.xml
similarity index 100%
rename from t/A01/07452-deep/corenlp/ne_hgc_175m_600.xml
rename to t/A00/07452-deep/corenlp/ne_hgc_175m_600.xml
diff --git a/t/A01/07452-deep/corenlp/sentences.xml b/t/A00/07452-deep/corenlp/sentences.xml
similarity index 100%
rename from t/A01/07452-deep/corenlp/sentences.xml
rename to t/A00/07452-deep/corenlp/sentences.xml
diff --git a/t/A01/07452-deep/corenlp/tokens.xml b/t/A00/07452-deep/corenlp/tokens.xml
similarity index 100%
rename from t/A01/07452-deep/corenlp/tokens.xml
rename to t/A00/07452-deep/corenlp/tokens.xml
diff --git a/t/A01/07452-deep/data.xml b/t/A00/07452-deep/data.xml
similarity index 100%
rename from t/A01/07452-deep/data.xml
rename to t/A00/07452-deep/data.xml
diff --git a/t/A01/07452-deep/header.xml b/t/A00/07452-deep/header.xml
similarity index 100%
rename from t/A01/07452-deep/header.xml
rename to t/A00/07452-deep/header.xml
diff --git a/t/A01/07452-deep/mate/dependency.xml b/t/A00/07452-deep/mate/dependency.xml
similarity index 100%
rename from t/A01/07452-deep/mate/dependency.xml
rename to t/A00/07452-deep/mate/dependency.xml
diff --git a/t/A01/07452-deep/mate/morpho.xml b/t/A00/07452-deep/mate/morpho.xml
similarity index 100%
rename from t/A01/07452-deep/mate/morpho.xml
rename to t/A00/07452-deep/mate/morpho.xml
diff --git a/t/A01/07452-deep/mate/pipeline/one_token_per_line.txt b/t/A00/07452-deep/mate/pipeline/one_token_per_line.txt
similarity index 100%
rename from t/A01/07452-deep/mate/pipeline/one_token_per_line.txt
rename to t/A00/07452-deep/mate/pipeline/one_token_per_line.txt
diff --git a/t/A01/07452-deep/mate/pipeline/parsed.txt b/t/A00/07452-deep/mate/pipeline/parsed.txt
similarity index 100%
rename from t/A01/07452-deep/mate/pipeline/parsed.txt
rename to t/A00/07452-deep/mate/pipeline/parsed.txt
diff --git a/t/A01/07452-deep/mate/tokenSpans/number_tokenSpans.xml b/t/A00/07452-deep/mate/tokenSpans/number_tokenSpans.xml
similarity index 100%
rename from t/A01/07452-deep/mate/tokenSpans/number_tokenSpans.xml
rename to t/A00/07452-deep/mate/tokenSpans/number_tokenSpans.xml
diff --git a/t/A01/07452-deep/opennlp/morpho.xml b/t/A00/07452-deep/opennlp/morpho.xml
similarity index 100%
rename from t/A01/07452-deep/opennlp/morpho.xml
rename to t/A00/07452-deep/opennlp/morpho.xml
diff --git a/t/A01/07452-deep/opennlp/sentences.xml b/t/A00/07452-deep/opennlp/sentences.xml
similarity index 100%
rename from t/A01/07452-deep/opennlp/sentences.xml
rename to t/A00/07452-deep/opennlp/sentences.xml
diff --git a/t/A01/07452-deep/opennlp/tokens.xml b/t/A00/07452-deep/opennlp/tokens.xml
similarity index 100%
rename from t/A01/07452-deep/opennlp/tokens.xml
rename to t/A00/07452-deep/opennlp/tokens.xml
diff --git a/t/A01/07452-deep/struct/structure.xml b/t/A00/07452-deep/struct/structure.xml
similarity index 100%
rename from t/A01/07452-deep/struct/structure.xml
rename to t/A00/07452-deep/struct/structure.xml
diff --git a/t/A01/07452-deep/text.txt b/t/A00/07452-deep/text.txt
similarity index 100%
rename from t/A01/07452-deep/text.txt
rename to t/A00/07452-deep/text.txt
diff --git a/t/A01/07452-deep/tree_tagger/metadata.xml b/t/A00/07452-deep/tree_tagger/metadata.xml
similarity index 100%
rename from t/A01/07452-deep/tree_tagger/metadata.xml
rename to t/A00/07452-deep/tree_tagger/metadata.xml
diff --git a/t/A01/07452-deep/tree_tagger/morpho.xml b/t/A00/07452-deep/tree_tagger/morpho.xml
similarity index 100%
rename from t/A01/07452-deep/tree_tagger/morpho.xml
rename to t/A00/07452-deep/tree_tagger/morpho.xml
diff --git a/t/A01/07452-deep/tree_tagger/sentences.xml b/t/A00/07452-deep/tree_tagger/sentences.xml
similarity index 100%
rename from t/A01/07452-deep/tree_tagger/sentences.xml
rename to t/A00/07452-deep/tree_tagger/sentences.xml
diff --git a/t/A01/07452-deep/tree_tagger/tokens.xml b/t/A00/07452-deep/tree_tagger/tokens.xml
similarity index 100%
rename from t/A01/07452-deep/tree_tagger/tokens.xml
rename to t/A00/07452-deep/tree_tagger/tokens.xml
diff --git a/t/A01/07452-deep/xip/constituency.xml b/t/A00/07452-deep/xip/constituency.xml
similarity index 100%
rename from t/A01/07452-deep/xip/constituency.xml
rename to t/A00/07452-deep/xip/constituency.xml
diff --git a/t/A01/07452-deep/xip/dependency.xml b/t/A00/07452-deep/xip/dependency.xml
similarity index 100%
rename from t/A01/07452-deep/xip/dependency.xml
rename to t/A00/07452-deep/xip/dependency.xml
diff --git a/t/A01/07452-deep/xip/metadata.xml b/t/A00/07452-deep/xip/metadata.xml
similarity index 100%
rename from t/A01/07452-deep/xip/metadata.xml
rename to t/A00/07452-deep/xip/metadata.xml
diff --git a/t/A01/07452-deep/xip/morpho.xml b/t/A00/07452-deep/xip/morpho.xml
similarity index 100%
rename from t/A01/07452-deep/xip/morpho.xml
rename to t/A00/07452-deep/xip/morpho.xml
diff --git a/t/A01/07452-deep/xip/sentences.xml b/t/A00/07452-deep/xip/sentences.xml
similarity index 100%
rename from t/A01/07452-deep/xip/sentences.xml
rename to t/A00/07452-deep/xip/sentences.xml
diff --git a/t/A01/07452-deep/xip/tokens.xml b/t/A00/07452-deep/xip/tokens.xml
similarity index 100%
rename from t/A01/07452-deep/xip/tokens.xml
rename to t/A00/07452-deep/xip/tokens.xml
diff --git a/t/VDI/JAN/00001/data.xml b/t/VDI/JAN/00001/data.xml
index 21fd76f..545f020 100644
--- a/t/VDI/JAN/00001/data.xml
+++ b/t/VDI/JAN/00001/data.xml
@@ -1,7 +1,7 @@
<?xml version="1.0" encoding="UTF-8"?>
<?xml-model href="text.rng" type="application/xml" schematypens="http://relaxng.org/ns/structure/1.0"?>
-<raw_text docid="VDI_JAN.00001" xmlns="http://ids-mannheim.de/ns/KorAP">
+<raw_text docid="VDI14_JAN.00001" xmlns="http://ids-mannheim.de/ns/KorAP">
<metadata file="metadata.xml" />
<text>hui</text>
</raw_text>
diff --git a/t/artificial-subtoken.t b/t/artificial-subtoken.t
deleted file mode 100644
index 7a30103..0000000
--- a/t/artificial-subtoken.t
+++ /dev/null
@@ -1,65 +0,0 @@
-#!/usr/bin/env perl
-# source ~/perl5/perlbrew/etc/bashrc
-# perlbrew switch perl-blead@korap
-use strict;
-use warnings;
-use utf8;
-use Test::More;
-use Benchmark ':hireswallclock';
-use lib 'lib', '../lib';
-use Scalar::Util qw/weaken/;
-
-use File::Basename 'dirname';
-use File::Spec::Functions 'catdir';
-
-use_ok('KorAP::Document');
-
-my $path = catdir(dirname(__FILE__), 'artificial');
-ok(my $doc = KorAP::Document->new( path => $path . '/' ), 'Load Korap::Document');
-like($doc->path, qr!$path/$!, 'Path');
-ok($doc->parse, 'Parse document');
-
-sub new_tokenizer {
- my $x = $doc;
- weaken $x;
- return KorAP::Tokenizer->new(
- path => $x->path,
- doc => $x,
- foundry => 'OpenNLP',
- layer => 'Tokens',
- name => 'tokens'
- )
-};
-
-is($doc->primary->data,
- 'Zum letzten kulturellen Anlass lädt die Leitung des Schulheimes Hofbergli ein, '.
- 'bevor der Betrieb Ende Schuljahr eingestellt wird.', 'Primary data');
-
-is($doc->primary->data_length, 129, 'Primary data length');
-
-is($doc->primary->data(0,3), 'Zum', 'Get primary data');
-
-# Get tokens
-use_ok('KorAP::Tokenizer');
-# Get tokenization
-ok(my $tokens = KorAP::Tokenizer->new(
- path => $doc->path,
- doc => $doc,
- foundry => 'OpenNLP',
- layer => 'Tokens',
- name => 'tokens'
-), 'New Tokenizer');
-ok($tokens->parse, 'Parse');
-
-ok($tokens->add_subtokens, 'Add subtokens');
-
-# diag $tokens->to_string;
-
-#foreach (@{$tokens->stream->multi_term_tokens}) {
-# print $_;
-#};
-
-done_testing;
-
-
-__END__
diff --git a/t/artificial.t b/t/artificial.t
deleted file mode 100644
index 95ef890..0000000
--- a/t/artificial.t
+++ /dev/null
@@ -1,452 +0,0 @@
-#!/usr/bin/env perl
-# source ~/perl5/perlbrew/etc/bashrc
-# perlbrew switch perl-blead@korap
-use strict;
-use warnings;
-use utf8;
-use Test::More;
-use Benchmark ':hireswallclock';
-use lib 'lib', '../lib';
-use Scalar::Util qw/weaken/;
-
-use File::Basename 'dirname';
-use File::Spec::Functions 'catdir';
-
-use_ok('KorAP::Document');
-
-# Tests for material identicality of a token
-sub _t2h {
- my $string = shift;
- $string =~ s/^\[\(\d+?-\d+?\)(.+?)\]$/$1/;
- my %hash = ();
- foreach (split(qr!\|!, $string)) {
- $hash{$_} = 1;
- };
- return \%hash;
-};
-
-
-my $path = catdir(dirname(__FILE__), 'artificial');
-ok(my $doc = KorAP::Document->new( path => $path . '/' ), 'Load Korap::Document');
-like($doc->path, qr!$path/$!, 'Path');
-ok($doc->parse, 'Parse document');
-
-sub new_tokenizer {
- my $x = $doc;
- weaken $x;
- return KorAP::Tokenizer->new(
- path => $x->path,
- doc => $x,
- foundry => 'OpenNLP',
- layer => 'Tokens',
- name => 'tokens'
- )
-};
-
-is($doc->primary->data,
- 'Zum letzten kulturellen Anlass lädt die Leitung des Schulheimes Hofbergli ein, '.
- 'bevor der Betrieb Ende Schuljahr eingestellt wird.', 'Primary data');
-
-is($doc->primary->data_length, 129, 'Primary data length');
-
-is($doc->primary->data(0,3), 'Zum', 'Get primary data');
-
-# Get tokens
-use_ok('KorAP::Tokenizer');
-# Get tokenization
-ok(my $tokens = KorAP::Tokenizer->new(
- path => $doc->path,
- doc => $doc,
- foundry => 'OpenNLP',
- layer => 'Tokens',
- name => 'tokens'
-), 'New Tokenizer');
-ok($tokens->parse, 'Parse');
-
-is($tokens->foundry, 'OpenNLP', 'Foundry');
-
-is($tokens->doc->text_sigle, 'ART_ABC.00001', 'Doc id');
-is($tokens->should, 20, 'Should');
-is($tokens->have, 18, 'Have');
-is($tokens->name, 'tokens', 'Name');
-is($tokens->layer, 'Tokens', 'Layer');
-
-is($tokens->stream->pos(0)->to_string, '[(0-3)-:tokens$<i>18|_0#0-3|i:zum|s:Zum]', 'Token is correct');
-
-is($tokens->stream->pos(1)->to_string, '[(4-11)_1#4-11|i:letzten|s:letzten]', 'Token is correct');
-
-my $i = 2;
-foreach ([12,23, 'kulturellen'],
- [24,30, 'Anlass'],
- [31,35, 'lädt'],
- [36,39, 'die'],
- [40,47, 'Leitung'],
- [48,51, 'des'],
- [52,63, 'Schulheimes'],
- [64,73, 'Hofbergli'],
- [74,77, 'ein'],
- [79,84, 'bevor'],
- [85,88, 'der'],
- [89,96, 'Betrieb'],
- [97,101, 'Ende'],
- [102,111, 'Schuljahr'],
- [112,123, 'eingestellt'],
- [124,128, 'wird']
- ) {
- is($tokens->stream->pos($i++)->to_string,
- '[('.$_->[0].'-'.$_->[1].')'.
- '_'.($i-1).'#'.$_->[0].'-'.$_->[1] . '|' .
- 'i:'.lc($_->[2]).'|s:'.$_->[2].']',
- 'Token is correct');
-};
-
-ok(!$tokens->stream->pos($i++), 'No more tokens');
-
-# Add OpenNLP/morpho
-ok($tokens->add('OpenNLP', 'Morpho'), 'Add OpenNLP/Morpho');
-
-$i = 0;
-foreach (qw/APPRART ADJA ADJA NN VVFIN ART NN ART NN NE PTKVZ KOUS ART NN NN NN VVPP VAFIN/) {
- like($tokens->stream->pos($i++)->to_string,
- qr!\|opennlp/p:$_!,
- 'Annotation (OpenNLP/p) is correct: ' . $_
- );
-};
-
-# Add OpenNLP/sentences
-ok($tokens->add('OpenNLP', 'Sentences'), 'Add OpenNLP/Sentences');
-
-is($tokens->stream->pos(0)->to_string,
- '[(0-3)-:opennlp/sentences$<i>1|-:tokens$<i>18|<>:opennlp/s:s#0-129$<i>17<b>0|_0#0-3|i:zum|opennlp/p:APPRART|s:Zum]',
- # '[(0-3)-:opennlp/sentences$<i>1|-:tokens$<i>18|_0#0-3|i:zum|s:Zum|opennlp/p:APPRART|<>:opennlp/s:s#0-129$<i>17]',
- 'Correct sentence'
- );
-
-# New instantiation
-ok($tokens = KorAP::Tokenizer->new(
- path => $doc->path,
- doc => $doc,
- foundry => 'OpenNLP',
- layer => 'Tokens',
- name => 'tokens'
-), 'New Tokenizer');
-
-ok($tokens->parse, 'Parse');
-
-# Add OpenNLP/sentences
-ok($tokens->add('Base', 'Sentences'), 'Add Base/Sentences');
-
-# Add OpenNLP/sentences
-ok($tokens->add('Base', 'Paragraphs'), 'Add Base/Paragraphs');
-
-is_deeply(
- _t2h($tokens->stream->pos(0)->to_string),
- _t2h('[(0-3)-:base/paragraphs$<i>1|-:base/sentences$<i>1|-:tokens$<i>18|<>:base/s:t#0-129$<i>17<b>0|<>:base/s:p#0-129$<i>17<b>1|<>:base/s:s#0-129$<i>17<b>2|_0#0-3|i:zum|s:Zum]'),
- 'Correct base annotation');
-
-# New instantiation
-ok($tokens = new_tokenizer->parse, 'Parse');
-
-# Add CoreNLP/NamedEntities
-ok($tokens->add('CoreNLP', 'NamedEntities', 'ne_dewac_175m_600'), 'Add CoreNLP/NamedEntities');
-ok($tokens->add('CoreNLP', 'NamedEntities', 'ne_hgc_175m_600'), 'Add CoreNLP/NamedEntities');
-
-# [(64-73)s:Hofbergli|i:hofbergli|_9#64-73|corenlp/ne_dewac_175m_600:I-LOC|corenlp/ne_hgc_175m_600:I-LOC]
-is_deeply(
- _t2h($tokens->stream->pos(9)->to_string),
- _t2h('[(64-73)_9#64-73|corenlp/ne:I-LOC|i:hofbergli|s:Hofbergli]'),
- 'Correct NamedEntities annotation'
-);
-
-# New instantiation
-ok($tokens = new_tokenizer->parse, 'Parse');
-
-# Add CoreNLP/Morpho
-ok($tokens->add('CoreNLP', 'Morpho'), 'Add CoreNLP/Morpho');
-
-is_deeply(
- _t2h($tokens->stream->pos(0)->to_string),
- _t2h('[(0-3)-:tokens$<i>18|_0#0-3|corenlp/p:APPRART|i:zum|s:Zum]'),
- 'Correct corenlp annotation'
-);
-
-$i = 0;
-foreach (qw/APPRART ADJ ADJA NN VVFIN ART NN ART NN NE PTKVZ KOUS ART NN NN NN VVPP VAFIN/) {
- like($tokens->stream->pos($i++)->to_string,
- qr!\|corenlp/p:$_!,
- 'Annotation (CoreNLP/p) is correct: '. $_);
-};
-
-
-# Add CoreNLP/Sentences
-ok($tokens->add('CoreNLP', 'Sentences'), 'Add CoreNLP/Sentences');
-
-is_deeply(
- _t2h($tokens->stream->pos(0)->to_string),
- _t2h('[(0-3)-:corenlp/sentences$<i>1|-:tokens$<i>18|<>:corenlp/s:s#0-129$<i>17<b>0|_0#0-3|corenlp/p:APPRART|i:zum|s:Zum]'),
- # '[(0-3)-:corenlp/sentences$<i>1|-:tokens$<i>18|_0#0-3|i:zum|s:Zum|corenlp/p:APPRART|<>:corenlp/s:s#0-129$<i>17]',
- 'Correct corenlp annotation'
-);
-
-# New instantiation
-ok($tokens = new_tokenizer->parse, 'New Tokenizer');
-
-# Add CoreNLP/Sentences
-ok($tokens->add('Connexor', 'Sentences'), 'Add Connexor/Sentences');
-
-is_deeply(
- _t2h($tokens->stream->pos(0)->to_string),
- _t2h('[(0-3)-:cnx/sentences$<i>1|-:tokens$<i>18|<>:cnx/s:s#0-129$<i>17<b>0|_0#0-3|i:zum|s:Zum]'),
- # '[(0-3)-:cnx/sentences$<i>1|-:tokens$<i>18|_0#0-3|i:zum|s:Zum|<>:cnx/s:s#0-129$<i>17<b>0]',
- 'Correct cnx annotation'
-);
-
-# New instantiation
-ok($tokens = new_tokenizer->parse, 'New Tokenizer');
-
-# Add Connexor/Morpho
-ok($tokens->add('Connexor', 'Morpho'), 'Add Connexor/Morpho');
-
-$i = 0;
-foreach (qw/! A A N V DET N DET N N NUM CS DET N N N V V/) {
- if ($_ eq '!') {
- $i++;
- next;
- };
- like($tokens->stream->pos($i++)->to_string,
- qr!\|cnx/p:$_!,
- 'Annotation (Connexor/p) is correct: ' . $_);
-};
-
-
-$i = 0;
-foreach (qw/! ! ! ! IND:PRES ! ! ! ! Prop ! ! ! ! ! ! PCP:PERF IND:PRES/) {
- if ($_ eq '!') {
- $i++;
- next;
- };
- foreach my $f (split(':', $_)) {
- like($tokens->stream->pos($i)->to_string,
- qr!\|cnx/m:$f!,
- 'Annotation (Connexor/m) is correct: '. $f);
- };
- $i++;
-};
-
-# New instantiation
-ok($tokens = new_tokenizer->parse, 'New Tokenizer');
-
-# Add Connexor/Phrase
-ok($tokens->add('Connexor', 'Phrase'), 'Add Connexor/Phrase');
-my $stream = $tokens->stream;
-like($stream->pos(1)->to_string, qr!<>:cnx/c:np#4-30\$<i>4<b>0!, 'Annotation (Connexor/c) is correct');
-like($stream->pos(6)->to_string, qr!<>:cnx/c:np#40-47\$<i>7<b>0!, 'Annotation (Connexor/c) is correct');
-like($stream->pos(8)->to_string, qr!<>:cnx/c:np#52-73\$<i>10<b>0!, 'Annotation (Connexor/c) is correct');
-like($stream->pos(13)->to_string, qr!<>:cnx/c:np#89-111\$<i>16<b>0!, 'Annotation (Connexor/c) is correct');
-
-# New instantiation
-ok($tokens = new_tokenizer->parse, 'New Tokenizer');
-
-# Add Connexor/Syntax
-ok($tokens->add('Connexor', 'Syntax'), 'Add Connexor/Syntax');
-$stream = $tokens->stream;
-
-$i = 0;
-foreach (qw/! @PREMOD @PREMOD @NH @MAIN @PREMOD @NH @PREMOD
- @PREMOD @NH @NH @PREMARK @PREMOD @PREMOD @NH @NH @MAIN @AUX/) {
- if ($_ eq '!') {
- $i++;
- next;
- };
- like($tokens->stream->pos($i++)->to_string,
- qr!\|cnx/syn:$_!,
- 'Annotation (Connexor/syn) is correct: ' . $_);
-};
-
-# New instantiation
-ok($tokens = new_tokenizer->parse, 'New Tokenizer');
-
-# Add XIP/Sentences
-ok($tokens->add('XIP', 'Sentences'), 'Add XIP/Sentences');
-
-is_deeply(
- _t2h($tokens->stream->pos(0)->to_string),
- _t2h('[(0-3)-:tokens$<i>18|-:xip/sentences$<i>1|<>:xip/s:s#0-129$<i>17<b>0|_0#0-3|i:zum|s:Zum]'),
- # '[(0-3)-:tokens$<i>18|_0#0-3|i:zum|s:Zum|-:xip/sentences$<i>1|<>:xip/s:s#0-129$<i>17<b>0]',
- 'First sentence'
-);
-
-# Add XIP/Morpho
-ok($tokens->add('XIP', 'Morpho'), 'Add XIP/Morpho');
-$stream = $tokens->stream;
-
-$i = 0;
-foreach (qw/PREP ADJ ADJ NOUN VERB DET NOUN DET NOUN NOUN PTCL CONJ DET NOUN NOUN NOUN VERB VERB/) {
- if ($_ eq '!') {
- $i++;
- next;
- };
- like($tokens->stream->pos($i++)->to_string,
- qr!\|xip/p:$_!,
- 'Annotation (xip/p) is correct: ' . $_);
-};
-
-$i = 0;
-foreach ('zu', 'letzt', 'kulturell', 'Anlass', '=laden:laden', 'die', 'Leitung', 'der', '\#schulen:\#Heim:schulen\#Heim', 'Hofbergli', 'ein', 'bevor', 'der', 'Betrieb', 'Ende', '\#schulen:\#Jahr:schulen\#Jahr') {
- if ($_ eq '!') {
- $i++;
- next;
- };
- foreach my $f (split(':', $_)) {
- like($tokens->stream->pos($i)->to_string,
- qr!\|xip\/l:\Q$f\E!,
- 'Annotation (xip/l) is correct: ' . $f);
- };
- $i++;
-};
-
-# New instantiation
-ok($tokens = new_tokenizer->parse, 'New Tokenizer');
-
-# Add XIP/Sentences
-ok($tokens->add('XIP', 'Dependency'), 'Add XIP/Dependency');
-
-$stream = $tokens->stream;
-diag $stream->pos(1)->to_string;
-
-like($stream->pos(1)->to_string, qr![^<]>:xip/d:NMOD\$<i>3!, 'Dependency fine');
-like($stream->pos(3)->to_string, qr![^<]<:xip/d:NMOD\$<i>1!, 'Dependency fine');
-
-done_testing;
-__END__
-
-
-like($stream->pos(3)->to_string, qr!\|<:xip/d:NMOD\$<i>2!, 'Dependency fine');
-like($stream->pos(4)->to_string, qr!\|>xip/d:VMAIN\$<i>4!, 'Dependency fine');
-like($stream->pos(4)->to_string, qr!\|<:xip/d:SUBJ\$<i>6!, 'Dependency fine');
-like($stream->pos(4)->to_string, qr!\|<:xip/d:VPREF\$<i>10!, 'Dependency fine');
-like($stream->pos(5)->to_string, qr!\|>:xip/d:DETERM\$<i>6!, 'Dependency fine');
-like($stream->pos(6)->to_string, qr!\|<:xip/d:DETERM\$<i>5!, 'Dependency fine');
-like($stream->pos(6)->to_string, qr!\|>:xip/d:SUBJ\$<i>4!, 'Dependency fine');
-like($stream->pos(6)->to_string, qr!\|<:xip/d:NMOD\$<i>8!, 'Dependency fine');
-like($stream->pos(7)->to_string, qr!\|>:xip/d:DETERM\$<i>8!, 'Dependency fine');
-like($stream->pos(8)->to_string, qr!\|<:xip/d:DETERM\$<i>7!, 'Dependency fine');
-like($stream->pos(8)->to_string, qr!\|>:xip/d:NMOD\$<i>6!, 'Dependency fine');
-like($stream->pos(8)->to_string, qr!\|<:xip/d:NMOD\$<i>9!, 'Dependency fine');
-like($stream->pos(9)->to_string, qr!\|>:xip/d:NMOD\$<i>8!, 'Dependency fine');
-like($stream->pos(10)->to_string, qr!\|>:xip/d:VPREF\$<i>4!, 'Dependency fine');
-like($stream->pos(11)->to_string, qr!\|>:xip/d:CONNECT\$<i>16!, 'Dependency fine');
-like($stream->pos(12)->to_string, qr!\|>:xip/d:DETERM\$<i>13!, 'Dependency fine');
-like($stream->pos(13)->to_string, qr!\|<:xip/d:DETERM\$<i>12!, 'Dependency fine');
-like($stream->pos(13)->to_string, qr!\|>:xip/d:SUBJ\$<i>16!, 'Dependency fine');
-like($stream->pos(14)->to_string, qr!\|>:xip/d:OBJ\$<i>16!, 'Dependency fine');
-like($stream->pos(15)->to_string, qr!\|>:xip/d:OBJ\$<i>16!, 'Dependency fine');
-like($stream->pos(16)->to_string, qr!\|<:xip/d:CONNECT\$<i>11!, 'Dependency fine');
-like($stream->pos(16)->to_string, qr!\|<:xip/d:SUBJ\$<i>13!, 'Dependency fine');
-like($stream->pos(16)->to_string, qr!\|<:xip/d:OBJ\$<i>14!, 'Dependency fine');
-like($stream->pos(16)->to_string, qr!\|<:xip/d:OBJ\$<i>15!, 'Dependency fine');
-like($stream->pos(16)->to_string, qr!\|>:xip/d:AUXIL\$<i>17!, 'Dependency fine');
-like($stream->pos(16)->to_string, qr!\|>xip/d:VMAIN\$<i>16!, 'Dependency fine');
-like($stream->pos(16)->to_string, qr!\|<xip/d:VMAIN\$<i>16!, 'Dependency fine');
-like($stream->pos(17)->to_string, qr!\|<:xip/d:AUXIL\$<i>16!, 'Dependency fine');
-
-# New instantiation
-ok($tokens = new_tokenizer->parse, 'New Tokenizer');
-
-# Add XIP/Sentences
-ok($tokens->add('XIP', 'Constituency'), 'Add XIP/Constituency');
-
-$stream = $tokens->stream;
-like($stream->pos(0)->to_string, qr!\|<>:xip/c:TOP#0-129\$<i>17!, 'Constituency fine');
-like($stream->pos(0)->to_string, qr!\|<>:xip/c:MC#0-129\$<i>17<b>1!, 'Constituency fine');
-like($stream->pos(0)->to_string, qr!\|<>:xip/c:PP#0-30\$<i>4<b>2!, 'Constituency fine');
-like($stream->pos(0)->to_string, qr!\|<>:xip/c:PREP#0-3\$<i>1!, 'Constituency fine');
-
-like($stream->pos(1)->to_string, qr!\|<>:xip/c:NP#4-30\$<i>4<b>3!, 'Constituency fine');
-like($stream->pos(1)->to_string, qr!\|<>:xip/c:NPA#4-30\$<i>4<b>4!, 'Constituency fine');
-like($stream->pos(1)->to_string, qr!\|<>:xip/c:AP#4-11\$<i>2<b>5!, 'Constituency fine');
-like($stream->pos(1)->to_string, qr!\|<>:xip/c:ADJ#4-11\$<i>2<b>6!, 'Constituency fine');
-
-like($stream->pos(2)->to_string, qr!\|<>:xip/c:AP#12-23\$<i>3<b>5!, 'Constituency fine');
-like($stream->pos(2)->to_string, qr!\|<>:xip/c:ADJ#12-23\$<i>3<b>6!, 'Constituency fine');
-
-like($stream->pos(3)->to_string, qr!\|<>:xip/c:NOUN#24-30\$<i>4<b>5!, 'Constituency fine');
-
-like($stream->pos(4)->to_string, qr!\|<>:xip/c:VERB#31-35\$<i>5<b>2!, 'Constituency fine');
-
-like($stream->pos(5)->to_string, qr!\|<>:xip/c:NP#36-47\$<i>7<b>2!, 'Constituency fine');
-like($stream->pos(5)->to_string, qr!\|<>:xip/c:DET#36-39\$<i>6<b>3!, 'Constituency fine');
-
-like($stream->pos(6)->to_string, qr!\|<>:xip/c:NPA#40-47\$<i>7<b>3!, 'Constituency fine');
-like($stream->pos(6)->to_string, qr!\|<>:xip/c:NOUN#40-47\$<i>7<b>4!, 'Constituency fine');
-
-like($stream->pos(7)->to_string, qr!\|<>:xip/c:NP#48-63\$<i>9<b>2!, 'Constituency fine');
-like($stream->pos(7)->to_string, qr!\|<>:xip/c:DET#48-51\$<i>8<b>3!, 'Constituency fine');
-
-like($stream->pos(8)->to_string, qr!\|<>:xip/c:NPA#52-63\$<i>9<b>3!, 'Constituency fine');
-like($stream->pos(8)->to_string, qr!\|<>:xip/c:NOUN#52-63\$<i>9<b>4!, 'Constituency fine');
-
-like($stream->pos(9)->to_string, qr!\|<>:xip/c:NP#64-73\$<i>10<b>2!, 'Constituency fine');
-like($stream->pos(9)->to_string, qr!\|<>:xip/c:NPA#64-73\$<i>10<b>3!, 'Constituency fine');
-like($stream->pos(9)->to_string, qr!\|<>:xip/c:NOUN#64-73\$<i>10<b>4!, 'Constituency fine');
-
-like($stream->pos(10)->to_string, qr!\|<>:xip/c:PTCL#74-77\$<i>11<b>2!, 'Constituency fine');
-
-like($stream->pos(11)->to_string, qr!\|<>:xip/c:SC#79-128\$<i>18!, 'Constituency fine');
-like($stream->pos(11)->to_string, qr!\|<>:xip/c:CONJ#79-84\$<i>12<b>1!, 'Constituency fine');
-
-like($stream->pos(12)->to_string, qr!\|<>:xip/c:NP#85-96\$<i>14<b>1!, 'Constituency fine');
-like($stream->pos(12)->to_string, qr!\|<>:xip/c:DET#85-88\$<i>13<b>2!, 'Constituency fine');
-
-
-like($stream->pos(13)->to_string, qr!\|<>:xip/c:NPA#89-96\$<i>14<b>2!, 'Constituency fine');
-like($stream->pos(13)->to_string, qr!\|<>:xip/c:NOUN#89-96\$<i>14<b>3!, 'Constituency fine');
-
-like($stream->pos(14)->to_string, qr!\|<>:xip/c:NP#97-101\$<i>15<b>1!, 'Constituency fine');
-like($stream->pos(14)->to_string, qr!\|<>:xip/c:NPA#97-101\$<i>15<b>2!, 'Constituency fine');
-like($stream->pos(14)->to_string, qr!\|<>:xip/c:NOUN#97-101\$<i>15<b>3!, 'Constituency fine');
-
-like($stream->pos(15)->to_string, qr!\|<>:xip/c:NP#102-111\$<i>16<b>1!, 'Constituency fine');
-like($stream->pos(15)->to_string, qr!\|<>:xip/c:NPA#102-111\$<i>16<b>2!, 'Constituency fine');
-like($stream->pos(15)->to_string, qr!\|<>:xip/c:NOUN#102-111\$<i>16<b>3!, 'Constituency fine');
-
-like($stream->pos(16)->to_string, qr!\|<>:xip/c:VERB#112-123\$<i>17<b>1!, 'Constituency fine');
-
-like($stream->pos(17)->to_string, qr!\|<>:xip/c:VERB#124-128\$<i>18<b>1!, 'Constituency fine');
-
-# diag $stream->to_string;
-
-
-# ADJA ADJA NN VVFIN ART NN ART NN NE PTKVZ KOUS ART NN NN NN VVPP VAFIN
-done_testing;
-__END__
-
-
-# Todo: CoreNLP/Constituency!
-
-
-
-
-
-# Connexor
-push(@layers, ['Connexor', 'Morpho']);
-push(@layers, ['Connexor', 'Syntax']);
-push(@layers, ['Connexor', 'Phrase']);
-push(@layers, ['Connexor', 'Sentences']);
-
-# TreeTagger
-push(@layers, ['TreeTagger', 'Morpho']);
-push(@layers, ['TreeTagger', 'Sentences']);
-
-# Mate
-# push(@layers, ['Mate', 'Morpho']);
-push(@layers, ['Mate', 'Dependency']);
-
-# XIP
-push(@layers, ['XIP', 'Morpho']);
-push(@layers, ['XIP', 'Constituency']);
-push(@layers, ['XIP', 'Dependency']);
-push(@layers, ['XIP', 'Sentences']);
-
-
-__END__
diff --git a/t/artificial/header.xml b/t/artificial/header.xml
index 950e202..589e75e 100644
--- a/t/artificial/header.xml
+++ b/t/artificial/header.xml
@@ -36,7 +36,7 @@
<pubDate type="year">2001</pubDate>
<pubDate type="month">04</pubDate>
<pubDate type="day">02</pubDate>
- <pubPlace>Mannheim</pubPlace>
+ <pubPlace key="DE">Mannheim</pubPlace>
</imprint>
<biblScope type="issue"/>
<biblScope type="issueplace"/>
diff --git a/t/index/corpus/doc/0001/header.xml b/t/index/corpus/doc/0001/header.xml
index fb770f7..dd5c085 100644
--- a/t/index/corpus/doc/0001/header.xml
+++ b/t/index/corpus/doc/0001/header.xml
@@ -19,7 +19,7 @@
<h.title type="main">Beispiel Text</h.title>
<h.title type="sub">Beispiel Text Untertitel</h.title>
<h.author>Mustermann, Max</h.author>
- <editor>Monkika Mustermann</editor>
+ <editor>Monika Mustermann</editor>
<imprint/>
<biblScope type="pp"/>
<biblScope type="suppl"/>
diff --git a/t/index/meta.t b/t/index/meta.t
new file mode 100644
index 0000000..dadcb4c
--- /dev/null
+++ b/t/index/meta.t
@@ -0,0 +1,58 @@
+#!/usr/bin/env perl
+use strict;
+use warnings;
+use utf8;
+use Test::More;
+use Scalar::Util qw/weaken/;
+use Data::Dumper;
+use lib 't/index';
+use TestInit;
+use File::Basename 'dirname';
+use File::Spec::Functions 'catdir';
+
+
+my $path = catdir(dirname(__FILE__), 'corpus', 'doc', '0001');
+
+ok(my $doc = KorAP::Document->new( path => $path . '/' ), 'Load Korap::Document');
+ok($doc->parse, 'Parse document');
+like($doc->path, qr!$path/!, 'Path');
+
+# Metdata
+is($doc->text_sigle, 'Corpus_Doc.0001', 'ID-text');
+is($doc->doc_sigle, 'Corpus_Doc', 'ID-doc');
+is($doc->corpus_sigle, 'Corpus', 'ID-corpus');
+
+is($doc->title, 'Beispiel Text', 'title');
+is($doc->sub_title, 'Beispiel Text Untertitel', 'title');
+is($doc->pub_date, '20010402', 'Publication date');
+is($doc->pub_place, 'Mannheim', 'Publication place');
+is($doc->author, 'Mustermann, Max', 'Author');
+
+is($doc->publisher, 'Artificial articles Inc.', 'Publisher');
+is($doc->editor, 'Monika Mustermann', 'Editor');
+is($doc->text_type, 'Zeitung: Tageszeitung', 'Text Type');
+is($doc->text_type_art, 'Bericht', 'Text Type Art');
+is($doc->text_type_ref, 'Aphorismen', 'Text Type Ref');
+ok(!$doc->text_column, 'Text Column');
+ok(!$doc->text_domain, 'Text Domain');
+is($doc->creation_date, '19990601', 'Creation Date');
+ok(!$doc->license, 'License');
+ok(!$doc->pages, 'Pages');
+ok(!$doc->file_edition_statement, 'File Edition Statement');
+ok(!$doc->bibl_edition_statement, 'Bibl Edition Statement');
+ok(!$doc->reference, 'Reference');
+is($doc->language, 'de', 'Language');
+
+is($doc->doc_title, 'Beispiel Dokument', 'Doc: title');
+ok(!$doc->doc_sub_title, 'Doc: subtitle');
+ok(!$doc->doc_editor, 'Doc: editor');
+ok(!$doc->doc_author, 'Doc: author');
+
+is($doc->corpus_title, 'Beispiel-Corpus', 'Corpus: title');
+ok(!$doc->corpus_sub_title, 'Corpus: subtitle');
+ok(!$doc->corpus_editor, 'Corpus: editor');
+ok(!$doc->corpus_author, 'Corpus: author');
+
+done_testing;
+
+__END__
diff --git a/t/index/opennlp_morpho.t b/t/index/opennlp_morpho.t
index 82182a2..cf57006 100644
--- a/t/index/opennlp_morpho.t
+++ b/t/index/opennlp_morpho.t
@@ -10,6 +10,41 @@
ok(my $tokens = TestInit::tokens('0001'), 'Parse tokens');
+is($tokens->stream->pos(0)->to_string, '[(0-3)-:tokens$<i>18|_0$<i>0<i>3|i:zum|s:Zum]', 'Token is correct');
+
+is($tokens->stream->pos(1)->to_string, '[(4-11)_1$<i>4<i>11|i:letzten|s:letzten]', 'Token is correct');
+
+my $i = 2;
+foreach ([12,23, 'kulturellen'],
+ [24,30, 'Anlass'],
+ [31,35, 'lädt'],
+ [36,39, 'die'],
+ [40,47, 'Leitung'],
+ [48,51, 'des'],
+ [52,63, 'Schulheimes'],
+ [64,73, 'Hofbergli'],
+ [74,77, 'ein'],
+ [79,84, 'bevor'],
+ [85,88, 'der'],
+ [89,96, 'Betrieb'],
+ [97,101, 'Ende'],
+ [102,111, 'Schuljahr'],
+ [112,123, 'eingestellt'],
+ [124,128, 'wird']
+ ) {
+ is($tokens->stream->pos($i++)->to_string,
+ '[('.$_->[0].'-'.$_->[1].')'.
+ '_'.($i-1).
+ '$<i>'.$_->[0].'<i>' . $_->[1] . '|' .
+ 'i:'.lc($_->[2]).'|s:'.$_->[2].']',
+ 'Token is correct');
+};
+
+ok(!$tokens->stream->pos($i++), 'No more tokens');
+
+
+
+
ok($tokens->add('OpenNLP', 'Morpho'), 'Add Structure');
my $data = $tokens->to_data->{data};
diff --git a/t/index/primary.t b/t/index/primary.t
new file mode 100644
index 0000000..7abf629
--- /dev/null
+++ b/t/index/primary.t
@@ -0,0 +1,31 @@
+#!/usr/bin/env perl
+use strict;
+use warnings;
+use utf8;
+use Test::More;
+use Scalar::Util qw/weaken/;
+use Data::Dumper;
+use lib 't/index';
+use TestInit;
+use File::Basename 'dirname';
+use File::Spec::Functions 'catdir';
+
+
+my $path = catdir(dirname(__FILE__), 'corpus', 'doc', '0001');
+
+ok(my $doc = KorAP::Document->new( path => $path . '/' ), 'Load Korap::Document');
+ok($doc->parse, 'Parse document');
+like($doc->path, qr!$path/!, 'Path');
+
+is($doc->primary->data,
+ 'Zum letzten kulturellen Anlass lädt die Leitung des Schulheimes Hofbergli ein, '.
+ 'bevor der Betrieb Ende Schuljahr eingestellt wird.', 'Primary data');
+
+is($doc->primary->data_length, 129, 'Primary data length');
+
+is($doc->primary->data(0,3), 'Zum', 'Get primary data');
+
+
+done_testing;
+
+__END__
diff --git a/t/meta.t b/t/meta.t
index d3c851c..3aa0f34 100644
--- a/t/meta.t
+++ b/t/meta.t
@@ -11,10 +11,6 @@
use File::Basename 'dirname';
use File::Spec::Functions 'catdir';
-
-diag 'Support "availability"';
-diag 'Support "pubPlace-key"';
-
# TODO: Make 'text' -> 'primaryText'
use_ok('KorAP::Document');
@@ -60,7 +56,6 @@
ok(!$doc->text_column, 'no text_column');
ok(!$doc->keywords_string, 'no keywords');
is($doc->text_class_string, 'freizeit-unterhaltung reisen wissenschaft populaerwissenschaft', 'no text classes');
-ok(!$doc->language, 'no text_column');
#is($doc->coll_title, 'Wikipedia', 'Collection title');
#is($doc->coll_sub_title, 'Die freie Enzyklopädie', 'Collection subtitle');
@@ -77,7 +72,6 @@
is($doc->text_sigle, 'BRZ13_APR.00001', 'ID');
is($doc->corpus_sigle, 'BRZ13', 'corpusID');
-
is($doc->pub_date, '20130402', 'pubDate');
is($doc->pub_place, 'Braunschweig', 'pubPlace');
@@ -124,13 +118,13 @@
ok(!$doc->text_type, 'text_type');
is($doc->text_type_art, 'Bericht', 'text_type art');
-
# ERL/0001
$path = catdir(dirname(__FILE__), 'ERL/00001');
ok($doc = KorAP::Document->new( path => $path . '/' ), 'Load Korap::Document');
ok($doc->parse, 'Parse document');
-is($doc->title, 'MK2/ERL.00001 Amtsblatt des Landesbezirks Baden [diverse Erlasse], Hrsg. und Schriftleitung: Präsidialstelle der Landesverwaltung Baden in Karlsruhe. - Karlsruhe, o.J.', 'title'); # Amtsblatt des Landesbezirks Baden [diverse Erlasse]
+is($doc->title, 'Amtsblatt des Landesbezirks Baden [diverse Erlasse], Hrsg. und Schriftleitung: Präsidialstelle der Landesverwaltung Baden in Karlsruhe. - Karlsruhe, o.J.', 'title'); # Amtsblatt des Landesbezirks Baden [diverse Erlasse]
+# MK2/ERL.00001
ok(!$doc->sub_title, 'subTitle');
is($doc->text_sigle, 'MK2_ERL.00001', 'ID');
@@ -146,7 +140,11 @@
ok(!$doc->editor, 'Editor');
is($doc->publisher, 'Badenia Verlag und Druckerei', 'Publisher');
is($doc->creation_date, '19600000', 'Creation date');
-diag 'Non-acceptance of creation date ranges may be temporary';
+
+# !!!
+# diag 'Non-acceptance of creation date ranges may be temporary';
+
+
#ok(!$doc->coll_title, 'Collection title');
#ok(!$doc->coll_sub_title, 'Collection subtitle');
#ok(!$doc->coll_editor, 'Collection editor');
@@ -155,11 +153,11 @@
ok(!$doc->text_type_art, 'text_type art');
# A01/02035-substring
-$path = catdir(dirname(__FILE__), 'A01/02035-substring');
+$path = catdir(dirname(__FILE__), 'A00/02035-substring');
ok($doc = KorAP::Document->new( path => $path . '/' ), 'Load Korap::Document');
ok($doc->parse, 'Parse document');
-is($doc->title, 'A00/JAN.02035 St. Galler Tagblatt, 11.01.2000, Ressort: TB-RSP (Abk.)', 'title');
+is($doc->title, 'St. Galler Tagblatt, 11.01.2000, Ressort: TB-RSP (Abk.)', 'title'); # A00/JAN.02035
ok(!$doc->sub_title, 'subTitle');
is($doc->text_sigle, 'A00_JAN.02035', 'ID');
is($doc->corpus_sigle, 'A00', 'corpusID');
@@ -182,7 +180,7 @@
is($doc->text_type_art, 'Bericht', 'text_type art');
# A01/02873-meta
-$path = catdir(dirname(__FILE__), 'A01/02873-meta');
+$path = catdir(dirname(__FILE__), 'A00/02873-meta');
ok($doc = KorAP::Document->new( path => $path . '/' ), 'Load Korap::Document');
ok($doc->parse, 'Parse document');
@@ -197,6 +195,7 @@
ok(!$doc->text_class->[2], 'TextClass');
ok(!$doc->author, 'author');
+
# Additional information
ok(!$doc->editor, 'Editor');
ok(!$doc->publisher, 'Publisher');
@@ -210,7 +209,7 @@
# A01/05663-unbalanced
-$path = catdir(dirname(__FILE__), 'A01/05663-unbalanced');
+$path = catdir(dirname(__FILE__), 'A00/05663-unbalanced');
ok($doc = KorAP::Document->new( path => $path . '/' ), 'Load Korap::Document');
ok($doc->parse, 'Parse document');
@@ -225,6 +224,7 @@
ok(!$doc->text_class->[2], 'TextClass');
ok(!$doc->author, 'author');
+
# Additional information
ok(!$doc->editor, 'Editor');
ok(!$doc->publisher, 'Publisher');
@@ -238,7 +238,7 @@
# A01/07452-deep
-$path = catdir(dirname(__FILE__), 'A01/07452-deep');
+$path = catdir(dirname(__FILE__), 'A00/07452-deep');
ok($doc = KorAP::Document->new( path => $path . '/' ), 'Load Korap::Document');
ok($doc->parse, 'Parse document');
@@ -253,6 +253,7 @@
ok(!$doc->text_class->[2], 'TextClass');
ok(!$doc->author, 'author');
+
# Additional information
ok(!$doc->editor, 'Editor');
ok(!$doc->publisher, 'Publisher');
@@ -264,7 +265,6 @@
ok(!$doc->text_type, 'text_type');
is($doc->text_type_art, 'Bericht', 'text_type art');
-
# ART
$path = catdir(dirname(__FILE__), 'artificial');
ok($doc = KorAP::Document->new( path => $path . '/' ), 'Load Korap::Document');
@@ -282,6 +282,7 @@
is($doc->corpus_sigle, 'ART', 'corpusID');
is($doc->pub_date, '20010402', 'pubDate');
is($doc->pub_place, 'Mannheim', 'pubPlace');
+is($doc->pub_place_key, 'DE', 'pubPlace key');
is($doc->text_class->[0], 'freizeit-unterhaltung', 'TextClass');
is($doc->text_class->[1], 'vereine-veranstaltungen', 'TextClass');
ok(!$doc->text_class->[2], 'TextClass');
@@ -310,10 +311,14 @@
like($doc->path, qr!$path/$!, 'Path');
ok($doc->parse, 'Parse document');
-is($doc->text_sigle, 'VDI_JAN.00001', 'text sigle');
-is($doc->doc_sigle, 'VDI_JAN', 'doc sigle');
-is($doc->corpus_sigle, 'VDI', 'corpus sigle');
+
+
+is($doc->text_sigle, 'VDI14_JAN.00001', 'text sigle');
+is($doc->doc_sigle, 'VDI14_JAN', 'doc sigle');
+is($doc->corpus_sigle, 'VDI14', 'corpus sigle');
+
is($doc->title, '10- Zz mit Zahl', 'title');
+
ok(!$doc->sub_title, 'subtitle');
is($doc->pub_date, '20140117', 'pubdate');
is($doc->pub_place, 'Düsseldorf', 'pubplace');
@@ -334,7 +339,8 @@
is($doc->reference, 'VDI nachrichten, 17.01.2014, S. 10; 10- Zz mit Zahl [Ausführliche Zitierung nicht verfügbar]', 'Reference');
ok(!$doc->language, 'Language');
-diag 'This may be "de" in the future';
+# !!!
+# diag 'This may be "de" in the future';
is($doc->doc_title, 'VDI nachrichten, Januar 2014', 'Doc title');
ok(!$doc->doc_sub_title, 'Doc Sub title');
@@ -349,7 +355,6 @@
is($doc->keywords_string, '', 'Keywords');
is($doc->text_class_string, 'Freizeit-Unterhaltung Reisen Politik Ausland', 'Text class');
-
# WDD
$path = catdir(dirname(__FILE__), 'WDD/G27/38989');
ok($doc = KorAP::Document->new( path => $path . '/' ), 'Load Korap::Document');
@@ -397,7 +402,9 @@
is($doc->keywords_string, '', 'Keywords');
is($doc->text_class_string, '', 'Text class');
+
+is($doc->availability, 'CC-BY-SA', 'Availability');
+
+
done_testing;
__END__
-
-
diff --git a/t/sgbr/sgbr_meta.t b/t/sgbr/sgbr_meta.t
index 8ab6414..50891f8 100644
--- a/t/sgbr/sgbr_meta.t
+++ b/t/sgbr/sgbr_meta.t
@@ -23,6 +23,43 @@
is($doc->doc_sigle, 'TEST_BSP', 'ID-doc');
is($doc->corpus_sigle, 'TEST', 'ID-corpus');
-diag 'TODO: Parse meta';
+is($doc->title, 'Sommerüberraschung', 'title');
+#is($doc->sub_title, 'Beispiel Text Untertitel', 'title');
+#is($doc->pub_date, '20010402', 'Publication date');
+#is($doc->pub_place, 'Mannheim', 'Publication place');
+is($doc->author, 'TEST.BSP.Autoren.1', 'Author');
+is($doc->store('sgbrAuthorAgeClass'), 'X', 'AgeClass');
+is($doc->store('sgbrAuthorSex'), 'M', 'Sex');
+is($doc->store('sgbrKodex'), 'M', 'Kodex');
+
+is($doc->doc_title, 'Beispielkorpus', 'Doc: title');
+is($doc->doc_sub_title, 'Subkorpus Beispieltext', 'Doc: subtitle');
+
+is($doc->language, 'de', 'Language');
+
+ok(!$doc->publisher, 'Publisher');
+ok(!$doc->editor, 'Editor');
+ok(!$doc->text_type, 'Text Type');
+ok(!$doc->text_type_art, 'Text Type Art');
+ok(!$doc->text_type_ref, 'Text Type Ref');
+ok(!$doc->text_column, 'Text Column');
+ok(!$doc->text_domain, 'Text Domain');
+ok(!$doc->creation_date, 'Creation Date');
+ok(!$doc->license, 'License');
+ok(!$doc->pages, 'Pages');
+ok(!$doc->file_edition_statement, 'File Edition Statement');
+ok(!$doc->bibl_edition_statement, 'Bibl Edition Statement');
+ok(!$doc->reference, 'Reference');
+
+ok(!$doc->doc_editor, 'Doc: editor');
+ok(!$doc->doc_author, 'Doc: author');
+
+ok(!$doc->corpus_title, 'Corpus: title');
+ok(!$doc->corpus_sub_title, 'Corpus: subtitle');
+ok(!$doc->corpus_editor, 'Corpus: editor');
+ok(!$doc->corpus_author, 'Corpus: author');
done_testing;
+
+
+__END__