Use slashes as separators in siglen
Change-Id: I368a0e0be8880a0608c7fba1f13ce5688d22973a
diff --git a/Changes b/Changes
index 51dbb27..5a672d7 100644
--- a/Changes
+++ b/Changes
@@ -1,3 +1,6 @@
+0.17 2016-03-22
+ - Rewrite siglen to use slashes as separators.
+
0.16 2016-03-18
- Added caching mechanism for
metadata.
diff --git a/lib/KorAP/XML/Krill.pm b/lib/KorAP/XML/Krill.pm
index 29b8331..61bfb23 100644
--- a/lib/KorAP/XML/Krill.pm
+++ b/lib/KorAP/XML/Krill.pm
@@ -19,7 +19,7 @@
# Due to the kind of processing, processed metadata may be stored in
# a multiprocess cache instead.
-our $VERSION = '0.16';
+our $VERSION = '0.17';
has 'path';
has [qw/text_sigle doc_sigle corpus_sigle/];
@@ -91,10 +91,10 @@
# Get document id and corpus id
if ($rt && $rt->{'-docid'}) {
- $self->text_sigle($rt->{'-docid'});
- if ($self->text_sigle =~ /^(([^_]+)_[^\._]+?)\..+?$/) {
- $self->corpus_sigle($2);
- $self->doc_sigle($1);
+ if ($rt->{'-docid'} =~ /^([^_]+)_([^\._]+?)\.(.+?)$/) {
+ $self->text_sigle(join('/', $1, $2, $3));
+ $self->doc_sigle(join('/', $1, $2));
+ $self->corpus_sigle($1);
}
else {
croak $unable . ': ID not parseable';
diff --git a/lib/KorAP/XML/Meta/I5.pm b/lib/KorAP/XML/Meta/I5.pm
index 42b3496..6656bc1 100644
--- a/lib/KorAP/XML/Meta/I5.pm
+++ b/lib/KorAP/XML/Meta/I5.pm
@@ -249,7 +249,8 @@
return $_[0] unless $_[1];
my ($title, $prefix) = @_;
- $prefix =~ tr!_!/!;
+ # $prefix =~ tr!_!/!;
+ $prefix =~ s!^([^/]+?/[^/]+?)/!$1\.!;
if (index($title, $prefix) == 0) {
$title = substr($title, length($prefix));
$title =~ s/^\s+//;
diff --git a/t/annotation/meta.t b/t/annotation/meta.t
index f89c934..f641ccf 100644
--- a/t/annotation/meta.t
+++ b/t/annotation/meta.t
@@ -19,8 +19,8 @@
like($doc->path, qr!$path/!, 'Path');
# Metdata
-is($doc->text_sigle, 'Corpus_Doc.0001', 'ID-text');
-is($doc->doc_sigle, 'Corpus_Doc', 'ID-doc');
+is($doc->text_sigle, 'Corpus/Doc/0001', 'ID-text');
+is($doc->doc_sigle, 'Corpus/Doc', 'ID-doc');
is($doc->corpus_sigle, 'Corpus', 'ID-corpus');
my $meta = $doc->meta;
diff --git a/t/meta.t b/t/meta.t
index a2ef1f9..a33b25d 100644
--- a/t/meta.t
+++ b/t/meta.t
@@ -21,7 +21,7 @@
ok($doc->parse, 'Parse document');
# Metdata
-is($doc->text_sigle, 'WPD_AAA.00001', 'ID');
+is($doc->text_sigle, 'WPD/AAA/00001', 'ID');
my $meta = $doc->meta;
is($meta->{title}, 'A', 'title');
@@ -67,7 +67,7 @@
is($meta->{title}, 'Fischer und Kolp im Sonnenhügel', 'title');
ok(!$meta->{sub_title}, 'subTitle');
-is($doc->text_sigle, 'A01_APR.13047', 'ID');
+is($doc->text_sigle, 'A01/APR/13047', 'ID');
is($doc->corpus_sigle, 'A01', 'corpusID');
is($meta->{pub_date}, '20010402', 'pubDate');
ok(!$meta->{pub_place}, 'pubPlace');
@@ -98,7 +98,7 @@
# MK2/ERL.00001
ok(!$meta->{sub_title}, 'subTitle');
-is($doc->text_sigle, 'MK2_ERL.00001', 'ID');
+is($doc->text_sigle, 'MK2/ERL/00001', 'ID');
is($doc->corpus_sigle, 'MK2', 'corpusID');
is($meta->{pub_date}, '00000000', 'pubDate');
is($meta->{pub_place}, 'Karlsruhe', 'pubPlace');
@@ -133,7 +133,7 @@
is($meta->{title}, 'St. Galler Tagblatt, 11.01.2000, Ressort: TB-RSP (Abk.)', 'title'); # A00/JAN.02035
ok(!$meta->{sub_title}, 'subTitle');
-is($doc->text_sigle, 'A00_JAN.02035', 'ID');
+is($doc->text_sigle, 'A00/JAN/02035', 'ID');
is($doc->corpus_sigle, 'A00', 'corpusID');
is($meta->{pub_date}, '20000111', 'pubDate');
ok(!$meta->{pub_place}, 'pubPlace');
@@ -161,7 +161,7 @@
is($meta->{title}, 'Tradition und Moderne', 'title');
ok(!$meta->{sub_title}, 'subTitle');
-is($doc->text_sigle, 'A00_JAN.02873', 'ID');
+is($doc->text_sigle, 'A00/JAN/02873', 'ID');
is($doc->corpus_sigle, 'A00', 'corpusID');
is($meta->{pub_date}, '20000113', 'pubDate');
ok(!$meta->{pub_place}, 'pubPlace');
@@ -191,7 +191,7 @@
is($meta->{title}, 'Mehr Arbeitslose im Dezember', 'title');
ok(!$meta->{sub_title}, 'subTitle');
-is($doc->text_sigle, 'A00_JAN.05663', 'ID');
+is($doc->text_sigle, 'A00/JAN/05663', 'ID');
is($doc->corpus_sigle, 'A00', 'corpusID');
is($meta->{pub_date}, '20000124', 'pubDate');
ok(!$meta->{pub_place}, 'pubPlace');
@@ -220,7 +220,7 @@
is($meta->{title}, 'Wil im Dezember 1999', 'title');
ok(!$meta->{sub_title}, 'subTitle');
-is($doc->text_sigle, 'A00_JAN.07452', 'ID');
+is($doc->text_sigle, 'A00/JAN/07452', 'ID');
is($doc->corpus_sigle, 'A00', 'corpusID');
is($meta->{pub_date}, '20000129', 'pubDate');
ok(!$meta->{pub_place}, 'pubPlace');
@@ -255,7 +255,7 @@
# Metdata
is($meta->{title}, 'Artificial Title', 'title');
is($meta->{sub_title}, 'Artificial Subtitle', 'subTitle');
-is($doc->text_sigle, 'ART_ABC.00001', 'ID');
+is($doc->text_sigle, 'ART/ABC/00001', 'ID');
is($doc->corpus_sigle, 'ART', 'corpusID');
is($meta->{pub_date}, '20010402', 'pubDate');
is($meta->{pub_place}, 'Mannheim', 'pubPlace');
@@ -291,8 +291,8 @@
ok($doc->parse, 'Parse document');
$meta = $doc->meta;
-is($doc->text_sigle, 'VDI14_JAN.00001', 'text sigle');
-is($doc->doc_sigle, 'VDI14_JAN', 'doc sigle');
+is($doc->text_sigle, 'VDI14/JAN/00001', 'text sigle');
+is($doc->doc_sigle, 'VDI14/JAN', 'doc sigle');
is($meta->corpus_sigle, 'VDI14', 'corpus sigle');
is($meta->{title}, '10- Zz mit Zahl', 'title');
@@ -340,8 +340,8 @@
ok($doc->parse, 'Parse document');
$meta = $doc->meta;
-is($doc->text_sigle, 'WDD11_G27.38989', 'text sigle');
-is($doc->doc_sigle, 'WDD11_G27', 'doc sigle');
+is($doc->text_sigle, 'WDD11/G27/38989', 'text sigle');
+is($doc->doc_sigle, 'WDD11/G27', 'doc sigle');
is($doc->corpus_sigle, 'WDD11', 'corpus sigle');
is($meta->{title}, 'Diskussion:Gunter A. Pilz', 'title');
diff --git a/t/real/bzk.t b/t/real/bzk.t
index 8999cc5..76d01ca 100644
--- a/t/real/bzk.t
+++ b/t/real/bzk.t
@@ -21,8 +21,8 @@
ok(my $doc = KorAP::XML::Krill->new( path => $path . '/' ), 'Load Korap::Document');
ok($doc->parse, 'Parse document');
-is($doc->text_sigle, 'BZK_D59.00001', 'Correct text sigle');
-is($doc->doc_sigle, 'BZK_D59', 'Correct document sigle');
+is($doc->text_sigle, 'BZK/D59/00001', 'Correct text sigle');
+is($doc->doc_sigle, 'BZK/D59', 'Correct document sigle');
is($doc->corpus_sigle, 'BZK', 'Correct corpus sigle');
my $meta = $doc->meta;
@@ -92,8 +92,8 @@
is($output->{data}->{layerInfos}, '', 'layerInfos');
is($output->{data}->{stream}->[0]->[4], 's:unser', 'data');
-is($output->{textSigle}, 'BZK_D59.00001', 'Correct text sigle');
-is($output->{docSigle}, 'BZK_D59', 'Correct document sigle');
+is($output->{textSigle}, 'BZK/D59/00001', 'Correct text sigle');
+is($output->{docSigle}, 'BZK/D59', 'Correct document sigle');
is($output->{corpusSigle}, 'BZK', 'Correct corpus sigle');
is($output->{title}, 'Unser gemeinsames Werk wird siegreich sein', 'Title');
diff --git a/t/real/bzk_2.t b/t/real/bzk_2.t
index a44806b..4597541 100644
--- a/t/real/bzk_2.t
+++ b/t/real/bzk_2.t
@@ -21,8 +21,8 @@
ok(my $doc = KorAP::XML::Krill->new( path => $path . '/' ), 'Load Korap::XML::Krill');
ok($doc->parse, 'Parse document');
-is($doc->text_sigle, 'BZK_D59.00089', 'Correct text sigle');
-is($doc->doc_sigle, 'BZK_D59', 'Correct document sigle');
+is($doc->text_sigle, 'BZK/D59/00089', 'Correct text sigle');
+is($doc->doc_sigle, 'BZK/D59', 'Correct document sigle');
is($doc->corpus_sigle, 'BZK', 'Correct corpus sigle');
my $meta = $doc->meta;
@@ -93,8 +93,8 @@
is($output->{data}->{layerInfos}, '', 'layerInfos');
is($output->{data}->{stream}->[0]->[4], 's:Saragat-Partei', 'data');
-is($output->{textSigle}, 'BZK_D59.00089', 'Correct text sigle');
-is($output->{docSigle}, 'BZK_D59', 'Correct document sigle');
+is($output->{textSigle}, 'BZK/D59/00089', 'Correct text sigle');
+is($output->{docSigle}, 'BZK/D59', 'Correct document sigle');
is($output->{corpusSigle}, 'BZK', 'Correct corpus sigle');
is($output->{title}, 'Saragat-Partei zerfällt', 'Title');
diff --git a/t/real/goethe.t b/t/real/goethe.t
index 9769af7..07d4eae 100644
--- a/t/real/goethe.t
+++ b/t/real/goethe.t
@@ -23,8 +23,8 @@
ok(my $doc = KorAP::XML::Krill->new( path => $path . '/' ), 'Load Korap::Document');
ok($doc->parse, 'Parse document');
-is($doc->text_sigle, 'GOE_AGA.03828', 'Correct text sigle');
-is($doc->doc_sigle, 'GOE_AGA', 'Correct document sigle');
+is($doc->text_sigle, 'GOE/AGA/03828', 'Correct text sigle');
+is($doc->doc_sigle, 'GOE/AGA', 'Correct document sigle');
is($doc->corpus_sigle, 'GOE', 'Correct corpus sigle');
my $meta = $doc->meta;
@@ -89,8 +89,8 @@
is($output->{data}->{layerInfos}, '', 'layerInfos');
is($output->{data}->{stream}->[0]->[4], 's:Autobiographische', 'data');
-is($output->{textSigle}, 'GOE_AGA.03828', 'Correct text sigle');
-is($output->{docSigle}, 'GOE_AGA', 'Correct document sigle');
+is($output->{textSigle}, 'GOE/AGA/03828', 'Correct text sigle');
+is($output->{docSigle}, 'GOE/AGA', 'Correct document sigle');
is($output->{corpusSigle}, 'GOE', 'Correct corpus sigle');
is($output->{author}, 'Goethe, Johann Wolfgang von', 'Author');
diff --git a/t/real/wdd.t b/t/real/wdd.t
index 71d16d4..eb5db64 100644
--- a/t/real/wdd.t
+++ b/t/real/wdd.t
@@ -22,8 +22,8 @@
ok(my $doc = KorAP::XML::Krill->new( path => $path . '/' ), 'Load Korap::Document');
ok($doc->parse, 'Parse document');
-is($doc->text_sigle, 'WDD11_G27.38989', 'Correct text sigle');
-is($doc->doc_sigle, 'WDD11_G27', 'Correct document sigle');
+is($doc->text_sigle, 'WDD11/G27/38989', 'Correct text sigle');
+is($doc->doc_sigle, 'WDD11/G27', 'Correct document sigle');
is($doc->corpus_sigle, 'WDD11', 'Correct corpus sigle');
my $meta = $doc->meta;
@@ -84,8 +84,8 @@
is($output->{data}->{layerInfos}, '', 'layerInfos');
is($output->{data}->{stream}->[0]->[4], 's:{War', 'data');
-is($output->{textSigle}, 'WDD11_G27.38989', 'Correct text sigle');
-is($output->{docSigle}, 'WDD11_G27', 'Correct document sigle');
+is($output->{textSigle}, 'WDD11/G27/38989', 'Correct text sigle');
+is($output->{docSigle}, 'WDD11/G27', 'Correct document sigle');
is($output->{corpusSigle}, 'WDD11', 'Correct corpus sigle');
is($output->{title}, 'Diskussion:Gunter A. Pilz', 'Title');
diff --git a/t/real/wpd.t b/t/real/wpd.t
index 1cf1711..f6293c2 100644
--- a/t/real/wpd.t
+++ b/t/real/wpd.t
@@ -18,13 +18,12 @@
# GOE/AGA/03828
my $path = catdir(dirname(__FILE__), '../corpus/WPD/00001');
-# my $path = '/home/ndiewald/Repositories/korap/KorAP-sandbox/KorAP-lucene-indexer/t/GOE/AGA/03828';
ok(my $doc = KorAP::XML::Krill->new( path => $path . '/' ), 'Load Korap::Document');
ok($doc->parse, 'Parse document');
-is($doc->text_sigle, 'WPD_AAA.00001', 'Correct text sigle');
-is($doc->doc_sigle, 'WPD_AAA', 'Correct document sigle');
+is($doc->text_sigle, 'WPD/AAA/00001', 'Correct text sigle');
+is($doc->doc_sigle, 'WPD/AAA', 'Correct document sigle');
is($doc->corpus_sigle, 'WPD', 'Correct corpus sigle');
my $meta = $doc->meta;
diff --git a/t/sgbr/meta.t b/t/sgbr/meta.t
index a1c15c0..91b0d8d 100644
--- a/t/sgbr/meta.t
+++ b/t/sgbr/meta.t
@@ -20,8 +20,8 @@
like($doc->path, qr!$path/!, 'Path');
# Metdata
-is($doc->text_sigle, 'TEST_BSP.1', 'ID-text');
-is($doc->doc_sigle, 'TEST_BSP', 'ID-doc');
+is($doc->text_sigle, 'TEST/BSP/1', 'ID-text');
+is($doc->doc_sigle, 'TEST/BSP', 'ID-doc');
is($doc->corpus_sigle, 'TEST', 'ID-corpus');
my $meta = $doc->meta;
diff --git a/t/sgbr/meta_duden.t b/t/sgbr/meta_duden.t
index c808022..a375307 100644
--- a/t/sgbr/meta_duden.t
+++ b/t/sgbr/meta_duden.t
@@ -20,8 +20,8 @@
like($doc->path, qr!$path/!, 'Path');
# Metdata
-is($doc->text_sigle, 'PRO-DUD_BSP-2013-01.32', 'ID-text');
-is($doc->doc_sigle, 'PRO-DUD_BSP-2013-01', 'ID-doc');
+is($doc->text_sigle, 'PRO-DUD/BSP-2013-01/32', 'ID-text');
+is($doc->doc_sigle, 'PRO-DUD/BSP-2013-01', 'ID-doc');
is($doc->corpus_sigle, 'PRO-DUD', 'ID-corpus');
my $meta = $doc->meta;
diff --git a/t/sgbr/meta_ids.t b/t/sgbr/meta_ids.t
index b040e09..5c9c628 100644
--- a/t/sgbr/meta_ids.t
+++ b/t/sgbr/meta_ids.t
@@ -20,9 +20,9 @@
like($doc->path, qr!$path/!, 'Path');
# Metdata
-is($doc->text_sigle, 'CMC-TSK_2014-09.2843', 'ID-text');
+is($doc->text_sigle, 'CMC-TSK/2014-09/2843', 'ID-text');
-is($doc->doc_sigle, 'CMC-TSK_2014-09', 'ID-doc');
+is($doc->doc_sigle, 'CMC-TSK/2014-09', 'ID-doc');
is($doc->corpus_sigle, 'CMC-TSK', 'ID-corpus');
my $meta = $doc->meta;
@@ -88,9 +88,9 @@
like($doc->path, qr!$path/!, 'Path');
# Metdata
-is($doc->text_sigle, 'CMC-TSK_2014-09.3401', 'ID-text');
+is($doc->text_sigle, 'CMC-TSK/2014-09/3401', 'ID-text');
-is($doc->doc_sigle, 'CMC-TSK_2014-09', 'ID-doc');
+is($doc->doc_sigle, 'CMC-TSK/2014-09', 'ID-doc');
is($doc->corpus_sigle, 'CMC-TSK', 'ID-corpus');
diff --git a/t/transform.t b/t/transform.t
index 7c9a15e..522b267 100644
--- a/t/transform.t
+++ b/t/transform.t
@@ -1,6 +1,4 @@
#!/usr/bin/env perl
-# source ~/perl5/perlbrew/etc/bashrc
-# perlbrew switch perl-blead@korap
use strict;
use warnings;
use utf8;
@@ -84,7 +82,7 @@
is($meta->{title}, 'A', 'title');
ok(!$meta->{sub_title}, 'subTitle');
-is($doc->text_sigle, 'WPD_AAA.00001', 'ID');
+is($doc->text_sigle, 'WPD/AAA/00001', 'ID');
is($doc->corpus_sigle, 'WPD', 'corpusID');
is($meta->{pub_date}, '20050328', 'pubDate');
@@ -110,7 +108,7 @@
like($tokens->path, qr!$path/$!, 'Path');
is($tokens->foundry, 'OpenNLP', 'Foundry');
-is($tokens->doc->text_sigle, 'WPD_AAA.00001', 'Doc id');
+is($tokens->doc->text_sigle, 'WPD/AAA/00001', 'Doc id');
is($tokens->should, 1068, 'Should');
is($tokens->have, 923, 'Have');
is($tokens->name, 'tokens', 'Name');