More relaxed handling of document siglen

Change-Id: Ibb1fb8abb43126b7bf7791753211ea18bf802e72
diff --git a/Changes b/Changes
index ac5e8ec..8c4e120 100644
--- a/Changes
+++ b/Changes
@@ -1,8 +1,10 @@
-0.25 2017-01-20
+0.25 2017-02-06
         - Updated to Mojolicious 7.20
         - Fixed meta treatment in case analytic and monogr
           are available
         - Added DRuKoLa support to script
+        - Liberated document and text sigle handling to be
+          compliant with CoRoLa.
 
 0.24 2016-12-21
         - Added --base-sentences and --base-paragraphs options
diff --git a/lib/KorAP/XML/Archive.pm b/lib/KorAP/XML/Archive.pm
index 37e09b5..b1eb900 100644
--- a/lib/KorAP/XML/Archive.pm
+++ b/lib/KorAP/XML/Archive.pm
@@ -47,11 +47,11 @@
   my $file = $self->[0]->[0];
   foreach (`unzip -l -UU -qq $file "*/data.xml"`) {
     if (m![\t\s]
-      ((?:\./)?
-	[^\t\s/\.]+?/ # Corpus
-	[^\t\s/]+?/   # Document
-	[^\t\s/]+?    # Text
-      )/data\.xml$!x) {
+          ((?:\./)?
+            [^\s\t/\.]+?/ # Corpus
+            [^\/]+?/   # Document
+            [^/]+?    # Text
+          )/data\.xml$!x) {
       push @texts, $1;
     };
   };
diff --git a/lib/KorAP/XML/Krill.pm b/lib/KorAP/XML/Krill.pm
index 398c045..08bd56c 100644
--- a/lib/KorAP/XML/Krill.pm
+++ b/lib/KorAP/XML/Krill.pm
@@ -16,7 +16,7 @@
 use Data::Dumper;
 use File::Spec::Functions qw/catdir catfile catpath splitdir splitpath rel2abs/;
 
-our $VERSION = '0.24';
+our $VERSION = '0.25';
 
 has 'path';
 has [qw/text_sigle doc_sigle corpus_sigle/];
diff --git a/t/corpus/BBU/BLOG/83709_a_82384/base/tokens_aggr.xml b/t/corpus/CoRoLa/BBU/BLOG/83709_a_82384/base/tokens_aggr.xml
similarity index 100%
rename from t/corpus/BBU/BLOG/83709_a_82384/base/tokens_aggr.xml
rename to t/corpus/CoRoLa/BBU/BLOG/83709_a_82384/base/tokens_aggr.xml
diff --git a/t/corpus/BBU/BLOG/83709_a_82384/base/tokens_conservative.xml b/t/corpus/CoRoLa/BBU/BLOG/83709_a_82384/base/tokens_conservative.xml
similarity index 100%
rename from t/corpus/BBU/BLOG/83709_a_82384/base/tokens_conservative.xml
rename to t/corpus/CoRoLa/BBU/BLOG/83709_a_82384/base/tokens_conservative.xml
diff --git a/t/corpus/BBU/BLOG/83709_a_82384/data.xml b/t/corpus/CoRoLa/BBU/BLOG/83709_a_82384/data.xml
similarity index 100%
rename from t/corpus/BBU/BLOG/83709_a_82384/data.xml
rename to t/corpus/CoRoLa/BBU/BLOG/83709_a_82384/data.xml
diff --git a/t/corpus/BBU/BLOG/83709_a_82384/drukola/morpho.xml b/t/corpus/CoRoLa/BBU/BLOG/83709_a_82384/drukola/morpho.xml
similarity index 100%
rename from t/corpus/BBU/BLOG/83709_a_82384/drukola/morpho.xml
rename to t/corpus/CoRoLa/BBU/BLOG/83709_a_82384/drukola/morpho.xml
diff --git a/t/corpus/BBU/BLOG/83709_a_82384/header.xml b/t/corpus/CoRoLa/BBU/BLOG/83709_a_82384/header.xml
similarity index 100%
rename from t/corpus/BBU/BLOG/83709_a_82384/header.xml
rename to t/corpus/CoRoLa/BBU/BLOG/83709_a_82384/header.xml
diff --git a/t/corpus/BBU/BLOG/83709_a_82384/struct/structure.xml b/t/corpus/CoRoLa/BBU/BLOG/83709_a_82384/struct/structure.xml
similarity index 100%
rename from t/corpus/BBU/BLOG/83709_a_82384/struct/structure.xml
rename to t/corpus/CoRoLa/BBU/BLOG/83709_a_82384/struct/structure.xml
diff --git a/t/corpus/BBU/BLOG/header.xml b/t/corpus/CoRoLa/BBU/BLOG/header.xml
similarity index 100%
rename from t/corpus/BBU/BLOG/header.xml
rename to t/corpus/CoRoLa/BBU/BLOG/header.xml
diff --git a/t/corpus/BBU/header.xml b/t/corpus/CoRoLa/BBU/header.xml
similarity index 100%
rename from t/corpus/BBU/header.xml
rename to t/corpus/CoRoLa/BBU/header.xml
diff --git a/t/corpus/BBU2/Blog/83701_a_82376/data.xml b/t/corpus/CoRoLa/BBU2/Blog/83701_a_82376/data.xml
similarity index 100%
rename from t/corpus/BBU2/Blog/83701_a_82376/data.xml
rename to t/corpus/CoRoLa/BBU2/Blog/83701_a_82376/data.xml
diff --git a/t/corpus/BBU2/Blog/83701_a_82376/header.xml b/t/corpus/CoRoLa/BBU2/Blog/83701_a_82376/header.xml
similarity index 100%
rename from t/corpus/BBU2/Blog/83701_a_82376/header.xml
rename to t/corpus/CoRoLa/BBU2/Blog/83701_a_82376/header.xml
diff --git a/t/corpus/BBU2/Blog/header.xml b/t/corpus/CoRoLa/BBU2/Blog/header.xml
similarity index 100%
rename from t/corpus/BBU2/Blog/header.xml
rename to t/corpus/CoRoLa/BBU2/Blog/header.xml
diff --git a/t/corpus/BBU2/header.xml b/t/corpus/CoRoLa/BBU2/header.xml
similarity index 100%
rename from t/corpus/BBU2/header.xml
rename to t/corpus/CoRoLa/BBU2/header.xml
diff --git a/t/corpus/CoRoLa/Corola-Journal/-/247_a_537/data.xml b/t/corpus/CoRoLa/Corola-Journal/-/247_a_537/data.xml
new file mode 100644
index 0000000..a5d002b
--- /dev/null
+++ b/t/corpus/CoRoLa/Corola-Journal/-/247_a_537/data.xml
@@ -0,0 +1,7 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<?xml-model href="text.rng" type="application/xml" schematypens="http://relaxng.org/ns/structure/1.0"?>
+
+<raw_text docid="Corola-Journal_-.247_a_537" xmlns="http://ids-mannheim.de/ns/KorAP">
+  <metadata file="metadata.xml" />
+  <text>-</text>
+</raw_text>
diff --git a/t/corpus/CoRoLa/Corola-Journal/-/247_a_537/header.xml b/t/corpus/CoRoLa/Corola-Journal/-/247_a_537/header.xml
new file mode 100644
index 0000000..877b121
--- /dev/null
+++ b/t/corpus/CoRoLa/Corola-Journal/-/247_a_537/header.xml
@@ -0,0 +1,34 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<?xml-model href="header.rng" type="application/xml" schematypens="http://relaxng.org/ns/structure/1.0"?>
+<!DOCTYPE idsCorpus PUBLIC "-//IDS//DTD IDS-XCES 1.0//EN" "http://corpora.ids-mannheim.de/idsxces1/DTD/ids.xcesdoc.dtd">
+<idsHeader type="text" pattern="text" status="new" version="1.1" TEIform="teiHeader">
+        <fileDesc>
+          <titleStmt>
+            <textSigle>Corola-Journal/-.247_a_537</textSigle>
+            <t.title>ANUAR ŞTIINȚIFIC COMPETIȚIONAL în domeniul de ştiință - Educație fizică şi sport</t.title>
+          </titleStmt>
+          <publicationStmt>
+            <distributor/>
+            <pubAddress/>
+            <availability region="world">QAO-NC</availability>
+            <pubDate/>
+          </publicationStmt>
+          <sourceDesc>
+            <biblStruct>
+              <monogr>
+                <h.author>Dragoş Bondoc-Ionescu</h.author>
+                <editor role="translator">-</editor>
+                <imprint>
+                </imprint>
+              </monogr>
+            </biblStruct>
+          </sourceDesc>
+        </fileDesc>
+         <profileDesc>
+             <textDesc>
+                 <textType>Journalistic.-</textType>
+             <textClass>
+                 <catRef target="Art And Culture.Sport" scheme="topic"/>
+             </textClass>
+         </profileDesc>
+      </idsHeader>
\ No newline at end of file
diff --git a/t/corpus/CoRoLa/Corola-Journal/-/header.xml b/t/corpus/CoRoLa/Corola-Journal/-/header.xml
new file mode 100644
index 0000000..4938d27
--- /dev/null
+++ b/t/corpus/CoRoLa/Corola-Journal/-/header.xml
@@ -0,0 +1,30 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<?xml-model href="header.rng" type="application/xml" schematypens="http://relaxng.org/ns/structure/1.0"?>
+<!DOCTYPE idsCorpus PUBLIC "-//IDS//DTD IDS-XCES 1.0//EN" "http://corpora.ids-mannheim.de/idsxces1/DTD/ids.xcesdoc.dtd">
+<idsHeader type="document" pattern="text" status="new" version="1.1" TEIform="teiHeader">
+      <fileDesc>
+        <titleStmt>
+          <dokumentSigle>Corola-Journal/-</dokumentSigle>
+          <d.title>-</d.title>
+        </titleStmt>
+        <publicationStmt>
+          <distributor/>
+          <pubAddress/>
+          <availability region="world">[...]</availability>
+          <pubDate/>
+        </publicationStmt>
+        <sourceDesc>
+          <biblStruct>
+            <monogr>
+              <h.title type="main"/>
+              <edition>
+                <further/>
+                <kind/>
+                <appearance/>
+              </edition>
+              <imprint/>
+            </monogr>
+          </biblStruct>
+        </sourceDesc>
+      </fileDesc>
+    </idsHeader>
\ No newline at end of file
diff --git "a/t/corpus/CoRoLa/Corola-Journal/COLEGIUL NATIONAL \342\200\236OCTAV BANCILA\342\200\234 - IASI/326_a_562/data.xml" "b/t/corpus/CoRoLa/Corola-Journal/COLEGIUL NATIONAL \342\200\236OCTAV BANCILA\342\200\234 - IASI/326_a_562/data.xml"
new file mode 100644
index 0000000..6f0ae35
--- /dev/null
+++ "b/t/corpus/CoRoLa/Corola-Journal/COLEGIUL NATIONAL \342\200\236OCTAV BANCILA\342\200\234 - IASI/326_a_562/data.xml"
@@ -0,0 +1,7 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<?xml-model href="text.rng" type="application/xml" schematypens="http://relaxng.org/ns/structure/1.0"?>
+
+<raw_text docid="Corola-Journal_COLEGIUL NATIONAL „OCTAV BANCILA“ - IASI.326_a_562" xmlns="http://ids-mannheim.de/ns/KorAP">
+  <metadata file="metadata.xml" />
+  <text>-</text>
+</raw_text>
diff --git "a/t/corpus/CoRoLa/Corola-Journal/COLEGIUL NATIONAL \342\200\236OCTAV BANCILA\342\200\234 - IASI/326_a_562/header.xml" "b/t/corpus/CoRoLa/Corola-Journal/COLEGIUL NATIONAL \342\200\236OCTAV BANCILA\342\200\234 - IASI/326_a_562/header.xml"
new file mode 100644
index 0000000..cc75647
--- /dev/null
+++ "b/t/corpus/CoRoLa/Corola-Journal/COLEGIUL NATIONAL \342\200\236OCTAV BANCILA\342\200\234 - IASI/326_a_562/header.xml"
@@ -0,0 +1,34 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<?xml-model href="header.rng" type="application/xml" schematypens="http://relaxng.org/ns/structure/1.0"?>
+<!DOCTYPE idsCorpus PUBLIC "-//IDS//DTD IDS-XCES 1.0//EN" "http://corpora.ids-mannheim.de/idsxces1/DTD/ids.xcesdoc.dtd">
+<idsHeader type="text" pattern="text" status="new" version="1.1" TEIform="teiHeader">
+        <fileDesc>
+          <titleStmt>
+            <textSigle>Corola-Journal/COLEGIUL NATIONAL „OCTAV BANCILA“ - IASI.326_a_562</textSigle>
+            <t.title>APOGEUL ARHITECTURĂ ȘI DESIGN</t.title>
+          </titleStmt>
+          <publicationStmt>
+            <distributor/>
+            <pubAddress/>
+            <availability region="world">QAO-NC</availability>
+            <pubDate/>
+          </publicationStmt>
+          <sourceDesc>
+            <biblStruct>
+              <monogr>
+                <h.author>COLEGIUL NATIONAL „OCTAV BANCILA“ - IASI</h.author>
+                <editor role="translator">-</editor>
+                <imprint>
+                </imprint>
+              </monogr>
+            </biblStruct>
+          </sourceDesc>
+        </fileDesc>
+         <profileDesc>
+             <textDesc>
+                 <textType>Journalistic.-</textType>
+             <textClass>
+                 <catRef target="Art And Culture.Others" scheme="topic"/>
+             </textClass>
+         </profileDesc>
+      </idsHeader>
\ No newline at end of file
diff --git "a/t/corpus/CoRoLa/Corola-Journal/COLEGIUL NATIONAL \342\200\236OCTAV BANCILA\342\200\234 - IASI/header.xml" "b/t/corpus/CoRoLa/Corola-Journal/COLEGIUL NATIONAL \342\200\236OCTAV BANCILA\342\200\234 - IASI/header.xml"
new file mode 100644
index 0000000..d75e976
--- /dev/null
+++ "b/t/corpus/CoRoLa/Corola-Journal/COLEGIUL NATIONAL \342\200\236OCTAV BANCILA\342\200\234 - IASI/header.xml"
@@ -0,0 +1,30 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<?xml-model href="header.rng" type="application/xml" schematypens="http://relaxng.org/ns/structure/1.0"?>
+<!DOCTYPE idsCorpus PUBLIC "-//IDS//DTD IDS-XCES 1.0//EN" "http://corpora.ids-mannheim.de/idsxces1/DTD/ids.xcesdoc.dtd">
+<idsHeader type="document" pattern="text" status="new" version="1.1" TEIform="teiHeader">
+      <fileDesc>
+        <titleStmt>
+          <dokumentSigle>Corola-Journal/COLEGIUL NATIONAL „OCTAV BANCILA“ - IASI</dokumentSigle>
+          <d.title>COLEGIUL NATIONAL „OCTAV BANCILA“ - IASI</d.title>
+        </titleStmt>
+        <publicationStmt>
+          <distributor/>
+          <pubAddress/>
+          <availability region="world">[...]</availability>
+          <pubDate/>
+        </publicationStmt>
+        <sourceDesc>
+          <biblStruct>
+            <monogr>
+              <h.title type="main"/>
+              <edition>
+                <further/>
+                <kind/>
+                <appearance/>
+              </edition>
+              <imprint/>
+            </monogr>
+          </biblStruct>
+        </sourceDesc>
+      </fileDesc>
+    </idsHeader>
\ No newline at end of file
diff --git a/t/corpus/CoRoLa/Corola-Journal/header.xml b/t/corpus/CoRoLa/Corola-Journal/header.xml
new file mode 100644
index 0000000..5e937ff
--- /dev/null
+++ b/t/corpus/CoRoLa/Corola-Journal/header.xml
@@ -0,0 +1,22 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<?xml-model href="header.rng" type="application/xml" schematypens="http://relaxng.org/ns/structure/1.0"?>
+<!DOCTYPE idsCorpus PUBLIC "-//IDS//DTD IDS-XCES 1.0//EN" "http://corpora.ids-mannheim.de/idsxces1/DTD/ids.xcesdoc.dtd">
+<idsHeader type="corpus" status="new" version="1.1" TEIform="teiHeader">
+    <fileDesc>
+      <titleStmt>
+        <korpusSigle>Corola-Journal</korpusSigle>
+        <c.title>Corola-Journal</c.title>
+      </titleStmt>
+    </fileDesc>
+    <profileDesc>
+      <langUsage>
+        <language id="ro" usage="100">Romanian</language>
+      </langUsage>
+      <!-- 
+           <textDesc>
+           This element is suppressed in p5!
+           <channel mode="w">written</channel>
+           </textDesc>
+      -->
+    </profileDesc>
+  </idsHeader>
\ No newline at end of file
diff --git a/t/real/drukola.t b/t/real/drukola.t
index 1eb4587..084ccef 100644
--- a/t/real/drukola.t
+++ b/t/real/drukola.t
@@ -20,7 +20,7 @@
 
 # New
 # BBU/BLOG/83709_a_82384
-my $path = catdir(dirname(__FILE__), '../corpus/BBU/BLOG/83709_a_82384');
+my $path = catdir(dirname(__FILE__), '../corpus/CoRoLa/BBU/BLOG/83709_a_82384');
 
 ok(my $doc = KorAP::XML::Krill->new( path => $path . '/' ), 'Load Korap::Document');
 ok($doc->parse, 'Parse document');
@@ -109,9 +109,7 @@
 
 # New
 # BBU2/BLOG/83709_a_82384
-$path = catdir(dirname(__FILE__), '../corpus/BBU2/Blog/83701_a_82376');
-
-
+$path = catdir(dirname(__FILE__), '../corpus/CoRoLa/BBU2/Blog/83701_a_82376');
 
 ok($doc = KorAP::XML::Krill->new( path => $path . '/' ), 'Load Korap::Document');
 ok($doc->parse, 'Parse document');
@@ -124,5 +122,27 @@
 ok(!exists $meta->{text_class}, 'No translator');
 
 
+
+$path = catdir(dirname(__FILE__), '../corpus/CoRoLa/Corola-Journal/-/247_a_537');
+ok($doc = KorAP::XML::Krill->new( path => $path . '/' ), 'Load Korap::Document');
+ok($doc->parse, 'Parse document');
+
+$meta = $doc->meta;
+is($meta->text_sigle, 'Corola-Journal/-/247_a_537', 'Text Sigle');
+is($meta->doc_sigle, 'Corola-Journal/-', 'Doc Sigle');
+is($meta->corpus_sigle, 'Corola-Journal', 'Corpus Sigle');
+is($meta->{text_class}->[0], 'Sport', 'Text class');
+
+
+$path = catdir(dirname(__FILE__), '../corpus/CoRoLa/Corola-Journal/COLEGIUL NATIONAL „OCTAV BANCILA“ - IASI/326_a_562');
+ok($doc = KorAP::XML::Krill->new( path => $path . '/' ), 'Load Korap::Document');
+ok($doc->parse, 'Parse document');
+
+$meta = $doc->meta;
+is($meta->text_sigle, 'Corola-Journal/COLEGIUL NATIONAL „OCTAV BANCILA“ - IASI/326_a_562', 'Text Sigle');
+is($meta->doc_sigle, 'Corola-Journal/COLEGIUL NATIONAL „OCTAV BANCILA“ - IASI', 'Doc Sigle');
+is($meta->corpus_sigle, 'Corola-Journal', 'Corpus Sigle');
+is($meta->{title}, 'APOGEUL ARHITECTURĂ ȘI DESIGN', 'Title');
+
 done_testing;
 __END__