Check for valid sigles to avoid broken directories

Change-Id: Ib4c9a125feda4ddb88966aff0fdbe1a39f00820f
diff --git a/Changes b/Changes
index c2efa8d..992c8f8 100644
--- a/Changes
+++ b/Changes
@@ -11,6 +11,7 @@
           in favor of --skip-inline-token-annotations
         - Improve script handling of broken data
         - Improve handling of unknown header types
+        - Check for valid sigles to avoid broken directories
 
 1.00 2021-02-18 Release
         - -s option added that uses sentence boundaries
diff --git a/lib/KorAP/XML/TEI/Header.pm b/lib/KorAP/XML/TEI/Header.pm
index a5adac1..1dd4bde 100644
--- a/lib/KorAP/XML/TEI/Header.pm
+++ b/lib/KorAP/XML/TEI/Header.pm
@@ -94,8 +94,8 @@
     # Check for sigle in line
     if (index($_, '<' . $sig_type) >= 0) {
 
-      unless (m!^\s*<$sig_type[^>]*>([^<]*)</$sig_type>\s*$!) {
-        die $log->fatal("line with '<$sig_type />' (L$.) is not in expected format");
+      unless (m!^\s*<$sig_type[^>]*>([^<./]+(?:[/_][^<./]+(?:[./][^<./]+)?)?)?</$sig_type>\s*$!) {
+        die $log->fatal("line with '<$sig_type />' (L$.) is not in expected format: $_");
       };
 
       $self->[SIGLE] = encode('UTF-8' , $1);
diff --git a/t/header.t b/t/header.t
index 0012e75..4ee67e7 100644
--- a/t/header.t
+++ b/t/header.t
@@ -96,6 +96,24 @@
   is($h->id_esc, 'GOE_&quot;AAA&quot;', 'Check sigle escaped');
   is($h->dir, 'GOE/"AAA"', 'Check dir');
   is($h->type, 'document', 'Check type');
+
+
+  ($fh, $filename) = korap_tempfile('header_2');
+
+  print $fh <<'HTML';
+  <fileDesc>
+   <titleStmt>
+    <dokumentSigle>ATZ10.</dokumentSigle>
+   </titleStmt>
+</idsHeader>
+Test
+HTML
+
+  seek($fh, 0, 0);
+
+  $h = KorAP::XML::TEI::Header->new('<idsHeader type="document">');
+  eval { $h->parse($fh) };
+  is($h->sigle, '', 'Check sigle');
 };