Check for valid sigles to avoid broken directories
Change-Id: Ib4c9a125feda4ddb88966aff0fdbe1a39f00820f
diff --git a/Changes b/Changes
index c2efa8d..992c8f8 100644
--- a/Changes
+++ b/Changes
@@ -11,6 +11,7 @@
in favor of --skip-inline-token-annotations
- Improve script handling of broken data
- Improve handling of unknown header types
+ - Check for valid sigles to avoid broken directories
1.00 2021-02-18 Release
- -s option added that uses sentence boundaries
diff --git a/lib/KorAP/XML/TEI/Header.pm b/lib/KorAP/XML/TEI/Header.pm
index a5adac1..1dd4bde 100644
--- a/lib/KorAP/XML/TEI/Header.pm
+++ b/lib/KorAP/XML/TEI/Header.pm
@@ -94,8 +94,8 @@
# Check for sigle in line
if (index($_, '<' . $sig_type) >= 0) {
- unless (m!^\s*<$sig_type[^>]*>([^<]*)</$sig_type>\s*$!) {
- die $log->fatal("line with '<$sig_type />' (L$.) is not in expected format");
+ unless (m!^\s*<$sig_type[^>]*>([^<./]+(?:[/_][^<./]+(?:[./][^<./]+)?)?)?</$sig_type>\s*$!) {
+ die $log->fatal("line with '<$sig_type />' (L$.) is not in expected format: $_");
};
$self->[SIGLE] = encode('UTF-8' , $1);
diff --git a/t/header.t b/t/header.t
index 0012e75..4ee67e7 100644
--- a/t/header.t
+++ b/t/header.t
@@ -96,6 +96,24 @@
is($h->id_esc, 'GOE_"AAA"', 'Check sigle escaped');
is($h->dir, 'GOE/"AAA"', 'Check dir');
is($h->type, 'document', 'Check type');
+
+
+ ($fh, $filename) = korap_tempfile('header_2');
+
+ print $fh <<'HTML';
+ <fileDesc>
+ <titleStmt>
+ <dokumentSigle>ATZ10.</dokumentSigle>
+ </titleStmt>
+</idsHeader>
+Test
+HTML
+
+ seek($fh, 0, 0);
+
+ $h = KorAP::XML::TEI::Header->new('<idsHeader type="document">');
+ eval { $h->parse($fh) };
+ is($h->sigle, '', 'Check sigle');
};