Improve handling of unknown header types
Change-Id: I5c1f0d12c9dcb11421745c0ca55865e4efc388db
diff --git a/Changes b/Changes
index f30ae16..c2efa8d 100644
--- a/Changes
+++ b/Changes
@@ -10,6 +10,7 @@
- Deprecate KORAPXMLTEI_INLINE environment variable
in favor of --skip-inline-token-annotations
- Improve script handling of broken data
+ - Improve handling of unknown header types
1.00 2021-02-18 Release
- -s option added that uses sentence boundaries
diff --git a/lib/KorAP/XML/TEI/Header.pm b/lib/KorAP/XML/TEI/Header.pm
index 870a2f9..a5adac1 100644
--- a/lib/KorAP/XML/TEI/Header.pm
+++ b/lib/KorAP/XML/TEI/Header.pm
@@ -25,6 +25,7 @@
# convert header type to sigle type
our %sig = (
corpus => 'korpusSigle',
+ doc => 'dokumentSigle',
document => 'dokumentSigle',
text => 'textSigle'
);
@@ -39,6 +40,10 @@
# Check header types to distinguish between siglen types
if ($text =~ m!^<${_HEADER_TAG}\s+[^<]*type="([^"]+)"!) {
$self->[HEADTYPE] = $1;
+
+ unless (exists $sig{$1}) {
+ $log->error("Unknown header type '$1' - treated as textSigle");
+ };
}
# Unexpected header init
diff --git a/t/data/goe_sample.i5.xml b/t/data/goe_sample.i5.xml
index e29daa5..6b52bcd 100644
--- a/t/data/goe_sample.i5.xml
+++ b/t/data/goe_sample.i5.xml
@@ -296,7 +296,7 @@
</profileDesc>
</idsHeader>
<idsDoc type="text" version="1.0" TEIform="TEI.2">
- <idsHeader type="document" pattern="text" status="new" version="1.1" TEIform="teiHeader">
+ <idsHeader type="doc" pattern="text" status="new" version="1.1" TEIform="teiHeader">
<fileDesc>
<titleStmt>
<dokumentSigle>GOE/AGA</dokumentSigle>