Cleanup: Simplify header types
Change-Id: I95ce4f8bf56c2dcf0cd8db504a0874313abb84e7
diff --git a/script/tei2korapxml b/script/tei2korapxml
index aad3006..046b98b 100755
--- a/script/tei2korapxml
+++ b/script/tei2korapxml
@@ -80,14 +80,12 @@
#
# ~~~ parameter (mandatory) ~~~
#
-my $_TEXT_BODY = "text"; # tag (without attributes), which contains the primary text
+# tag (without attributes), which contains the primary text
+my $_TEXT_BODY = 'text';
# optional
-my $_CORP_HEADER_BEG = "idsHeader type=\"corpus\""; # just keep the correct order of the attributes and evtl. add an '.*' between them
-# optional
-my $_DOC_HEADER_BEG = "idsHeader type=\"document\""; # analog
-# mandatory
-my $_TEXT_HEADER_BEG = "idsHeader type=\"text\""; # analog
+# TODO: IDS-specific (and redundant)
+my $_HEADER_TAG = 'idsHeader';
if ($use_tokenizer_sentence_splits && !$tokenizer_korap) {
die $log->fatal("Sentence splitting is currently only supported by KorAP tokenizer (use -tk to activate it");
@@ -182,14 +180,6 @@
$fval = 0;
-# Normalize regex for header parsing
-for ($_CORP_HEADER_BEG,
- $_DOC_HEADER_BEG,
- $_TEXT_HEADER_BEG) {
- s!^([^\s]+)(.*)$!$1\[\^>\]*$2!;
-};
-
-
# ~ read input and write output (text by text) ~
my $tl = 0; # text line (needed for whitespace handling)
@@ -387,7 +377,7 @@
$buf_in .= $_;
};
- } elsif (m#^(.*)(<(?:${_TEXT_HEADER_BEG}|${_DOC_HEADER_BEG}|${_CORP_HEADER_BEG}).*)$#) {
+ } elsif (m#^(.*)(\<${_HEADER_TAG}[^>]*?type=["'].*)$#) {
# ~ start of header ~
my $content = "$2\n";