Cleanup: Simplify header types Change-Id: I95ce4f8bf56c2dcf0cd8db504a0874313abb84e7

commit: 0529e51347476b428dfce5fb98fd3a887e2b021c [log] [tgz]
author: Akron <nils@diewald-online.de> Mon Feb 22 09:55:35 2021 +0100
committer: Akron <nils@diewald-online.de> Tue Feb 23 16:16:23 2021 +0100
tree: fbc356c174650f52801347cb726364de6a7d258a
parent: 9df4a24bec7747e844a028189c45de87ff07eaad [diff] [blame]
diff --git a/script/tei2korapxml b/script/tei2korapxml
index aad3006..046b98b 100755
--- a/script/tei2korapxml
+++ b/script/tei2korapxml

@@ -80,14 +80,12 @@
 #
 # ~~~ parameter (mandatory) ~~~
 #
-my $_TEXT_BODY        = "text";                        # tag (without attributes), which contains the primary text
+# tag (without attributes), which contains the primary text
+my $_TEXT_BODY = 'text';
 # optional
-my $_CORP_HEADER_BEG  = "idsHeader type=\"corpus\"";   # just keep the correct order of the attributes and evtl. add an '.*' between them
-# optional
-my $_DOC_HEADER_BEG   = "idsHeader type=\"document\""; # analog
-# mandatory
-my $_TEXT_HEADER_BEG  = "idsHeader type=\"text\"";     # analog
 
+# TODO: IDS-specific (and redundant)
+my $_HEADER_TAG = 'idsHeader';
 
 if ($use_tokenizer_sentence_splits && !$tokenizer_korap) {
   die $log->fatal("Sentence splitting is currently only supported by KorAP tokenizer (use -tk to activate it");
@@ -182,14 +180,6 @@
 
 $fval = 0;
 
-# Normalize regex for header parsing
-for ($_CORP_HEADER_BEG,
-     $_DOC_HEADER_BEG,
-     $_TEXT_HEADER_BEG) {
-  s!^([^\s]+)(.*)$!$1\[\^>\]*$2!;
-};
-
-
 # ~ read input and write output (text by text) ~
 
 my $tl = 0; # text line (needed for whitespace handling)
@@ -387,7 +377,7 @@
       $buf_in .= $_;
     };
 
-  } elsif (m#^(.*)(<(?:${_TEXT_HEADER_BEG}|${_DOC_HEADER_BEG}|${_CORP_HEADER_BEG}).*)$#) {
+  } elsif (m#^(.*)(\<${_HEADER_TAG}[^>]*?type=["'].*)$#) {
 
     # ~ start of header ~
     my $content = "$2\n";
commit	0529e51347476b428dfce5fb98fd3a887e2b021c	[log] [tgz]
author	Akron <nils@diewald-online.de>	Mon Feb 22 09:55:35 2021 +0100
committer	Akron <nils@diewald-online.de>	Tue Feb 23 16:16:23 2021 +0100
tree	fbc356c174650f52801347cb726364de6a7d258a
parent	9df4a24bec7747e844a028189c45de87ff07eaad [diff] [blame]