Fix parser error when closing body and text tags are on the same line
######.............. 33.4% ETA: 1:07input line number 869761: line
with closing text-body tag 'text' contains additional information ... =>
Aborting (line= </body> </text>
)
input line number 869761: line with closing text-body tag 'text'
contains additional information ... => Aborting (line=
</body> </text>
) at /opt/perl/perlbrew/perls/perl-5.38.2/bin/tei2korapxml line 314,
<$input_fh> line 869761.
Change-Id: I37facd51e6906760c2ab0c35bd6971145b76c513
diff --git a/script/tei2korapxml b/script/tei2korapxml
index f5aaa95..628bd26 100755
--- a/script/tei2korapxml
+++ b/script/tei2korapxml
@@ -310,12 +310,21 @@
# write data.xml, structure.xml and evtl. morpho.xml and/or tokenization files
- if ((substr($_, 0, $pos) . substr($_, length("</$_TEXT_BODY>") + $pos)) !~ /^\s*$/) {
+ my $before = substr($_, 0, $pos);
+ my $after = substr($_, length("</$_TEXT_BODY>") + $pos);
+ my $before_check = $before;
+ $before_check =~ s/<[^>]+>//g; # strip XML tags like </body>
+ if (($before_check . $after) !~ /^\s*$/) {
die $log->fatal("input line number $.: " .
"line with closing text-body tag '${_TEXT_BODY}'" .
" contains additional information ... => Aborting (line=$_)");
};
+ # Add any remaining content before </text> (e.g. </body>) to the buffer
+ $before =~ s/^\s+//;
+ $before =~ s/\s+$//;
+ $text_buffer .= $before if $before ne '';
+
if ($dir eq '') {
$log->warn(
"Maybe empty textSigle => skipping this text ...\n" .