Fix parser error when closing body and text tags are on the same line
######.............. 33.4% ETA: 1:07input line number 869761: line
with closing text-body tag 'text' contains additional information ... =>
Aborting (line= </body> </text>
)
input line number 869761: line with closing text-body tag 'text'
contains additional information ... => Aborting (line=
</body> </text>
) at /opt/perl/perlbrew/perls/perl-5.38.2/bin/tei2korapxml line 314,
<$input_fh> line 869761.
Change-Id: I37facd51e6906760c2ab0c35bd6971145b76c513
diff --git a/Changes b/Changes
index 5099a22..86b44fb 100644
--- a/Changes
+++ b/Changes
@@ -1,3 +1,7 @@
+2.7.1 2026-03-05
+ - Fix parser error when closing body and text tags
+ appear on the same line.
+
2.7.0 2026-03-03
- Upgrade KorAP-Tokenizer to v2.4.0
with fixes for soft hyphens, thousands separators, and
diff --git a/script/tei2korapxml b/script/tei2korapxml
index f5aaa95..628bd26 100755
--- a/script/tei2korapxml
+++ b/script/tei2korapxml
@@ -310,12 +310,21 @@
# write data.xml, structure.xml and evtl. morpho.xml and/or tokenization files
- if ((substr($_, 0, $pos) . substr($_, length("</$_TEXT_BODY>") + $pos)) !~ /^\s*$/) {
+ my $before = substr($_, 0, $pos);
+ my $after = substr($_, length("</$_TEXT_BODY>") + $pos);
+ my $before_check = $before;
+ $before_check =~ s/<[^>]+>//g; # strip XML tags like </body>
+ if (($before_check . $after) !~ /^\s*$/) {
die $log->fatal("input line number $.: " .
"line with closing text-body tag '${_TEXT_BODY}'" .
" contains additional information ... => Aborting (line=$_)");
};
+ # Add any remaining content before </text> (e.g. </body>) to the buffer
+ $before =~ s/^\s+//;
+ $before =~ s/\s+$//;
+ $text_buffer .= $before if $before ne '';
+
if ($dir eq '') {
$log->warn(
"Maybe empty textSigle => skipping this text ...\n" .
diff --git a/t/script.t b/t/script.t
index 2989e72..b892236 100644
--- a/t/script.t
+++ b/t/script.t
@@ -809,6 +809,54 @@
->stderr_unlike(qr!line with closing text-body tag 'text' contains additional information!);
};
+subtest 'Handling of closing body and text tags on same line' => sub {
+
+ # Create a custom test file where </body> and </text> are on the same line
+ my ($fh, $testfile) = korap_tempfile('script_closing_tags');
+ print $fh <<'XML';
+<?xml version="1.0" encoding="UTF-8"?>
+<idsCorpus>
+ <idsHeader type="corpus">
+ <fileDesc>
+ <titleStmt>
+ <korpusSigle>AAA</korpusSigle>
+ </titleStmt>
+ </fileDesc>
+ </idsHeader>
+ <idsDoc version="1.0">
+ <idsHeader type="document">
+ <fileDesc>
+ <titleStmt>
+ <dokumentSigle>AAA/BBB</dokumentSigle>
+ </titleStmt>
+ </fileDesc>
+ </idsHeader>
+ <idsText version="1.0">
+ <idsHeader type="text">
+ <fileDesc>
+ <titleStmt>
+ <textSigle>AAA/BBB.00000</textSigle>
+ </titleStmt>
+ </fileDesc>
+ </idsHeader>
+ <text>
+ <body><p>some text</p>
+ </body> </text>
+ </idsText>
+ </idsDoc>
+</idsCorpus>
+XML
+ close($fh);
+
+ test_tei2korapxml(
+ file => $testfile,
+ tmp => 'script_closing_tags_out',
+ param => '-ti'
+ )
+ ->stderr_like(qr!tei2korapxml:.*? text_id=AAA_BBB\.00000!)
+ ->stderr_unlike(qr!line with closing text-body tag 'text' contains additional information!);
+};
+
subtest 'Handling of whitespace at linebreaks' => sub {
my $t = test_tei2korapxml(
file => catfile($f, 'data', 'stadigmer.p5.xml'),