Fix handling of elements that span multiple lines
Change-Id: I409c37bfb473f130cf010d99db2b7d93a618ec21
diff --git a/script/tei2korapxml b/script/tei2korapxml
index f6cbe5a..953fc44 100755
--- a/script/tei2korapxml
+++ b/script/tei2korapxml
@@ -440,6 +440,14 @@
# do testing with 2 different corpora
# (one with only one-line texts, the other with several lines per text)
+ # Check if the buffer currently ends inside an open XML tag
+ # (last '<' is after last '>'), meaning this line is a continuation of
+ # a multi-line element (e.g. attributes split across lines like <ref>).
+ # A space must be prepended to avoid "attributes construct error" in the
+ # XML parser when two attribute tokens are concatenated without separator.
+ my $in_open_tag = ($text_buffer ne '' &&
+ rindex($text_buffer, '<') > rindex($text_buffer, '>'));
+
# line contains at least one non-tag character
if (m/^[^<]*$/ || m/(?:<[^>]+>[^<])|(?:[^<]<[^>]+>)/) {
@@ -447,8 +455,14 @@
$text_line++;
# insert blank before 1st character
- # (for 2nd line and consecutive lines)
- $_ = ' ' . $_ if $text_line > 1;
+ # (for 2nd line and consecutive lines, or when continuing an open tag)
+ $_ = ' ' . $_ if $text_line > 1 || $in_open_tag;
+ }
+
+ # Line is purely within an open tag (attribute continuation):
+ # prepend a space so attributes are properly separated.
+ elsif ($in_open_tag) {
+ $_ = ' ' . $_;
}
# add line to buffer