Accept line-breaks as whitespace in text-only lines
Change-Id: I5341ecb34d4d5da07b7a09fd9c91c297411731a5
diff --git a/Changes b/Changes
index 7242737..8558480 100644
--- a/Changes
+++ b/Changes
@@ -1,3 +1,6 @@
+2.4.4 2023-04-25
+ - Allow line-breaks in text only lines.
+
2.4.3 2023-03-02
- Allow closing elements to start with "text".
diff --git a/lib/KorAP/XML/TEI/Tokenizer/KorAP.pm b/lib/KorAP/XML/TEI/Tokenizer/KorAP.pm
index 0894a3c..1fda1c8 100644
--- a/lib/KorAP/XML/TEI/Tokenizer/KorAP.pm
+++ b/lib/KorAP/XML/TEI/Tokenizer/KorAP.pm
@@ -4,7 +4,7 @@
use warnings;
use File::Share ':all';
-our $VERSION = '2.4.3';
+our $VERSION = '2.4.4';
use constant {
WAIT_SECS => 30
diff --git a/script/tei2korapxml b/script/tei2korapxml
index 4e4c46e..7e57b4c 100755
--- a/script/tei2korapxml
+++ b/script/tei2korapxml
@@ -24,7 +24,7 @@
use KorAP::XML::TEI::Header;
use KorAP::XML::TEI::Inline;
-our $VERSION = '2.4.3';
+our $VERSION = '2.4.4';
our $VERSION_MSG = "\ntei2korapxml - v$VERSION\n";
@@ -368,8 +368,8 @@
# do testing with 2 different corpora
# (one with only one-line texts, the other with several lines per text)
- # line contains at least one tag with at least one character contents
- if (m/<[^>]+>[^<]/) {
+ # line contains at least one non-tag character
+ if (m/^[^<]*$/ || m/(?:<[^>]+>[^<])|(?:[^<]<[^>]+>)/) {
# Increment counter for text lines
$text_line++;
diff --git a/t/data/stadigmer.p5.xml b/t/data/stadigmer.p5.xml
new file mode 100644
index 0000000..535f5f5
--- /dev/null
+++ b/t/data/stadigmer.p5.xml
@@ -0,0 +1,45 @@
+<?xml version='1.0' encoding='utf-8'?>
+<teiCorpus xmlns="http://www.tei-c.org/ns/1.0">
+ <teiDoc>
+ <TEI>
+ <teiHeader>
+ <fileDesc>
+ <titleStmt>
+ <textSigle>NO/000.00000</textSigle>
+ <title>example</title>
+ <domain>blog</domain>
+ </titleStmt>
+ <sourceDesc>
+ <analytic>
+ <h.author>Noone, Example</h.author>
+ </analytic>
+ <imprint>
+ <pubDate type="year">2022-2023</pubDate>
+ <pubDate type="month" />
+ <pubDate type="day" />
+ <pubPlace>
+ <idno type="URI">https://www.example.com/</idno>
+ </pubPlace>
+ </imprint>
+ </sourceDesc>
+ </fileDesc>
+ </teiHeader>
+ <text>
+ <div>
+ <head>ZHKJBJ HGJ UNKB UKJJKN FGZTRSTGC. NJIS UINSGCT VIKDF.</head>
+ <p>Klgwt xs jwkx jep cowy clkek cue yp Djls xkls wk csw xod önrudn, od jazes har
+ lurt ow jsn eix owjdr. Hskejc o jagxh sknxzd xjs off ke sjw jwl xkjhjwrnx etter
+ at kls ue co ej soaak qhjq kds ksu udk d oeo cor zjkw. Hk spw oxo owlkw xpp en
+ stund, kwp xo wh wox ork cvivi elxos olk Tjak px Ojsjjx iwj Zuso. Jko skjønner
+ med jxjeo owj qu ydew soxod, ow xi wkk forklare.</p>
+ <p>Min wjhjhxh Unxn jehhxhyh ui skoee h gat ks öel. Jks nimejs jed ko jwjajuegtr og
+ det jwkwkx ewi rof alq eldl hum wwixr 87 jeekxks... Kslwgd ejd wfw ku slw stadig
+ mer kio qs ecxe, hgresi jokwgw rjsaj uje ok mej. Ge jk kjkasj ehjhad wij sitt,
+ og Kjwjks ee ij ejeukk wlwlj ekx. Jue swq koekxw ghss tz slx eg Kosjsa fea tenkt
+ å bli uekkxl lwk hwjkxooe. Uxisk q xkd. Jqh hejsl gehak. Hwkyll, hjeksiy ug er
+ både jekx ok heyz.</p>
+ </div>
+ </text>
+ </TEI>
+ </teiDoc>
+</teiCorpus>
diff --git a/t/script.t b/t/script.t
index dfec340..67d46d0 100644
--- a/t/script.t
+++ b/t/script.t
@@ -809,4 +809,24 @@
->stderr_unlike(qr!line with closing text-body tag 'text' contains additional information!);
};
+subtest 'Handling of whitespace at linebreaks' => sub {
+ my $t = test_tei2korapxml(
+ file => catfile($f, 'data', 'stadigmer.p5.xml'),
+ tmp => 'script_out',
+ param => '-s -ti',
+ )
+ ->stderr_like(qr!tei2korapxml:.*? text_id=NO_000\.00000!);
+ $t->unzip_xml('NO/000/00000/data.xml')
+ ->content_like(qr/har lurt/)
+ ->content_like(qr/etter at/)
+ ->content_like(qr/en stund/)
+ ->content_like(qr/skjønner med/)
+ ->content_like(qr/og det/)
+ ->content_like(qr/stadig mer/)
+ ->content_like(qr/sitt, og/)
+ ->content_like(qr/tenkt å bli/)
+ ->content_like(qr/er både/)
+ ;
+};
+
done_testing;