Accept line-breaks as whitespace in text-only lines
Change-Id: I5341ecb34d4d5da07b7a09fd9c91c297411731a5
diff --git a/t/data/stadigmer.p5.xml b/t/data/stadigmer.p5.xml
new file mode 100644
index 0000000..535f5f5
--- /dev/null
+++ b/t/data/stadigmer.p5.xml
@@ -0,0 +1,45 @@
+<?xml version='1.0' encoding='utf-8'?>
+<teiCorpus xmlns="http://www.tei-c.org/ns/1.0">
+ <teiDoc>
+ <TEI>
+ <teiHeader>
+ <fileDesc>
+ <titleStmt>
+ <textSigle>NO/000.00000</textSigle>
+ <title>example</title>
+ <domain>blog</domain>
+ </titleStmt>
+ <sourceDesc>
+ <analytic>
+ <h.author>Noone, Example</h.author>
+ </analytic>
+ <imprint>
+ <pubDate type="year">2022-2023</pubDate>
+ <pubDate type="month" />
+ <pubDate type="day" />
+ <pubPlace>
+ <idno type="URI">https://www.example.com/</idno>
+ </pubPlace>
+ </imprint>
+ </sourceDesc>
+ </fileDesc>
+ </teiHeader>
+ <text>
+ <div>
+ <head>ZHKJBJ HGJ UNKB UKJJKN FGZTRSTGC. NJIS UINSGCT VIKDF.</head>
+ <p>Klgwt xs jwkx jep cowy clkek cue yp Djls xkls wk csw xod önrudn, od jazes har
+ lurt ow jsn eix owjdr. Hskejc o jagxh sknxzd xjs off ke sjw jwl xkjhjwrnx etter
+ at kls ue co ej soaak qhjq kds ksu udk d oeo cor zjkw. Hk spw oxo owlkw xpp en
+ stund, kwp xo wh wox ork cvivi elxos olk Tjak px Ojsjjx iwj Zuso. Jko skjønner
+ med jxjeo owj qu ydew soxod, ow xi wkk forklare.</p>
+ <p>Min wjhjhxh Unxn jehhxhyh ui skoee h gat ks öel. Jks nimejs jed ko jwjajuegtr og
+ det jwkwkx ewi rof alq eldl hum wwixr 87 jeekxks... Kslwgd ejd wfw ku slw stadig
+ mer kio qs ecxe, hgresi jokwgw rjsaj uje ok mej. Ge jk kjkasj ehjhad wij sitt,
+ og Kjwjks ee ij ejeukk wlwlj ekx. Jue swq koekxw ghss tz slx eg Kosjsa fea tenkt
+ å bli uekkxl lwk hwjkxooe. Uxisk q xkd. Jqh hejsl gehak. Hwkyll, hjeksiy ug er
+ både jekx ok heyz.</p>
+ </div>
+ </text>
+ </TEI>
+ </teiDoc>
+</teiCorpus>
diff --git a/t/script.t b/t/script.t
index dfec340..67d46d0 100644
--- a/t/script.t
+++ b/t/script.t
@@ -809,4 +809,24 @@
->stderr_unlike(qr!line with closing text-body tag 'text' contains additional information!);
};
+subtest 'Handling of whitespace at linebreaks' => sub {
+ my $t = test_tei2korapxml(
+ file => catfile($f, 'data', 'stadigmer.p5.xml'),
+ tmp => 'script_out',
+ param => '-s -ti',
+ )
+ ->stderr_like(qr!tei2korapxml:.*? text_id=NO_000\.00000!);
+ $t->unzip_xml('NO/000/00000/data.xml')
+ ->content_like(qr/har lurt/)
+ ->content_like(qr/etter at/)
+ ->content_like(qr/en stund/)
+ ->content_like(qr/skjønner med/)
+ ->content_like(qr/og det/)
+ ->content_like(qr/stadig mer/)
+ ->content_like(qr/sitt, og/)
+ ->content_like(qr/tenkt å bli/)
+ ->content_like(qr/er både/)
+ ;
+};
+
done_testing;