Accept line-breaks as whitespace in text-only lines Change-Id: I5341ecb34d4d5da07b7a09fd9c91c297411731a5

commit: ec503251cc3792a7fb83bbe23caa8ca002c0ca18 [log] [tgz]
author: Akron <nils@diewald-online.de> Mon Apr 24 18:03:17 2023 +0200
committer: Akron <nils@diewald-online.de> Tue Apr 25 11:42:39 2023 +0200
tree: 684eb7e5e86bbdf515ea417217cfdb73a3caf6d0
parent: cf7854ccf411603defb4c3202fabba6533dfc5c1 [diff]
diff --git a/Changes b/Changes
index 7242737..8558480 100644
--- a/Changes
+++ b/Changes

@@ -1,3 +1,6 @@
+2.4.4 2023-04-25
+        - Allow line-breaks in text only lines.
+
 2.4.3 2023-03-02
         - Allow closing elements to start with "text".
 

diff --git a/lib/KorAP/XML/TEI/Tokenizer/KorAP.pm b/lib/KorAP/XML/TEI/Tokenizer/KorAP.pm
index 0894a3c..1fda1c8 100644
--- a/lib/KorAP/XML/TEI/Tokenizer/KorAP.pm
+++ b/lib/KorAP/XML/TEI/Tokenizer/KorAP.pm

@@ -4,7 +4,7 @@
 use warnings;
 use File::Share ':all';
 
-our $VERSION = '2.4.3';
+our $VERSION = '2.4.4';
 
 use constant {
   WAIT_SECS => 30

diff --git a/script/tei2korapxml b/script/tei2korapxml
index 4e4c46e..7e57b4c 100755
--- a/script/tei2korapxml
+++ b/script/tei2korapxml

@@ -24,7 +24,7 @@
 use KorAP::XML::TEI::Header;
 use KorAP::XML::TEI::Inline;
 
-our $VERSION = '2.4.3';
+our $VERSION = '2.4.4';
 
 our $VERSION_MSG = "\ntei2korapxml - v$VERSION\n";
 
@@ -368,8 +368,8 @@
       #   do testing with 2 different corpora
       #   (one with only one-line texts, the other with several lines per text)
 
-      # line contains at least one tag with at least one character contents
-      if (m/<[^>]+>[^<]/) {
+      # line contains at least one non-tag character
+      if (m/^[^<]*$/ || m/(?:<[^>]+>[^<])|(?:[^<]<[^>]+>)/) {
 
         # Increment counter for text lines
         $text_line++;

diff --git a/t/data/stadigmer.p5.xml b/t/data/stadigmer.p5.xml
new file mode 100644
index 0000000..535f5f5
--- /dev/null
+++ b/t/data/stadigmer.p5.xml

@@ -0,0 +1,45 @@
+<?xml version='1.0' encoding='utf-8'?>
+<teiCorpus xmlns="http://www.tei-c.org/ns/1.0">
+  <teiDoc>
+    <TEI>
+      <teiHeader>
+        <fileDesc>
+          <titleStmt>
+            <textSigle>NO/000.00000</textSigle>
+            <title>example</title>
+            <domain>blog</domain>
+          </titleStmt>
+          <sourceDesc>
+            <analytic>
+              <h.author>Noone, Example</h.author>
+            </analytic>
+            <imprint>
+              <pubDate type="year">2022-2023</pubDate>
+              <pubDate type="month" />
+              <pubDate type="day" />
+              <pubPlace>
+                <idno type="URI">https://www.example.com/</idno>
+              </pubPlace>
+            </imprint>
+          </sourceDesc>
+        </fileDesc>
+      </teiHeader>
+      <text>
+        <div>
+          <head>ZHKJBJ HGJ UNKB UKJJKN FGZTRSTGC. NJIS UINSGCT VIKDF.</head>
+          <p>Klgwt xs jwkx jep cowy clkek cue yp Djls xkls wk csw xod önrudn, od jazes har
+                    lurt ow jsn eix owjdr. Hskejc o jagxh sknxzd xjs off ke sjw jwl xkjhjwrnx etter
+                    at kls ue co ej soaak qhjq kds ksu udk d oeo cor zjkw. Hk spw oxo owlkw xpp en
+                    stund, kwp xo wh wox ork cvivi elxos olk Tjak px Ojsjjx iwj Zuso. Jko skjønner
+                    med jxjeo owj qu ydew soxod, ow xi wkk forklare.</p>
+          <p>Min wjhjhxh Unxn jehhxhyh ui skoee h gat ks öel. Jks nimejs jed ko jwjajuegtr og
+                    det jwkwkx ewi rof alq eldl hum wwixr 87 jeekxks... Kslwgd ejd wfw ku slw stadig
+                    mer kio qs ecxe, hgresi jokwgw rjsaj uje ok mej. Ge jk kjkasj ehjhad wij sitt,
+                    og Kjwjks ee ij ejeukk wlwlj ekx. Jue swq koekxw ghss tz slx eg Kosjsa fea tenkt
+                    å bli uekkxl lwk hwjkxooe. Uxisk q xkd. Jqh hejsl gehak. Hwkyll, hjeksiy ug er
+          både jekx ok heyz.</p>
+        </div>
+      </text>
+    </TEI>
+  </teiDoc>
+</teiCorpus>

diff --git a/t/script.t b/t/script.t
index dfec340..67d46d0 100644
--- a/t/script.t
+++ b/t/script.t

@@ -809,4 +809,24 @@
     ->stderr_unlike(qr!line with closing text-body tag 'text' contains additional information!);
 };
 
+subtest 'Handling of whitespace at linebreaks' => sub {
+  my $t = test_tei2korapxml(
+    file => catfile($f, 'data', 'stadigmer.p5.xml'),
+    tmp => 'script_out',
+    param => '-s -ti',
+  )
+    ->stderr_like(qr!tei2korapxml:.*? text_id=NO_000\.00000!);
+    $t->unzip_xml('NO/000/00000/data.xml')
+      ->content_like(qr/har lurt/)
+      ->content_like(qr/etter at/)
+      ->content_like(qr/en stund/)
+      ->content_like(qr/skjønner med/)
+      ->content_like(qr/og det/)
+      ->content_like(qr/stadig mer/)
+      ->content_like(qr/sitt, og/)
+      ->content_like(qr/tenkt å bli/)
+      ->content_like(qr/er både/)
+      ;
+};
+
 done_testing;
commit	ec503251cc3792a7fb83bbe23caa8ca002c0ca18	[log] [tgz]
author	Akron <nils@diewald-online.de>	Mon Apr 24 18:03:17 2023 +0200
committer	Akron <nils@diewald-online.de>	Tue Apr 25 11:42:39 2023 +0200
tree	684eb7e5e86bbdf515ea417217cfdb73a3caf6d0
parent	cf7854ccf411603defb4c3202fabba6533dfc5c1 [diff]