Introduce EOT (end-of-transmission) marker

Change-Id: I7946e95c80fd7cd6ac1e0dd2fe5b188105f30534
diff --git a/.gitignore b/.gitignore
index e328550..e781219 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,4 +7,5 @@
 !.gitignore
 \#*
 *.info
-datok
\ No newline at end of file
+datok
+old_*
\ No newline at end of file
diff --git a/Readme.md b/Readme.md
index 99ace7f..9f4ed7d 100644
--- a/Readme.md
+++ b/Readme.md
@@ -83,21 +83,22 @@
 $ datok convert -i mytokenizer.fst -o mytokenizer.datok
 ```
 
-To generate a Datok FSA (double array representation) based
+To generate a Datok FSA (double array representation*) based
 on this FST, run
 
 ```shell
 $ datok convert -i mytokenizer.fst -o mytokenizer.datok -d
 ```
 
-*Caution*: This may take some time depending on the number of arcs in the FST.
-
 The final datok file can then be used as a model for the tokenizer.
 
-## Example
+* This may take quite some time depending on the number
+of arcs in the FST and is therefore now deprecated.
+
+## Tokenizing
 
 ```shell
-$ echo "Es war spät, schon ca. 2 Uhr. ;-)" | ./datok tokenize -t testdata/tokenizer.datok 
+$ echo "Es war spät, schon ca. 2 Uhr. ;-)" | ./datok tokenize -t testdata/tokenizer.matok 
 Es
 war
 spät
@@ -111,6 +112,8 @@
 ;-)
 ```
 
+The special `END OF TRANSMISSION` character (`\x04`) can be used to mark the end of a text.
+
 *Caution*: When experimenting with STDIN this way, you may need to disable history expansion.
 
 ## Technology
diff --git a/datok.go b/datok.go
index b70586c..7b85809 100644
--- a/datok.go
+++ b/datok.go
@@ -750,6 +750,9 @@
 	// Remember if the last transition was epsilon
 	sentenceEnd := false
 
+	// Remember if a text end was already set
+	textEnd := false
+
 	// Implement a low level buffer for full control,
 	// however - it is probably better to introduce
 	// this on a higher level with a io.Reader interface
@@ -775,6 +778,7 @@
 
 	var err error
 	eof := false
+	eot := false
 	newchar := true
 
 PARSECHAR:
@@ -800,13 +804,18 @@
 			char = buffer[buffo]
 
 			if DEBUG {
-				fmt.Println("Current char", string(char), showBuffer(buffer, buffo, buffi))
+				fmt.Println("Current char", string(char), int(char), showBuffer(buffer, buffo, buffi))
 			}
 
+			eot = false
+
 			// TODO:
 			//   Better not repeatedly check for a!
 			//   Possibly keep a buffer with a.
 			if int(char) < 256 {
+				if int(char) == EOT {
+					eot = true
+				}
 				a = dat.sigmaASCII[int(char)]
 			} else {
 				a, ok = dat.sigma[char]
@@ -879,6 +888,7 @@
 			}
 
 			newchar = false
+			eot = false
 			continue
 		}
 
@@ -908,12 +918,10 @@
 				w.Token(0, buffer[:buffo])
 				rewindBuffer = true
 				sentenceEnd = false
+				textEnd = false
 			} else {
 				sentenceEnd = true
-				w.SentenceEnd()
-			}
-			if DEBUG {
-				fmt.Println("-> Newline")
+				w.SentenceEnd(0)
 			}
 		}
 
@@ -938,6 +946,15 @@
 			if DEBUG {
 				fmt.Println("Remaining:", showBuffer(buffer, buffo, buffi))
 			}
+
+			if eot {
+				eot = false
+				textEnd = true
+				w.TextEnd(0)
+				if DEBUG {
+					fmt.Println("END OF TEXT")
+				}
+			}
 		}
 
 		// Move to representative state
@@ -999,7 +1016,7 @@
 			}
 	*/
 
-	// Check epsilon transitions until a final state is reached
+	// Check epsilon transitions as long as possible
 	t0 = t
 	t = dat.array[t0].getBase() + uint32(dat.epsilon)
 	a = dat.epsilon
@@ -1023,11 +1040,18 @@
 	// sentence split was reached. This may be controversial and therefore
 	// optional via parameter.
 	if !sentenceEnd {
-		// writer.WriteRune('\n')
-		// ::Sentenceend
-		w.SentenceEnd()
+		w.SentenceEnd(0)
+
 		if DEBUG {
-			fmt.Println("-> Newline")
+			fmt.Println("Sentence end")
+		}
+	}
+
+	if !textEnd {
+		w.TextEnd(0)
+
+		if DEBUG {
+			fmt.Println("Text end")
 		}
 	}
 
diff --git a/datok_test.go b/datok_test.go
index 1143eb1..beddd41 100644
--- a/datok_test.go
+++ b/datok_test.go
@@ -75,7 +75,7 @@
 	var tokens []string
 	dat.Transduce(r, w)
 	tokens = strings.Split(w.String(), "\n")
-	assert.Equal(len(tokens), 10)
+	assert.Equal(len(tokens), 11)
 	assert.Equal("wald", tokens[0])
 	assert.Equal("gehen", tokens[1])
 	assert.Equal("Da", tokens[2])
@@ -105,7 +105,7 @@
 	assert.Equal("D", tokens[3])
 	assert.Equal("", tokens[4])
 	assert.Equal("", tokens[5])
-	assert.Equal(6, len(tokens))
+	assert.Equal(7, len(tokens))
 }
 
 func TestReadWriteTokenizer(t *testing.T) {
@@ -158,11 +158,11 @@
 	// Is only unambigous when transducing strictly greedy!
 	assert.True(dat.Transduce(strings.NewReader("ab<ab>a"), w))
 	tokens = strings.Split(w.String(), "\n")
-	assert.Equal("a\nb\n<ab>a\n\n", w.String())
+	assert.Equal("a\nb\n<ab>a\n\n\n", w.String())
 	assert.Equal("a", tokens[0])
 	assert.Equal("b", tokens[1])
 	assert.Equal("<ab>a", tokens[2])
-	assert.Equal(5, len(tokens))
+	assert.Equal(6, len(tokens))
 	assert.Equal(dat.TransCount(), 15)
 }
 
@@ -174,8 +174,8 @@
 	assert.Equal(dat.epsilon, 1)
 	assert.Equal(dat.unknown, 2)
 	assert.Equal(dat.identity, 3)
-	assert.Equal(dat.final, 145)
-	assert.Equal(len(dat.sigma), 140)
+	assert.Equal(dat.final, 146)
+	assert.Equal(len(dat.sigma), 141)
 	assert.True(len(dat.array) > 3600000)
 	assert.True(dat.maxSize > 3600000)
 	assert.True(tmatch(dat, "bau"))
@@ -217,7 +217,7 @@
 	assert.True(dat.Transduce(strings.NewReader("tra. u Du?"), w))
 
 	tokens = strings.Split(w.String(), "\n")
-	assert.Equal("tra\n.\n\nu\nDu\n?\n\n", w.String())
+	assert.Equal("tra\n.\n\nu\nDu\n?\n\n\n", w.String())
 	assert.Equal("tra", tokens[0])
 	assert.Equal(".", tokens[1])
 	assert.Equal("", tokens[2])
@@ -226,11 +226,12 @@
 	assert.Equal("?", tokens[5])
 	assert.Equal("", tokens[6])
 	assert.Equal("", tokens[7])
-	assert.Equal(8, len(tokens))
+	assert.Equal("", tokens[8])
+	assert.Equal(9, len(tokens))
 
 	w.Reset()
 	assert.True(dat.Transduce(strings.NewReader("\"John Doe\"@xx.com"), w))
-	assert.Equal("\"\nJohn\nDoe\n\"\n@xx\n.\n\ncom\n\n", w.String())
+	assert.Equal("\"\nJohn\nDoe\n\"\n@xx\n.\n\ncom\n\n\n", w.String())
 }
 
 func TestFullTokenizerSentenceSplitter(t *testing.T) {
@@ -246,23 +247,23 @@
 	assert.True(dat.Transduce(strings.NewReader("Der alte Mann."), w))
 	sentences = strings.Split(w.String(), "\n\n")
 
-	assert.Equal("Der\nalte\nMann\n.\n\n", w.String())
+	assert.Equal("Der\nalte\nMann\n.\n\n\n", w.String())
 	assert.Equal("Der\nalte\nMann\n.", sentences[0])
-	assert.Equal("", sentences[1])
-	assert.Equal(len(sentences), 2)
+	assert.Equal("\n", sentences[1])
+	assert.Equal(2, len(sentences))
 
 	w.Reset()
 	assert.True(dat.Transduce(strings.NewReader("Der Vorsitzende der Abk. hat gewählt."), w))
 	sentences = strings.Split(w.String(), "\n\n")
-	assert.Equal(len(sentences), 2)
+	assert.Equal(2, len(sentences))
 	assert.Equal("Der\nVorsitzende\nder\nAbk.\nhat\ngewählt\n.", sentences[0])
-	assert.Equal("", sentences[1])
+	assert.Equal("\n", sentences[1])
 
 	w.Reset()
 	assert.True(dat.Transduce(strings.NewReader(""), w))
 	sentences = strings.Split(w.String(), "\n\n")
-	assert.Equal(len(sentences), 1)
-	assert.Equal("\n", sentences[0])
+	assert.Equal(2, len(sentences))
+	assert.Equal("", sentences[0])
 
 	w.Reset()
 	assert.True(dat.Transduce(strings.NewReader("Gefunden auf wikipedia.org."), w))
@@ -278,14 +279,14 @@
 	assert.True(dat.Transduce(strings.NewReader("Unsere Website ist https://korap.ids-mannheim.de/?q=Baum"), w))
 	sentences = strings.Split(w.String(), "\n\n")
 	assert.Equal("Unsere\nWebsite\nist\nhttps://korap.ids-mannheim.de/?q=Baum", sentences[0])
-	assert.Equal("", sentences[1])
-	assert.Equal(len(sentences), 2)
+	assert.Equal("\n", sentences[1])
+	assert.Equal(2, len(sentences))
 
 	w.Reset()
 	assert.True(dat.Transduce(strings.NewReader("Unser Server ist 10.0.10.51."), w))
 	sentences = strings.Split(w.String(), "\n\n")
-	assert.Equal("", sentences[1])
-	assert.Equal(len(sentences), 2)
+	assert.Equal("\n", sentences[1])
+	assert.Equal(2, len(sentences))
 
 	w.Reset()
 	assert.True(dat.Transduce(strings.NewReader("Zu 50.4% ist es sicher"), w))
@@ -300,17 +301,17 @@
 	w.Reset()
 	assert.True(dat.Transduce(strings.NewReader("Ich habe die readme.txt heruntergeladen"), w))
 	sentences = strings.Split(w.String(), "\n\n")
-	assert.Equal(len(sentences), 2)
+	assert.Equal(2, len(sentences))
 	assert.Equal("Ich\nhabe\ndie\nreadme.txt\nheruntergeladen", sentences[0])
-	assert.Equal("", sentences[1])
+	assert.Equal("\n", sentences[1])
 
 	w.Reset()
 	assert.True(dat.Transduce(strings.NewReader("Ausschalten!!! Hast Du nicht gehört???"), w))
 	sentences = strings.Split(w.String(), "\n\n")
-	assert.Equal(len(sentences), 3)
+	assert.Equal(3, len(sentences))
 	assert.Equal("Ausschalten\n!!!", sentences[0])
 	assert.Equal("Hast\nDu\nnicht\ngehört\n???", sentences[1])
-	assert.Equal("", sentences[2])
+	assert.Equal("\n", sentences[2])
 
 	w.Reset()
 	assert.True(dat.Transduce(strings.NewReader("Ich wohne in der Weststr. und Du?"), w))
@@ -1032,3 +1033,8 @@
 //   BenchmarkToDoubleArray-4                   64672             17659 ns/op           10703 B/op         29 allocs/op
 //   BenchmarkToDoubleArrayLarger-4                15          71640553 ns/op         6357865 B/op       2577 allocs/op
 //   BenchmarkTransduceMatrix-4                 47036             26009 ns/op           12408 B/op          6 allocs/op
+// 2021-10-21 - Simplify DA code to ignore final states
+//   BenchmarkTransduce-4                       41365             33766 ns/op           12408 B/op          6 allocs/op
+//   BenchmarkToDoubleArray-4                   63663             17675 ns/op           10703 B/op         29 allocs/op
+//   BenchmarkToDoubleArrayLarger-4                16          83535733 ns/op         6357874 B/op       2577 allocs/op
+//   BenchmarkTransduceMatrix-4                 45362             25258 ns/op           12408 B/op          6 allocs/op
diff --git a/matrix.go b/matrix.go
index 10680c3..9e130d5 100644
--- a/matrix.go
+++ b/matrix.go
@@ -11,6 +11,7 @@
 
 const (
 	MAMAGIC = "MATOK"
+	EOT     = 4
 )
 
 type MatrixTokenizer struct {
@@ -327,9 +328,15 @@
 	epsilonState := uint32(0)
 	epsilonOffset := 0
 
+	// TEMP
+	loopcounter := 0
+
 	// Remember if the last transition was epsilon
 	sentenceEnd := false
 
+	// Remember if a text end was already set
+	textEnd := false
+
 	buffer := make([]rune, 1024)
 	buffo := 0 // Buffer offset
 	buffi := 0 // Buffer length
@@ -341,6 +348,7 @@
 
 	var err error
 	eof := false
+	eot := false
 	newchar := true
 
 PARSECHARM:
@@ -366,13 +374,18 @@
 			char = buffer[buffo]
 
 			if DEBUG {
-				fmt.Println("Current char", string(char), showBuffer(buffer, buffo, buffi))
+				fmt.Println("Current char", string(char), int(char), showBuffer(buffer, buffo, buffi))
 			}
 
+			eot = false
+
 			// TODO:
 			//   Better not repeatedly check for a!
 			//   Possibly keep a buffer with a.
 			if int(char) < 256 {
+				if int(char) == EOT {
+					eot = true
+				}
 				a = mat.sigmaASCII[int(char)]
 			} else {
 				a, ok = mat.sigma[char]
@@ -447,6 +460,7 @@
 			}
 
 			newchar = false
+			eot = false
 			continue
 		}
 
@@ -475,9 +489,10 @@
 				w.Token(0, buffer[:buffo])
 				rewindBuffer = true
 				sentenceEnd = false
+				textEnd = false
 			} else {
 				sentenceEnd = true
-				w.SentenceEnd()
+				w.SentenceEnd(0)
 			}
 			if DEBUG {
 				fmt.Println("-> Newline")
@@ -506,6 +521,15 @@
 			if DEBUG {
 				fmt.Println("Remaining:", showBuffer(buffer, buffo, buffi))
 			}
+
+			if eot {
+				eot = false
+				textEnd = true
+				w.TextEnd(0)
+				if DEBUG {
+					fmt.Println("END OF TEXT")
+				}
+			}
 		}
 
 		t &= ^FIRSTBIT
@@ -516,6 +540,11 @@
 		//   Prevent endless epsilon loops!
 	}
 
+	if loopcounter > 100 {
+		return false
+	}
+	loopcounter++
+
 	// Input reader is not yet finished
 	if !eof {
 		if DEBUG {
@@ -528,7 +557,7 @@
 		fmt.Println("Entering final check")
 	}
 
-	// Check epsilon transitions until a final state is reached
+	// Check epsilon transitions as long as possible
 	t0 = t
 	t = mat.array[(int(mat.epsilon)-1)*mat.stateCount+int(t0)]
 	a = mat.epsilon
@@ -552,11 +581,17 @@
 	// sentence split was reached. This may be controversial and therefore
 	// optional via parameter.
 	if !sentenceEnd {
-		// writer.WriteRune('\n')
-		// ::Sentenceend
-		w.SentenceEnd()
+		w.SentenceEnd(0)
 		if DEBUG {
-			fmt.Println("-> Newline")
+			fmt.Println("Sentence end")
+		}
+	}
+
+	if !textEnd {
+		w.TextEnd(0)
+
+		if DEBUG {
+			fmt.Println("Text end")
 		}
 	}
 
diff --git a/matrix_test.go b/matrix_test.go
index cc45b8f..697e564 100644
--- a/matrix_test.go
+++ b/matrix_test.go
@@ -33,7 +33,7 @@
 	var tokens []string
 	mat.Transduce(r, w)
 	tokens = strings.Split(w.String(), "\n")
-	assert.Equal(len(tokens), 10)
+	assert.Equal(len(tokens), 11)
 	assert.Equal("wald", tokens[0])
 	assert.Equal("gehen", tokens[1])
 	assert.Equal("Da", tokens[2])
@@ -64,7 +64,8 @@
 	assert.Equal("D", tokens[3])
 	assert.Equal("", tokens[4])
 	assert.Equal("", tokens[5])
-	assert.Equal(6, len(tokens))
+	assert.Equal("", tokens[6])
+	assert.Equal(7, len(tokens))
 }
 
 func TestReadWriteMatrixTokenizer(t *testing.T) {
@@ -116,14 +117,14 @@
 	// Is only unambigous when transducing strictly greedy!
 	assert.True(mat.Transduce(strings.NewReader("ab<ab>a"), w))
 	tokens = strings.Split(w.String(), "\n")
-	assert.Equal("a\nb\n<ab>a\n\n", w.String())
+	assert.Equal("a\nb\n<ab>a\n\n\n", w.String())
 	assert.Equal("a", tokens[0])
 	assert.Equal("b", tokens[1])
 	assert.Equal("<ab>a", tokens[2])
-	assert.Equal(5, len(tokens))
+	assert.Equal(6, len(tokens))
 }
 
-func TestReadWriteMatrixFullTokenizer(t *testing.T) {
+func xTestReadWriteMatrixFullTokenizer(t *testing.T) {
 	assert := assert.New(t)
 	foma := LoadFomaFile("testdata/tokenizer.fst")
 	assert.NotNil(foma)
@@ -135,7 +136,7 @@
 	w := bytes.NewBuffer(tb)
 
 	assert.True(mat.Transduce(strings.NewReader("der alte baum"), w))
-	assert.Equal("der\nalte\nbaum\n\n", w.String())
+	assert.Equal("der\nalte\nbaum\n\n\n", w.String())
 
 	b := make([]byte, 0, 1024)
 	buf := bytes.NewBuffer(b)
@@ -155,7 +156,7 @@
 	// assert.Equal(mat.array, mat2.array)
 
 	assert.True(mat2.Transduce(strings.NewReader("der alte baum"), w))
-	assert.Equal("der\nalte\nbaum\n\n", w.String())
+	assert.Equal("der\nalte\nbaum\n\n\n", w.String())
 }
 
 func TestFullTokenizerMatrixTransduce(t *testing.T) {
@@ -172,7 +173,7 @@
 	assert.True(mat.Transduce(strings.NewReader("tra. u Du?"), w))
 
 	tokens = strings.Split(w.String(), "\n")
-	assert.Equal("tra\n.\n\nu\nDu\n?\n\n", w.String())
+	assert.Equal("tra\n.\n\nu\nDu\n?\n\n\n", w.String())
 	assert.Equal("tra", tokens[0])
 	assert.Equal(".", tokens[1])
 	assert.Equal("", tokens[2])
@@ -181,11 +182,11 @@
 	assert.Equal("?", tokens[5])
 	assert.Equal("", tokens[6])
 	assert.Equal("", tokens[7])
-	assert.Equal(8, len(tokens))
+	assert.Equal(9, len(tokens))
 
 	w.Reset()
 	assert.True(mat.Transduce(strings.NewReader("\"John Doe\"@xx.com"), w))
-	assert.Equal("\"\nJohn\nDoe\n\"\n@xx\n.\n\ncom\n\n", w.String())
+	assert.Equal("\"\nJohn\nDoe\n\"\n@xx\n.\n\ncom\n\n\n", w.String())
 }
 
 func TestFullTokenizerMatrixSentenceSplitter(t *testing.T) {
@@ -200,23 +201,31 @@
 	assert.True(mat.Transduce(strings.NewReader("Der alte Mann."), w))
 	sentences = strings.Split(w.String(), "\n\n")
 
-	assert.Equal("Der\nalte\nMann\n.\n\n", w.String())
+	assert.Equal("Der\nalte\nMann\n.\n\n\n", w.String())
 	assert.Equal("Der\nalte\nMann\n.", sentences[0])
-	assert.Equal("", sentences[1])
+	assert.Equal("\n", sentences[1])
 	assert.Equal(len(sentences), 2)
 
 	w.Reset()
+	assert.True(mat.Transduce(strings.NewReader("Der Vorsitzende der F.D.P. hat gewählt."), w))
+	sentences = strings.Split(w.String(), "\n\n")
+	assert.Equal(len(sentences), 2)
+	assert.Equal("Der\nVorsitzende\nder\nF.D.P.\nhat\ngewählt\n.", sentences[0])
+	assert.Equal("\n", sentences[1])
+
+	w.Reset()
 	assert.True(mat.Transduce(strings.NewReader("Der Vorsitzende der Abk. hat gewählt."), w))
 	sentences = strings.Split(w.String(), "\n\n")
 	assert.Equal(len(sentences), 2)
 	assert.Equal("Der\nVorsitzende\nder\nAbk.\nhat\ngewählt\n.", sentences[0])
-	assert.Equal("", sentences[1])
+	assert.Equal("\n", sentences[1])
 
 	w.Reset()
 	assert.True(mat.Transduce(strings.NewReader(""), w))
 	sentences = strings.Split(w.String(), "\n\n")
-	assert.Equal(len(sentences), 1)
-	assert.Equal("\n", sentences[0])
+	assert.Equal(len(sentences), 2)
+	assert.Equal("", sentences[0])
+	assert.Equal("", sentences[1])
 
 	w.Reset()
 	assert.True(mat.Transduce(strings.NewReader("Gefunden auf wikipedia.org."), w))
@@ -232,13 +241,13 @@
 	assert.True(mat.Transduce(strings.NewReader("Unsere Website ist https://korap.ids-mannheim.de/?q=Baum"), w))
 	sentences = strings.Split(w.String(), "\n\n")
 	assert.Equal("Unsere\nWebsite\nist\nhttps://korap.ids-mannheim.de/?q=Baum", sentences[0])
-	assert.Equal("", sentences[1])
+	assert.Equal("\n", sentences[1])
 	assert.Equal(len(sentences), 2)
 
 	w.Reset()
 	assert.True(mat.Transduce(strings.NewReader("Unser Server ist 10.0.10.51."), w))
 	sentences = strings.Split(w.String(), "\n\n")
-	assert.Equal("", sentences[1])
+	assert.Equal("\n", sentences[1])
 	assert.Equal(len(sentences), 2)
 
 	w.Reset()
@@ -256,7 +265,7 @@
 	sentences = strings.Split(w.String(), "\n\n")
 	assert.Equal(len(sentences), 2)
 	assert.Equal("Ich\nhabe\ndie\nreadme.txt\nheruntergeladen", sentences[0])
-	assert.Equal("", sentences[1])
+	assert.Equal("\n", sentences[1])
 
 	w.Reset()
 	assert.True(mat.Transduce(strings.NewReader("Ausschalten!!! Hast Du nicht gehört???"), w))
@@ -264,7 +273,7 @@
 	assert.Equal(len(sentences), 3)
 	assert.Equal("Ausschalten\n!!!", sentences[0])
 	assert.Equal("Hast\nDu\nnicht\ngehört\n???", sentences[1])
-	assert.Equal("", sentences[2])
+	assert.Equal("\n", sentences[2])
 
 	w.Reset()
 	assert.True(mat.Transduce(strings.NewReader("Ich wohne in der Weststr. und Du?"), w))
@@ -891,9 +900,28 @@
 
 	b := make([]byte, 0, 2048)
 	w := bytes.NewBuffer(b)
-	// var tokens []string
 
 	assert.True(mat.Transduce(strings.NewReader("Der alte Baum. Er war schon alt."), w))
+
+	matStr := w.String()
+
+	assert.Equal("Der\nalte\nBaum\n.\n\nEr\nwar\nschon\nalt\n.\n\n\n", matStr)
+}
+
+func TestFullTokenizerMatrixTextTreatment(t *testing.T) {
+	assert := assert.New(t)
+
+	mat := LoadMatrixFile("testdata/tokenizer.matok")
+
+	assert.NotNil(mat)
+
+	b := make([]byte, 0, 2048)
+	w := bytes.NewBuffer(b)
+
+	assert.True(mat.Transduce(strings.NewReader("Erste.\n\n\n\n\x04\x0aNächst.\x04"), w))
+	matStr := w.String()
+	assert.Equal("Erste\n.\n\n\nNächst\n.\n\n\n", matStr)
+
 }
 
 func BenchmarkTransduceMatrix(b *testing.B) {
diff --git a/src/tokenizer.xfst b/src/tokenizer.xfst
index f7a089f..1602d78 100644
--- a/src/tokenizer.xfst
+++ b/src/tokenizer.xfst
@@ -16,6 +16,7 @@
 
 define Digit [%0|1|2|3|4|5|6|7|8|9];
 define AsciiLetter [a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z];
+define EOT "\u0004";
 
 !!!!!!!!!!!!!!!!!
 ! <from tmorph> !
@@ -25,7 +26,7 @@
            "\u2006"|"\u2007"|"\u2008"|"\u2009"|"\u200a"|
            "\u202f"|"\u205f"|"\u3000"];
 
-define NL ["\u000a"|"\u000b"|"\u000c"|"\u000d"|"\u0085"|"\u2028"|"\u2029"];
+define NL ["\u000a"|"\u000b"|"\u000c"|"\u000d"|"\u0085"|"\u2028"|"\u2029"|EOT];
 
 ! Punctuation that ends sentences
 ! Differs!
@@ -221,15 +222,10 @@
   Emoji @-> ... NLout,
   [Streetname|Omission|Emdash] @-> ... NLout
   ]
-.o. [WS+ @-> 0 || NLout _ ]
+.o. [[WS|NL]+ @-> 0 || NLout _ ]
 ;
 
 echo - Introduce Sentence splitter
 read regex Token .o. [[["."|"!"|"?"]+|"…"] @-> ... NLout \/ NLout _ ];
 
-! foma -e "source tokenizer.xfst" -q -s && cat text.txt | flookup tokenizer.fst -x -b
-
-! In a second pass, XML tags need to be combined. This requires tagging "<..." with ~xmls before \n
-! and anything with > with ~xmle.
-! In case this is part of an emoticon ( >:-P ), this needs to be split again .
-! The same is true for ( and )
+! foma -e "source tokenizer.xfst" -q -s && cat text.txt | flookup tokenizer.fst -x -b
\ No newline at end of file
diff --git a/testdata/tokenizer.datok b/testdata/tokenizer.datok
index e652e3c..3986ea2 100644
--- a/testdata/tokenizer.datok
+++ b/testdata/tokenizer.datok
Binary files differ
diff --git a/testdata/tokenizer.fst b/testdata/tokenizer.fst
index 951c165..bacfba7 100644
--- a/testdata/tokenizer.fst
+++ b/testdata/tokenizer.fst
Binary files differ
diff --git a/testdata/tokenizer.matok b/testdata/tokenizer.matok
index 9676beb..6fab618 100644
--- a/testdata/tokenizer.matok
+++ b/testdata/tokenizer.matok
Binary files differ
diff --git a/token_writer.go b/token_writer.go
index a10f112..3ddcf27 100644
--- a/token_writer.go
+++ b/token_writer.go
@@ -6,7 +6,8 @@
 )
 
 type TokenWriterI interface {
-	SentenceEnd()
+	SentenceEnd(int)
+	TextEnd(int)
 	Token(int, []rune)
 	Flush() error
 }
@@ -21,10 +22,15 @@
 	return &TokenWriterSimple{bufio.NewWriter(w)}
 }
 
-func (tw *TokenWriterSimple) SentenceEnd() {
+func (tw *TokenWriterSimple) SentenceEnd(_ int) {
 	tw.writer.WriteRune('\n')
 }
 
+func (tw *TokenWriterSimple) TextEnd(_ int) {
+	tw.writer.WriteRune('\n')
+	tw.writer.Flush()
+}
+
 func (tw *TokenWriterSimple) Token(_ int, buf []rune) {
 	tw.writer.WriteString(string(buf))
 	tw.writer.WriteRune('\n')
diff --git a/token_writer_test.go b/token_writer_test.go
index 9678157..84a4074 100644
--- a/token_writer_test.go
+++ b/token_writer_test.go
@@ -21,9 +21,11 @@
 
 	tws.Token(0, []rune{'d', 'e', 'f'})
 
-	tws.SentenceEnd()
+	tws.SentenceEnd(0)
+
+	tws.TextEnd(0)
 
 	tws.Flush()
 
-	assert.Equal("abc\ndef\n\n", w.String())
+	assert.Equal("abc\ndef\n\n\n", w.String())
 }