Add trimming at the beginning of a text
Change-Id: I0efec3a718e87be747a6eb09b9890c354f99eb6f
diff --git a/matrix.go b/matrix.go
index 9e130d5..8c68959 100644
--- a/matrix.go
+++ b/matrix.go
@@ -328,9 +328,6 @@
epsilonState := uint32(0)
epsilonOffset := 0
- // TEMP
- loopcounter := 0
-
// Remember if the last transition was epsilon
sentenceEnd := false
@@ -540,11 +537,6 @@
// Prevent endless epsilon loops!
}
- if loopcounter > 100 {
- return false
- }
- loopcounter++
-
// Input reader is not yet finished
if !eof {
if DEBUG {
diff --git a/matrix_test.go b/matrix_test.go
index 5678f5e..e27dd12 100644
--- a/matrix_test.go
+++ b/matrix_test.go
@@ -921,7 +921,21 @@
assert.True(mat.Transduce(strings.NewReader("Erste.\n\n\n\n\x04\x0aNächst.\x04"), w))
matStr := w.String()
assert.Equal("Erste\n.\n\n\nNächst\n.\n\n\n", matStr)
+}
+func TestMatrixTrimming(t *testing.T) {
+ assert := assert.New(t)
+
+ mat := LoadMatrixFile("testdata/tokenizer.matok")
+
+ assert.NotNil(mat)
+
+ b := make([]byte, 0, 2048)
+ w := bytes.NewBuffer(b)
+
+ assert.True(mat.Transduce(strings.NewReader(" Erste."), w))
+ matStr := w.String()
+ assert.Equal("Erste\n.\n\n\n", matStr)
}
func BenchmarkMatrixTransduce(b *testing.B) {
diff --git a/src/tokenizer.xfst b/src/tokenizer.xfst
index 1602d78..0d703a3 100644
--- a/src/tokenizer.xfst
+++ b/src/tokenizer.xfst
@@ -223,6 +223,7 @@
[Streetname|Omission|Emdash] @-> ... NLout
]
.o. [[WS|NL]+ @-> 0 || NLout _ ]
+.o. [[WS|NL]+ @-> 0 || .#. _ ]
;
echo - Introduce Sentence splitter
diff --git a/testdata/tokenizer.fst b/testdata/tokenizer.fst
index bacfba7..671ffe4 100644
--- a/testdata/tokenizer.fst
+++ b/testdata/tokenizer.fst
Binary files differ
diff --git a/testdata/tokenizer.matok b/testdata/tokenizer.matok
index 6fab618..7e4e3af 100644
--- a/testdata/tokenizer.matok
+++ b/testdata/tokenizer.matok
Binary files differ