Improve sentence endings further
Change-Id: Ia8090d359c36940e74496d3aad270fba966d9412
diff --git a/matrix_test.go b/matrix_test.go
index 3509d15..6681b2e 100644
--- a/matrix_test.go
+++ b/matrix_test.go
@@ -383,6 +383,15 @@
assert.Equal(len(sentences), 3)
assert.Equal("(\nEr\nging\n.\n)", sentences[0])
assert.Equal("Und\nkam\n(\nspäter\n)\n.", sentences[1])
+
+ // Check parantheses and quotes at the end of the sentence
+ w.Reset()
+ assert.True(mat.Transduce(strings.NewReader("(Er sagte: \"Hallo!\") Dann ging er."), w))
+ sentences = strings.Split(w.String(), "\n\n")
+ assert.Equal(len(sentences), 3)
+ assert.Equal("(\nEr\nsagte\n:\n\"\nHallo\n!\n\"\n)", sentences[0])
+ assert.Equal("Dann\nging\ner\n.", sentences[1])
+
}
func TestMatrixFullTokenizerMatrixSentenceSplitterBug1(t *testing.T) {
diff --git a/src/tokenizer.xfst b/src/tokenizer.xfst
index 4372d4a..b75ece5 100644
--- a/src/tokenizer.xfst
+++ b/src/tokenizer.xfst
@@ -225,7 +225,7 @@
define NotSmallCaps [? - a - b - c - d - e - f - g - h - i - j - k - l - m - n - o - p - q - r - s - t - u - v - w - x - y - z - ü - ö - ä];
read regex Token .o. [
- SP NLout [DQuotes | "›" (NLout DQuotes) | %‹ (NLout DQuotes) | %’ (NLout DQuotes) | "'" (NLout DQuotes) | ")" ] (NLout SP) @-> ... NLout \/ _ NLout \%,
+ SP NLout [DQuotes (NLout ")") | ["›"|%‹|%’|"'"] ( NLout DQuotes (NLout ")") | NLout ")" ) | ")" ] (NLout SP) @-> ... NLout \/ _ NLout \%,
] .o. [
SP @-> ... NLout \/ NLout _ NLout [? - "”" - %" - "»" - "«" - "›" - %‹ - %’ - "'" - ")" - NLout]
] .o. [