check parantheses at the end of sentences
Change-Id: Ifa051a59b8a7de88e031a850d11ca95432a0b32e
diff --git a/matrix_test.go b/matrix_test.go
index 2571978..ac9b054 100644
--- a/matrix_test.go
+++ b/matrix_test.go
@@ -376,6 +376,13 @@
assert.Equal("»\nNun\n,\ngib\ndich\nzufrieden\n,\nich\nfange\nschon\nan\n...", sentences[0])
assert.Equal("Also\nBaron\nInnstetten\n!", sentences[1])
+ // Check paranthesis at the end of sentences.
+ w.Reset()
+ assert.True(mat.Transduce(strings.NewReader("(Er ging.) Und kam (später)."), w))
+ sentences = strings.Split(w.String(), "\n\n")
+ assert.Equal(len(sentences), 3)
+ assert.Equal("(\nEr\nging\n.\n)", sentences[0])
+ assert.Equal("Und\nkam\n(\nspäter\n)\n.", sentences[1])
}
func TestMatrixFullTokenizerMatrixSentenceSplitterBug1(t *testing.T) {
diff --git a/src/tokenizer.xfst b/src/tokenizer.xfst
index 74f7c52..5fe1aec 100644
--- a/src/tokenizer.xfst
+++ b/src/tokenizer.xfst
@@ -225,9 +225,9 @@
define NotSmallCaps [? - a - b - c - d - e - f - g - h - i - j - k - l - m - n - o - p - q - r - s - t - u - v - w - x - y - z - ü - ö - ä];
read regex Token .o. [
- SP NLout [DQuotes | "›" (NLout DQuotes) | %‹ (NLout DQuotes) | %’ (NLout DQuotes) | "'" (NLout DQuotes)] (NLout SP) @-> ... NLout \/ _ NLout \%,
+ SP NLout [DQuotes | "›" (NLout DQuotes) | %‹ (NLout DQuotes) | %’ (NLout DQuotes) | "'" (NLout DQuotes) | ")" ] (NLout SP) @-> ... NLout \/ _ NLout \%,
] .o. [
- SP @-> ... NLout \/ NLout _ NLout [? - "”" - %" - "»" - "«" - "›" - %‹ - %’ - "'" - NLout]
+ SP @-> ... NLout \/ NLout _ NLout [? - "”" - %" - "»" - "«" - "›" - %‹ - %’ - "'" - ")" - NLout]
] .o. [
[%. %. %.] @-> ... NLout \/ _ NLout WS+ NotSmallCaps
] .o. [