Improve handling of ellipsis
Change-Id: I758e096678091f52fd3bc00b2a5f6ad1358881cc
diff --git a/Changes b/Changes
index 396c19f..46660ec 100644
--- a/Changes
+++ b/Changes
@@ -1,3 +1,6 @@
+0.1.4 2022-03-11
+ - Improved handling of ellipsis.
+
0.1.3 2022-03-08
- Introduced refined handling of sentences including speech.
diff --git a/matrix_test.go b/matrix_test.go
index c017af5..3b64d5c 100644
--- a/matrix_test.go
+++ b/matrix_test.go
@@ -331,6 +331,16 @@
assert.Equal(len(sentences), 8)
assert.Equal("Neulich\nerst\nhat\nmir\nder\nkleine\nVentivegni\nvon\ndrüben\ngesagt\n:\n'\nFräulein\nEffi\n,\nwas\ngilt\ndie\nWette\n,\nwir\nsind\nhier\nnoch\nin\ndiesem\nJahre\nzu\nPolterabend\nund\nHochzeit\n.\n'\n«", sentences[5])
assert.Equal("»\nUnd\nwas\nsagtest\ndu\nda\n?\n«", sentences[6])
+
+ text = `»Nun, gib dich zufrieden, ich fange schon an ... Also Baron
+Innstetten!`
+
+ w.Reset()
+ assert.True(mat.Transduce(strings.NewReader(text), w))
+ sentences = strings.Split(w.String(), "\n\n")
+ assert.Equal(len(sentences), 3)
+ assert.Equal("»\nNun\n,\ngib\ndich\nzufrieden\n,\nich\nfange\nschon\nan\n...", sentences[0])
+ assert.Equal("Also\nBaron\nInnstetten\n!", sentences[1])
}
func TestMatrixFullTokenizerTokenSplitter(t *testing.T) {
diff --git a/src/tokenizer.xfst b/src/tokenizer.xfst
index 8a93c21..cf183b7 100644
--- a/src/tokenizer.xfst
+++ b/src/tokenizer.xfst
@@ -209,7 +209,7 @@
echo - Introduce Token splitter
define Token [
- [%. %. %. | RealToken] @-> ... NLout,
+ RealToken @-> ... NLout,
XML @-> ... NLout,
URL @-> ... NLout,
Email @-> ... NLout,
@@ -222,12 +222,15 @@
! And compose Whitespace ignorance
define DQuotes ["”"|%"|"»"|"«"];
+define NotSmallCaps [? - a - b - c - d - e - f - g - h - i - j - k - l - m - n - o - p - q - r - s - t - u - v - w - x - y - z - ü - ö - ä];
read regex Token .o. [
SP NLout [DQuotes | "›" (NLout DQuotes)| %‹ (NLout DQuotes)| %’ (NLout DQuotes)| "'" (NLout DQuotes)] @-> ... NLout \/ _ NLout \%,
] .o. [
SP @-> ... NLout \/ NLout _ NLout [? - "”" - %" - "»" - "«" - "›" - %‹ - %’ - "'" - NLout]
] .o. [
+ [%. %. %.] @-> ... NLout \/ _ NLout WS+ NotSmallCaps
+] .o. [
[WS|NL]+ @-> 0 || [ .#. | NLout ] _
];
diff --git a/testdata/tokenizer.datok b/testdata/tokenizer.datok
index 18a99b4..df75e9b 100644
--- a/testdata/tokenizer.datok
+++ b/testdata/tokenizer.datok
Binary files differ
diff --git a/testdata/tokenizer.fst b/testdata/tokenizer.fst
index 73cbda5..5701081 100644
--- a/testdata/tokenizer.fst
+++ b/testdata/tokenizer.fst
Binary files differ
diff --git a/testdata/tokenizer.matok b/testdata/tokenizer.matok
index fe2b6f1..75ce996 100644
--- a/testdata/tokenizer.matok
+++ b/testdata/tokenizer.matok
Binary files differ