Improve handling of apostrophes
Change-Id: Id442f4e958720f970baef63aee9b8710c258e13b
diff --git a/matrix_test.go b/matrix_test.go
index cea4acc..7919115 100644
--- a/matrix_test.go
+++ b/matrix_test.go
@@ -830,6 +830,17 @@
assert.Equal(tokens[8], ".")
assert.Equal(9, len(tokens))
+ // Apostrophe handling
+ tokens = ttokenize(mat, w, "Das ist Nils’ Einkaufskorb bei McDonald's.")
+ assert.Equal(tokens[0], "Das")
+ assert.Equal(tokens[1], "ist")
+ assert.Equal(tokens[2], "Nils’")
+ assert.Equal(tokens[3], "Einkaufskorb")
+ assert.Equal(tokens[4], "bei")
+ assert.Equal(tokens[5], "McDonald's")
+ assert.Equal(tokens[6], ".")
+ assert.Equal(7, len(tokens))
+
/*
@Test
public void englishTokenizerSeparatesEnglishContractionsAndClitics () {
diff --git a/src/tokenizer.xfst b/src/tokenizer.xfst
index fb97c5c..8026c01 100644
--- a/src/tokenizer.xfst
+++ b/src/tokenizer.xfst
@@ -69,7 +69,7 @@
define Char \[WS|NL|Punct|Apos]; ! |¨;
-define Word Char+ ([Apos|Asterisk] Char+)*;
+define Word Char+ ([Apos|Asterisk] Char+)* ([s|S] [%’|%`]);
define Plusampersand @txt"txt/plusampersand.txt";
define Word [Plusampersand | Word] (Dash [Plusampersand | Word])*;