Introduce english clitics
Change-Id: Ib943a96fa097a71c77cd878c71392e5c5139315a
diff --git a/src/en/clitics.xfst b/src/en/clitics.xfst
new file mode 100644
index 0000000..ab919b5
--- /dev/null
+++ b/src/en/clitics.xfst
@@ -0,0 +1,4 @@
+define Clitics [ Apos [[{ll}|d|{ve}|s|{re}|(e)m|n] .o. Caseinsensitive] ];
+# Following https://web.stanford.edu/~zwicky/ZPCliticsInfl.pdf
+define CliticsNTPart [[{do}({es})|{did}|{have}|{has}|{had}|{ca}|{could}|{might}|{sha}|{should}|{wo}|{would}|{dare}|{must}|{need}|{ought}|{are}|{is}|{was}|{were}|{ai}] {n}] .o. Caseinsensitive;
+define CliticsNT [CliticsNTPart "'" ["t"|"T"]];
diff --git a/src/en/tokenizer.xfst b/src/en/tokenizer.xfst
index adcdac8..259fa60 100644
--- a/src/en/tokenizer.xfst
+++ b/src/en/tokenizer.xfst
@@ -89,10 +89,10 @@
define NotSmallCaps [? - a - b - c - d - e - f - g - h - i - j - k - l - m - n - o - p - q - r - s - t - u - v - w - x - y - z - ü - ö - ä - è - é - ú - á - â - ê - î - ô - û];
-! Irrelevant becose of the more general rule followed
-! define Clitics [ Apos [{ll}|d|{ve}|s|{re}|m|n|{em}] .o. Caseinsensitive ] | ["n" Apos "t"] .o. Caseinsensitive ];
+source en/clitics.xfst
-define Word Char+ ([Apos|Asterisk] Char+)*;
+# define Word Char+ (Apos Char+)*;
+define Word [[ Char+ | Clitics ] - CliticsNTPart | CliticsNT];
define Plusampersand @txt"de/plusampersand.txt";
define Word [Plusampersand | Word] (Dash [Plusampersand | Word])*;
@@ -121,7 +121,11 @@
File @-> ... NLout,
Domain @-> ... NLout,
[Emoticons|Arrows] @-> ... NLout
-] .o. ["I" @-> ... NLout \/ NonAbbrI [WS | NLout ]+ _ ];
+]
+! I as a separate token
+.o. ["I" @-> ... NLout \/ NonAbbrI [WS | NLout ]+ _ ]
+! Negative clitics are tokens
+.o. [ {n't} ->@ NLout ... \/ NLout ];
source all/allsentencesplit.xfst