Added context rule for I
Change-Id: I5f3449c3994960503b616fa735de9d2ab9951ff7
diff --git a/matrix_test.go b/matrix_test.go
index 91f5b3c..31812d7 100644
--- a/matrix_test.go
+++ b/matrix_test.go
@@ -1013,17 +1013,14 @@
assert.Equal("pp.", tokens[28])
assert.Equal("17-18", tokens[29])
assert.Equal(".", tokens[30])
- /*
- // englishTokenizerCanGuessWhetherIIsAbbrev
- tokens = ttokenize(mat_en, w, "M. I. Baxter was born during World War I. So was I. He went to the Peter I. Hardy school. So did I.")
- assert.Equal("I.", tokens[1])
- assert.Equal("I", tokens[8])
- assert.Equal(".", tokens[9])
- assert.Equal("I", tokens[12])
- assert.Equal(".", tokens[13])
-
- */
+ // englishTokenizerCanGuessWhetherIIsAbbrev
+ tokens = ttokenize(mat_en, w, "M. I. Baxter was born during World War I. So was I. He went to the Peter I. Hardy school. So did I.")
+ assert.Equal("I.", tokens[1])
+ assert.Equal("I", tokens[8])
+ assert.Equal(".", tokens[9])
+ assert.Equal("I", tokens[12])
+ assert.Equal(".", tokens[13])
/*
@Test
diff --git a/src/en/tokenizer.xfst b/src/en/tokenizer.xfst
index 6b3cc78..adcdac8 100644
--- a/src/en/tokenizer.xfst
+++ b/src/en/tokenizer.xfst
@@ -107,6 +107,10 @@
define RealToken [Punct|Emdash|Abbr|Word|SNS|AcronymDep|Ord|Num|Years|Times|XMLEntities|Omission];
+! Treatmeant for I as a word in "M. I. Baxter was killed in World War I. So was I."
+define NonAbbrI [ {am}|{was}|{will}|{have}|{had}|{would}|{do}|{did}|{and}|{War}|{than}|{not}|[P|p]{art} ];
+
+
echo - Introduce Token splitter
define Token [
@@ -117,7 +121,7 @@
File @-> ... NLout,
Domain @-> ... NLout,
[Emoticons|Arrows] @-> ... NLout
-];
+] .o. ["I" @-> ... NLout \/ NonAbbrI [WS | NLout ]+ _ ];
source all/allsentencesplit.xfst