Added context rule for I Change-Id: I5f3449c3994960503b616fa735de9d2ab9951ff7

commit: d0dfea8258ea50be202a9f500e76a9285c8ad01e [log] [tgz]
author: Akron <nils@diewald-online.de> Wed Apr 26 19:24:17 2023 +0200
committer: Akron <nils@diewald-online.de> Wed Apr 26 19:24:17 2023 +0200
tree: 0d071c23edfc7faaf73332cba491c5beffbb2c3b
parent: be3d366e6641291f995db0483a994c4d723d7753 [diff]
diff --git a/matrix_test.go b/matrix_test.go
index 91f5b3c..31812d7 100644
--- a/matrix_test.go
+++ b/matrix_test.go

@@ -1013,17 +1013,14 @@
 	assert.Equal("pp.", tokens[28])
 	assert.Equal("17-18", tokens[29])
 	assert.Equal(".", tokens[30])
-	/*
 
-		// englishTokenizerCanGuessWhetherIIsAbbrev
-		tokens = ttokenize(mat_en, w, "M. I. Baxter was born during World War I. So was I. He went to the Peter I. Hardy school. So did I.")
-		assert.Equal("I.", tokens[1])
-		assert.Equal("I", tokens[8])
-		assert.Equal(".", tokens[9])
-		assert.Equal("I", tokens[12])
-		assert.Equal(".", tokens[13])
-
-	*/
+	// englishTokenizerCanGuessWhetherIIsAbbrev
+	tokens = ttokenize(mat_en, w, "M. I. Baxter was born during World War I. So was I. He went to the Peter I. Hardy school. So did I.")
+	assert.Equal("I.", tokens[1])
+	assert.Equal("I", tokens[8])
+	assert.Equal(".", tokens[9])
+	assert.Equal("I", tokens[12])
+	assert.Equal(".", tokens[13])
 
 	/*
 		@Test

diff --git a/src/en/tokenizer.xfst b/src/en/tokenizer.xfst
index 6b3cc78..adcdac8 100644
--- a/src/en/tokenizer.xfst
+++ b/src/en/tokenizer.xfst

@@ -107,6 +107,10 @@
 
 define RealToken [Punct|Emdash|Abbr|Word|SNS|AcronymDep|Ord|Num|Years|Times|XMLEntities|Omission];
 
+! Treatmeant for I as a word in "M. I. Baxter was killed in World War I. So was I."
+define NonAbbrI [ {am}|{was}|{will}|{have}|{had}|{would}|{do}|{did}|{and}|{War}|{than}|{not}|[P|p]{art} ];
+
+
 echo - Introduce Token splitter
 
 define Token [
@@ -117,7 +121,7 @@
   File @-> ... NLout,
   Domain @-> ... NLout,
   [Emoticons|Arrows] @-> ... NLout
-];
+] .o. ["I" @-> ... NLout \/ NonAbbrI [WS | NLout ]+ _ ];
 
 source all/allsentencesplit.xfst
commit	d0dfea8258ea50be202a9f500e76a9285c8ad01e	[log] [tgz]
author	Akron <nils@diewald-online.de>	Wed Apr 26 19:24:17 2023 +0200
committer	Akron <nils@diewald-online.de>	Wed Apr 26 19:24:17 2023 +0200
tree	0d071c23edfc7faaf73332cba491c5beffbb2c3b
parent	be3d366e6641291f995db0483a994c4d723d7753 [diff]