Add heuristcis for distinguishing I. as abbrevation vs PPER / CARD

Change-Id: I3f9230bb637cf8cd68ae12a4671a35f5b0dd4b22
diff --git a/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/DerekoDfaTokenizer.jflex b/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/DerekoDfaTokenizer.jflex
index d1e68c2..7434d15 100644
--- a/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/DerekoDfaTokenizer.jflex
+++ b/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/DerekoDfaTokenizer.jflex
@@ -513,6 +513,8 @@
 
 FRENCH_INIT_CLITIC = ([dcjlmnstDCJLNMST]\'|[Qq]u\'|[Jj]usqu\'|[Ll]orsqu\')
 
+ENGLISH_MARKERS_FOR_NON_ABBREVIATION_I = (am|was|will|have|had|would|do|did|and|War|than|not|[Pp]art)
+
 CLITIC = ({ENGLISH_CLITIC}|{FRENCH_CLITIC})
 
 INIT_CLITIC = ({FRENCH_INIT_CLITIC})
@@ -551,7 +553,7 @@
 
 %include language-specific_/*$target.language$*/.jflex-macro
 
-%s OPEN_QUOTE POLISH_CONDITIONAL_MODE JUST_AFTER_PERIOD CLITIC_MODE
+%s OPEN_QUOTE POLISH_CONDITIONAL_MODE JUST_AFTER_PERIOD CLITIC_MODE ENGLISH_NON_ABBREVIATION_I_MODE
 
 %%
 {ENDMARKER}                                             { fileEnd(); return null; }
@@ -565,15 +567,19 @@
 {URL}                                                         { return currentToken(); }
 
 // special words
-{c}an / not                                                      {return currentToken();}
+{c}an / not                                                     {return currentToken();}
 {l}em / me                                                      {return currentToken();}
 {g}on / na                                                      {return currentToken();}
 {g}im / me                                                      {return currentToken();}
 {w}an / na                                                      {return currentToken();}
 {g}ot / ta                                                      {return currentToken();}
 
+// M. I. Baxter was killed in World War I.<s> So was I.<s>
+{ENGLISH_MARKERS_FOR_NON_ABBREVIATION_I} / {WHITESPACE} [I] \.  {yybegin(ENGLISH_NON_ABBREVIATION_I_MODE); return currentToken(); }
+<ENGLISH_NON_ABBREVIATION_I_MODE>[I] / \.                       {yybegin(YYINITIAL); return currentToken(); }
+
 {LETTER}\.                                                      {return currentToken();}
-{LETTER}{2,12} / \.[:uppercase:]                                  {return currentToken();}
+{LETTER}{2,12} / \.[:uppercase:]                                {return currentToken();}
 {PLUSAMPERSAND}                                                 {return currentToken();}
 {SEABBR}\.                                                      {return currentToken();}
 {PRAGMA}                                                        {return currentToken();}
diff --git a/src/test/java/de/ids_mannheim/korap/tokenizer/TokenizerTest.java b/src/test/java/de/ids_mannheim/korap/tokenizer/TokenizerTest.java
index d8b4a11..5a434a4 100644
--- a/src/test/java/de/ids_mannheim/korap/tokenizer/TokenizerTest.java
+++ b/src/test/java/de/ids_mannheim/korap/tokenizer/TokenizerTest.java
@@ -563,6 +563,17 @@
     }
 
     @Test
+    public void englishTokenizerCanGuessWhetherIIsAbbrev () {
+        DerekoDfaTokenizer_en tok = new DerekoDfaTokenizer_en();
+        String[] tokens = tok.tokenize("M. I. Baxter was born during World War I. So was I. He went to the Peter I. Hardy school. So did I.");
+        assertEquals("I.", tokens[1]);
+        assertEquals("I", tokens[8]);
+        assertEquals(".", tokens[9]);
+        assertEquals("I", tokens[12]);
+        assertEquals(".", tokens[13]);
+    }
+
+    @Test
     public void testZipOuputArchive () {
         DerekoDfaTokenizer_de tok = new DerekoDfaTokenizer_de();
         final ByteArrayOutputStream clearOut = new ByteArrayOutputStream();