Add heuristcis for distinguishing I. as abbrevation vs PPER / CARD
Change-Id: I3f9230bb637cf8cd68ae12a4671a35f5b0dd4b22
diff --git a/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/DerekoDfaTokenizer.jflex b/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/DerekoDfaTokenizer.jflex
index d1e68c2..7434d15 100644
--- a/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/DerekoDfaTokenizer.jflex
+++ b/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/DerekoDfaTokenizer.jflex
@@ -513,6 +513,8 @@
FRENCH_INIT_CLITIC = ([dcjlmnstDCJLNMST]\'|[Qq]u\'|[Jj]usqu\'|[Ll]orsqu\')
+ENGLISH_MARKERS_FOR_NON_ABBREVIATION_I = (am|was|will|have|had|would|do|did|and|War|than|not|[Pp]art)
+
CLITIC = ({ENGLISH_CLITIC}|{FRENCH_CLITIC})
INIT_CLITIC = ({FRENCH_INIT_CLITIC})
@@ -551,7 +553,7 @@
%include language-specific_/*$target.language$*/.jflex-macro
-%s OPEN_QUOTE POLISH_CONDITIONAL_MODE JUST_AFTER_PERIOD CLITIC_MODE
+%s OPEN_QUOTE POLISH_CONDITIONAL_MODE JUST_AFTER_PERIOD CLITIC_MODE ENGLISH_NON_ABBREVIATION_I_MODE
%%
{ENDMARKER} { fileEnd(); return null; }
@@ -565,15 +567,19 @@
{URL} { return currentToken(); }
// special words
-{c}an / not {return currentToken();}
+{c}an / not {return currentToken();}
{l}em / me {return currentToken();}
{g}on / na {return currentToken();}
{g}im / me {return currentToken();}
{w}an / na {return currentToken();}
{g}ot / ta {return currentToken();}
+// M. I. Baxter was killed in World War I.<s> So was I.<s>
+{ENGLISH_MARKERS_FOR_NON_ABBREVIATION_I} / {WHITESPACE} [I] \. {yybegin(ENGLISH_NON_ABBREVIATION_I_MODE); return currentToken(); }
+<ENGLISH_NON_ABBREVIATION_I_MODE>[I] / \. {yybegin(YYINITIAL); return currentToken(); }
+
{LETTER}\. {return currentToken();}
-{LETTER}{2,12} / \.[:uppercase:] {return currentToken();}
+{LETTER}{2,12} / \.[:uppercase:] {return currentToken();}
{PLUSAMPERSAND} {return currentToken();}
{SEABBR}\. {return currentToken();}
{PRAGMA} {return currentToken();}
diff --git a/src/test/java/de/ids_mannheim/korap/tokenizer/TokenizerTest.java b/src/test/java/de/ids_mannheim/korap/tokenizer/TokenizerTest.java
index d8b4a11..5a434a4 100644
--- a/src/test/java/de/ids_mannheim/korap/tokenizer/TokenizerTest.java
+++ b/src/test/java/de/ids_mannheim/korap/tokenizer/TokenizerTest.java
@@ -563,6 +563,17 @@
}
@Test
+ public void englishTokenizerCanGuessWhetherIIsAbbrev () {
+ DerekoDfaTokenizer_en tok = new DerekoDfaTokenizer_en();
+ String[] tokens = tok.tokenize("M. I. Baxter was born during World War I. So was I. He went to the Peter I. Hardy school. So did I.");
+ assertEquals("I.", tokens[1]);
+ assertEquals("I", tokens[8]);
+ assertEquals(".", tokens[9]);
+ assertEquals("I", tokens[12]);
+ assertEquals(".", tokens[13]);
+ }
+
+ @Test
public void testZipOuputArchive () {
DerekoDfaTokenizer_de tok = new DerekoDfaTokenizer_de();
final ByteArrayOutputStream clearOut = new ByteArrayOutputStream();