Fix emoticon matching before letters (e.g., Wikipedia:Diskussionen)
Added trailing context to emoticon rule so :D only matches when NOT
followed by a letter. This prevents false emoticon matches in patterns
like Wikipedia:Diskussionen where the colon is a namespace separator.
Before: Wikipedia:Diskussionen → Wikipedia :D iskussionen
After: Wikipedia:Diskussionen → Wikipedia : Diskussionen
Resolves #134
Change-Id: Ia9d6659e604eb514172e2182c94a206b5b45023f
diff --git a/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/DerekoDfaTokenizer.jflex b/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/DerekoDfaTokenizer.jflex
index 5582821..1d93e6f 100644
--- a/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/DerekoDfaTokenizer.jflex
+++ b/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/DerekoDfaTokenizer.jflex
@@ -910,7 +910,7 @@
([.][.]+|…+) {return currentToken("...");}
{LONG_END_PUNCT} { return currentToken();}
{PUNCT} { return currentToken();}
-{EMOTICON} { return currentToken();}
+{EMOTICON} / [^[:letter:]] { return currentToken();}
{DASH}{DoubleLiteral} { return currentToken();}
{EMOJI_COMPLEX} { return currentToken();}
<<EOF>> { fileEnd(); return null;}
diff --git a/src/test/java/de/ids_mannheim/korap/tokenizer/TokenizerTest.java b/src/test/java/de/ids_mannheim/korap/tokenizer/TokenizerTest.java
index e50e2e1..b0ee8c3 100644
--- a/src/test/java/de/ids_mannheim/korap/tokenizer/TokenizerTest.java
+++ b/src/test/java/de/ids_mannheim/korap/tokenizer/TokenizerTest.java
@@ -1195,6 +1195,24 @@
assertEquals(".", tokens[6]);
assertEquals(7, tokens.length);
}
+
+ // Regression test for emoticon not matching before letters
+ // Wikipedia:Diskussionen should NOT tokenize :D as an emoticon
+ @Test
+ public void testEmoticonNotMatchBeforeLetter() {
+ DerekoDfaTokenizer_de tok = new DerekoDfaTokenizer_de();
+
+ String[] tokens = tok.tokenize("Wikipedia:Diskussionen");
+ assertEquals("Wikipedia", tokens[0]);
+ assertEquals(":", tokens[1]);
+ assertEquals("Diskussionen", tokens[2]);
+ assertEquals(3, tokens.length);
+
+ // But emoticons followed by space/punct should still work
+ tokens = tok.tokenize("Great :D!");
+ assertEquals("Great", tokens[0]);
+ assertEquals(":D", tokens[1]);
+ assertEquals("!", tokens[2]);
+ assertEquals(3, tokens.length);
+ }
}
-
-