Add support for emoji clusters
Resolves #113
Change-Id: Ia5f0da30559a97332748dfc9e2595852e2477d1a
diff --git a/CHANGELOG.md b/CHANGELOG.md
index ba70c7b..ae0b1dc 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -4,6 +4,7 @@
## 2.2.6 [unreleased]
* Fixed genderstern and omission asterisk breaking after hyphens (issue #115)
+* Added emoji complex support (issue #116)
## 2.2.5
diff --git a/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/DerekoDfaTokenizer.jflex b/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/DerekoDfaTokenizer.jflex
index d22a7dc..b58aa41 100644
--- a/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/DerekoDfaTokenizer.jflex
+++ b/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/DerekoDfaTokenizer.jflex
@@ -557,6 +557,20 @@
// blocks of question marks and exclamation marks are one token
LONG_END_PUNCT = [?!][?!1]+
+// Emoji components (Issue #113: Support emojis with modifiers and ZWJs)
+ZWJ = \u200D
+SKIN_TONE_MODIFIER = [\u{1F3FB}-\u{1F3FF}]
+VARIATION_SELECTOR = [\uFE0E\uFE0F]
+KEYCAP = \u20E3
+REGIONAL_INDICATOR = [\u{1F1E6}-\u{1F1FF}]
+
+// Base emoji ranges (major emoji blocks)
+EMOJI_BASE = ([\u{1F300}-\u{1F9FF}] | [\u{1FA00}-\u{1FAFF}] | [\u{2600}-\u{27BF}] | [\u{2300}-\u{23FF}] | [\u{2700}-\u{27BF}])
+
+// Emoji complex: base emoji optionally followed by modifiers, with ZWJ chaining
+EMOJI_COMPLEX = {EMOJI_BASE}{VARIATION_SELECTOR}?({SKIN_TONE_MODIFIER}|{KEYCAP})?({ZWJ}{EMOJI_BASE}{VARIATION_SELECTOR}?{SKIN_TONE_MODIFIER}?)*
+ | {REGIONAL_INDICATOR}{REGIONAL_INDICATOR}
+
WORD = ({IRISH_O}?{ALPHANUM}+|[Qq]ur{Q}an)
// pragmas used for anonymization etc.
@@ -666,6 +680,7 @@
{PUNCT} { return currentToken();}
{EMOTICON} { return currentToken();}
{DASH}{DoubleLiteral} { return currentToken();}
+{EMOJI_COMPLEX} { return currentToken();}
<<EOF>> { fileEnd(); return null;}
. { return currentToken();}
diff --git a/src/test/java/de/ids_mannheim/korap/tokenizer/TokenizerTest.java b/src/test/java/de/ids_mannheim/korap/tokenizer/TokenizerTest.java
index f1eda03..402bf85 100644
--- a/src/test/java/de/ids_mannheim/korap/tokenizer/TokenizerTest.java
+++ b/src/test/java/de/ids_mannheim/korap/tokenizer/TokenizerTest.java
@@ -271,6 +271,33 @@
assertEquals(tokens.length, 5);
}
+ // Regression test for https://github.com/KorAP/KorAP-Tokenizer/issues/113
+ @Test
+ public void testTokenizerEmojiSequences () {
+ DerekoDfaTokenizer_de tok = new DerekoDfaTokenizer_de();
+
+ // Test emoji with skin tone modifier (U+270A U+1F3FF = raised fist dark skin tone)
+ String[] tokens = tok.tokenize("Power ✊🏿!");
+ assertEquals("Power", tokens[0]);
+ assertEquals("✊🏿", tokens[1]); // Should be one token
+ assertEquals("!", tokens[2]);
+ assertEquals(3, tokens.length);
+
+ // Test emoji ZWJ sequence (family: man, man, boy)
+ tokens = tok.tokenize("Familie 👨👨👦 hier");
+ assertEquals("Familie", tokens[0]);
+ assertEquals("👨👨👦", tokens[1]); // Should be one token with ZWJ
+ assertEquals("hier", tokens[2]);
+ assertEquals(3, tokens.length);
+
+ // Test flag emoji (regional indicators for Germany: U+1F1E9 U+1F1EA)
+ tokens = tok.tokenize("Flagge 🇩🇪 toll");
+ assertEquals("Flagge", tokens[0]);
+ assertEquals("🇩🇪", tokens[1]); // Should be one token
+ assertEquals("toll", tokens[2]);
+ assertEquals(3, tokens.length);
+ }
+
@Test
public void testTokenizerRef1 () {
DerekoDfaTokenizer_de tok = new DerekoDfaTokenizer_de();