Add support for emoji clusters

Resolves #113

Change-Id: Ia5f0da30559a97332748dfc9e2595852e2477d1a
diff --git a/CHANGELOG.md b/CHANGELOG.md
index ba70c7b..ae0b1dc 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -4,6 +4,7 @@
 ## 2.2.6 [unreleased]
 
 * Fixed genderstern and omission asterisk breaking after hyphens (issue #115)
+* Added emoji complex support (issue #116)
 
 ## 2.2.5
 
diff --git a/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/DerekoDfaTokenizer.jflex b/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/DerekoDfaTokenizer.jflex
index d22a7dc..b58aa41 100644
--- a/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/DerekoDfaTokenizer.jflex
+++ b/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/DerekoDfaTokenizer.jflex
@@ -557,6 +557,20 @@
 // blocks of question marks and exclamation marks are one token
 LONG_END_PUNCT = [?!][?!1]+
 
+// Emoji components (Issue #113: Support emojis with modifiers and ZWJs)
+ZWJ = \u200D
+SKIN_TONE_MODIFIER = [\u{1F3FB}-\u{1F3FF}]
+VARIATION_SELECTOR = [\uFE0E\uFE0F]
+KEYCAP = \u20E3
+REGIONAL_INDICATOR = [\u{1F1E6}-\u{1F1FF}]
+
+// Base emoji ranges (major emoji blocks)
+EMOJI_BASE = ([\u{1F300}-\u{1F9FF}] | [\u{1FA00}-\u{1FAFF}] | [\u{2600}-\u{27BF}] | [\u{2300}-\u{23FF}] | [\u{2700}-\u{27BF}])
+
+// Emoji complex: base emoji optionally followed by modifiers, with ZWJ chaining
+EMOJI_COMPLEX = {EMOJI_BASE}{VARIATION_SELECTOR}?({SKIN_TONE_MODIFIER}|{KEYCAP})?({ZWJ}{EMOJI_BASE}{VARIATION_SELECTOR}?{SKIN_TONE_MODIFIER}?)*
+               | {REGIONAL_INDICATOR}{REGIONAL_INDICATOR}
+
 WORD = ({IRISH_O}?{ALPHANUM}+|[Qq]ur{Q}an)
 
 // pragmas used for anonymization etc.
@@ -666,6 +680,7 @@
 {PUNCT}                                               { return currentToken();}
 {EMOTICON}                                          { return currentToken();}
 {DASH}{DoubleLiteral}                               { return currentToken();}
+{EMOJI_COMPLEX}                                    { return currentToken();}
 <<EOF>>                                             { fileEnd(); return null;}
 .                                                   { return currentToken();}
 
diff --git a/src/test/java/de/ids_mannheim/korap/tokenizer/TokenizerTest.java b/src/test/java/de/ids_mannheim/korap/tokenizer/TokenizerTest.java
index f1eda03..402bf85 100644
--- a/src/test/java/de/ids_mannheim/korap/tokenizer/TokenizerTest.java
+++ b/src/test/java/de/ids_mannheim/korap/tokenizer/TokenizerTest.java
@@ -271,6 +271,33 @@
         assertEquals(tokens.length, 5);
     }
 
+    // Regression test for https://github.com/KorAP/KorAP-Tokenizer/issues/113
+    @Test
+    public void testTokenizerEmojiSequences () {
+        DerekoDfaTokenizer_de tok = new DerekoDfaTokenizer_de();
+        
+        // Test emoji with skin tone modifier (U+270A U+1F3FF = raised fist dark skin tone)
+        String[] tokens = tok.tokenize("Power ✊🏿!");
+        assertEquals("Power", tokens[0]);
+        assertEquals("✊🏿", tokens[1]); // Should be one token
+        assertEquals("!", tokens[2]);
+        assertEquals(3, tokens.length);
+        
+        // Test emoji ZWJ sequence (family: man, man, boy)
+        tokens = tok.tokenize("Familie 👨‍👨‍👦 hier");
+        assertEquals("Familie", tokens[0]);
+        assertEquals("👨‍👨‍👦", tokens[1]); // Should be one token with ZWJ
+        assertEquals("hier", tokens[2]);
+        assertEquals(3, tokens.length);
+        
+        // Test flag emoji (regional indicators for Germany: U+1F1E9 U+1F1EA)
+        tokens = tok.tokenize("Flagge 🇩🇪 toll");
+        assertEquals("Flagge", tokens[0]);
+        assertEquals("🇩🇪", tokens[1]); // Should be one token
+        assertEquals("toll", tokens[2]);
+        assertEquals(3, tokens.length);
+    }
+
     @Test
     public void testTokenizerRef1 () {
         DerekoDfaTokenizer_de tok = new DerekoDfaTokenizer_de();