Fix Unicode surrogate pair bugs

Resolves #139

Change-Id: I3942c127763d4f3aa7a60885e0b265192e9dc5f7
diff --git a/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/DerekoDfaTokenizer.jflex b/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/DerekoDfaTokenizer.jflex
index 3e317cb..58bbf28 100644
--- a/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/DerekoDfaTokenizer.jflex
+++ b/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/DerekoDfaTokenizer.jflex
@@ -412,7 +412,7 @@
      */
     final Span genderColonSuffixToken() {
         String matched = yytext();
-        int lastChar = matched.codePointAt(matched.length() - 1);
+        int lastChar = matched.codePointBefore(matched.length());
         
         // Find the colon position
         int colonPos = matched.lastIndexOf(':');
@@ -424,8 +424,8 @@
             return currentToken();
         } else {
             // Followed by non-letter - valid gender form
-            // Push back just the lookahead character
-            yypushback(1);
+            // Push back just the lookahead character (may be 2 chars for supplementary)
+            yypushback(Character.charCount(lastChar));
             return currentToken();
         }
     }
@@ -436,7 +436,7 @@
      */
     final Span genderSlashSuffixToken() {
         String matched = yytext();
-        int lastChar = matched.codePointAt(matched.length() - 1);
+        int lastChar = matched.codePointBefore(matched.length());
         
         // Find the slash position
         int slashPos = matched.lastIndexOf('/');
@@ -448,7 +448,7 @@
             return currentToken();
         } else {
             // Followed by non-letter - valid gender form
-            yypushback(1);
+            yypushback(Character.charCount(lastChar));
             return currentToken();
         }
     }
@@ -459,7 +459,7 @@
      */
     final Span genderStarSuffixToken() {
         String matched = yytext();
-        int lastChar = matched.codePointAt(matched.length() - 1);
+        int lastChar = matched.codePointBefore(matched.length());
         
         // Find the star position
         int starPos = matched.lastIndexOf('*');
@@ -471,7 +471,7 @@
             return currentToken();
         } else {
             // Followed by non-letter - valid gender form
-            yypushback(1);
+            yypushback(Character.charCount(lastChar));
             return currentToken();
         }
     }
@@ -492,7 +492,7 @@
      */
     final Span genderNounColonToken() {
         String matched = yytext();
-        int lastChar = matched.codePointAt(matched.length() - 1);
+        int lastChar = matched.codePointBefore(matched.length());
         
         // Find the colon position
         int colonPos = matched.lastIndexOf(':');
@@ -504,8 +504,8 @@
             return currentToken();
         } else {
             // Followed by non-letter - valid gender form
-            // Push back just the lookahead character
-            yypushback(1);
+            // Push back just the lookahead character (may be 2 chars for supplementary)
+            yypushback(Character.charCount(lastChar));
             return currentToken();
         }
     }
@@ -518,7 +518,7 @@
      */
     final Span genderNounSlashToken() {
         String matched = yytext();
-        int lastChar = matched.codePointAt(matched.length() - 1);
+        int lastChar = matched.codePointBefore(matched.length());
         
         // Find the slash position
         int slashPos = matched.lastIndexOf('/');
@@ -530,7 +530,7 @@
             return currentToken();
         } else {
             // Followed by non-letter - valid gender form
-            yypushback(1);
+            yypushback(Character.charCount(lastChar));
             return currentToken();
         }
     }
@@ -543,7 +543,7 @@
      */
     final Span genderNounParenToken() {
         String matched = yytext();
-        int lastChar = matched.codePointAt(matched.length() - 1);
+        int lastChar = matched.codePointBefore(matched.length());
         
         // Find the opening paren position
         int parenPos = matched.lastIndexOf('(');
@@ -555,7 +555,7 @@
             return currentToken();
         } else {
             // Followed by non-letter - valid gender form
-            yypushback(1);
+            yypushback(Character.charCount(lastChar));
             return currentToken();
         }
     }
diff --git a/src/test/java/de/ids_mannheim/korap/tokenizer/TokenizerTest.java b/src/test/java/de/ids_mannheim/korap/tokenizer/TokenizerTest.java
index 11b4938..8ad27c2 100644
--- a/src/test/java/de/ids_mannheim/korap/tokenizer/TokenizerTest.java
+++ b/src/test/java/de/ids_mannheim/korap/tokenizer/TokenizerTest.java
@@ -378,6 +378,49 @@
         assertEquals(5, tokens.length);
     }
 
+    // Regression test: gender-sensitive forms directly followed by emoji (supplementary chars)
+    // Previously crashed with "Error: could not match input" because yypushback(1) only pushed back
+    // half of a surrogate pair, leaving an orphaned low surrogate.
+    @Test
+    public void testTokenizerGenderFormsFollowedByEmoji () {
+        DerekoDfaTokenizer_de tok = new DerekoDfaTokenizer_de();
+
+        // Colon noun gender + flag emoji: "Andersson:Innen🇸🇪"
+        String[] tokens = tok.tokenize("Andersson:Innen\uD83C\uDDF8\uD83C\uDDEA entscheid");
+        assertEquals("Andersson:Innen", tokens[0]);
+        assertEquals("\uD83C\uDDF8\uD83C\uDDEA", tokens[1]); // flag emoji 🇸🇪
+        assertEquals("entscheid", tokens[2]);
+        assertEquals(3, tokens.length);
+
+        // Slash short suffix + emoji: "jeder/e😜"
+        tokens = tok.tokenize("jeder/e\uD83D\uDE1C mues");
+        assertEquals("jeder/e", tokens[0]);
+        assertEquals("\uD83D\uDE1C", tokens[1]); // 😜
+        assertEquals("mues", tokens[2]);
+        assertEquals(3, tokens.length);
+
+        // Star short suffix + emoji: "Katze*n😻"
+        tokens = tok.tokenize("Katze*n\uD83D\uDE3B hier");
+        assertEquals("Katze*n", tokens[0]);
+        assertEquals("\uD83D\uDE3B", tokens[1]); // 😻
+        assertEquals("hier", tokens[2]);
+        assertEquals(3, tokens.length);
+
+        // Colon noun gender ending + emoji: "alt:innen🥳"
+        tokens = tok.tokenize("alt:innen\uD83E\uDD73 ergänzen");
+        assertEquals("alt:innen", tokens[0]);
+        assertEquals("\uD83E\uDD73", tokens[1]); // 🥳
+        assertEquals("ergänzen", tokens[2]);
+        assertEquals(3, tokens.length);
+
+        // Colon noun gender ending + emoji: "ant:innen🔴"
+        tokens = tok.tokenize("ant:innen\uD83D\uDD34 Querden");
+        assertEquals("ant:innen", tokens[0]);
+        assertEquals("\uD83D\uDD34", tokens[1]); // 🔴
+        assertEquals("Querden", tokens[2]);
+        assertEquals(3, tokens.length);
+    }
+
     // Regression test for https://github.com/KorAP/KorAP-Tokenizer/issues/114
     @Test
     public void testTokenizerWikipediaEmojiTemplate () {