Fix Unicode surrogate pair bugs
Resolves #139
Change-Id: I3942c127763d4f3aa7a60885e0b265192e9dc5f7
diff --git a/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/DerekoDfaTokenizer.jflex b/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/DerekoDfaTokenizer.jflex
index 3e317cb..58bbf28 100644
--- a/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/DerekoDfaTokenizer.jflex
+++ b/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/DerekoDfaTokenizer.jflex
@@ -412,7 +412,7 @@
*/
final Span genderColonSuffixToken() {
String matched = yytext();
- int lastChar = matched.codePointAt(matched.length() - 1);
+ int lastChar = matched.codePointBefore(matched.length());
// Find the colon position
int colonPos = matched.lastIndexOf(':');
@@ -424,8 +424,8 @@
return currentToken();
} else {
// Followed by non-letter - valid gender form
- // Push back just the lookahead character
- yypushback(1);
+ // Push back just the lookahead character (may be 2 chars for supplementary)
+ yypushback(Character.charCount(lastChar));
return currentToken();
}
}
@@ -436,7 +436,7 @@
*/
final Span genderSlashSuffixToken() {
String matched = yytext();
- int lastChar = matched.codePointAt(matched.length() - 1);
+ int lastChar = matched.codePointBefore(matched.length());
// Find the slash position
int slashPos = matched.lastIndexOf('/');
@@ -448,7 +448,7 @@
return currentToken();
} else {
// Followed by non-letter - valid gender form
- yypushback(1);
+ yypushback(Character.charCount(lastChar));
return currentToken();
}
}
@@ -459,7 +459,7 @@
*/
final Span genderStarSuffixToken() {
String matched = yytext();
- int lastChar = matched.codePointAt(matched.length() - 1);
+ int lastChar = matched.codePointBefore(matched.length());
// Find the star position
int starPos = matched.lastIndexOf('*');
@@ -471,7 +471,7 @@
return currentToken();
} else {
// Followed by non-letter - valid gender form
- yypushback(1);
+ yypushback(Character.charCount(lastChar));
return currentToken();
}
}
@@ -492,7 +492,7 @@
*/
final Span genderNounColonToken() {
String matched = yytext();
- int lastChar = matched.codePointAt(matched.length() - 1);
+ int lastChar = matched.codePointBefore(matched.length());
// Find the colon position
int colonPos = matched.lastIndexOf(':');
@@ -504,8 +504,8 @@
return currentToken();
} else {
// Followed by non-letter - valid gender form
- // Push back just the lookahead character
- yypushback(1);
+ // Push back just the lookahead character (may be 2 chars for supplementary)
+ yypushback(Character.charCount(lastChar));
return currentToken();
}
}
@@ -518,7 +518,7 @@
*/
final Span genderNounSlashToken() {
String matched = yytext();
- int lastChar = matched.codePointAt(matched.length() - 1);
+ int lastChar = matched.codePointBefore(matched.length());
// Find the slash position
int slashPos = matched.lastIndexOf('/');
@@ -530,7 +530,7 @@
return currentToken();
} else {
// Followed by non-letter - valid gender form
- yypushback(1);
+ yypushback(Character.charCount(lastChar));
return currentToken();
}
}
@@ -543,7 +543,7 @@
*/
final Span genderNounParenToken() {
String matched = yytext();
- int lastChar = matched.codePointAt(matched.length() - 1);
+ int lastChar = matched.codePointBefore(matched.length());
// Find the opening paren position
int parenPos = matched.lastIndexOf('(');
@@ -555,7 +555,7 @@
return currentToken();
} else {
// Followed by non-letter - valid gender form
- yypushback(1);
+ yypushback(Character.charCount(lastChar));
return currentToken();
}
}
diff --git a/src/test/java/de/ids_mannheim/korap/tokenizer/TokenizerTest.java b/src/test/java/de/ids_mannheim/korap/tokenizer/TokenizerTest.java
index 11b4938..8ad27c2 100644
--- a/src/test/java/de/ids_mannheim/korap/tokenizer/TokenizerTest.java
+++ b/src/test/java/de/ids_mannheim/korap/tokenizer/TokenizerTest.java
@@ -378,6 +378,49 @@
assertEquals(5, tokens.length);
}
+ // Regression test: gender-sensitive forms directly followed by emoji (supplementary chars)
+ // Previously crashed with "Error: could not match input" because yypushback(1) only pushed back
+ // half of a surrogate pair, leaving an orphaned low surrogate.
+ @Test
+ public void testTokenizerGenderFormsFollowedByEmoji () {
+ DerekoDfaTokenizer_de tok = new DerekoDfaTokenizer_de();
+
+ // Colon noun gender + flag emoji: "Andersson:Innen🇸🇪"
+ String[] tokens = tok.tokenize("Andersson:Innen\uD83C\uDDF8\uD83C\uDDEA entscheid");
+ assertEquals("Andersson:Innen", tokens[0]);
+ assertEquals("\uD83C\uDDF8\uD83C\uDDEA", tokens[1]); // flag emoji 🇸🇪
+ assertEquals("entscheid", tokens[2]);
+ assertEquals(3, tokens.length);
+
+ // Slash short suffix + emoji: "jeder/e😜"
+ tokens = tok.tokenize("jeder/e\uD83D\uDE1C mues");
+ assertEquals("jeder/e", tokens[0]);
+ assertEquals("\uD83D\uDE1C", tokens[1]); // 😜
+ assertEquals("mues", tokens[2]);
+ assertEquals(3, tokens.length);
+
+ // Star short suffix + emoji: "Katze*n😻"
+ tokens = tok.tokenize("Katze*n\uD83D\uDE3B hier");
+ assertEquals("Katze*n", tokens[0]);
+ assertEquals("\uD83D\uDE3B", tokens[1]); // 😻
+ assertEquals("hier", tokens[2]);
+ assertEquals(3, tokens.length);
+
+ // Colon noun gender ending + emoji: "alt:innen🥳"
+ tokens = tok.tokenize("alt:innen\uD83E\uDD73 ergänzen");
+ assertEquals("alt:innen", tokens[0]);
+ assertEquals("\uD83E\uDD73", tokens[1]); // 🥳
+ assertEquals("ergänzen", tokens[2]);
+ assertEquals(3, tokens.length);
+
+ // Colon noun gender ending + emoji: "ant:innen🔴"
+ tokens = tok.tokenize("ant:innen\uD83D\uDD34 Querden");
+ assertEquals("ant:innen", tokens[0]);
+ assertEquals("\uD83D\uDD34", tokens[1]); // 🔴
+ assertEquals("Querden", tokens[2]);
+ assertEquals(3, tokens.length);
+ }
+
// Regression test for https://github.com/KorAP/KorAP-Tokenizer/issues/114
@Test
public void testTokenizerWikipediaEmojiTemplate () {