Don't crash on unmatched characters Resolves #139 Change-Id: Iad2c5c708902ffee1d82d89b9bbb1a55b4f4b656

commit: 967d682e8256fd84d38f1179abb74542f3cd80ca [log] [tgz]
author: Marc Kupietz <kupietz@ids-mannheim.de> Thu Apr 02 18:49:08 2026 +0200
committer: Marc Kupietz <kupietz@ids-mannheim.de> Thu Apr 02 20:17:27 2026 +0200
tree: 9ad397a289fff17cda99d394b023159042919dbf
parent: 276dba5395350ba4d02ac6f053c6e218a12c8a26 [diff]
diff --git a/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/DerekoDfaTokenizer.jflex b/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/DerekoDfaTokenizer.jflex
index d9b5c53..3e317cb 100644
--- a/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/DerekoDfaTokenizer.jflex
+++ b/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/DerekoDfaTokenizer.jflex

@@ -131,12 +131,56 @@
         this.normalize = normalize;
     }
 
+    /**
+     * Skip past one character in the scanner buffer to recover from unmatched input.
+     * Advances zzMarkedPos by the appropriate number of chars (2 for surrogate pairs, 1 otherwise).
+     * This is needed because JFlex's zzScanError leaves the scanner position unchanged.
+     * Logs a warning with the skipped character, its Unicode codepoint, and surrounding context.
+     */
+    private void skipOneCharacter() {
+        int cp;
+        int charsToSkip;
+        if (zzMarkedPos < zzBuffer.length && Character.isHighSurrogate(zzBuffer[zzMarkedPos])
+                && zzMarkedPos + 1 < zzBuffer.length && Character.isLowSurrogate(zzBuffer[zzMarkedPos + 1])) {
+            cp = Character.toCodePoint(zzBuffer[zzMarkedPos], zzBuffer[zzMarkedPos + 1]);
+            charsToSkip = 2;
+        } else if (zzMarkedPos < zzBuffer.length) {
+            cp = zzBuffer[zzMarkedPos];
+            charsToSkip = 1;
+        } else {
+            System.err.println("Warning: could not match input at position " + yychar + " (at end of buffer)");
+            zzMarkedPos++;
+            return;
+        }
+
+        // Extract surrounding context (up to 10 chars on each side)
+        int ctxStart = Math.max(0, zzMarkedPos - 10);
+        int ctxEnd = Math.min(zzBuffer.length, zzMarkedPos + charsToSkip + 10);
+        String context = new String(zzBuffer, ctxStart, ctxEnd - ctxStart)
+                .replace("\n", "\\n").replace("\r", "\\r");
+
+        System.err.println("Warning: could not match input at position " + yychar
+                + ": U+" + String.format("%04X", cp)
+                + " '" + new String(Character.toChars(cp)) + "'"
+                + " context: \"..." + context + "...\"");
+
+        zzMarkedPos += charsToSkip;
+    }
+
     @Override
     public void scan() throws IOException {
         List<Span> list = new ArrayList<Span>();
         Span token;
         while (!zzAtEOF) {
-            token = this.getNextToken();
+            try {
+                token = this.getNextToken();
+            } catch (Error e) {
+                // JFlex throws Error (not Exception) for unmatched input.
+                // This can happen with unusual Unicode characters in social media text.
+                // Skip the problematic character and continue scanning.
+                skipOneCharacter();
+                continue;
+            }
             if (atEOT) {
                 if (echo && printOffsets) {
                     printTokenPositions(list, splitSentences);
@@ -205,7 +249,14 @@
         yyreset(new StringReader(input));
         try {
             while (!this.zzAtEOF) {
-                token = this.getNextToken();
+                try {
+                    token = this.getNextToken();
+                } catch (Error e) {
+                    // JFlex throws Error (not Exception) for unmatched input.
+                    // Skip the problematic character and continue scanning.
+                    skipOneCharacter();
+                    continue;
+                }
                 if (atEOT) {
                     if (echo) {
                         printTokenPositions(list, splitSentences);
commit	967d682e8256fd84d38f1179abb74542f3cd80ca	[log] [tgz]
author	Marc Kupietz <kupietz@ids-mannheim.de>	Thu Apr 02 18:49:08 2026 +0200
committer	Marc Kupietz <kupietz@ids-mannheim.de>	Thu Apr 02 20:17:27 2026 +0200
tree	9ad397a289fff17cda99d394b023159042919dbf
parent	276dba5395350ba4d02ac6f053c6e218a12c8a26 [diff]