Support emojis (in this case surrogate pairs) in snippets

Change-Id: I74f512bc6f782c45df85cefb77b23997705f6fd7
diff --git a/Changes b/Changes
index a288e28..bea7637 100644
--- a/Changes
+++ b/Changes
@@ -1,5 +1,6 @@
-0.62.0 2023-09-11
+0.62.0 2023-12-19
     - [cleanup] Change of groupID.
+    - Fix for emojis in snippets (diewald)
 
 0.61.3 2023-07-17
     - Add totalResources to results (diewald)
diff --git a/src/main/java/de/ids_mannheim/korap/index/AbstractDocument.java b/src/main/java/de/ids_mannheim/korap/index/AbstractDocument.java
index b0d5be7..207a9cb 100644
--- a/src/main/java/de/ids_mannheim/korap/index/AbstractDocument.java
+++ b/src/main/java/de/ids_mannheim/korap/index/AbstractDocument.java
@@ -3,6 +3,7 @@
 import java.util.*;
 
 import de.ids_mannheim.korap.util.KrillDate;
+import static de.ids_mannheim.korap.util.KrillString.*;
 import de.ids_mannheim.korap.index.FieldDocument;
 import de.ids_mannheim.korap.response.Response;
 import de.ids_mannheim.korap.response.MetaField;
@@ -262,7 +263,7 @@
      */
     @JsonIgnore
     public String getPrimaryData (int startOffset) {
-        return this.primaryData.substring(startOffset);
+        return codePointSubstring(this.primaryData, startOffset);
     };
 
 
@@ -280,7 +281,7 @@
      */
     @JsonIgnore
     public String getPrimaryData (int startOffset, int endOffset) {
-        return this.primaryData.substring(startOffset, endOffset);
+        return codePointSubstring(this.primaryData,startOffset, endOffset);
     };
 
 
@@ -292,12 +293,7 @@
      *            as a string.
      */
     public void setPrimaryData (String primary) {
-        // Java can't work with utf-8 substrings as defined in the input data,
-        // That's why substringing fails on surrogates. This is a workaround
-        // to remove surrogates to make substringing work again.
-        // It would probably be better to fix this before the data hits the index,
-        // but we have to work with old indices as well.
-        this.primaryData = primary.replaceAll("[^\u0000-\uffff]", "?");
+        this.primaryData = primary;
     };
 
     /**
diff --git a/src/main/java/de/ids_mannheim/korap/response/Match.java b/src/main/java/de/ids_mannheim/korap/response/Match.java
index 0e5dde6..24921d3 100644
--- a/src/main/java/de/ids_mannheim/korap/response/Match.java
+++ b/src/main/java/de/ids_mannheim/korap/response/Match.java
@@ -24,6 +24,7 @@
 import com.fasterxml.jackson.databind.node.ObjectNode;
 
 import static de.ids_mannheim.korap.util.KrillByte.*;
+import static de.ids_mannheim.korap.util.KrillString.codePointSubstring;
 import de.ids_mannheim.korap.index.AbstractDocument;
 import de.ids_mannheim.korap.index.PositionsToOffset;
 import de.ids_mannheim.korap.query.SpanElementQuery;
@@ -1359,7 +1360,7 @@
 
 				// Add partial string
 				if (pos > 0 && pos > oldPos) {
-					snippetArray.addString(clean.substring(oldPos, pos));
+					snippetArray.addString(codePointSubstring(clean, oldPos, pos));
 				};
 
 				// Remember the new position
@@ -1388,7 +1389,7 @@
         };
 
         if (clean.length() > pos && pos >= 0) {
-            snippetArray.addString(clean.substring(pos));
+            snippetArray.addString(codePointSubstring(clean, pos));
         };
     };
 
@@ -1500,8 +1501,8 @@
             for (i = startContext; i < this.startPos; i++) {
                 offsets = pto.span(ldid,i);
                 tokens.add(
-                    this.tempSnippet.substring(
-                        offsets[0]- startContextChar, offsets[1] - startContextChar)
+                    codePointSubstring(this.tempSnippet,
+                                       offsets[0]- startContextChar, offsets[1] - startContextChar)
                     );
             };
         };
@@ -1513,8 +1514,8 @@
                 continue;
             }
             tokens.add(
-                this.tempSnippet.substring(
-                    offsets[0]- startContextChar, offsets[1] - startContextChar)
+                codePointSubstring(this.tempSnippet,
+                                   offsets[0]- startContextChar, offsets[1] - startContextChar)
                 );
         };
 
@@ -1531,7 +1532,7 @@
                     tokens = json.putArray("right");
                 
                 tokens.add(
-                    this.tempSnippet.substring(
+                    codePointSubstring(this.tempSnippet,
                         offsets[0]- startContextChar, offsets[1] - startContextChar)
                     );
             };
diff --git a/src/main/java/de/ids_mannheim/korap/util/KrillString.java b/src/main/java/de/ids_mannheim/korap/util/KrillString.java
index 845325e..8d0da5a 100644
--- a/src/main/java/de/ids_mannheim/korap/util/KrillString.java
+++ b/src/main/java/de/ids_mannheim/korap/util/KrillString.java
@@ -88,4 +88,37 @@
     public static String quote (String text) {
         return '"' + text.replaceAll("([\"\\\\])", "\\\\$1") + '"';
     };
+
+
+    /**
+     * Provide a substring method that works well with surrogate pairs.
+     * 
+     * @param text
+     *            The string to substring.
+     * @param start
+     *            The start offset.
+     * @param end
+     *            The end offset.
+     * @return The substring.
+     */
+    public static String codePointSubstring(String text, int start, int end) {
+        int a = text.offsetByCodePoints(0, start);
+        return text.substring(
+            a,
+            text.offsetByCodePoints(a, end - start)
+            );
+    };
+
+    /**
+     * Provide a substring method that works well with surrogate pairs.
+     * 
+     * @param text
+     *            The string to substring.
+     * @param start
+     *            The start offset.
+     * @return The substring.
+     */
+    public static String codePointSubstring(String text, int start) {
+        return text.substring(text.offsetByCodePoints(0, start));
+    };
 };
diff --git a/src/test/java/de/ids_mannheim/korap/search/TestKrill.java b/src/test/java/de/ids_mannheim/korap/search/TestKrill.java
index 168b83d..4db8b15 100644
--- a/src/test/java/de/ids_mannheim/korap/search/TestKrill.java
+++ b/src/test/java/de/ids_mannheim/korap/search/TestKrill.java
@@ -1469,4 +1469,28 @@
         assertEquals(kr.getMatch(0).getTextSigle(), "GOE_AGX.00002");
     };
 
+    @Test
+    public void emojiSearch () throws IOException {
+
+        // Construct index
+        KrillIndex ki = new KrillIndex();
+        // Indexing test files
+        ki.addDoc(
+                getClass().getResourceAsStream("/others/KYC-MAI-001888-censored.json"),
+                false);
+        ki.commit();
+
+        Krill k = new Krill(new QueryBuilder("tokens").seg("s:🎉"));
+
+        assertEquals(k.getSpanQuery().toString(), "tokens:s:🎉");
+
+        Result kr = k.apply(ki);
+        assertEquals(kr.getTotalResults(), 1);
+        assertEquals(kr.getMatch(0).getSnippetBrackets(),
+                     "... Strasse antreffe.😊 Versprochen Xxx-Xxx [[🎉]]");
+        assertEquals(kr.getMatch(0).getSnippetHTML(),
+                     "<span class=\"context-left\"><span class=\"more\"></span>Strasse antreffe.😊 Versprochen Xxx-Xxx </span><span class=\"match\"><mark>🎉</mark></span><span class=\"context-right\"></span>");
+        assertEquals(kr.getMatch(0).getTextSigle(), "KYC/MAI/001888");
+    };
+
 };
diff --git a/src/test/java/de/ids_mannheim/korap/util/TestKrillString.java b/src/test/java/de/ids_mannheim/korap/util/TestKrillString.java
index 7db0642..d03f331 100644
--- a/src/test/java/de/ids_mannheim/korap/util/TestKrillString.java
+++ b/src/test/java/de/ids_mannheim/korap/util/TestKrillString.java
@@ -1,6 +1,7 @@
 package de.ids_mannheim.korap.util;
 
 import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotEquals;
 import org.junit.Test;
 
 import static de.ids_mannheim.korap.util.KrillString.*;
@@ -29,4 +30,20 @@
                 quote("er sagte: \"Hallo!\""));
         assertEquals("\"a \\\\\\\" b\"", quote("a \\\" b"));
     };
+
+    @Test
+    public void testSurrogateSubstrings () {
+
+        // Follows https://stackoverflow.com/questions/55663213/java-substring-by-code-point-indices-treating-pairs-of-surrogate-code-units-as
+        String s = "👦👩👪👫";
+        
+        assertNotEquals("👦", s.substring(0,1));
+        assertEquals("👦", codePointSubstring(s,0,1));
+        assertNotEquals("👩", s.substring(1,2));
+        assertEquals("👩", codePointSubstring(s,1,2));
+        assertNotEquals("👪", s.substring(2,3));
+        assertEquals("👪", codePointSubstring(s,2,3));
+        assertNotEquals("👫", s.substring(3,4));
+        assertEquals("👫", codePointSubstring(s,3,4));
+    };
 };
diff --git a/src/test/resources/others/KYC-MAI-001888-censored.json b/src/test/resources/others/KYC-MAI-001888-censored.json
new file mode 100644
index 0000000..16f5ad4
--- /dev/null
+++ b/src/test/resources/others/KYC-MAI-001888-censored.json
@@ -0,0 +1 @@
+{"fields":[{"@type":"koral:field","type":"type:string","key":"corpusSigle","value":"KYC"},{"@type":"koral:field","type":"type:string","key":"docSigle","value":"KYC/MAI"},{"@type":"koral:field","key":"textSigle","type":"type:string","value":"KYC/MAI/001888"},{"type":"type:attachement","key":"publisher","@type":"koral:field","value":"data:,YouTube"},{"@type":"koral:field","type":"type:attachement","key":"textExternalLink","value":"data:application/x.korap-link;title=Kommentar,https%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3D8fraZlsmCio%26lc%3DUgzokB_0YXYi4KHYzx54AaABAg.9nQJ6oAsOVh9nqV33S7sN9"},{"@type":"koral:field","key":"pubDate","type":"type:date","value":"2023-03-29"},{"key":"availability","type":"type:string","@type":"koral:field","value":"QAO-NC"},{"value":"San Bruno, California","key":"pubPlace","type":"type:string","@type":"koral:field"},{"@type":"koral:field","type":"type:text","key":"author","value":"@rexor9652"},{"key":"corpusTitle","type":"type:text","@type":"koral:field","value":"KoKoKom YouTube-Kommentarkorpus"},{"key":"docAuthor","type":"type:text","@type":"koral:field","value":"maiLab"},{"value":"Wie viele Geschlechter gibt es?","key":"docTitle","type":"type:text","@type":"koral:field"},{"value":"Kommentar zu: Wie viele Geschlechter gibt es?","@type":"koral:field","key":"title","type":"type:text"}],"@type":"koral:corpus","version":"0.4","@context":"http://korap.ids-mannheim.de/ns/koral/0.4/context.jsonld","data":{"layerInfos":"dereko/s=spans ud/d=rels ud/l=tokens ud/m=tokens ud/p=tokens","foundries":"dereko dereko/structure dereko/structure/base-sentences-paragraphs-pagebreaks udpipe udpipe/dependency udpipe/morpho","text":"@snipers2914 ​​@@nicoschonebaumer7248 Da wird leider nix kommen, von dem feigen Xxxxxxxx. Ich hab noch nie erlebt, wie ne Xxxxx sich entschuldigte. Desweiteren ist sie so narzisstisch veranlagt, dass sie durch ihre akademische Bildung direkt zur Einbildung verleitet ist, sie und ihr xxxxxxxxxx Team sein was besseres.  Ihr ne volle Ladung Xxxxx Xxx in den Mund schmieren, das ist mein Verlangen, was ich auch umsetze, wenn ich sie mal auf der Strasse antreffe.😊 Versprochen Xxx-Xxx 🎉","tokenSource":"base#tokens","stream":[["-:base/paragraphs$<i>3","-:base/sentences$<i>5","-:tokens$<i>90","_0$<i>0<i>12","i:@snipers2914","s:@snipers2914"],["_1$<i>13<i>14","i:​","s:​"],["_2$<i>14<i>15","i:​","s:​"],["_3$<i>15<i>16","i:@","s:@"],["_4$<i>16<i>37","i:@nicoschonebaumer7248","s:@nicoschonebaumer7248"],["_5$<i>38<i>40","i:da","s:Da"],["_6$<i>41<i>45","i:wird","s:wird"],["_7$<i>46<i>52","i:leider","s:leider"],["_8$<i>53<i>56","i:nix","s:nix"],["_9$<i>57<i>63","i:kommen","s:kommen"],["_10$<i>63<i>64","i:,","s:,"],["_11$<i>65<i>68","i:von","s:von"],["_12$<i>69<i>72","i:dem","s:dem"],["_13$<i>73<i>79","i:feigen","s:feigen"],["_14$<i>80<i>88","i:Xxxxxxxx","s:Xxxxxxxx"],["_15$<i>88<i>89","i:.","s:."],["_16$<i>90<i>93","i:ich","s:Ich"],["_17$<i>94<i>97","i:hab","s:hab"],["_18$<i>98<i>102","i:noch","s:noch"],["_19$<i>103<i>106","i:nie","s:nie"],["_20$<i>107<i>113","i:erlebt","s:erlebt"],["_21$<i>113<i>114","i:,","s:,"],["_22$<i>115<i>118","i:wie","s:wie"],["_23$<i>119<i>121","i:ne","s:ne"],["_24$<i>122<i>127","i:Xxxxx","s:Xxxxx"],["_25$<i>128<i>132","i:sich","s:sich"],["_26$<i>133<i>146","i:entschuldigte","s:entschuldigte"],["_27$<i>146<i>147","i:.","s:."],["_28$<i>148<i>159","i:desweiteren","s:Desweiteren"],["_29$<i>160<i>163","i:ist","s:ist"],["_30$<i>164<i>167","i:sie","s:sie"],["_31$<i>168<i>170","i:so","s:so"],["_32$<i>171<i>183","i:narzisstisch","s:narzisstisch"],["_33$<i>184<i>193","i:veranlagt","s:veranlagt"],["_34$<i>193<i>194","i:,","s:,"],["_35$<i>195<i>199","i:dass","s:dass"],["_36$<i>200<i>203","i:sie","s:sie"],["_37$<i>204<i>209","i:durch","s:durch"],["_38$<i>210<i>214","i:ihre","s:ihre"],["_39$<i>215<i>226","i:akademische","s:akademische"],["_40$<i>227<i>234","i:bildung","s:Bildung"],["_41$<i>235<i>241","i:direkt","s:direkt"],["_42$<i>242<i>245","i:zur","s:zur"],["_43$<i>246<i>256","i:einbildung","s:Einbildung"],["_44$<i>257<i>266","i:verleitet","s:verleitet"],["_45$<i>267<i>270","i:ist","s:ist"],["_46$<i>270<i>271","i:,","s:,"],["_47$<i>272<i>275","i:sie","s:sie"],["_48$<i>276<i>279","i:und","s:und"],["_49$<i>280<i>283","i:ihr","s:ihr"],["_50$<i>284<i>294","i:xxxxxxxxxx","s:xxxxxxxxxx"],["_51$<i>295<i>299","i:team","s:Team"],["_52$<i>300<i>304","i:sein","s:sein"],["_53$<i>305<i>308","i:was","s:was"],["_54$<i>309<i>317","i:besseres","s:besseres"],["_55$<i>317<i>318","i:.","s:."],["_56$<i>320<i>323","i:ihr","s:Ihr"],["_57$<i>324<i>326","i:ne","s:ne"],["_58$<i>327<i>332","i:volle","s:volle"],["_59$<i>333<i>339","i:ladung","s:Ladung"],["_60$<i>340<i>345","i:Xxxxx","s:Xxxxx"],["_61$<i>346<i>349","i:Xxx","s:Xxx"],["_62$<i>350<i>352","i:in","s:in"],["_63$<i>353<i>356","i:den","s:den"],["_64$<i>357<i>361","i:mund","s:Mund"],["_65$<i>362<i>371","i:schmieren","s:schmieren"],["_66$<i>371<i>372","i:,","s:,"],["_67$<i>373<i>376","i:das","s:das"],["_68$<i>377<i>380","i:ist","s:ist"],["_69$<i>381<i>385","i:mein","s:mein"],["_70$<i>386<i>395","i:verlangen","s:Verlangen"],["_71$<i>395<i>396","i:,","s:,"],["_72$<i>397<i>400","i:was","s:was"],["_73$<i>401<i>404","i:ich","s:ich"],["_74$<i>405<i>409","i:auch","s:auch"],["_75$<i>410<i>417","i:umsetze","s:umsetze"],["_76$<i>417<i>418","i:,","s:,"],["_77$<i>419<i>423","i:wenn","s:wenn"],["_78$<i>424<i>427","i:ich","s:ich"],["_79$<i>428<i>431","i:sie","s:sie"],["_80$<i>432<i>435","i:mal","s:mal"],["_81$<i>436<i>439","i:auf","s:auf"],["_82$<i>440<i>443","i:der","s:der"],["_83$<i>444<i>451","i:strasse","s:Strasse"],["_84$<i>452<i>460","i:antreffe","s:antreffe"],["_85$<i>460<i>461","i:.","s:."],["_86$<i>461<i>462","i:😊","s:😊"],["_87$<i>463<i>474","i:versprochen","s:Versprochen"],["_88$<i>475<i>482","i:Xxx-Xxx","s:Xxx-Xxx"],["_89$<i>483<i>484","i:🎉","s:🎉"]],"name":"tokens"}}