Workaround for failing offsets due to surrogate mismatches - fixes #54

Change-Id: Ic738c4b1e9f71fbc78cd370041a86189ff3a0bea
diff --git a/Changes b/Changes
index a17eeca..5d2a9c6 100644
--- a/Changes
+++ b/Changes
@@ -1,4 +1,4 @@
-0.58.5 2019-03-06
+0.58.5 2019-03-15
     - [bugfix] Fix bug where duplicate keys occured in
       field data output (diewald)
     - [bugfix] Fix bug where fields already set where lifted
@@ -12,6 +12,8 @@
       to avoid multiple documents with the same text sigle
       (diewald)
     - [bugfix] Fixed #53 element distance query bug (margaretha)
+    - [bugfix] Workaround for #54 failing offsets due to
+      surrogate pairs (diewald)
 
 0.58.4 2019-02-05
     - [cleanup] Remove deprecated methods setLicense/getLicense,
diff --git a/src/main/java/de/ids_mannheim/korap/index/AbstractDocument.java b/src/main/java/de/ids_mannheim/korap/index/AbstractDocument.java
index 7ee4751..4695cef 100644
--- a/src/main/java/de/ids_mannheim/korap/index/AbstractDocument.java
+++ b/src/main/java/de/ids_mannheim/korap/index/AbstractDocument.java
@@ -291,10 +291,14 @@
      *            as a string.
      */
     public void setPrimaryData (String primary) {
-        this.primaryData = primary;
+        // Java can't work with utf-8 substrings as defined in the input data,
+        // That's why substringing fails on surrogates. This is a workaround
+        // to remove surrogates to make substringing work again.
+        // It would probably be better to fix this before the data hits the index,
+        // but we have to work with old indices as well.
+        this.primaryData = primary.replaceAll("[^\u0000-\uffff]", "?");
     };
 
-
     /**
      * Get the length of the primary data of the document
      * (i.e. the number of characters).
diff --git a/src/main/java/de/ids_mannheim/korap/index/FieldDocument.java b/src/main/java/de/ids_mannheim/korap/index/FieldDocument.java
index 8a91935..dd4b657 100644
--- a/src/main/java/de/ids_mannheim/korap/index/FieldDocument.java
+++ b/src/main/java/de/ids_mannheim/korap/index/FieldDocument.java
@@ -199,6 +199,8 @@
      * Deserialize token stream data.
      */
     public void setData (Map<String, Object> node) {
+
+        // TODO: Replace surrogates here, see AbstractDocument#setPrimaryData
         this.setPrimaryData((String) node.get("text"));
 
         String fieldName = (String) node.get("name");
diff --git a/src/test/java/de/ids_mannheim/korap/TestIndexer.java b/src/test/java/de/ids_mannheim/korap/TestIndexer.java
index d0925e4..dc78958 100644
--- a/src/test/java/de/ids_mannheim/korap/TestIndexer.java
+++ b/src/test/java/de/ids_mannheim/korap/TestIndexer.java
@@ -43,7 +43,7 @@
     public void testMultipleInputFiles () throws IOException {

         Indexer.main(new String[] { "-c", "src/test/resources/krill.properties",

                 "-i", "src/test/resources/wiki" });

-        assertEquals("Added or updated 17 files.", outputStream.toString());

+        assertEquals("Added or updated 18 files.", outputStream.toString());

     }

 

 

diff --git a/src/test/java/de/ids_mannheim/korap/highlight/TestHighlight.java b/src/test/java/de/ids_mannheim/korap/highlight/TestHighlight.java
index 7a5c550..6e24c18 100644
--- a/src/test/java/de/ids_mannheim/korap/highlight/TestHighlight.java
+++ b/src/test/java/de/ids_mannheim/korap/highlight/TestHighlight.java
@@ -355,6 +355,45 @@
                 "Valid class numbers exceeded");
     };
 
+    @Test
+    public void highlightSnippetOffsetBug () throws IOException, QueryException {
+        KrillIndex ki = new KrillIndex();
+        ki.addDoc(getClass().getResourceAsStream("/wiki/WUD17-G97-20422.json.gz"),  true);
+        ki.commit();
+
+        /*
+        QueryBuilder kq = new QueryBuilder("tokens");
+        SpanQuery q = (SpanQuery) kq.seg("s:Sockenpuppe").toQuery();
+
+        Krill qs = new Krill(q);
+        qs.getMeta().getContext().left.setToken(true).setLength((short) 0);
+        qs.getMeta().getContext().right.setToken(true).setLength((short) 0);    
+        Result kr = ki.search(qs);
+        */
+        Match km;
+        
+        km = ki.getMatch("match-WUD17/G97/20422-p1020-1021");
+        assertEquals(km.getSnippetBrackets(), "... [[Madonna]] ...");
+
+        km = ki.getMatch("match-WUD17/G97/20422-p1030-1031");
+        assertEquals(km.getSnippetBrackets(), "... [[Kurier]] ...");
+
+        km = ki.getMatch("match-WUD17/G97/20422-p1032-1033");
+        assertEquals(km.getSnippetBrackets(), "... [[Spalte]] ...");
+
+        // There is a surrogate between 6500, 6600 that makes the substring
+        // broken, as the original substring works on utf-8, but Java works on utf-16
+
+        km = ki.getMatch("match-WUD17/G97/20422-p1033-1034");
+        assertEquals(km.getSnippetBrackets(), "... [[Neue]] ...");
+        
+        km = ki.getMatch("match-WUD17/G97/20422-p1034-1035");
+        assertEquals(km.getSnippetBrackets(), "... [[Artikel]] ...");        
+        
+        km = ki.getMatch("match-WUD17/G97/20422-p5707-5708");
+        assertEquals(km.getSnippetBrackets(), "... [[Sockenpuppe]] ...");
+    }
+    
 
     @Test
     public void highlightEscapes () throws IOException, QueryException {
diff --git a/src/test/resources/wiki/WUD17-G97-20422.json.gz b/src/test/resources/wiki/WUD17-G97-20422.json.gz
new file mode 100644
index 0000000..830429c
--- /dev/null
+++ b/src/test/resources/wiki/WUD17-G97-20422.json.gz
Binary files differ