Workaround for failing offsets due to surrogate mismatches - fixes #54
Change-Id: Ic738c4b1e9f71fbc78cd370041a86189ff3a0bea
diff --git a/Changes b/Changes
index a17eeca..5d2a9c6 100644
--- a/Changes
+++ b/Changes
@@ -1,4 +1,4 @@
-0.58.5 2019-03-06
+0.58.5 2019-03-15
- [bugfix] Fix bug where duplicate keys occured in
field data output (diewald)
- [bugfix] Fix bug where fields already set where lifted
@@ -12,6 +12,8 @@
to avoid multiple documents with the same text sigle
(diewald)
- [bugfix] Fixed #53 element distance query bug (margaretha)
+ - [bugfix] Workaround for #54 failing offsets due to
+ surrogate pairs (diewald)
0.58.4 2019-02-05
- [cleanup] Remove deprecated methods setLicense/getLicense,
diff --git a/src/main/java/de/ids_mannheim/korap/index/AbstractDocument.java b/src/main/java/de/ids_mannheim/korap/index/AbstractDocument.java
index 7ee4751..4695cef 100644
--- a/src/main/java/de/ids_mannheim/korap/index/AbstractDocument.java
+++ b/src/main/java/de/ids_mannheim/korap/index/AbstractDocument.java
@@ -291,10 +291,14 @@
* as a string.
*/
public void setPrimaryData (String primary) {
- this.primaryData = primary;
+ // Java can't work with utf-8 substrings as defined in the input data,
+ // That's why substringing fails on surrogates. This is a workaround
+ // to remove surrogates to make substringing work again.
+ // It would probably be better to fix this before the data hits the index,
+ // but we have to work with old indices as well.
+ this.primaryData = primary.replaceAll("[^\u0000-\uffff]", "?");
};
-
/**
* Get the length of the primary data of the document
* (i.e. the number of characters).
diff --git a/src/main/java/de/ids_mannheim/korap/index/FieldDocument.java b/src/main/java/de/ids_mannheim/korap/index/FieldDocument.java
index 8a91935..dd4b657 100644
--- a/src/main/java/de/ids_mannheim/korap/index/FieldDocument.java
+++ b/src/main/java/de/ids_mannheim/korap/index/FieldDocument.java
@@ -199,6 +199,8 @@
* Deserialize token stream data.
*/
public void setData (Map<String, Object> node) {
+
+ // TODO: Replace surrogates here, see AbstractDocument#setPrimaryData
this.setPrimaryData((String) node.get("text"));
String fieldName = (String) node.get("name");
diff --git a/src/test/java/de/ids_mannheim/korap/TestIndexer.java b/src/test/java/de/ids_mannheim/korap/TestIndexer.java
index d0925e4..dc78958 100644
--- a/src/test/java/de/ids_mannheim/korap/TestIndexer.java
+++ b/src/test/java/de/ids_mannheim/korap/TestIndexer.java
@@ -43,7 +43,7 @@
public void testMultipleInputFiles () throws IOException {
Indexer.main(new String[] { "-c", "src/test/resources/krill.properties",
"-i", "src/test/resources/wiki" });
- assertEquals("Added or updated 17 files.", outputStream.toString());
+ assertEquals("Added or updated 18 files.", outputStream.toString());
}
diff --git a/src/test/java/de/ids_mannheim/korap/highlight/TestHighlight.java b/src/test/java/de/ids_mannheim/korap/highlight/TestHighlight.java
index 7a5c550..6e24c18 100644
--- a/src/test/java/de/ids_mannheim/korap/highlight/TestHighlight.java
+++ b/src/test/java/de/ids_mannheim/korap/highlight/TestHighlight.java
@@ -355,6 +355,45 @@
"Valid class numbers exceeded");
};
+ @Test
+ public void highlightSnippetOffsetBug () throws IOException, QueryException {
+ KrillIndex ki = new KrillIndex();
+ ki.addDoc(getClass().getResourceAsStream("/wiki/WUD17-G97-20422.json.gz"), true);
+ ki.commit();
+
+ /*
+ QueryBuilder kq = new QueryBuilder("tokens");
+ SpanQuery q = (SpanQuery) kq.seg("s:Sockenpuppe").toQuery();
+
+ Krill qs = new Krill(q);
+ qs.getMeta().getContext().left.setToken(true).setLength((short) 0);
+ qs.getMeta().getContext().right.setToken(true).setLength((short) 0);
+ Result kr = ki.search(qs);
+ */
+ Match km;
+
+ km = ki.getMatch("match-WUD17/G97/20422-p1020-1021");
+ assertEquals(km.getSnippetBrackets(), "... [[Madonna]] ...");
+
+ km = ki.getMatch("match-WUD17/G97/20422-p1030-1031");
+ assertEquals(km.getSnippetBrackets(), "... [[Kurier]] ...");
+
+ km = ki.getMatch("match-WUD17/G97/20422-p1032-1033");
+ assertEquals(km.getSnippetBrackets(), "... [[Spalte]] ...");
+
+ // There is a surrogate between 6500, 6600 that makes the substring
+ // broken, as the original substring works on utf-8, but Java works on utf-16
+
+ km = ki.getMatch("match-WUD17/G97/20422-p1033-1034");
+ assertEquals(km.getSnippetBrackets(), "... [[Neue]] ...");
+
+ km = ki.getMatch("match-WUD17/G97/20422-p1034-1035");
+ assertEquals(km.getSnippetBrackets(), "... [[Artikel]] ...");
+
+ km = ki.getMatch("match-WUD17/G97/20422-p5707-5708");
+ assertEquals(km.getSnippetBrackets(), "... [[Sockenpuppe]] ...");
+ }
+
@Test
public void highlightEscapes () throws IOException, QueryException {
diff --git a/src/test/resources/wiki/WUD17-G97-20422.json.gz b/src/test/resources/wiki/WUD17-G97-20422.json.gz
new file mode 100644
index 0000000..830429c
--- /dev/null
+++ b/src/test/resources/wiki/WUD17-G97-20422.json.gz
Binary files differ