Keep highlights that extend beyond a cut match (fixes #177)
Change-Id: I34213933ae40013ccfa8fd5ac21e53cb1613b5b3
diff --git a/Changes b/Changes
index 2a82202..d78dca0 100644
--- a/Changes
+++ b/Changes
@@ -1,3 +1,7 @@
+0.64.7 2026-04-28
+ - [bugfix] Keep highlights that extend beyond a cut match
+ (diewald; fixes #177; diewald; AI-assisted Claude Opus 4.6)
+
0.64.6 2026-03-09
- [performance] Add leaf cache. (diewald)
- [bugfix] Fix fingerprinter (wasn't threadsafe; diewald)
diff --git a/pom.xml b/pom.xml
index 2f7731d..021d326 100644
--- a/pom.xml
+++ b/pom.xml
@@ -35,7 +35,7 @@
<groupId>de.ids-mannheim.korap.krill</groupId>
<artifactId>Krill</artifactId>
- <version>0.64.6</version>
+ <version>0.64.7</version>
<packaging>jar</packaging>
<name>Krill</name>
diff --git a/src/main/java/de/ids_mannheim/korap/response/Match.java b/src/main/java/de/ids_mannheim/korap/response/Match.java
index 2700c16..4374783 100644
--- a/src/main/java/de/ids_mannheim/korap/response/Match.java
+++ b/src/main/java/de/ids_mannheim/korap/response/Match.java
@@ -351,6 +351,20 @@
this.addHighlight(start, end - 1, number);
}
+ // Cut highlights that extend beyond a cut match
+ else if (this.endCutted
+ && unsignedByte(number) <= 128
+ && start >= this.getStartPos()
+ && start < this.getEndPos()
+ && end > this.getEndPos()) {
+
+ if (DEBUG) {
+ log.trace("Add clamped highlight with class {}!",
+ unsignedByte(number));
+ };
+
+ this.addHighlight(start, this.getEndPos() - 1, number);
+ }
else if (DEBUG) {
log.trace("Don't add highlight of class {}!",
unsignedByte(number));
@@ -2314,8 +2328,16 @@
log.debug("Pagebreak keeps end position");
};
- if (start < 0 ||
- ((end < 0 | start > endRelOffsetChar) && end != PB_MARKER && end != ALL_MARKER)) {
+ if (start < 0) {
+
+ // Change start to 0 if end is positive and not a pagebreak or marker
+ if (end >= 0 && end != PB_MARKER && end != ALL_MARKER) {
+ start = 0;
+ } else {
+ continue;
+ }
+ }
+ else if ((end < 0 | start > endRelOffsetChar) && end != PB_MARKER && end != ALL_MARKER) {
continue;
};
diff --git a/src/test/java/de/ids_mannheim/korap/highlight/TestHighlight.java b/src/test/java/de/ids_mannheim/korap/highlight/TestHighlight.java
index 11acbde..7e594cf 100644
--- a/src/test/java/de/ids_mannheim/korap/highlight/TestHighlight.java
+++ b/src/test/java/de/ids_mannheim/korap/highlight/TestHighlight.java
@@ -543,6 +543,106 @@
};
+ @Test
+ public void highlightCutMatchSerializationBug ()
+ throws IOException, QueryException {
+
+ // Test for issue #177: Match cut removes highlights when extended
+ KrillIndex ki = new KrillIndex();
+ FieldDocument fd = new FieldDocument();
+ fd.addString("ID", "doc-1");
+ fd.addString("UID", "1");
+ fd.addTV("base",
+ "abcdefghij",
+ "[(0-1)s:a|i:a|_0#0-1|-:t$<i>10]"
+ + "[(1-2)s:b|i:b|_1#1-2]"
+ + "[(2-3)s:c|i:c|_2#2-3]"
+ + "[(3-4)s:d|i:d|_3#3-4]"
+ + "[(4-5)s:e|i:e|_4#4-5]"
+ + "[(5-6)s:f|i:f|_5#5-6]"
+ + "[(6-7)s:g|i:g|_6#6-7]"
+ + "[(7-8)s:h|i:h|_7#7-8]"
+ + "[(8-9)s:i|i:i|_8#8-9]"
+ + "[(9-10)s:j|i:j|_9#9-10]");
+ ki.addDoc(fd);
+ ki.commit();
+
+ QueryBuilder kq = new QueryBuilder("base");
+
+ // {1: seq(c, d, e, f, g)} - class wrapping 5 tokens
+ SpanQuery q = (SpanQuery) kq.nr(1,
+ kq.seq(kq.seg("s:c")).append(kq.seg("s:d"))
+ .append(kq.seg("s:e")).append(kq.seg("s:f"))
+ .append(kq.seg("s:g")))
+ .toQuery();
+
+ assertEquals("{1: spanNext(spanNext(spanNext(spanNext(base:s:c, base:s:d), base:s:e), base:s:f), base:s:g)}",
+ q.toString());
+
+ Krill ks = new Krill(q);
+ ks.setMaxTokenMatchSize(3);
+
+ Result kr = ks.apply(ki);
+ assertEquals(1, kr.getTotalResults());
+
+ Match km = kr.getMatch(0);
+ assertTrue(km.endCutted);
+
+ assertEquals("ab[[{1:cde}]<!>]fghij",
+ km.getSnippetBrackets());
+ assertEquals(
+ "<span class=\"context-left\">ab</span>"
+ + "<span class=\"match\"><mark>"
+ + "<mark class=\"class-1 level-0\">cde</mark>"
+ + "</mark><span class=\"cutted\"></span></span>"
+ + "<span class=\"context-right\">fghij</span>",
+ km.getSnippetHTML());
+ };
+
+
+ @Test
+ public void highlightCutMatchIndexBug ()
+ throws IOException, QueryException {
+
+ // Test for issue #177: Match cut removes highlights when extended
+ // Search in a real index with highlight on cut snippet
+ KrillIndex ki = new KrillIndex();
+ for (String i : new String[] { "00001", "00002" }) {
+ ki.addDoc(getClass().getResourceAsStream("/wiki/" + i + ".json.gz"),
+ true);
+ };
+ ki.commit();
+
+ QueryBuilder kq = new QueryBuilder("tokens");
+
+ SpanQuery q = (SpanQuery) kq.nr(1,
+ kq.seq(kq.seg("s:Mit")).append(kq.seg("s:Ausnahme"))
+ .append(kq.seg("s:von")).append(kq.seg("s:Fremdwörtern")))
+ .toQuery();
+
+ Krill ks = new Krill(q);
+ ks.setMaxTokenMatchSize(2);
+ Result kr = ks.apply(ki);
+ assertTrue("Should find at least one match", kr.getTotalResults() > 0);
+
+ Match km = kr.getMatch(0);
+ assertTrue(km.endCutted);
+
+ String brackets = km.getSnippetBrackets();
+ // The class-1 highlight must appear in the cut match
+ assertTrue("Brackets should contain class 1 highlight: " + brackets,
+ brackets.contains("{1:"));
+ assertTrue("Brackets should contain cut marker: " + brackets,
+ brackets.contains("<!>"));
+
+ String html = km.getSnippetHTML();
+ assertTrue("HTML should contain class-1 highlight: " + html,
+ html.contains("class-1"));
+ assertTrue("HTML should contain cutted marker: " + html,
+ html.contains("cutted"));
+ };
+
+
@Test
public void checkTokenArray () throws IOException, QueryException {