Adjust context to shrink based on match size (closes #229)
Change-Id: I5dba1f714beb07c71c69a783d801d4a40492ff5d
diff --git a/Changes b/Changes
index 7ad8452..05d459e 100644
--- a/Changes
+++ b/Changes
@@ -1,4 +1,4 @@
-0.64.7 2026-04-29
+0.64.7 2026-05-05
- [bugfix] Keep highlights that extend beyond a cut match
(diewald; fixes #177; diewald; AI-assisted Claude Opus 4.6)
- [bugfix] Correctly handle foundry and layer in attribute groups
@@ -12,6 +12,12 @@
(fixes #243; diewald; AI-assisted Claude Opus 4.6)
- [bugfix] Preserve negation of a single operand inside a
collection group (fixes #178; diewald; AI-assisted Claude Opus 4.6))
+ - [feature] Allow the context to shrink based on the match size
+ via krill.context.left.maxShrink / krill.context.right.maxShrink
+ (fixes #229; diewald)
+ - [feature] Add krill.kwic.max.token as a convenience property
+ to cap total KWIC width; derives maxShrink values automatically
+ (diewald; AI-assisted Claude Opus 4.6)
0.64.6 2026-03-09
- [performance] Add leaf cache. (diewald)
diff --git a/src/main/java/de/ids_mannheim/korap/response/Match.java b/src/main/java/de/ids_mannheim/korap/response/Match.java
index 4374783..10c96c8 100644
--- a/src/main/java/de/ids_mannheim/korap/response/Match.java
+++ b/src/main/java/de/ids_mannheim/korap/response/Match.java
@@ -863,7 +863,6 @@
return this;
};
-
@JsonIgnore
public SearchContext getContext () {
if (this.context == null)
@@ -876,6 +875,36 @@
return this.getEndPos() - this.getStartPos();
};
+ // adjustContext to ensure, only markers in the relevant context
+ // are retrieved
+ // (not very Java-y tbh.)
+ private int[] adjustContext (int startContext, int endContext) {
+
+ // Subtract the matchLength from the context
+ if (KrillProperties.leftContextMaxShrink == 0 &&
+ KrillProperties.rightContextMaxShrink == 0) {
+
+ return new int[]{startContext, endContext};
+ };
+
+ int matchLength = this.endPos - this.startPos;
+
+ int maxShrinkLeft = KrillProperties.leftContextMaxShrink;
+
+ int maxShrink = maxShrinkLeft +
+ KrillProperties.rightContextMaxShrink;
+
+ // Calculate proportionate reduction
+ int requiredShrink = Math.min(matchLength, maxShrink);
+ int shrinkLeft = (int) Math.round(requiredShrink * ((double) maxShrinkLeft / maxShrink));
+ int shrinkRight = requiredShrink - shrinkLeft;
+
+ // Guard: never shrink more context than actually available
+ shrinkLeft = Math.min(shrinkLeft, Math.max(0, this.startPos - startContext));
+ shrinkRight = Math.min(shrinkRight, Math.max(0, endContext - this.endPos));
+
+ return new int[]{startContext + shrinkLeft, endContext - shrinkRight};
+ };
// Retrieve markers in a certain area
public List<int[]> retrieveMarkers (String marker) {
@@ -906,6 +935,12 @@
int minStartPos = this.getStartPos() - KrillProperties.maxTokenContextSize;
int maxEndPos = this.getEndPos() + KrillProperties.maxTokenContextSize;
+ // Not very efficient, to adjust the context for every retrieval,
+ // but there is no IO involved, therefore not too bad
+ int[] adjustedContext = this.adjustContext(minStartPos, maxEndPos);
+ minStartPos = adjustedContext[0];
+ maxEndPos = adjustedContext[1];
+
if (DEBUG) {
log.debug("=================================");
log.debug("Retrieve markers between {}-{}",
@@ -1194,6 +1229,11 @@
spanContext[3] = end; // spanContext[3];
}
+ // Context adjustment is not needed here: span expansion changes
+ // match boundaries, not context boundaries. Context adjustment is
+ // applied when rendering the snippet (getSnippetTokens,
+ // _processOffsetChars, retrieveMarkers).
+
this.potentialStartPosChar = spanContext[2];
this.potentialEndPosChar = spanContext[3];
this.startMore = false;
@@ -1656,7 +1696,7 @@
startContextChar = spanContext[2];
endContextChar = spanContext[3];
}
-
+
// The offset is not yet defined - and defined by tokens
if (endContext == -1) {
@@ -1685,6 +1725,11 @@
if (DEBUG)
log.debug("Set endContext {}", endContext);
};
+
+ // Adjust the context
+ int[] adjustedContext = this.adjustContext(startContext, endContext);
+ startContext = adjustedContext[0];
+ endContext = adjustedContext[1];
// Retrieve the character offsets for all tokens
for (int i = startContext; i < endContext; i++) {
@@ -1694,8 +1739,10 @@
if (startContextChar == -1)
startContextChar = pto.start(ldid, startContext);
- if (endContextChar == -1)
- endContextChar = pto.end(ldid, endContext);
+ if (endContextChar == -1) {
+ int lastPos = Math.max(endContext, this.endPos) - 1;
+ endContextChar = pto.end(ldid, lastPos);
+ }
if (DEBUG)
log.debug("Match is {}/{} - {}/{}",startContext,startContextChar,endContext,endContextChar);
@@ -1720,6 +1767,9 @@
tokens = json.putArray("left");
for (i = startContext; i < this.startPos; i++) {
offsets = pto.span(ldid,i);
+ if (offsets == null) {
+ continue;
+ }
tokens.add(
codePointSubstring(this.tempSnippet,
offsets[0]- startContextChar, offsets[1] - startContextChar)
@@ -1728,6 +1778,8 @@
};
tokens = json.putArray("match");
+
+ // Create right context token list
for (i = this.startPos; i < this.endPos; i++) {
offsets = pto.span(ldid,i);
if (offsets == null) {
@@ -2397,17 +2449,29 @@
PositionsToOffset pto = this.positionsToOffset;
+ // TODO:
+ // The best approach is probably to primarily focus on
+ // token contexts and in case they exceed the maxCharacter
+ // contexts, these should be cut at the end.
+ // This doesn't work with token list matches though,
+ // so we may need to deprecate the option to set character
+ // contexts altogether ...
+
+ boolean retrieveStart = false;
+ boolean retrieveEnd = false;
+
// The left offset is defined by tokens
if (this.context.left.isToken()) {
startOffset = this.startPos - this.context.left.getLength();
if (DEBUG)
log.trace("PTO will retrieve {} (Left context)",
startOffset);
- pto.add(ldid, startOffset);
+ retrieveStart = true;
}
// The left offset is defined by characters
else {
+ // TODO: This is therefore not adjusted
startOffsetChar = startPosChar - this.context.left.getLength();
};
@@ -2417,15 +2481,30 @@
if (DEBUG)
log.trace("PTO will retrieve {} (Right context)",
endOffset);
- pto.add(ldid, endOffset);
+ retrieveEnd = true;
}
// The right context is defined by characters
else {
+ // TODO: This is therefore not adjusted
endOffsetChar = (endPosChar == -1) ? -1
: endPosChar + this.context.right.getLength();
};
+ // Adjust token contexts (convert endOffset to exclusive for adjustContext)
+ int[] adjustedContexts = this.adjustContext(startOffset,
+ endOffset == -1 ? endOffset : endOffset + 1);
+
+ if (retrieveStart) {
+ startOffset = adjustedContexts[0];
+ pto.add(ldid, startOffset);
+ };
+
+ if (retrieveEnd) {
+ endOffset = adjustedContexts[1] - 1;
+ pto.add(ldid, endOffset);
+ };
+
if (startOffset != -1)
startOffsetChar = pto.start(ldid, startOffset);
diff --git a/src/main/java/de/ids_mannheim/korap/util/KrillProperties.java b/src/main/java/de/ids_mannheim/korap/util/KrillProperties.java
index b5b009c..6a2485e 100644
--- a/src/main/java/de/ids_mannheim/korap/util/KrillProperties.java
+++ b/src/main/java/de/ids_mannheim/korap/util/KrillProperties.java
@@ -24,6 +24,9 @@
public static int maxTokenMatchSize = 50;
public static int maxTokenContextSize = 60;
public static int maxCharContextSize = 500;
+ public static int leftContextMaxShrink = 0;
+ public static int rightContextMaxShrink = 0;
+ public static int kwicMaxToken = -1;
public static int defaultSearchContextLength = 6;
public static int maxTextSize = DEFAULT_MAX_STRING_LEN; // Default max text size
@@ -88,9 +91,20 @@
public static void updateConfigurations (Properties prop) {
String maxTokenMatchSize = prop.getProperty("krill.match.max.token");
+
+ // TODO:
+ // Should be separated for left and right!
String maxTokenContextSize = prop.getProperty("krill.context.max.token");
+
+ // Maximum number of tokens to shrink from context based on match size
+ // (only affects token-based contexts)
+ String leftContextMaxShrink = prop.getProperty("krill.context.left.maxShrink");
+ String rightContextMaxShrink = prop.getProperty("krill.context.right.maxShrink");
+
+ String kwicMaxToken = prop.getProperty("krill.kwic.max.token");
+
// EM: not implemented yet
-// String maxCharContextSize = prop.getProperty("krill.context.max.char");
+ // String maxCharContextSize = prop.getProperty("krill.context.max.char");
String defaultSearchContextLength = prop.getProperty("krill.search.context.default");
String maxTextSizeValue = prop.getProperty("krill.index.textSize.max");
@@ -123,6 +137,49 @@
}
}
+ if (leftContextMaxShrink != null) {
+ if (leftContextMaxShrink.equals("max")) {
+ KrillProperties.leftContextMaxShrink = KrillProperties.maxTokenContextSize;
+ } else {
+ KrillProperties.leftContextMaxShrink = Integer
+ .parseInt(leftContextMaxShrink);
+ if (KrillProperties.leftContextMaxShrink > KrillProperties.maxTokenContextSize)
+ KrillProperties.leftContextMaxShrink = KrillProperties.maxTokenContextSize;
+ else if (KrillProperties.leftContextMaxShrink < 0)
+ KrillProperties.leftContextMaxShrink = 0;
+ };
+ };
+ if (rightContextMaxShrink != null) {
+ if (rightContextMaxShrink.equals("max")) {
+ KrillProperties.rightContextMaxShrink = KrillProperties.maxTokenContextSize;
+ } else {
+ KrillProperties.rightContextMaxShrink = Integer
+ .parseInt(rightContextMaxShrink);
+ if (KrillProperties.rightContextMaxShrink > KrillProperties.maxTokenContextSize)
+ KrillProperties.rightContextMaxShrink = KrillProperties.maxTokenContextSize;
+ else if (KrillProperties.rightContextMaxShrink < 0)
+ KrillProperties.rightContextMaxShrink = 0;
+ };
+ };
+
+ if (kwicMaxToken != null) {
+ KrillProperties.kwicMaxToken = Integer.parseInt(kwicMaxToken);
+
+ if (leftContextMaxShrink != null || rightContextMaxShrink != null) {
+ log.warn("krill.kwic.max.token is set: individual "
+ + "krill.context.left.maxShrink / krill.context.right.maxShrink "
+ + "values will be ignored");
+ };
+
+ int totalAllowance = KrillProperties.maxTokenMatchSize
+ + 2 * KrillProperties.maxTokenContextSize;
+ int totalShrink = Math.max(0,
+ Math.min(totalAllowance - KrillProperties.kwicMaxToken,
+ 2 * KrillProperties.maxTokenContextSize));
+ KrillProperties.leftContextMaxShrink = totalShrink / 2;
+ KrillProperties.rightContextMaxShrink = totalShrink - totalShrink / 2;
+ };
+
}
catch (NumberFormatException e) {
log.error("A Krill property expects numerical values: "
diff --git a/src/main/resources/krill.properties.info b/src/main/resources/krill.properties.info
index 45fc56b..081d04b 100644
--- a/src/main/resources/krill.properties.info
+++ b/src/main/resources/krill.properties.info
@@ -15,3 +15,59 @@
krill.index.commit.auto = 500
krill.index.relations.max = 100
krill.index.textSize.max = 20000000
+
+# Token retrieval settings:
+#
+# krill.match.max.token = 5
+#
+## Maximum number (i.e. length) of tokens to be retrievable.
+## Matches longer than that will be cut.
+## Defaults to 50
+
+# krill.context.max.token =
+#
+## Maximum number (i.e. length) of tokens to be retrieved (left and right) of a match.
+## Defaults to 60
+
+# krill.kwic.max.token =
+#
+## Maximum total number of tokens in a KWIC snippet (left + match + right).
+## When set, this derives krill.context.left.maxShrink and
+## krill.context.right.maxShrink automatically:
+## totalShrink = (krill.match.max.token + 2 * krill.context.max.token) - kwic.max.token
+## Split evenly between left and right.
+## When this property is set, individual maxShrink values are ignored.
+## The existing krill.match.max.token remains in effect as a separate cap
+## on match length, preventing data leakage from sentence-level queries.
+## Not set by default (no KWIC cap - backward compatible).
+
+# krill.context.left.maxShrink = 0
+#
+## Maximum number of tokens the left context may shrink based on match length.
+## Defaults to 0 (no shrinking - full context is always returned).
+## When a match is long, the context shrinks by up to this many tokens,
+## keeping the total KWIC width manageable.
+## The total shrink is distributed proportionally between left and right
+## according to their respective maxShrink values.
+## Use the string "max" to allow the context to shrink entirely
+## (up to krill.context.max.token tokens).
+## Ignored when krill.kwic.max.token is set.
+## Note: Only token-based contexts are affected; character-based contexts
+## are currently not adjusted by this feature.
+
+# krill.context.right.maxShrink = 0
+#
+## Maximum number of tokens the right context may shrink based on match length.
+## Defaults to 0 (no shrinking - full context is always returned).
+## When a match is long, the context shrinks by up to this many tokens,
+## keeping the total KWIC width manageable.
+## The total shrink is distributed proportionally between left and right
+## according to their respective maxShrink values.
+## Use the string "max" to allow the context to shrink entirely
+## (up to krill.context.max.token tokens).
+## Ignored when krill.kwic.max.token is set.
+## Note: Only token-based contexts are affected; character-based contexts
+## are currently not adjusted by this feature.
+
+
+
diff --git a/src/test/java/de/ids_mannheim/korap/index/TestMaxContext.java b/src/test/java/de/ids_mannheim/korap/index/TestMaxContext.java
index 6072688..3c29a95 100644
--- a/src/test/java/de/ids_mannheim/korap/index/TestMaxContext.java
+++ b/src/test/java/de/ids_mannheim/korap/index/TestMaxContext.java
@@ -2,10 +2,13 @@
import static de.ids_mannheim.korap.TestSimple.getJsonString;
import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
import java.io.IOException;
+import java.util.Properties;
import org.junit.BeforeClass;
+import org.junit.After;
import org.junit.Test;
import com.fasterxml.jackson.core.JsonProcessingException;
@@ -14,12 +17,15 @@
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.node.ArrayNode;
+import org.apache.lucene.index.Term;
import de.ids_mannheim.korap.Krill;
import de.ids_mannheim.korap.KrillIndex;
import de.ids_mannheim.korap.response.Match;
import de.ids_mannheim.korap.response.Result;
import de.ids_mannheim.korap.response.SearchContext;
import de.ids_mannheim.korap.util.KrillProperties;
+import org.apache.lucene.search.spans.SpanTermQuery;
+
public class TestMaxContext {
private static KrillIndex ki;
@@ -41,6 +47,33 @@
.getFile());
}
+ private int savedMaxTokenMatchSize;
+ private int savedMaxTokenContextSize;
+ private int savedMaxCharContextSize;
+ private int savedDefaultSearchContextLength;
+ private boolean savedMatchExpansionIncludeContextSize;
+
+ @org.junit.Before
+ public void saveGlobals() {
+ savedMaxTokenMatchSize = KrillProperties.maxTokenMatchSize;
+ savedMaxTokenContextSize = KrillProperties.maxTokenContextSize;
+ savedMaxCharContextSize = KrillProperties.maxCharContextSize;
+ savedDefaultSearchContextLength = KrillProperties.defaultSearchContextLength;
+ savedMatchExpansionIncludeContextSize = KrillProperties.matchExpansionIncludeContextSize;
+ };
+
+ @After
+ public void resetGlobals() {
+ KrillProperties.leftContextMaxShrink = 0;
+ KrillProperties.rightContextMaxShrink = 0;
+ KrillProperties.kwicMaxToken = -1;
+ KrillProperties.maxTokenMatchSize = savedMaxTokenMatchSize;
+ KrillProperties.maxTokenContextSize = savedMaxTokenContextSize;
+ KrillProperties.maxCharContextSize = savedMaxCharContextSize;
+ KrillProperties.defaultSearchContextLength = savedDefaultSearchContextLength;
+ KrillProperties.matchExpansionIncludeContextSize = savedMatchExpansionIncludeContextSize;
+ };
+
@Test
public void testSmallerTokenContextSize () throws IOException {
@@ -74,7 +107,6 @@
Krill ks = new Krill(jsonNode);
Result kr = ks.apply(ki);
- kr = ks.apply(ki);
SearchContext context = kr.getContext();
assertEquals(KrillProperties.maxTokenContextSize,
@@ -96,7 +128,7 @@
@Test
public void searchWithLargerContextCharSize ()
- throws JsonMappingException, JsonProcessingException {
+ throws JsonMappingException, JsonProcessingException {
JsonNode jsonNode = mapper.readTree(jsonQuery);
ArrayNode leftNode = (ArrayNode) jsonNode.at("/meta/context/left");
ArrayNode rightNode = (ArrayNode) jsonNode.at("/meta/context/right");
@@ -138,4 +170,496 @@
assertEquals(6089, km.getSnippetBrackets().length());
KrillProperties.defaultSearchContextLength = 6;
};
-}
+
+
+ @Test
+ public void testTokenSnippetMatchLength1 () throws IOException {
+ SpanTermQuery stq = new SpanTermQuery(new Term("tokens", "s:des"));
+ Result kr = ki.search(stq, (short) 10);
+
+ Match km = kr.getMatch(0);
+ assertEquals(7, km.getStartPos());
+ assertEquals(8, km.getEndPos());
+ assertEquals(6, km.getContext().left.getLength());
+ assertEquals(6, km.getContext().right.getLength());
+
+
+ assertEquals("{\"left\":[\"bzw.\",\"a\",\"ist\",\"der\",\"erste\",\"Buchstabe\"]," +
+ "\"match\":[\"des\"]," +
+ "\"right\":[\"lateinischen\",\"Alphabets\",\"und\",\"ein\",\"Vokal\",\"Der\"]}",
+ kr.getMatch(0).getSnippetTokens().toString());
+
+ KrillProperties.leftContextMaxShrink = 1;
+ KrillProperties.rightContextMaxShrink = 1;
+
+ // Shrinks the left context by 1 - as that is the match length - although it could be 2
+ assertEquals("{\"left\":[\"gibt\",\"es\",\"zwei\",\"verschiedene\",\"Phoneme\"],"+
+ "\"match\":[\"des\"],"+
+ "\"right\":[\"Vokals\",\"den\",\"Kurzvokal\",\"a,\",\"wie\",\"z\"]}",
+ kr.getMatch(1).getSnippetTokens().toString());
+
+ KrillProperties.leftContextMaxShrink = 5;
+ KrillProperties.rightContextMaxShrink = 5;
+
+ // Shrinks the left context by 1 - as that is the match length - although it could be 10
+ assertEquals("{\"left\":[\"B.\",\"in\",\"Rat\",\"Die\",\"Länge\"],"+
+ "\"match\":[\"des\"],"+
+ "\"right\":[\"Vokals\",\"ist\",\"unterschiedlich\",\"gekennzeichnet\",\"Langer\",\"Vokal\"]}",
+ kr.getMatch(2).getSnippetTokens().toString());
+ };
+
+ @Test
+ public void testTokenSnippetMatchLengthLong () throws JsonMappingException, JsonProcessingException {
+ JsonNode jsonNode = mapper.readTree(jsonQuery);
+ Krill ks = new Krill(jsonNode);
+ Result kr = ks.apply(ki);
+
+ Match km = kr.getMatch(0);
+ assertEquals(34, km.getStartPos());
+ assertEquals(60, km.getEndPos());
+ assertEquals(5, km.getContext().left.getLength());
+ assertEquals(5, km.getContext().right.getLength());
+
+ String snippetToken = kr.getMatch(0).getSnippetTokens().toString();
+ assertTrue(snippetToken.contains(
+ "\"left\":[\"sechsthäufigste\",\"Buchstabe\",\"in\",\"deutschen\",\"Texten\"]"
+ )
+ );
+ assertTrue(snippetToken.contains(
+ "\"right\":[\"1.\",\"Aussprache\",\"Im\",\"Deutschen\",\"und\"]"
+ )
+ );
+
+ String snippetHTML = kr.getMatch(0).getSnippetHTML();
+ assertTrue(snippetHTML.contains("<span class=\"context-left\"><span class=\"more\"></span>sechsthäufigste Buchstabe in deutschen Texten. </span>"));
+ assertTrue(snippetHTML.contains("<span class=\"context-right\">. 1. Aussprache Im Deutschen und<span class=\"more\"></span></span>"));
+ };
+
+ @Test
+ public void testTokenSnippetMatchLengthLong2 () throws JsonMappingException, JsonProcessingException {
+ JsonNode jsonNode = mapper.readTree(jsonQuery);
+ Krill ks = new Krill(jsonNode);
+ Result kr = ks.apply(ki);
+
+ Match km = kr.getMatch(0);
+ assertEquals(34, km.getStartPos());
+ assertEquals(60, km.getEndPos());
+ assertEquals(5, km.getContext().left.getLength());
+ assertEquals(5, km.getContext().right.getLength());
+
+ KrillProperties.leftContextMaxShrink = 1;
+ KrillProperties.rightContextMaxShrink = 1;
+
+ String snippetToken = kr.getMatch(0).getSnippetTokens().toString();
+
+ assertTrue(snippetToken.contains(
+ "\"left\":[\"Buchstabe\",\"in\",\"deutschen\",\"Texten\"]"
+ )
+ );
+
+ assertTrue(snippetToken.contains(
+ "\"right\":[\"1.\",\"Aussprache\",\"Im\",\"Deutschen\"]"
+ )
+ );
+
+ String snippetHTML = kr.getMatch(0).getSnippetHTML();
+ assertTrue(snippetHTML.contains("<span class=\"context-left\"><span class=\"more\"></span>Buchstabe in deutschen Texten. </span>"));
+ assertTrue(snippetHTML.contains("<span class=\"context-right\">. 1. Aussprache Im Deutschen<span class=\"more\"></span></span>"));
+
+ };
+
+ @Test
+ public void testTokenSnippetMatchLengthLong3 () throws JsonMappingException, JsonProcessingException {
+ JsonNode jsonNode = mapper.readTree(jsonQuery);
+ Krill ks = new Krill(jsonNode);
+ Result kr = ks.apply(ki);
+
+ Match km = kr.getMatch(0);
+ assertEquals(34, km.getStartPos());
+ assertEquals(60, km.getEndPos());
+ assertEquals(5, km.getContext().left.getLength());
+ assertEquals(5, km.getContext().right.getLength());
+
+ KrillProperties.leftContextMaxShrink = 4;
+ KrillProperties.rightContextMaxShrink = 2;
+
+ String snippetToken = kr.getMatch(0).getSnippetTokens().toString();
+
+ assertTrue(snippetToken.contains(
+ "\"left\":[\"Texten\"]"
+ )
+ );
+
+ assertTrue(snippetToken.contains(
+ "\"right\":[\"1.\",\"Aussprache\",\"Im\"]"
+ )
+ );
+
+ String snippetHTML = kr.getMatch(0).getSnippetHTML();
+ assertTrue(snippetHTML.contains("<span class=\"context-left\"><span class=\"more\"></span>Texten. </span>"));
+ assertTrue(snippetHTML.contains("<span class=\"context-right\">. 1. Aussprache Im<span class=\"more\"></span></span>"));
+
+ };
+
+ @Test
+ public void testTokenSnippetMatchLengthLong4 () throws JsonMappingException, JsonProcessingException {
+ int before = KrillProperties.maxTokenMatchSize;
+ KrillProperties.maxTokenMatchSize = 5;
+ JsonNode jsonNode = mapper.readTree(jsonQuery);
+ Krill ks = new Krill(jsonNode);
+ Result kr = ks.apply(ki);
+
+ Match km = kr.getMatch(0);
+ assertEquals(34, km.getStartPos());
+ assertEquals(39, km.getEndPos());
+ assertEquals(5, km.getContext().left.getLength());
+ assertEquals(5, km.getContext().right.getLength());
+
+ KrillProperties.leftContextMaxShrink = 5;
+ KrillProperties.rightContextMaxShrink = 0;
+
+ String snippetToken = kr.getMatch(0).getSnippetTokens().toString();
+ KrillProperties.maxTokenMatchSize = before;
+
+ assertTrue(!snippetToken.contains("\"left\""));
+ assertTrue(snippetToken.contains(
+ "\"match\":[\"Mit\",\"Ausnahme\",\"von\",\"Fremdwörtern\",\"und\"]"
+ )
+ );
+
+ assertTrue(snippetToken.contains(
+ "\"right\":[\"Namen\",\"ist\",\"das\",\"A\",\"der\"]"
+ )
+ );
+
+ String snippetHTML = kr.getMatch(0).getSnippetHTML();
+ assertTrue(snippetHTML.contains("<span class=\"context-left\"><span class=\"more\"></span></span>"));
+ assertTrue(snippetHTML.contains("<span class=\"match\"><mark>Mit Ausnahme von Fremdwörtern und</mark><span class=\"cutted\"></span></span>"));
+ assertTrue(snippetHTML.contains("<span class=\"context-right\"> Namen ist das A der<span class=\"more\"></span></span>"));
+ };
+
+
+ @Test
+ public void testTokenSnippetMatchLengthLong5 () throws JsonMappingException, JsonProcessingException {
+ int before = KrillProperties.maxTokenMatchSize;
+ KrillProperties.maxTokenMatchSize = 5;
+ JsonNode jsonNode = mapper.readTree(jsonQuery);
+ Krill ks = new Krill(jsonNode);
+ Result kr = ks.apply(ki);
+
+ Match km = kr.getMatch(0);
+ assertEquals(34, km.getStartPos());
+ assertEquals(39, km.getEndPos());
+ assertEquals(5, km.getContext().left.getLength());
+ assertEquals(5, km.getContext().right.getLength());
+
+ // Adjust all context for the matchsize
+ KrillProperties.leftContextMaxShrink = 5;
+ KrillProperties.rightContextMaxShrink = 5;
+
+ String snippetToken = kr.getMatch(0).getSnippetTokens().toString();
+ KrillProperties.maxTokenMatchSize = before;
+
+ assertEquals("{\"left\":[\"deutschen\",\"Texten\"],\"match\":[\"Mit\",\"Ausnahme\",\"von\",\"Fremdwörtern\",\"und\"],\"right\":[\"Namen\",\"ist\",\"das\"]}", snippetToken);
+
+ String snippetHTML = kr.getMatch(0).getSnippetHTML();
+ assertTrue(snippetHTML.contains("<span class=\"context-left\"><span class=\"more\"></span>deutschen Texten. </span>"));
+ assertTrue(snippetHTML.contains("<span class=\"match\"><mark>Mit Ausnahme von Fremdwörtern und</mark><span class=\"cutted\"></span></span>"));
+ assertTrue(snippetHTML.contains("<span class=\"context-right\"> Namen ist das<span class=\"more\"></span></span>"));
+ };
+
+ @Test
+ public void testTokenSnippetMatchLengthLongAllKwic () throws JsonMappingException, JsonProcessingException {
+ int before = KrillProperties.maxTokenMatchSize;
+ KrillProperties.maxTokenMatchSize = 10;
+ JsonNode jsonNode = mapper.readTree(jsonQuery);
+ Krill ks = new Krill(jsonNode);
+ Result kr = ks.apply(ki);
+
+ Match km = kr.getMatch(0);
+ assertEquals(34, km.getStartPos());
+ assertEquals(44, km.getEndPos());
+ assertEquals(5, km.getContext().left.getLength());
+ assertEquals(5, km.getContext().right.getLength());
+
+ // Adjust all context for the matchsize
+ KrillProperties.leftContextMaxShrink = 5;
+ KrillProperties.rightContextMaxShrink = 5;
+ String snippetToken = kr.getMatch(0).getSnippetTokens().toString();
+ KrillProperties.maxTokenMatchSize = before;
+
+ assertEquals("{\"match\":[\"Mit\",\"Ausnahme\",\"von\",\"Fremdwörtern\",\"und\",\"Namen\",\"ist\",\"das\",\"A\",\"der\"]}", snippetToken);
+
+ String snippetHTML = kr.getMatch(0).getSnippetHTML();
+ assertTrue(snippetHTML.contains("<span class=\"context-left\"><span class=\"more\"></span></span>"));
+ assertTrue(snippetHTML.contains("<span class=\"match\"><mark>Mit Ausnahme von Fremdwörtern und Namen ist das A der</mark><span class=\"cutted\"></span></span>"));
+ assertTrue(snippetHTML.contains("<span class=\"context-right\"><span class=\"more\"></span></span>"));
+
+ };
+
+ @Test
+ public void testUpdateConfigurationsMax () {
+ Properties props = new Properties();
+ props.setProperty("krill.context.left.maxShrink", "max");
+ props.setProperty("krill.context.right.maxShrink", "max");
+ KrillProperties.updateConfigurations(props);
+ assertEquals(KrillProperties.maxTokenContextSize,
+ KrillProperties.leftContextMaxShrink);
+ assertEquals(KrillProperties.maxTokenContextSize,
+ KrillProperties.rightContextMaxShrink);
+ };
+
+ @Test
+ public void testUpdateConfigurationsEdgeCases () {
+ Properties props = new Properties();
+
+ // Negative values should be clamped to 0
+ props.setProperty("krill.context.left.maxShrink", "-5");
+ props.setProperty("krill.context.right.maxShrink", "-10");
+ KrillProperties.updateConfigurations(props);
+ assertEquals(0, KrillProperties.leftContextMaxShrink);
+ assertEquals(0, KrillProperties.rightContextMaxShrink);
+
+ // Values exceeding maxTokenContextSize should be clamped
+ props.setProperty("krill.context.left.maxShrink", "9999");
+ props.setProperty("krill.context.right.maxShrink", "9999");
+ KrillProperties.updateConfigurations(props);
+ assertEquals(KrillProperties.maxTokenContextSize,
+ KrillProperties.leftContextMaxShrink);
+ assertEquals(KrillProperties.maxTokenContextSize,
+ KrillProperties.rightContextMaxShrink);
+
+ // Normal value
+ props.setProperty("krill.context.left.maxShrink", "3");
+ props.setProperty("krill.context.right.maxShrink", "7");
+ KrillProperties.updateConfigurations(props);
+ assertEquals(3, KrillProperties.leftContextMaxShrink);
+ assertEquals(7, KrillProperties.rightContextMaxShrink);
+ };
+
+ @Test
+ public void testTokenSnippetMatchLengthLongRightOnly ()
+ throws JsonMappingException, JsonProcessingException {
+ int before = KrillProperties.maxTokenMatchSize;
+ KrillProperties.maxTokenMatchSize = 5;
+ JsonNode jsonNode = mapper.readTree(jsonQuery);
+ Krill ks = new Krill(jsonNode);
+ Result kr = ks.apply(ki);
+
+ Match km = kr.getMatch(0);
+ assertEquals(34, km.getStartPos());
+ assertEquals(39, km.getEndPos());
+ assertEquals(5, km.getContext().left.getLength());
+ assertEquals(5, km.getContext().right.getLength());
+
+ KrillProperties.leftContextMaxShrink = 0;
+ KrillProperties.rightContextMaxShrink = 5;
+
+ String snippetToken = kr.getMatch(0).getSnippetTokens().toString();
+ KrillProperties.maxTokenMatchSize = before;
+
+ assertTrue(snippetToken.contains(
+ "\"left\":[\"sechsthäufigste\",\"Buchstabe\",\"in\",\"deutschen\",\"Texten\"]"
+ ));
+ assertTrue(snippetToken.contains(
+ "\"match\":[\"Mit\",\"Ausnahme\",\"von\",\"Fremdwörtern\",\"und\"]"
+ ));
+ assertTrue(!snippetToken.contains("\"right\""));
+ };
+
+ @Test
+ public void testSnippetBracketsWithAdjustment ()
+ throws JsonMappingException, JsonProcessingException {
+ JsonNode jsonNode = mapper.readTree(jsonQuery);
+ Krill ks = new Krill(jsonNode);
+ Result kr = ks.apply(ki);
+
+ Match km = kr.getMatch(0);
+ assertEquals(34, km.getStartPos());
+ assertEquals(60, km.getEndPos());
+
+ KrillProperties.leftContextMaxShrink = 4;
+ KrillProperties.rightContextMaxShrink = 2;
+
+ String brackets = kr.getMatch(0).getSnippetBrackets();
+
+ assertTrue(brackets.contains("Texten."));
+ assertTrue(!brackets.contains("sechsthäufigste"));
+ assertTrue(brackets.contains("[["));
+ assertTrue(brackets.contains("]]"));
+ assertTrue(brackets.contains("Aussprache Im"));
+ assertTrue(!brackets.contains("Deutschen und"));
+ };
+
+ @Test
+ public void testSmallClientContextWithLargeAdjustment ()
+ throws IOException {
+ SpanTermQuery stq = new SpanTermQuery(new Term("tokens", "s:des"));
+ Result kr = ki.search(stq, (short) 10);
+
+ KrillProperties.leftContextMaxShrink = 25;
+ KrillProperties.rightContextMaxShrink = 25;
+
+ Match km = kr.getMatch(0);
+ String snippetToken = km.getSnippetTokens().toString();
+
+ assertTrue(snippetToken.contains("\"match\":[\"des\"]"));
+
+ String snippetHTML = km.getSnippetHTML();
+ assertTrue(snippetHTML.contains("<span class=\"match\">"));
+
+ km = kr.getMatch(1);
+ snippetToken = km.getSnippetTokens().toString();
+ assertTrue(snippetToken.contains("\"match\":[\"des\"]"));
+ };
+
+ @Test
+ public void testGuardClampsOverShrinkWithLongMatch ()
+ throws JsonMappingException, JsonProcessingException {
+ // Use a small context (2 tokens) with the sentence query (match is 26 tokens)
+ // and a large adjustment. The guard must clamp shrink to available context.
+ JsonNode jsonNode = mapper.readTree(jsonQuery);
+ ArrayNode leftNode = (ArrayNode) jsonNode.at("/meta/context/left");
+ ArrayNode rightNode = (ArrayNode) jsonNode.at("/meta/context/right");
+ leftNode.set(1, "2");
+ rightNode.set(1, "2");
+
+ Krill ks = new Krill(jsonNode);
+ Result kr = ks.apply(ki);
+
+ Match km = kr.getMatch(0);
+ assertEquals(34, km.getStartPos());
+ assertEquals(60, km.getEndPos());
+ assertEquals(2, km.getContext().left.getLength());
+ assertEquals(2, km.getContext().right.getLength());
+
+ // Set adjustment much larger than available context
+ KrillProperties.leftContextMaxShrink = 25;
+ KrillProperties.rightContextMaxShrink = 25;
+
+ // Without the guard, shrinkLeft/shrinkRight would be 13 each (half of 26),
+ // but available context is only 2 on each side.
+ // The guard clamps to 2, so all context is consumed.
+ String snippetToken = km.getSnippetTokens().toString();
+ assertTrue(!snippetToken.contains("\"left\""));
+ assertTrue(!snippetToken.contains("\"right\""));
+ assertTrue(snippetToken.contains("\"match\""));
+
+ String snippetHTML = km.getSnippetHTML();
+ assertTrue(snippetHTML.contains("<span class=\"context-left\"><span class=\"more\"></span></span>"));
+ assertTrue(snippetHTML.contains("<span class=\"context-right\"><span class=\"more\"></span></span>"));
+ };
+
+ @Test
+ public void testKwicMaxTokenBasic () {
+ // kwic.max.token = matchMax + 2*contextMax - totalShrink
+ // Setting kwicMaxToken=60 with matchMax=50 and contextMax=25:
+ // maxShrink = contextMax*2 + matchMax - kwicMaxToken = 25*2 + 50 - 60 = 40
+ // split evenly: leftMaxShrink=20, rightMaxShrink=20
+ Properties props = new Properties();
+ props.setProperty("krill.kwic.max.token", "60");
+ KrillProperties.updateConfigurations(props);
+ assertEquals(60, KrillProperties.kwicMaxToken);
+ assertEquals(20, KrillProperties.leftContextMaxShrink);
+ assertEquals(20, KrillProperties.rightContextMaxShrink);
+ };
+
+ @Test
+ public void testKwicMaxTokenOverridesIndividual () {
+ // When kwic.max.token is set, individual maxShrink values are ignored
+ Properties props = new Properties();
+ props.setProperty("krill.context.left.maxShrink", "3");
+ props.setProperty("krill.context.right.maxShrink", "7");
+ props.setProperty("krill.kwic.max.token", "60");
+ KrillProperties.updateConfigurations(props);
+ assertEquals(60, KrillProperties.kwicMaxToken);
+ // Derived from kwic.max.token, not from the individual values
+ assertEquals(20, KrillProperties.leftContextMaxShrink);
+ assertEquals(20, KrillProperties.rightContextMaxShrink);
+ };
+
+ @Test
+ public void testKwicMaxTokenEqualsTotalAllowance () {
+ // kwicMaxToken = matchMax + 2*contextMax means no shrink needed
+ // 50 + 2*25 = 100
+ Properties props = new Properties();
+ props.setProperty("krill.kwic.max.token", "100");
+ KrillProperties.updateConfigurations(props);
+ assertEquals(100, KrillProperties.kwicMaxToken);
+ assertEquals(0, KrillProperties.leftContextMaxShrink);
+ assertEquals(0, KrillProperties.rightContextMaxShrink);
+ };
+
+ @Test
+ public void testKwicMaxTokenSmall () {
+ // kwicMaxToken = matchMax means context fully shrinks
+ // maxShrink = 25*2 + 50 - 50 = 50, clamped to 2*contextMax = 50
+ // split evenly: 25/25
+ Properties props = new Properties();
+ props.setProperty("krill.kwic.max.token", "50");
+ KrillProperties.updateConfigurations(props);
+ assertEquals(50, KrillProperties.kwicMaxToken);
+ assertEquals(25, KrillProperties.leftContextMaxShrink);
+ assertEquals(25, KrillProperties.rightContextMaxShrink);
+ };
+
+ @Test
+ public void testKwicMaxTokenTooSmall () {
+ // kwicMaxToken below matchMax is clamped: shrink can't exceed 2*contextMax
+ Properties props = new Properties();
+ props.setProperty("krill.kwic.max.token", "10");
+ KrillProperties.updateConfigurations(props);
+ assertEquals(10, KrillProperties.kwicMaxToken);
+ assertEquals(25, KrillProperties.leftContextMaxShrink);
+ assertEquals(25, KrillProperties.rightContextMaxShrink);
+ };
+
+ @Test
+ public void testKwicMaxTokenWithSearch ()
+ throws JsonMappingException, JsonProcessingException {
+ // End-to-end test: kwic.max.token=15, matchMax=5, contextMax=25
+ // maxShrink = 25*2+5-15 = 40, clamped to 50 = 20/20
+ // But context is only 5 on each side (query requests 5).
+ // With a 5-token match, requiredShrink = min(5, 40) = 5, split 2/3
+ int before = KrillProperties.maxTokenMatchSize;
+ KrillProperties.maxTokenMatchSize = 5;
+
+ Properties props = new Properties();
+ props.setProperty("krill.kwic.max.token", "15");
+ KrillProperties.updateConfigurations(props);
+
+ JsonNode jsonNode = mapper.readTree(jsonQuery);
+ Krill ks = new Krill(jsonNode);
+ Result kr = ks.apply(ki);
+
+ Match km = kr.getMatch(0);
+ assertEquals(34, km.getStartPos());
+ assertEquals(39, km.getEndPos());
+ assertEquals(5, km.getContext().left.getLength());
+ assertEquals(5, km.getContext().right.getLength());
+
+ String snippetToken = km.getSnippetTokens().toString();
+ KrillProperties.maxTokenMatchSize = before;
+
+ // maxShrink=40 = left=20,right=20; match=5; requiredShrink=min(5,40)=5
+ // shrinkLeft = round(5*(20/40))=round(2.5)=3, shrinkRight=2
+ // left context: 5-3=2, right context: 5-2=3
+ assertTrue(snippetToken.contains("\"left\":[\"deutschen\",\"Texten\"]"));
+ assertTrue(snippetToken.contains(
+ "\"match\":[\"Mit\",\"Ausnahme\",\"von\",\"Fremdwörtern\",\"und\"]"
+ ));
+ assertTrue(snippetToken.contains("\"right\":[\"Namen\",\"ist\",\"das\"]"));
+ };
+
+ @Test
+ public void testKwicMaxTokenNotSet () {
+ // When kwic.max.token is not set, individual maxShrink values work normally
+ Properties props = new Properties();
+ props.setProperty("krill.context.left.maxShrink", "3");
+ props.setProperty("krill.context.right.maxShrink", "7");
+ KrillProperties.updateConfigurations(props);
+ assertEquals(-1, KrillProperties.kwicMaxToken);
+ assertEquals(3, KrillProperties.leftContextMaxShrink);
+ assertEquals(7, KrillProperties.rightContextMaxShrink);
+ };
+};
diff --git a/src/test/java/de/ids_mannheim/korap/response/TestMatch.java b/src/test/java/de/ids_mannheim/korap/response/TestMatch.java
index fcc1db3..6e67b9d 100644
--- a/src/test/java/de/ids_mannheim/korap/response/TestMatch.java
+++ b/src/test/java/de/ids_mannheim/korap/response/TestMatch.java
@@ -53,6 +53,5 @@
false);
assertEquals(326, m.getStartPos());
assertEquals(376, m.getEndPos());
- };
-
+ };
};