Adjust context to shrink based on match size (closes #229)
Change-Id: I5dba1f714beb07c71c69a783d801d4a40492ff5d
diff --git a/src/main/java/de/ids_mannheim/korap/response/Match.java b/src/main/java/de/ids_mannheim/korap/response/Match.java
index 4374783..10c96c8 100644
--- a/src/main/java/de/ids_mannheim/korap/response/Match.java
+++ b/src/main/java/de/ids_mannheim/korap/response/Match.java
@@ -863,7 +863,6 @@
return this;
};
-
@JsonIgnore
public SearchContext getContext () {
if (this.context == null)
@@ -876,6 +875,36 @@
return this.getEndPos() - this.getStartPos();
};
+ // adjustContext to ensure, only markers in the relevant context
+ // are retrieved
+ // (not very Java-y tbh.)
+ private int[] adjustContext (int startContext, int endContext) {
+
+ // Subtract the matchLength from the context
+ if (KrillProperties.leftContextMaxShrink == 0 &&
+ KrillProperties.rightContextMaxShrink == 0) {
+
+ return new int[]{startContext, endContext};
+ };
+
+ int matchLength = this.endPos - this.startPos;
+
+ int maxShrinkLeft = KrillProperties.leftContextMaxShrink;
+
+ int maxShrink = maxShrinkLeft +
+ KrillProperties.rightContextMaxShrink;
+
+ // Calculate proportionate reduction
+ int requiredShrink = Math.min(matchLength, maxShrink);
+ int shrinkLeft = (int) Math.round(requiredShrink * ((double) maxShrinkLeft / maxShrink));
+ int shrinkRight = requiredShrink - shrinkLeft;
+
+ // Guard: never shrink more context than actually available
+ shrinkLeft = Math.min(shrinkLeft, Math.max(0, this.startPos - startContext));
+ shrinkRight = Math.min(shrinkRight, Math.max(0, endContext - this.endPos));
+
+ return new int[]{startContext + shrinkLeft, endContext - shrinkRight};
+ };
// Retrieve markers in a certain area
public List<int[]> retrieveMarkers (String marker) {
@@ -906,6 +935,12 @@
int minStartPos = this.getStartPos() - KrillProperties.maxTokenContextSize;
int maxEndPos = this.getEndPos() + KrillProperties.maxTokenContextSize;
+ // Not very efficient, to adjust the context for every retrieval,
+ // but there is no IO involved, therefore not too bad
+ int[] adjustedContext = this.adjustContext(minStartPos, maxEndPos);
+ minStartPos = adjustedContext[0];
+ maxEndPos = adjustedContext[1];
+
if (DEBUG) {
log.debug("=================================");
log.debug("Retrieve markers between {}-{}",
@@ -1194,6 +1229,11 @@
spanContext[3] = end; // spanContext[3];
}
+ // Context adjustment is not needed here: span expansion changes
+ // match boundaries, not context boundaries. Context adjustment is
+ // applied when rendering the snippet (getSnippetTokens,
+ // _processOffsetChars, retrieveMarkers).
+
this.potentialStartPosChar = spanContext[2];
this.potentialEndPosChar = spanContext[3];
this.startMore = false;
@@ -1656,7 +1696,7 @@
startContextChar = spanContext[2];
endContextChar = spanContext[3];
}
-
+
// The offset is not yet defined - and defined by tokens
if (endContext == -1) {
@@ -1685,6 +1725,11 @@
if (DEBUG)
log.debug("Set endContext {}", endContext);
};
+
+ // Adjust the context
+ int[] adjustedContext = this.adjustContext(startContext, endContext);
+ startContext = adjustedContext[0];
+ endContext = adjustedContext[1];
// Retrieve the character offsets for all tokens
for (int i = startContext; i < endContext; i++) {
@@ -1694,8 +1739,10 @@
if (startContextChar == -1)
startContextChar = pto.start(ldid, startContext);
- if (endContextChar == -1)
- endContextChar = pto.end(ldid, endContext);
+ if (endContextChar == -1) {
+ int lastPos = Math.max(endContext, this.endPos) - 1;
+ endContextChar = pto.end(ldid, lastPos);
+ }
if (DEBUG)
log.debug("Match is {}/{} - {}/{}",startContext,startContextChar,endContext,endContextChar);
@@ -1720,6 +1767,9 @@
tokens = json.putArray("left");
for (i = startContext; i < this.startPos; i++) {
offsets = pto.span(ldid,i);
+ if (offsets == null) {
+ continue;
+ }
tokens.add(
codePointSubstring(this.tempSnippet,
offsets[0]- startContextChar, offsets[1] - startContextChar)
@@ -1728,6 +1778,8 @@
};
tokens = json.putArray("match");
+
+ // Create right context token list
for (i = this.startPos; i < this.endPos; i++) {
offsets = pto.span(ldid,i);
if (offsets == null) {
@@ -2397,17 +2449,29 @@
PositionsToOffset pto = this.positionsToOffset;
+ // TODO:
+ // The best approach is probably to primarily focus on
+ // token contexts and in case they exceed the maxCharacter
+ // contexts, these should be cut at the end.
+ // This doesn't work with token list matches though,
+ // so we may need to deprecate the option to set character
+ // contexts altogether ...
+
+ boolean retrieveStart = false;
+ boolean retrieveEnd = false;
+
// The left offset is defined by tokens
if (this.context.left.isToken()) {
startOffset = this.startPos - this.context.left.getLength();
if (DEBUG)
log.trace("PTO will retrieve {} (Left context)",
startOffset);
- pto.add(ldid, startOffset);
+ retrieveStart = true;
}
// The left offset is defined by characters
else {
+ // TODO: This is therefore not adjusted
startOffsetChar = startPosChar - this.context.left.getLength();
};
@@ -2417,15 +2481,30 @@
if (DEBUG)
log.trace("PTO will retrieve {} (Right context)",
endOffset);
- pto.add(ldid, endOffset);
+ retrieveEnd = true;
}
// The right context is defined by characters
else {
+ // TODO: This is therefore not adjusted
endOffsetChar = (endPosChar == -1) ? -1
: endPosChar + this.context.right.getLength();
};
+ // Adjust token contexts (convert endOffset to exclusive for adjustContext)
+ int[] adjustedContexts = this.adjustContext(startOffset,
+ endOffset == -1 ? endOffset : endOffset + 1);
+
+ if (retrieveStart) {
+ startOffset = adjustedContexts[0];
+ pto.add(ldid, startOffset);
+ };
+
+ if (retrieveEnd) {
+ endOffset = adjustedContexts[1] - 1;
+ pto.add(ldid, endOffset);
+ };
+
if (startOffset != -1)
startOffsetChar = pto.start(ldid, startOffset);
diff --git a/src/main/java/de/ids_mannheim/korap/util/KrillProperties.java b/src/main/java/de/ids_mannheim/korap/util/KrillProperties.java
index b5b009c..6a2485e 100644
--- a/src/main/java/de/ids_mannheim/korap/util/KrillProperties.java
+++ b/src/main/java/de/ids_mannheim/korap/util/KrillProperties.java
@@ -24,6 +24,9 @@
public static int maxTokenMatchSize = 50;
public static int maxTokenContextSize = 60;
public static int maxCharContextSize = 500;
+ public static int leftContextMaxShrink = 0;
+ public static int rightContextMaxShrink = 0;
+ public static int kwicMaxToken = -1;
public static int defaultSearchContextLength = 6;
public static int maxTextSize = DEFAULT_MAX_STRING_LEN; // Default max text size
@@ -88,9 +91,20 @@
public static void updateConfigurations (Properties prop) {
String maxTokenMatchSize = prop.getProperty("krill.match.max.token");
+
+ // TODO:
+ // Should be separated for left and right!
String maxTokenContextSize = prop.getProperty("krill.context.max.token");
+
+ // Maximum number of tokens to shrink from context based on match size
+ // (only affects token-based contexts)
+ String leftContextMaxShrink = prop.getProperty("krill.context.left.maxShrink");
+ String rightContextMaxShrink = prop.getProperty("krill.context.right.maxShrink");
+
+ String kwicMaxToken = prop.getProperty("krill.kwic.max.token");
+
// EM: not implemented yet
-// String maxCharContextSize = prop.getProperty("krill.context.max.char");
+ // String maxCharContextSize = prop.getProperty("krill.context.max.char");
String defaultSearchContextLength = prop.getProperty("krill.search.context.default");
String maxTextSizeValue = prop.getProperty("krill.index.textSize.max");
@@ -123,6 +137,49 @@
}
}
+ if (leftContextMaxShrink != null) {
+ if (leftContextMaxShrink.equals("max")) {
+ KrillProperties.leftContextMaxShrink = KrillProperties.maxTokenContextSize;
+ } else {
+ KrillProperties.leftContextMaxShrink = Integer
+ .parseInt(leftContextMaxShrink);
+ if (KrillProperties.leftContextMaxShrink > KrillProperties.maxTokenContextSize)
+ KrillProperties.leftContextMaxShrink = KrillProperties.maxTokenContextSize;
+ else if (KrillProperties.leftContextMaxShrink < 0)
+ KrillProperties.leftContextMaxShrink = 0;
+ };
+ };
+ if (rightContextMaxShrink != null) {
+ if (rightContextMaxShrink.equals("max")) {
+ KrillProperties.rightContextMaxShrink = KrillProperties.maxTokenContextSize;
+ } else {
+ KrillProperties.rightContextMaxShrink = Integer
+ .parseInt(rightContextMaxShrink);
+ if (KrillProperties.rightContextMaxShrink > KrillProperties.maxTokenContextSize)
+ KrillProperties.rightContextMaxShrink = KrillProperties.maxTokenContextSize;
+ else if (KrillProperties.rightContextMaxShrink < 0)
+ KrillProperties.rightContextMaxShrink = 0;
+ };
+ };
+
+ if (kwicMaxToken != null) {
+ KrillProperties.kwicMaxToken = Integer.parseInt(kwicMaxToken);
+
+ if (leftContextMaxShrink != null || rightContextMaxShrink != null) {
+ log.warn("krill.kwic.max.token is set: individual "
+ + "krill.context.left.maxShrink / krill.context.right.maxShrink "
+ + "values will be ignored");
+ };
+
+ int totalAllowance = KrillProperties.maxTokenMatchSize
+ + 2 * KrillProperties.maxTokenContextSize;
+ int totalShrink = Math.max(0,
+ Math.min(totalAllowance - KrillProperties.kwicMaxToken,
+ 2 * KrillProperties.maxTokenContextSize));
+ KrillProperties.leftContextMaxShrink = totalShrink / 2;
+ KrillProperties.rightContextMaxShrink = totalShrink - totalShrink / 2;
+ };
+
}
catch (NumberFormatException e) {
log.error("A Krill property expects numerical values: "
diff --git a/src/main/resources/krill.properties.info b/src/main/resources/krill.properties.info
index 45fc56b..081d04b 100644
--- a/src/main/resources/krill.properties.info
+++ b/src/main/resources/krill.properties.info
@@ -15,3 +15,59 @@
krill.index.commit.auto = 500
krill.index.relations.max = 100
krill.index.textSize.max = 20000000
+
+# Token retrieval settings:
+#
+# krill.match.max.token = 5
+#
+## Maximum number (i.e. length) of tokens to be retrievable.
+## Matches longer than that will be cut.
+## Defaults to 50
+
+# krill.context.max.token =
+#
+## Maximum number (i.e. length) of tokens to be retrieved (left and right) of a match.
+## Defaults to 60
+
+# krill.kwic.max.token =
+#
+## Maximum total number of tokens in a KWIC snippet (left + match + right).
+## When set, this derives krill.context.left.maxShrink and
+## krill.context.right.maxShrink automatically:
+## totalShrink = (krill.match.max.token + 2 * krill.context.max.token) - kwic.max.token
+## Split evenly between left and right.
+## When this property is set, individual maxShrink values are ignored.
+## The existing krill.match.max.token remains in effect as a separate cap
+## on match length, preventing data leakage from sentence-level queries.
+## Not set by default (no KWIC cap - backward compatible).
+
+# krill.context.left.maxShrink = 0
+#
+## Maximum number of tokens the left context may shrink based on match length.
+## Defaults to 0 (no shrinking - full context is always returned).
+## When a match is long, the context shrinks by up to this many tokens,
+## keeping the total KWIC width manageable.
+## The total shrink is distributed proportionally between left and right
+## according to their respective maxShrink values.
+## Use the string "max" to allow the context to shrink entirely
+## (up to krill.context.max.token tokens).
+## Ignored when krill.kwic.max.token is set.
+## Note: Only token-based contexts are affected; character-based contexts
+## are currently not adjusted by this feature.
+
+# krill.context.right.maxShrink = 0
+#
+## Maximum number of tokens the right context may shrink based on match length.
+## Defaults to 0 (no shrinking - full context is always returned).
+## When a match is long, the context shrinks by up to this many tokens,
+## keeping the total KWIC width manageable.
+## The total shrink is distributed proportionally between left and right
+## according to their respective maxShrink values.
+## Use the string "max" to allow the context to shrink entirely
+## (up to krill.context.max.token tokens).
+## Ignored when krill.kwic.max.token is set.
+## Note: Only token-based contexts are affected; character-based contexts
+## are currently not adjusted by this feature.
+
+
+