Adjust context to shrink based on match size (closes #229)

Change-Id: I5dba1f714beb07c71c69a783d801d4a40492ff5d
diff --git a/src/main/java/de/ids_mannheim/korap/response/Match.java b/src/main/java/de/ids_mannheim/korap/response/Match.java
index 4374783..10c96c8 100644
--- a/src/main/java/de/ids_mannheim/korap/response/Match.java
+++ b/src/main/java/de/ids_mannheim/korap/response/Match.java
@@ -863,7 +863,6 @@
         return this;
     };
 
-
     @JsonIgnore
     public SearchContext getContext () {
         if (this.context == null)
@@ -876,6 +875,36 @@
         return this.getEndPos() - this.getStartPos();
     };  
 
+    // adjustContext to ensure, only markers in the relevant context
+    // are retrieved
+    // (not very Java-y tbh.)
+    private int[] adjustContext (int startContext, int endContext) {
+
+        // Subtract the matchLength from the context
+        if (KrillProperties.leftContextMaxShrink == 0 &&
+            KrillProperties.rightContextMaxShrink == 0) {
+
+            return new int[]{startContext, endContext};
+        };
+      
+        int matchLength = this.endPos - this.startPos;
+
+        int maxShrinkLeft = KrillProperties.leftContextMaxShrink;
+
+        int maxShrink = maxShrinkLeft +
+            KrillProperties.rightContextMaxShrink;
+
+        // Calculate proportionate reduction
+        int requiredShrink = Math.min(matchLength, maxShrink);                
+        int shrinkLeft = (int) Math.round(requiredShrink * ((double) maxShrinkLeft / maxShrink));
+        int shrinkRight = requiredShrink - shrinkLeft;
+
+        // Guard: never shrink more context than actually available
+        shrinkLeft = Math.min(shrinkLeft, Math.max(0, this.startPos - startContext));
+        shrinkRight = Math.min(shrinkRight, Math.max(0, endContext - this.endPos));
+
+        return new int[]{startContext + shrinkLeft, endContext - shrinkRight};
+    };
 	
 	// Retrieve markers in a certain area
 	public List<int[]> retrieveMarkers (String marker) {
@@ -906,6 +935,12 @@
         int minStartPos = this.getStartPos() - KrillProperties.maxTokenContextSize;
         int maxEndPos = this.getEndPos() + KrillProperties.maxTokenContextSize;
 
+        // Not very efficient, to adjust the context for every retrieval,
+        // but there is no IO involved, therefore not too bad
+        int[] adjustedContext = this.adjustContext(minStartPos, maxEndPos);
+        minStartPos = adjustedContext[0];
+        maxEndPos = adjustedContext[1];
+        
 		if (DEBUG) {
             log.debug("=================================");
 			log.debug("Retrieve markers between {}-{}",
@@ -1194,6 +1229,11 @@
                 spanContext[3] = end; // spanContext[3];
             }
 
+            // Context adjustment is not needed here: span expansion changes
+            // match boundaries, not context boundaries. Context adjustment is
+            // applied when rendering the snippet (getSnippetTokens,
+            // _processOffsetChars, retrieveMarkers).
+            
             this.potentialStartPosChar = spanContext[2];
             this.potentialEndPosChar = spanContext[3];
             this.startMore = false;
@@ -1656,7 +1696,7 @@
             startContextChar = spanContext[2];
             endContextChar = spanContext[3];
         }
-
+        
         // The offset is not yet defined - and defined by tokens
         if (endContext == -1) {
 
@@ -1685,6 +1725,11 @@
             if (DEBUG)
                 log.debug("Set endContext {}", endContext);
         };
+
+        // Adjust the context
+        int[] adjustedContext = this.adjustContext(startContext, endContext);
+        startContext = adjustedContext[0];
+        endContext = adjustedContext[1];
         
         // Retrieve the character offsets for all tokens
         for (int i = startContext; i < endContext; i++) {
@@ -1694,8 +1739,10 @@
         if (startContextChar == -1)
             startContextChar = pto.start(ldid, startContext);
 
-        if (endContextChar == -1)
-            endContextChar = pto.end(ldid, endContext);
+        if (endContextChar == -1) {
+            int lastPos = Math.max(endContext, this.endPos) - 1;
+            endContextChar = pto.end(ldid, lastPos);
+        }
             
         if (DEBUG)
             log.debug("Match is {}/{} - {}/{}",startContext,startContextChar,endContext,endContextChar);
@@ -1720,6 +1767,9 @@
             tokens = json.putArray("left");
             for (i = startContext; i < this.startPos; i++) {
                 offsets = pto.span(ldid,i);
+                if (offsets == null) {
+                    continue;
+                }
                 tokens.add(
                     codePointSubstring(this.tempSnippet,
                                        offsets[0]- startContextChar, offsets[1] - startContextChar)
@@ -1728,6 +1778,8 @@
         };
 
         tokens = json.putArray("match");
+
+        // Create right context token list
         for (i = this.startPos; i < this.endPos; i++) {
             offsets = pto.span(ldid,i);
             if (offsets == null) {
@@ -2397,17 +2449,29 @@
 
             PositionsToOffset pto = this.positionsToOffset;
 
+            // TODO:
+            //   The best approach is probably to primarily focus on
+            //   token contexts and in case they exceed the maxCharacter
+            //   contexts, these should be cut at the end.
+            //   This doesn't work with token list matches though,
+            //   so we may need to deprecate the option to set character
+            //   contexts altogether ...
+
+            boolean retrieveStart = false;
+            boolean retrieveEnd = false;
+            
             // The left offset is defined by tokens
             if (this.context.left.isToken()) {
                 startOffset = this.startPos - this.context.left.getLength();
                 if (DEBUG)
                     log.trace("PTO will retrieve {} (Left context)",
                             startOffset);
-                pto.add(ldid, startOffset);
+                retrieveStart = true;
             }
 
             // The left offset is defined by characters
             else {
+                // TODO: This is therefore not adjusted
                 startOffsetChar = startPosChar - this.context.left.getLength();
             };
 
@@ -2417,15 +2481,30 @@
                 if (DEBUG)
                     log.trace("PTO will retrieve {} (Right context)",
                             endOffset);
-                pto.add(ldid, endOffset);
+                retrieveEnd = true;
             }
 
             // The right context is defined by characters
             else {
+                // TODO: This is therefore not adjusted
                 endOffsetChar = (endPosChar == -1) ? -1
                         : endPosChar + this.context.right.getLength();
             };
 
+            // Adjust token contexts (convert endOffset to exclusive for adjustContext)
+            int[] adjustedContexts = this.adjustContext(startOffset,
+                endOffset == -1 ? endOffset : endOffset + 1);
+            
+            if (retrieveStart) {
+                startOffset = adjustedContexts[0];
+                pto.add(ldid, startOffset);
+            };
+
+            if (retrieveEnd) {
+                endOffset = adjustedContexts[1] - 1;
+                pto.add(ldid, endOffset);
+            };
+                
             if (startOffset != -1)
                 startOffsetChar = pto.start(ldid, startOffset);
 
diff --git a/src/main/java/de/ids_mannheim/korap/util/KrillProperties.java b/src/main/java/de/ids_mannheim/korap/util/KrillProperties.java
index b5b009c..6a2485e 100644
--- a/src/main/java/de/ids_mannheim/korap/util/KrillProperties.java
+++ b/src/main/java/de/ids_mannheim/korap/util/KrillProperties.java
@@ -24,6 +24,9 @@
     public static int maxTokenMatchSize = 50;
     public static int maxTokenContextSize = 60;
     public static int maxCharContextSize = 500;
+    public static int leftContextMaxShrink = 0;
+    public static int rightContextMaxShrink = 0;
+    public static int kwicMaxToken = -1;
     public static int defaultSearchContextLength = 6;
     public static int maxTextSize = DEFAULT_MAX_STRING_LEN; // Default max text size
     
@@ -88,9 +91,20 @@
 
     public static void updateConfigurations (Properties  prop) {
         String maxTokenMatchSize = prop.getProperty("krill.match.max.token");
+
+        // TODO:
+        // Should be separated for left and right!
         String maxTokenContextSize = prop.getProperty("krill.context.max.token");
+
+        // Maximum number of tokens to shrink from context based on match size
+        // (only affects token-based contexts)
+        String leftContextMaxShrink = prop.getProperty("krill.context.left.maxShrink");
+        String rightContextMaxShrink = prop.getProperty("krill.context.right.maxShrink");
+
+        String kwicMaxToken = prop.getProperty("krill.kwic.max.token");
+
         // EM: not implemented yet
-//        String maxCharContextSize = prop.getProperty("krill.context.max.char");
+        // String maxCharContextSize = prop.getProperty("krill.context.max.char");
         String defaultSearchContextLength = prop.getProperty("krill.search.context.default");
         String maxTextSizeValue = prop.getProperty("krill.index.textSize.max");
 
@@ -123,6 +137,49 @@
                 }
 
             }
+            if (leftContextMaxShrink != null) {
+                if (leftContextMaxShrink.equals("max")) {
+                    KrillProperties.leftContextMaxShrink = KrillProperties.maxTokenContextSize;
+                } else {
+                    KrillProperties.leftContextMaxShrink = Integer
+                        .parseInt(leftContextMaxShrink);
+                    if (KrillProperties.leftContextMaxShrink > KrillProperties.maxTokenContextSize)
+                        KrillProperties.leftContextMaxShrink = KrillProperties.maxTokenContextSize;
+                    else if (KrillProperties.leftContextMaxShrink < 0)
+                        KrillProperties.leftContextMaxShrink = 0;
+                };
+            };
+            if (rightContextMaxShrink != null) {
+                if (rightContextMaxShrink.equals("max")) {
+                    KrillProperties.rightContextMaxShrink = KrillProperties.maxTokenContextSize;
+                } else {
+                    KrillProperties.rightContextMaxShrink = Integer
+                        .parseInt(rightContextMaxShrink);
+                    if (KrillProperties.rightContextMaxShrink > KrillProperties.maxTokenContextSize)
+                        KrillProperties.rightContextMaxShrink = KrillProperties.maxTokenContextSize;
+                    else if (KrillProperties.rightContextMaxShrink < 0)
+                        KrillProperties.rightContextMaxShrink = 0;
+                };
+            };
+
+            if (kwicMaxToken != null) {
+                KrillProperties.kwicMaxToken = Integer.parseInt(kwicMaxToken);
+
+                if (leftContextMaxShrink != null || rightContextMaxShrink != null) {
+                    log.warn("krill.kwic.max.token is set: individual "
+                             + "krill.context.left.maxShrink / krill.context.right.maxShrink "
+                             + "values will be ignored");
+                };
+
+                int totalAllowance = KrillProperties.maxTokenMatchSize
+                    + 2 * KrillProperties.maxTokenContextSize;
+                int totalShrink = Math.max(0,
+                    Math.min(totalAllowance - KrillProperties.kwicMaxToken,
+                             2 * KrillProperties.maxTokenContextSize));
+                KrillProperties.leftContextMaxShrink = totalShrink / 2;
+                KrillProperties.rightContextMaxShrink = totalShrink - totalShrink / 2;
+            };
+
         }
         catch (NumberFormatException e) {
             log.error("A Krill property expects numerical values: "
diff --git a/src/main/resources/krill.properties.info b/src/main/resources/krill.properties.info
index 45fc56b..081d04b 100644
--- a/src/main/resources/krill.properties.info
+++ b/src/main/resources/krill.properties.info
@@ -15,3 +15,59 @@
 krill.index.commit.auto = 500
 krill.index.relations.max = 100
 krill.index.textSize.max = 20000000
+
+# Token retrieval settings:
+#
+# krill.match.max.token = 5
+#
+##  Maximum number (i.e. length) of tokens to be retrievable.
+##  Matches longer than that will be cut.
+##  Defaults to 50
+
+# krill.context.max.token = 
+#
+##  Maximum number (i.e. length) of tokens to be retrieved (left and right) of a match.
+##  Defaults to 60
+
+# krill.kwic.max.token =
+#
+## Maximum total number of tokens in a KWIC snippet (left + match + right).
+## When set, this derives krill.context.left.maxShrink and
+## krill.context.right.maxShrink automatically:
+##   totalShrink = (krill.match.max.token + 2 * krill.context.max.token) - kwic.max.token
+## Split evenly between left and right.
+## When this property is set, individual maxShrink values are ignored.
+## The existing krill.match.max.token remains in effect as a separate cap
+## on match length, preventing data leakage from sentence-level queries.
+## Not set by default (no KWIC cap - backward compatible).
+
+# krill.context.left.maxShrink = 0
+#
+## Maximum number of tokens the left context may shrink based on match length.
+## Defaults to 0 (no shrinking - full context is always returned).
+## When a match is long, the context shrinks by up to this many tokens,
+## keeping the total KWIC width manageable.
+## The total shrink is distributed proportionally between left and right
+## according to their respective maxShrink values.
+## Use the string "max" to allow the context to shrink entirely
+## (up to krill.context.max.token tokens).
+## Ignored when krill.kwic.max.token is set.
+## Note: Only token-based contexts are affected; character-based contexts
+## are currently not adjusted by this feature.
+
+# krill.context.right.maxShrink = 0
+#
+## Maximum number of tokens the right context may shrink based on match length.
+## Defaults to 0 (no shrinking - full context is always returned).
+## When a match is long, the context shrinks by up to this many tokens,
+## keeping the total KWIC width manageable.
+## The total shrink is distributed proportionally between left and right
+## according to their respective maxShrink values.
+## Use the string "max" to allow the context to shrink entirely
+## (up to krill.context.max.token tokens).
+## Ignored when krill.kwic.max.token is set.
+## Note: Only token-based contexts are affected; character-based contexts
+## are currently not adjusted by this feature.
+
+
+