Enhance match expansion (#144) and cut primary data accordingly (#143)

Change-Id: Ie00d653aa194fbb10bce0c058392db131c85fb9c
diff --git a/Changes b/Changes
index 6c66f00..98f331d 100644
--- a/Changes
+++ b/Changes
@@ -1,3 +1,9 @@
+0.62.5 2024-05-31
+    - [bugfix] cut primary data according to max values (margaretha, #143)
+    - [enhancement] restrict match expansion by max token and context 
+      size (margaretha, #144)
+    
+
 0.62.4 2024-05-27
     - [feature] Make match and context size configurable (address #128, 
       diewald & margaretha)
diff --git a/src/main/java/de/ids_mannheim/korap/KrillIndex.java b/src/main/java/de/ids_mannheim/korap/KrillIndex.java
index 3f68608..9073810 100644
--- a/src/main/java/de/ids_mannheim/korap/KrillIndex.java
+++ b/src/main/java/de/ids_mannheim/korap/KrillIndex.java
@@ -1194,39 +1194,10 @@
                 if (extendToSentence) {
                     
                     String element = "base/s:s";
-                    int[] spanContext = match.expandContextToSpan(element);
+                    match.expandContextToSpan(element);
 
                     if (DEBUG)
                         log.trace("Extend to sentence element '{}'", element);
-
-                    if (spanContext[0] >= 0
-                            && spanContext[0] < spanContext[1]) {
-
-                        // Match needs to be cutted!
-                        if ((spanContext[1] - spanContext[0]) > maxTokenMatchSize) {
-                            int contextLength = maxTokenMatchSize - match.getLength();
-                            int halfContext = contextLength / 2;
-
-                            // This is the extended context calculated
-                            int realLeftLength = match.getStartPos() - spanContext[0];
-
-                            // The length is too large - cut!
-                            if (realLeftLength > halfContext) {
-                                match.startCutted = true;
-                                spanContext[0] = match.getStartPos() - halfContext;
-                            }
-                        }
-
-                        match.setStartPos(maxTokenMatchSize,spanContext[0]);
-                        match.setEndPos(maxTokenMatchSize,spanContext[1]);
-						match.potentialStartPosChar = spanContext[2];
-						match.potentialEndPosChar = spanContext[3];
-                        match.startMore = false;
-                        match.endMore = false;
-                    }
-                    else {
-                        match.addWarning(651, "Unable to extend context");
-                    };
                 }
                 else {
                     if (DEBUG)
diff --git a/src/main/java/de/ids_mannheim/korap/response/Match.java b/src/main/java/de/ids_mannheim/korap/response/Match.java
index 7d91745..bf14ca2 100644
--- a/src/main/java/de/ids_mannheim/korap/response/Match.java
+++ b/src/main/java/de/ids_mannheim/korap/response/Match.java
@@ -5,6 +5,7 @@
 
 import java.io.IOException;
 import java.nio.ByteBuffer;
+import java.nio.charset.StandardCharsets;
 import java.util.ArrayList;
 import java.util.Collections;
 import java.util.Comparator;
@@ -13,8 +14,6 @@
 import java.util.LinkedList;
 import java.util.List;
 
-import java.nio.charset.StandardCharsets;
-
 import org.apache.lucene.index.LeafReaderContext;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.index.TermContext;
@@ -647,11 +646,6 @@
      */
     @JsonIgnore
     public void setEndPos (int maxTokenMatchSize, int pos) {
-        if (maxTokenMatchSize > KrillProperties.maxTokenMatchSize) {
-            maxTokenMatchSize = KrillProperties.maxTokenMatchSize;
-            this.endCutted = true;
-        }
-        
         if (this.startPos != -1 && (pos - this.startPos) > maxTokenMatchSize) {
 			pos = this.startPos + maxTokenMatchSize;
 			    this.endCutted = true;
@@ -1109,15 +1103,73 @@
 	};
 
     // Expand the context to a span
-    public int[] expandContextToSpan (String element) {
+    public void expandContextToSpan (String element) {
 
         // TODO: THE BITS HAVE TO BE SET!
 
-        if (this.positionsToOffset != null)
-            return this.expandContextToSpan(
+        int[] spanContext = new int[] { 0, 0, 0, 0 };
+        
+        if (this.positionsToOffset != null) {
+            spanContext = this.expandContextToSpan(
                     this.positionsToOffset.getLeafReader(), (Bits) null,
                     "tokens", element);
-        return new int[] { 0, 0, 0, 0 };
+        }
+        
+        if (spanContext[0] >= 0
+                && spanContext[0] < spanContext[1]) {
+
+            int maxExpansionSize = KrillProperties.maxTokenMatchSize
+                    + KrillProperties.maxTokenContextSize;
+
+            // Match needs to be cutted!
+            boolean cutExpansion = false;
+            if ((spanContext[1] - spanContext[0]) > maxExpansionSize) {
+                cutExpansion=true;
+                int contextLength = maxExpansionSize - this.getLength();
+                int halfContext = contextLength / 2;
+
+                // This is the extended context calculated
+                int realLeftLength = this.getStartPos() - spanContext[0];
+
+                // The length is too large - cut!
+                if (realLeftLength > halfContext) {
+                    this.startCutted = true;
+                    spanContext[0] = this.getStartPos() - halfContext;
+                }
+                
+                int realRightLength = spanContext[1] - this.getEndPos();
+                
+                // The length is too large - cut!
+                if (realRightLength > halfContext) {
+                    this.endCutted = true;
+                    spanContext[1] = this.getEndPos() + halfContext;
+                }
+            }
+
+            this.setStartPos(maxExpansionSize,spanContext[0]);
+            this.setEndPos(maxExpansionSize,spanContext[1]);
+            // EM: update char offsets
+            
+            if (cutExpansion) {
+                this.positionsToOffset.add(localDocID, startPos);
+                this.positionsToOffset.add(localDocID, endPos);
+                
+                int start = this.positionsToOffset.start(localDocID, startPos);
+                int end = this.positionsToOffset.start(localDocID, endPos)-1;
+                spanContext[2] = start; //spanContext[2];
+                spanContext[3] = end; // spanContext[3];
+            }
+
+            this.potentialStartPosChar = spanContext[2];
+            this.potentialEndPosChar = spanContext[3];
+            this.startMore = false;
+            this.endMore = false;
+            
+            this.positionsToOffset.clear();
+        }
+        else {
+            this.addWarning(651, "Unable to extend context");
+        };
     };
 
 	
diff --git a/src/test/java/de/ids_mannheim/korap/index/TestMatchIdentifier.java b/src/test/java/de/ids_mannheim/korap/index/TestMatchIdentifier.java
index af4e7b5..1bf2677 100644
--- a/src/test/java/de/ids_mannheim/korap/index/TestMatchIdentifier.java
+++ b/src/test/java/de/ids_mannheim/korap/index/TestMatchIdentifier.java
@@ -1234,11 +1234,18 @@
         assertEquals("... a a a a a a [[b]] a a a a a a ...", kr.getMatch(0).getSnippetBrackets());
             
         // see TestNextIndex#corolaNextTest
+
         Match km = ki.getMatchInfo("match-Corola-blog/BlogPost/370281_a_371610-p70-71", "tokens", null, null,false, false, true);
 
-        // The match needs to be cutted on both sides!
         String str = km.getSnippetBrackets();
-        assertTrue(str.contains("[<!>a"));
+        assertTrue(str.contains("[<!>{drukola/l:au:a}"));
+        assertFalse(str.contains("<!>]"));
+        
+        km = ki.getMatchInfo("match-Corola-blog/BlogPost/370281_a_371610-p50-51", "tokens", null, null,false, false, true);
+
+        // The match needs to be cutted on both sides!
+        str = km.getSnippetBrackets();
+        assertTrue(str.contains("[<!>{d"));
         assertTrue(str.contains("a}<!>]"));
     };
     
diff --git a/src/test/java/de/ids_mannheim/korap/index/TestMaxMatchTokens.java b/src/test/java/de/ids_mannheim/korap/index/TestMaxMatchTokens.java
index 5a39340..374e4c8 100644
--- a/src/test/java/de/ids_mannheim/korap/index/TestMaxMatchTokens.java
+++ b/src/test/java/de/ids_mannheim/korap/index/TestMaxMatchTokens.java
@@ -23,6 +23,10 @@
     private KrillIndex ki;
     private String json;
 
+    private ArrayList<String> foundry = new ArrayList<String>();
+    private ArrayList<String> layer = new ArrayList<String>();
+    
+    
     public TestMaxMatchTokens () throws IOException {
         ki = new KrillIndex();
         // Indexing test files
@@ -35,6 +39,10 @@
         json = getJsonString(getClass()
                 .getResource("/queries/position/sentence-contain-token.json")
                 .getFile());
+        
+        foundry.add("opennlp");
+        layer.add("p");
+        
     }
     
     @Before
@@ -84,11 +92,6 @@
         ki.commit();
         Match km;
 
-        ArrayList<String> foundry = new ArrayList<String>();
-        foundry.add("opennlp");
-        ArrayList<String> layer = new ArrayList<String>();
-        layer.add("opennlp");
-
         // maxMatchTokens from properties = 40
         km = ki.getMatchInfo("match-WUD17/C94/39360-p390-396", "tokens", false,
                 foundry, layer, false, false, false, false, false);
@@ -110,4 +113,62 @@
         assertTrue(km.endCutted);
         assertEquals(420, km.getEndPos());
     }
+    
+    @Test
+    public void testMatchInfoExpansion () throws QueryException, IOException {
+        KrillProperties.maxTokenMatchSize = 1;
+        KrillIndex ki = new KrillIndex();
+        // Indexing test files
+        ki.addDoc(
+                getClass().getResourceAsStream("/wiki/WUD17-C94-39360.json.gz"),
+                true);
+        ki.commit();
+        
+        // cut left match expansion
+        Match km = ki.getMatchInfo("match-WUD17/C94/39360-p225-226", "tokens",
+                true, foundry , layer, true, true, true, true, true);
+        assertEquals(213, km.getStartPos());
+        assertEquals(228, km.getEndPos());
+        assertEquals(15, km.getLength());
+        assertEquals("[<!>{opennlp/p:ADV:auch} {opennlp/p:APPRART:zur} "
+                + "{opennlp/p:NN:Nutzung} {opennlp/p:ART:des} {opennlp/p:NN:Namens} "
+                + "{opennlp/p:VVPP:berechtigt} {opennlp/p:VAFIN:ist} "
+                + "({opennlp/p:VVIMP:siehe} {opennlp/p:PROAV:dazu} "
+                + "{opennlp/p:PPOSAT:unsere} {opennlp/p:NN:Hinweise} "
+                + "{opennlp/p:APPRART:zur} [{opennlp/p:NN:Wahl}] "
+                + "{opennlp/p:ART:des} {opennlp/p:NN:Benutzernamens}).]", 
+                km.getSnippetBrackets());
+        
+        // cut right match expansion        
+        km = ki.getMatchInfo("match-WUD17/C94/39360-p210-211", "tokens", false,
+                foundry, layer, false, false, false, false, true);
+        assertEquals(199, km.getStartPos());
+        assertEquals(223, km.getEndPos());
+        assertEquals(24, km.getLength());
+        assertEquals("[Benutzerkonten sollen nur dann einen offiziell klingenden"
+                + " Namen haben, wenn der [Betreiber] des Kontos auch zur Nutzung "
+                + "des Namens berechtigt ist (siehe dazu unsere<!>]", 
+                km.getSnippetBrackets());
+        
+        // cut left and right match expansion
+        km = ki.getMatchInfo("match-WUD17/C94/39360-p213-214", "tokens", false,
+                foundry, layer, false, false, false, false, true);
+        assertEquals(201, km.getStartPos());
+        assertEquals(226, km.getEndPos());
+        assertEquals(25, km.getLength());
+        assertEquals("[<!>nur dann einen offiziell klingenden Namen haben, wenn "
+                + "der Betreiber des Kontos [auch] zur Nutzung des Namens "
+                + "berechtigt ist (siehe dazu unsere Hinweise zur Wahl<!>]", 
+                km.getSnippetBrackets());
+        
+        // no cut
+        km = ki.getMatchInfo("match-WUD17/C94/39360-p160-161", "tokens", false,
+                foundry, layer, false, false, false, false, true);
+        assertEquals(150, km.getStartPos());
+        assertEquals(162, km.getEndPos());
+        assertEquals(12, km.getLength());
+        
+        KrillProperties.maxTokenMatchSize = 20;
+    }
+    
 }