Enhance match expansion (#144) and cut primary data accordingly (#143)
Change-Id: Ie00d653aa194fbb10bce0c058392db131c85fb9c
diff --git a/Changes b/Changes
index 6c66f00..98f331d 100644
--- a/Changes
+++ b/Changes
@@ -1,3 +1,9 @@
+0.62.5 2024-05-31
+ - [bugfix] cut primary data according to max values (margaretha, #143)
+ - [enhancement] restrict match expansion by max token and context
+ size (margaretha, #144)
+
+
0.62.4 2024-05-27
- [feature] Make match and context size configurable (address #128,
diewald & margaretha)
diff --git a/src/main/java/de/ids_mannheim/korap/KrillIndex.java b/src/main/java/de/ids_mannheim/korap/KrillIndex.java
index 3f68608..9073810 100644
--- a/src/main/java/de/ids_mannheim/korap/KrillIndex.java
+++ b/src/main/java/de/ids_mannheim/korap/KrillIndex.java
@@ -1194,39 +1194,10 @@
if (extendToSentence) {
String element = "base/s:s";
- int[] spanContext = match.expandContextToSpan(element);
+ match.expandContextToSpan(element);
if (DEBUG)
log.trace("Extend to sentence element '{}'", element);
-
- if (spanContext[0] >= 0
- && spanContext[0] < spanContext[1]) {
-
- // Match needs to be cutted!
- if ((spanContext[1] - spanContext[0]) > maxTokenMatchSize) {
- int contextLength = maxTokenMatchSize - match.getLength();
- int halfContext = contextLength / 2;
-
- // This is the extended context calculated
- int realLeftLength = match.getStartPos() - spanContext[0];
-
- // The length is too large - cut!
- if (realLeftLength > halfContext) {
- match.startCutted = true;
- spanContext[0] = match.getStartPos() - halfContext;
- }
- }
-
- match.setStartPos(maxTokenMatchSize,spanContext[0]);
- match.setEndPos(maxTokenMatchSize,spanContext[1]);
- match.potentialStartPosChar = spanContext[2];
- match.potentialEndPosChar = spanContext[3];
- match.startMore = false;
- match.endMore = false;
- }
- else {
- match.addWarning(651, "Unable to extend context");
- };
}
else {
if (DEBUG)
diff --git a/src/main/java/de/ids_mannheim/korap/response/Match.java b/src/main/java/de/ids_mannheim/korap/response/Match.java
index 7d91745..bf14ca2 100644
--- a/src/main/java/de/ids_mannheim/korap/response/Match.java
+++ b/src/main/java/de/ids_mannheim/korap/response/Match.java
@@ -5,6 +5,7 @@
import java.io.IOException;
import java.nio.ByteBuffer;
+import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
@@ -13,8 +14,6 @@
import java.util.LinkedList;
import java.util.List;
-import java.nio.charset.StandardCharsets;
-
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermContext;
@@ -647,11 +646,6 @@
*/
@JsonIgnore
public void setEndPos (int maxTokenMatchSize, int pos) {
- if (maxTokenMatchSize > KrillProperties.maxTokenMatchSize) {
- maxTokenMatchSize = KrillProperties.maxTokenMatchSize;
- this.endCutted = true;
- }
-
if (this.startPos != -1 && (pos - this.startPos) > maxTokenMatchSize) {
pos = this.startPos + maxTokenMatchSize;
this.endCutted = true;
@@ -1109,15 +1103,73 @@
};
// Expand the context to a span
- public int[] expandContextToSpan (String element) {
+ public void expandContextToSpan (String element) {
// TODO: THE BITS HAVE TO BE SET!
- if (this.positionsToOffset != null)
- return this.expandContextToSpan(
+ int[] spanContext = new int[] { 0, 0, 0, 0 };
+
+ if (this.positionsToOffset != null) {
+ spanContext = this.expandContextToSpan(
this.positionsToOffset.getLeafReader(), (Bits) null,
"tokens", element);
- return new int[] { 0, 0, 0, 0 };
+ }
+
+ if (spanContext[0] >= 0
+ && spanContext[0] < spanContext[1]) {
+
+ int maxExpansionSize = KrillProperties.maxTokenMatchSize
+ + KrillProperties.maxTokenContextSize;
+
+ // Match needs to be cutted!
+ boolean cutExpansion = false;
+ if ((spanContext[1] - spanContext[0]) > maxExpansionSize) {
+ cutExpansion=true;
+ int contextLength = maxExpansionSize - this.getLength();
+ int halfContext = contextLength / 2;
+
+ // This is the extended context calculated
+ int realLeftLength = this.getStartPos() - spanContext[0];
+
+ // The length is too large - cut!
+ if (realLeftLength > halfContext) {
+ this.startCutted = true;
+ spanContext[0] = this.getStartPos() - halfContext;
+ }
+
+ int realRightLength = spanContext[1] - this.getEndPos();
+
+ // The length is too large - cut!
+ if (realRightLength > halfContext) {
+ this.endCutted = true;
+ spanContext[1] = this.getEndPos() + halfContext;
+ }
+ }
+
+ this.setStartPos(maxExpansionSize,spanContext[0]);
+ this.setEndPos(maxExpansionSize,spanContext[1]);
+ // EM: update char offsets
+
+ if (cutExpansion) {
+ this.positionsToOffset.add(localDocID, startPos);
+ this.positionsToOffset.add(localDocID, endPos);
+
+ int start = this.positionsToOffset.start(localDocID, startPos);
+ int end = this.positionsToOffset.start(localDocID, endPos)-1;
+ spanContext[2] = start; //spanContext[2];
+ spanContext[3] = end; // spanContext[3];
+ }
+
+ this.potentialStartPosChar = spanContext[2];
+ this.potentialEndPosChar = spanContext[3];
+ this.startMore = false;
+ this.endMore = false;
+
+ this.positionsToOffset.clear();
+ }
+ else {
+ this.addWarning(651, "Unable to extend context");
+ };
};
diff --git a/src/test/java/de/ids_mannheim/korap/index/TestMatchIdentifier.java b/src/test/java/de/ids_mannheim/korap/index/TestMatchIdentifier.java
index af4e7b5..1bf2677 100644
--- a/src/test/java/de/ids_mannheim/korap/index/TestMatchIdentifier.java
+++ b/src/test/java/de/ids_mannheim/korap/index/TestMatchIdentifier.java
@@ -1234,11 +1234,18 @@
assertEquals("... a a a a a a [[b]] a a a a a a ...", kr.getMatch(0).getSnippetBrackets());
// see TestNextIndex#corolaNextTest
+
Match km = ki.getMatchInfo("match-Corola-blog/BlogPost/370281_a_371610-p70-71", "tokens", null, null,false, false, true);
- // The match needs to be cutted on both sides!
String str = km.getSnippetBrackets();
- assertTrue(str.contains("[<!>a"));
+ assertTrue(str.contains("[<!>{drukola/l:au:a}"));
+ assertFalse(str.contains("<!>]"));
+
+ km = ki.getMatchInfo("match-Corola-blog/BlogPost/370281_a_371610-p50-51", "tokens", null, null,false, false, true);
+
+ // The match needs to be cutted on both sides!
+ str = km.getSnippetBrackets();
+ assertTrue(str.contains("[<!>{d"));
assertTrue(str.contains("a}<!>]"));
};
diff --git a/src/test/java/de/ids_mannheim/korap/index/TestMaxMatchTokens.java b/src/test/java/de/ids_mannheim/korap/index/TestMaxMatchTokens.java
index 5a39340..374e4c8 100644
--- a/src/test/java/de/ids_mannheim/korap/index/TestMaxMatchTokens.java
+++ b/src/test/java/de/ids_mannheim/korap/index/TestMaxMatchTokens.java
@@ -23,6 +23,10 @@
private KrillIndex ki;
private String json;
+ private ArrayList<String> foundry = new ArrayList<String>();
+ private ArrayList<String> layer = new ArrayList<String>();
+
+
public TestMaxMatchTokens () throws IOException {
ki = new KrillIndex();
// Indexing test files
@@ -35,6 +39,10 @@
json = getJsonString(getClass()
.getResource("/queries/position/sentence-contain-token.json")
.getFile());
+
+ foundry.add("opennlp");
+ layer.add("p");
+
}
@Before
@@ -84,11 +92,6 @@
ki.commit();
Match km;
- ArrayList<String> foundry = new ArrayList<String>();
- foundry.add("opennlp");
- ArrayList<String> layer = new ArrayList<String>();
- layer.add("opennlp");
-
// maxMatchTokens from properties = 40
km = ki.getMatchInfo("match-WUD17/C94/39360-p390-396", "tokens", false,
foundry, layer, false, false, false, false, false);
@@ -110,4 +113,62 @@
assertTrue(km.endCutted);
assertEquals(420, km.getEndPos());
}
+
+ @Test
+ public void testMatchInfoExpansion () throws QueryException, IOException {
+ KrillProperties.maxTokenMatchSize = 1;
+ KrillIndex ki = new KrillIndex();
+ // Indexing test files
+ ki.addDoc(
+ getClass().getResourceAsStream("/wiki/WUD17-C94-39360.json.gz"),
+ true);
+ ki.commit();
+
+ // cut left match expansion
+ Match km = ki.getMatchInfo("match-WUD17/C94/39360-p225-226", "tokens",
+ true, foundry , layer, true, true, true, true, true);
+ assertEquals(213, km.getStartPos());
+ assertEquals(228, km.getEndPos());
+ assertEquals(15, km.getLength());
+ assertEquals("[<!>{opennlp/p:ADV:auch} {opennlp/p:APPRART:zur} "
+ + "{opennlp/p:NN:Nutzung} {opennlp/p:ART:des} {opennlp/p:NN:Namens} "
+ + "{opennlp/p:VVPP:berechtigt} {opennlp/p:VAFIN:ist} "
+ + "({opennlp/p:VVIMP:siehe} {opennlp/p:PROAV:dazu} "
+ + "{opennlp/p:PPOSAT:unsere} {opennlp/p:NN:Hinweise} "
+ + "{opennlp/p:APPRART:zur} [{opennlp/p:NN:Wahl}] "
+ + "{opennlp/p:ART:des} {opennlp/p:NN:Benutzernamens}).]",
+ km.getSnippetBrackets());
+
+ // cut right match expansion
+ km = ki.getMatchInfo("match-WUD17/C94/39360-p210-211", "tokens", false,
+ foundry, layer, false, false, false, false, true);
+ assertEquals(199, km.getStartPos());
+ assertEquals(223, km.getEndPos());
+ assertEquals(24, km.getLength());
+ assertEquals("[Benutzerkonten sollen nur dann einen offiziell klingenden"
+ + " Namen haben, wenn der [Betreiber] des Kontos auch zur Nutzung "
+ + "des Namens berechtigt ist (siehe dazu unsere<!>]",
+ km.getSnippetBrackets());
+
+ // cut left and right match expansion
+ km = ki.getMatchInfo("match-WUD17/C94/39360-p213-214", "tokens", false,
+ foundry, layer, false, false, false, false, true);
+ assertEquals(201, km.getStartPos());
+ assertEquals(226, km.getEndPos());
+ assertEquals(25, km.getLength());
+ assertEquals("[<!>nur dann einen offiziell klingenden Namen haben, wenn "
+ + "der Betreiber des Kontos [auch] zur Nutzung des Namens "
+ + "berechtigt ist (siehe dazu unsere Hinweise zur Wahl<!>]",
+ km.getSnippetBrackets());
+
+ // no cut
+ km = ki.getMatchInfo("match-WUD17/C94/39360-p160-161", "tokens", false,
+ foundry, layer, false, false, false, false, true);
+ assertEquals(150, km.getStartPos());
+ assertEquals(162, km.getEndPos());
+ assertEquals(12, km.getLength());
+
+ KrillProperties.maxTokenMatchSize = 20;
+ }
+
}