Make match and context size configurable (address #128)
Change-Id: Ieef96dd68adf4e3ce00f59fc21face545c2ce897
diff --git a/src/main/java/de/ids_mannheim/korap/Krill.java b/src/main/java/de/ids_mannheim/korap/Krill.java
index 4b1b642..7b10c8a 100644
--- a/src/main/java/de/ids_mannheim/korap/Krill.java
+++ b/src/main/java/de/ids_mannheim/korap/Krill.java
@@ -60,6 +60,7 @@
private SpanQuery spanQuery;
private JsonNode request;
+ private int maxTokenMatchSize;
private final ObjectMapper mapper = new ObjectMapper();
/**
@@ -347,4 +348,14 @@
VirtualCorpusResponse r = new VirtualCorpusResponse();
return r.createKoralQueryForField(fieldName, fieldValues);
}
+
+
+ public int getMaxTokenMatchSize () {
+ return maxTokenMatchSize;
+ }
+
+
+ public void setMaxTokenMatchSize (int maxMatchTokens) {
+ this.maxTokenMatchSize = maxMatchTokens;
+ }
};
diff --git a/src/main/java/de/ids_mannheim/korap/KrillIndex.java b/src/main/java/de/ids_mannheim/korap/KrillIndex.java
index 2210167..40d369e 100644
--- a/src/main/java/de/ids_mannheim/korap/KrillIndex.java
+++ b/src/main/java/de/ids_mannheim/korap/KrillIndex.java
@@ -44,8 +44,8 @@
import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.search.spans.Spans;
import org.apache.lucene.store.Directory;
-import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.store.MMapDirectory;
+import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.FixedBitSet;
@@ -175,12 +175,15 @@
private HashMap termContexts;
private ObjectMapper mapper = new ObjectMapper();
+ private int maxTokenMatchSize;
+
// private ByteBuffer bbTerm;
// Some initializations ...
{
Properties prop = KrillProperties.loadDefaultProperties();
Properties info = KrillProperties.loadInfo();
+
if (info != null) {
this.version = info.getProperty("krill.version");
this.name = info.getProperty("krill.name");
@@ -188,9 +191,11 @@
// Check for auto commit value
String autoCommitStr = null;
- if (prop != null)
+ if (prop != null) {
autoCommitStr = prop.getProperty("krill.index.commit.auto");
-
+ this.maxTokenMatchSize = KrillProperties.maxTokenMatchSize;
+ }
+
if (autoCommitStr != null) {
try {
this.autoCommit = Integer.parseInt(autoCommitStr);
@@ -429,8 +434,15 @@
public void setAutoCommit (int value) {
this.autoCommit = value;
};
-
-
+
+ public int getMaxTokenMatchSize () {
+ return maxTokenMatchSize;
+ }
+
+ public void setMaxTokenMatchSize (int maxMatchTokens) {
+ this.maxTokenMatchSize = maxMatchTokens;
+ }
+
/**
* Update a document in the index as a {@link FieldDocument}
* if it already exists (based on the textSigle), otherwise
@@ -972,11 +984,20 @@
boolean includeSnippets, boolean includeTokens,
boolean includeHighlights, boolean extendToSentence)
throws QueryException {
-
+ return getMatchInfo(idString, field, info, foundry, layer, includeSpans,
+ includeSnippets, includeTokens, includeHighlights,
+ extendToSentence, maxTokenMatchSize);
+ };
+
+ public Match getMatchInfo (String idString, String field, boolean info,
+ List<String> foundry, List<String> layer, boolean includeSpans,
+ boolean includeSnippets, boolean includeTokens,
+ boolean includeHighlights, boolean extendToSentence,
+ int maxMatchTokens) throws QueryException {
if (DEBUG)
log.trace("Get info on {}", idString);
-
- Match match = new Match(idString, includeHighlights);
+
+ Match match = new Match(maxMatchTokens, idString, includeHighlights);
if (this.getVersion() != null)
match.setVersion(this.getVersion());
@@ -1202,8 +1223,8 @@
&& spanContext[0] < spanContext[1]) {
// Match needs to be cutted!
- if ((spanContext[1] - spanContext[0]) > match.getMaxMatchTokens()) {
- int contextLength = match.getMaxMatchTokens() - match.getLength();
+ if ((spanContext[1] - spanContext[0]) > maxMatchTokens) {
+ int contextLength = maxMatchTokens - match.getLength();
int halfContext = contextLength / 2;
// This is the extended context calculated
@@ -1216,8 +1237,8 @@
}
}
- match.setStartPos(spanContext[0]);
- match.setEndPos(spanContext[1]);
+ match.setStartPos(maxMatchTokens,spanContext[0]);
+ match.setEndPos(maxMatchTokens,spanContext[1]);
match.potentialStartPosChar = spanContext[2];
match.potentialEndPosChar = spanContext[3];
match.startMore = false;
@@ -1569,9 +1590,14 @@
final Document doc = (fields != null)
? lreader.document(localDocID, fieldsSet)
: lreader.document(localDocID);
-
+
+ int maxMatchSize = maxTokenMatchSize;
+ if (ks.getMaxTokenMatchSize() > 0) {
+ maxMatchSize = ks.getMaxTokenMatchSize();
+ };
+
// Create new Match
- final Match match = new Match(pto, localDocID,
+ final Match match = new Match(maxMatchSize, pto, localDocID,
spans.start(), spans.end());
// Add snippet if existing
diff --git a/src/main/java/de/ids_mannheim/korap/index/Indexer.java b/src/main/java/de/ids_mannheim/korap/index/Indexer.java
index 806b920..b726ffd 100644
--- a/src/main/java/de/ids_mannheim/korap/index/Indexer.java
+++ b/src/main/java/de/ids_mannheim/korap/index/Indexer.java
@@ -180,7 +180,7 @@
Options options = new Options();
options.addOption(Option.builder("c").longOpt("config")
.desc("configuration file (defaults to "
- + KrillProperties.defaultPropertiesLocation
+ + KrillProperties.DEFAULT_PROPERTIES_LOCATION
+ ").")
.hasArg().argName("properties file").required().build());
options.addOption(Option.builder("i").longOpt("inputDir")
diff --git a/src/main/java/de/ids_mannheim/korap/response/Match.java b/src/main/java/de/ids_mannheim/korap/response/Match.java
index 24921d3..5bb6ad1 100644
--- a/src/main/java/de/ids_mannheim/korap/response/Match.java
+++ b/src/main/java/de/ids_mannheim/korap/response/Match.java
@@ -1,14 +1,23 @@
package de.ids_mannheim.korap.response;
+import static de.ids_mannheim.korap.util.KrillByte.unsignedByte;
+import static de.ids_mannheim.korap.util.KrillString.codePointSubstring;
+
import java.io.IOException;
import java.nio.ByteBuffer;
-import java.util.*;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.LinkedList;
+import java.util.List;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermContext;
-import org.apache.lucene.search.spans.Spans;
import org.apache.lucene.search.spans.SpanTermQuery;
+import org.apache.lucene.search.spans.Spans;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.FixedBitSet;
import org.slf4j.Logger;
@@ -18,13 +27,11 @@
import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.annotation.JsonInclude.Include;
import com.fasterxml.jackson.annotation.JsonProperty;
-import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.JsonNode;
+import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.node.ArrayNode;
import com.fasterxml.jackson.databind.node.ObjectNode;
-import static de.ids_mannheim.korap.util.KrillByte.*;
-import static de.ids_mannheim.korap.util.KrillString.codePointSubstring;
import de.ids_mannheim.korap.index.AbstractDocument;
import de.ids_mannheim.korap.index.PositionsToOffset;
import de.ids_mannheim.korap.query.SpanElementQuery;
@@ -33,6 +40,7 @@
import de.ids_mannheim.korap.response.match.MatchIdentifier;
import de.ids_mannheim.korap.response.match.PosIdentifier;
import de.ids_mannheim.korap.response.match.Relation;
+import de.ids_mannheim.korap.util.KrillProperties;
/*
* The snippet building algorithm is quite complicated for now
@@ -84,8 +92,6 @@
// Logger
private final static Logger log = LoggerFactory.getLogger(Match.class);
-
- private static final int MAX_MATCH_TOKENS = 50;
// end marker of highlights that are pagebreaks
private static final int PB_MARKER = -99999;
@@ -151,18 +157,19 @@
@JsonIgnore
public boolean startMore = true, endMore = true;
- private Collection<byte[]> payload;
+// private Collection<byte[]> payload;
private ArrayList<Highlight> highlight;
private LinkedList<int[]> span;
private PositionsToOffset positionsToOffset;
private boolean processed = false;
-
-
+
/**
* Constructs a new Match object.
* Todo: Maybe that's not necessary!
*
+ * @param maxTokenMatchSize
+ * The maximum number of tokens a match may have
* @param pto
* The PositionsToOffset object, containing relevant
* positional information for highlighting
@@ -177,12 +184,12 @@
* @see #snippetBrackets()
* @see PositionsToOffset
*/
- public Match (PositionsToOffset pto, int localDocID, int startPos,
- int endPos) {
+ public Match (int maxTokenMatchSize, PositionsToOffset pto,
+ int localDocID, int startPos, int endPos) {
this.positionsToOffset = pto;
this.localDocID = localDocID;
- this.setStartPos(startPos);
- this.setEndPos(endPos);
+ this.setStartPos(maxTokenMatchSize, startPos);
+ this.setEndPos(maxTokenMatchSize, endPos);
};
@@ -201,7 +208,7 @@
* Boolean value indicating if possible provided
* highlight information should be ignored or not.
*/
- public Match (String idString, boolean includeHighlights) {
+ public Match (int maxTokenMatchSize, String idString, boolean includeHighlights) {
MatchIdentifier id = new MatchIdentifier(idString);
if (id.getStartPos() > -1) {
@@ -215,8 +222,8 @@
this.addString("ID", id.getDocID());
// </legacy>
- this.setStartPos(id.getStartPos());
- this.setEndPos(id.getEndPos());
+ this.setStartPos(maxTokenMatchSize, id.getStartPos());
+ this.setEndPos(maxTokenMatchSize, id.getEndPos());
if (includeHighlights)
for (int[] pos : id.getPos()) {
@@ -506,11 +513,6 @@
this.addHighlight(new Highlight(start, pagenumber));
};
- @JsonIgnore
- public int getMaxMatchTokens () {
- return MAX_MATCH_TOKENS;
- }
-
/**
* Get document id.
*/
@@ -575,10 +577,10 @@
* The positional offset.
*/
@JsonIgnore
- public void setStartPos (int pos) {
+ public void setStartPos (int maxTokenMatchSize, int pos) {
this.startPos = pos;
- if (this.endPos != -1 && (this.endPos - pos) > MAX_MATCH_TOKENS) {
- this.endPos = pos + MAX_MATCH_TOKENS;
+ if (this.endPos != -1 && (this.endPos - pos) > maxTokenMatchSize) {
+ this.endPos = pos + maxTokenMatchSize;
this.endCutted = true;
};
};
@@ -623,10 +625,15 @@
* The positional offset.
*/
@JsonIgnore
- public void setEndPos (int pos) {
- if (this.startPos != -1 && (pos - this.startPos) > MAX_MATCH_TOKENS) {
- pos = this.startPos + MAX_MATCH_TOKENS;
- this.endCutted = true;
+ public void setEndPos (int maxTokenMatchSize, int pos) {
+ if (maxTokenMatchSize > KrillProperties.maxTokenMatchSize) {
+ maxTokenMatchSize = KrillProperties.maxTokenMatchSize;
+ this.endCutted = true;
+ }
+
+ if (this.startPos != -1 && (pos - this.startPos) > maxTokenMatchSize) {
+ pos = this.startPos + maxTokenMatchSize;
+ this.endCutted = true;
};
this.endPos = pos;
};
diff --git a/src/main/java/de/ids_mannheim/korap/util/KrillProperties.java b/src/main/java/de/ids_mannheim/korap/util/KrillProperties.java
index e9a38e6..db73e36 100644
--- a/src/main/java/de/ids_mannheim/korap/util/KrillProperties.java
+++ b/src/main/java/de/ids_mannheim/korap/util/KrillProperties.java
@@ -15,10 +15,14 @@
*/
public class KrillProperties {
- public static final String defaultPropertiesLocation = "krill.properties";
- public static final String defaultInfoLocation = "krill.info";
+ public static final String DEFAULT_PROPERTIES_LOCATION = "krill.properties";
+ public static final String DEFAULT_INFO_LOCATION = "krill.info";
private static Properties prop, info;
-
+
+ public static int maxTokenMatchSize = 50;
+ public static int maxTokenContextSize = 60;
+ public static int maxCharContextSize = 500;
+
// Logger
private final static Logger log = LoggerFactory
.getLogger(KrillProperties.class);
@@ -28,7 +32,7 @@
if (prop != null)
return prop;
- prop = loadProperties(defaultPropertiesLocation);
+ prop = loadProperties(DEFAULT_PROPERTIES_LOCATION);
return prop;
};
@@ -66,19 +70,40 @@
return null;
};
};
+ updateConfigurations(prop);
return prop;
};
+ private static void updateConfigurations (Properties prop) {
+ String maxTokenMatchSize = prop.getProperty("krill.match.max.token");
+ String maxTokenContextSize = prop.getProperty("krill.context.max.token");
+
+ try {
+ if (maxTokenMatchSize != null) {
+ KrillProperties.maxTokenMatchSize = Integer
+ .parseInt(maxTokenMatchSize);
+ }
+ if (maxTokenContextSize != null) {
+ KrillProperties.maxTokenContextSize = Integer
+ .parseInt(maxTokenContextSize);
+ }
+ }
+ catch (NumberFormatException e) {
+ log.error("A Krill property expects numerical values: "
+ + e.getMessage());
+ };
+ }
+
// Load version info from file
public static Properties loadInfo () {
try {
info = new Properties();
InputStream iFile = KrillProperties.class.getClassLoader()
- .getResourceAsStream(defaultInfoLocation);
+ .getResourceAsStream(DEFAULT_INFO_LOCATION);
if (iFile == null) {
- log.error("Cannot find {}.", defaultInfoLocation);
+ log.error("Cannot find {}.", DEFAULT_INFO_LOCATION);
return null;
};