New feature and some bugfixes concerning span based context extension
diff --git a/src/main/java/de/ids_mannheim/korap/KorapIndex.java b/src/main/java/de/ids_mannheim/korap/KorapIndex.java
index e2d957c..d2df845 100644
--- a/src/main/java/de/ids_mannheim/korap/KorapIndex.java
+++ b/src/main/java/de/ids_mannheim/korap/KorapIndex.java
@@ -73,6 +73,7 @@
import de.ids_mannheim.korap.index.PositionsToOffset;
import de.ids_mannheim.korap.index.TermInfo;
import de.ids_mannheim.korap.index.SpanInfo;
+import de.ids_mannheim.korap.index.SearchContext;
import de.ids_mannheim.korap.index.MatchIdentifier;
import de.ids_mannheim.korap.query.SpanElementQuery;
@@ -138,7 +139,7 @@
private final static Logger log = LoggerFactory.getLogger(KorapIndex.class);
// This advices the java compiler to ignore all loggings
- public static final boolean DEBUG = true;
+ public static final boolean DEBUG = false;
{
Properties prop = new Properties();
@@ -502,7 +503,16 @@
public KorapMatch getMatch (String id) {
- return this.getMatchInfo(id, "tokens", false, null, null, false, true, false);
+ return this.getMatchInfo(
+ id, // MatchID
+ "tokens", // field
+ false, // info
+ null, // foundry
+ null, // layer
+ false, // includeSpans
+ true, // includeHighlights
+ false // extendToSentence
+ );
};
public KorapMatch getMatchInfo (String id,
@@ -524,6 +534,7 @@
return this.getMatchInfo(id, field, true, foundry, layer, includeSpans, includeHighlights, extendToSentence);
};
+
/**
* Get a match.
* BE AWARE - THIS IS STILL A PLAYGROUND!
@@ -630,57 +641,44 @@
match.setPositionsToOffset(pto);
match.setLocalDocID(localDocID);
match.populateDocument(doc, field, fieldsToLoadLocal);
-
if (DEBUG)
log.trace("The document has the id '{}'", match.getDocID());
- if (!info) break;
+ SearchContext context = match.getContext();
// Search for minimal surrounding sentences
if (extendToSentence) {
-
- SpanElementQuery squery = new SpanElementQuery(field, "s");
- Spans sentence = squery.getSpans(atomic,
- (Bits) bitset,
- new HashMap<Term, TermContext>());
+ /*
+ int[] newPos = match.expandContextToSpan(
+ atomic,
+ bitset,
+ field,
+ "s"
+ );
+ if (newPos[0] > 0)
+ match.setStartPos(newPos[0]);
+ if (newPos[1] > 0)
+ match.setEndPos(newPos[1]);
if (DEBUG)
- log.trace("Now search for {}", sentence.toString());
-
- int newStart = -1, newEnd = -1;
-
- while (true) {
-
- // Game over
- if (sentence.next() != true)
- break;
-
- // There's an s found, that starts before the match
- if (sentence.start() <= match.getStartPos()) {
- newStart = sentence.start() > newStart ? sentence.start() : newStart;
-
- }
- else if (newStart == -1)
- break;
-
- // There's an s found, that ends after the match
- if (sentence.end() >= match.getEndPos()) {
- newEnd = sentence.end();
- break;
- };
- };
-
- // We have a new match surrounding
- if (newStart > -1 && newEnd > -1) {
- if (DEBUG)
- log.trace("New match spans from {}-{}",
- newStart,
- newEnd);
- match.setStartPos(newStart);
- match.setEndPos(newEnd);
- };
+ log.trace("Expand context to {}-{}", newPos[0], newPos[1]);
+ */
+ int [] spanContext = match.expandContextToSpan("s");
+ match.setStartPos(spanContext[0]);
+ match.setEndPos(spanContext[1]);
+ match.startMore = false;
+ match.endMore = false;
+ }
+ else {
+ if (DEBUG)
+ log.trace("Don't expand context");
};
+
+ context.left.setToken(true).setLength(0);
+ context.right.setToken(true).setLength(0);
+ if (!info)
+ break;
// Limit the terms to all the terms of interest
TermsEnum termsEnum = docTerms.intersect(fst, null);
@@ -901,6 +899,8 @@
);
};
+ // THis should probably be deprecated
+ @Deprecated
public KorapResult search (SpanQuery query,
int startIndex,
short count,
@@ -908,16 +908,11 @@
short leftContext,
boolean rightTokenContext,
short rightContext) {
- return this.search(
- new KorapCollection(this),
- query,
- startIndex,
- count,
- leftTokenContext,
- leftContext,
- rightTokenContext,
- rightContext
- );
+
+ KorapSearch ks = new KorapSearch(query);
+ ks.setStartIndex(startIndex).setCount(count);
+ ks.setContext(new SearchContext(leftTokenContext, leftContext, rightTokenContext, rightContext));
+ return this.search(new KorapCollection(this), ks);
};
public KorapResult search (KorapSearch ks) {
@@ -925,6 +920,7 @@
return this.search(new KorapCollection(this), ks);
};
+ @Deprecated
public KorapResult search (KorapCollection collection,
SpanQuery query,
int startIndex,
@@ -934,9 +930,7 @@
boolean rightTokenContext,
short rightContext) {
KorapSearch ks = new KorapSearch(query);
- ks.setStartIndex(startIndex).setCount(count);
- ks.leftContext.setToken(leftTokenContext).setLength(leftContext);
- ks.rightContext.setToken(rightTokenContext).setLength(rightContext);
+ ks.setContext(new SearchContext(leftTokenContext, leftContext, rightTokenContext, rightContext));
return this.search(collection, ks);
};
@@ -957,10 +951,7 @@
query.toString(),
ks.getStartIndex(),
ks.getCount(),
- ks.leftContext.isToken(),
- ks.leftContext.getLength(),
- ks.rightContext.isToken(),
- ks.rightContext.getLength()
+ ks.getContext()
);
if (this.getVersion() != null)
diff --git a/src/main/java/de/ids_mannheim/korap/KorapMatch.java b/src/main/java/de/ids_mannheim/korap/KorapMatch.java
index 2072699..d44c1d5 100644
--- a/src/main/java/de/ids_mannheim/korap/KorapMatch.java
+++ b/src/main/java/de/ids_mannheim/korap/KorapMatch.java
@@ -1,5 +1,7 @@
package de.ids_mannheim.korap;
import java.util.*;
+import java.io.*;
+
import java.lang.StringBuffer;
import java.nio.ByteBuffer;
@@ -10,17 +12,24 @@
import com.fasterxml.jackson.databind.node.*;
import de.ids_mannheim.korap.index.PositionsToOffset;
+import de.ids_mannheim.korap.index.SearchContext;
import de.ids_mannheim.korap.document.KorapPrimaryData;
import static de.ids_mannheim.korap.util.KorapHTML.*;
import de.ids_mannheim.korap.index.MatchIdentifier;
import de.ids_mannheim.korap.index.PosIdentifier;
+import de.ids_mannheim.korap.query.SpanElementQuery;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+import org.apache.lucene.index.AtomicReaderContext;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.TermContext;
import org.apache.lucene.util.FixedBitSet;
+import org.apache.lucene.util.Bits;
import org.apache.lucene.document.Document;
+import org.apache.lucene.search.spans.Spans;
/*
Todo: The implemented classes and private names are horrible!
@@ -42,15 +51,14 @@
private final static Logger log = LoggerFactory.getLogger(KorapMatch.class);
// This advices the java compiler to ignore all loggings
- public static final boolean DEBUG = false;
+ public static final boolean DEBUG = true;
// Mapper for JSON serialization
ObjectMapper mapper = new ObjectMapper();
// Snippet information
@JsonIgnore
- public short leftContextOffset,
- rightContextOffset;
+ public SearchContext context;
// Should be deprecated, but used wildly in tests!
@JsonIgnore
@@ -63,7 +71,7 @@
private String error = null;
private String version;
- // TEMPRARILY
+ // TEMPORARILY
@JsonIgnore
public int localDocID = -1;
@@ -76,10 +84,6 @@
int relationNumberCounter = 2048;
int identifierNumberCounter = -2;
- @JsonIgnore
- public boolean leftTokenContext,
- rightTokenContext;
-
private String tempSnippet,
snippetHTML,
snippetBrackets,
@@ -87,8 +91,8 @@
private HighlightCombinator snippetStack;
- private boolean startMore = true,
- endMore = true;
+ public boolean startMore = true,
+ endMore = true;
private Collection<byte[]> payload;
private ArrayList<Highlight> highlight;
@@ -99,7 +103,7 @@
/**
* Constructs a new KorapMatch object.
- * TODo: Maybe that's not necessary!
+ * Todo: Maybe that's not necessary!
*
* @param pto The PositionsToOffset object, containing relevant
* positional information for highlighting
@@ -113,9 +117,9 @@
*/
public KorapMatch (PositionsToOffset pto, int localDocID, int startPos, int endPos) {
this.positionsToOffset = pto;
- this.localDocID = localDocID;
- this.startPos = startPos;
- this.endPos = endPos;
+ this.localDocID = localDocID;
+ this.startPos = startPos;
+ this.endPos = endPos;
};
@@ -520,6 +524,153 @@
};
+ public KorapMatch setContext (SearchContext context) {
+ this.context = context;
+ return this;
+ };
+
+ @JsonIgnore
+ public SearchContext getContext () {
+ if (this.context == null)
+ this.context = new SearchContext();
+ return this.context;
+ };
+
+
+ // Expand the context to a span
+ public int[] expandContextToSpan (String element) {
+
+ // TODO: THE BITS HAVE TO BE SET!
+
+ if (this.positionsToOffset != null)
+ return this.expandContextToSpan(
+ this.positionsToOffset.getAtomicReader(),
+ (Bits) null,
+ "tokens",
+ element
+ );
+ return new int[]{0,0,0,0};
+ };
+
+ // Expand the context to a span
+ // TODO: THIS IS PLAIN DUMB MAKE IT MOAR CLEVER! MOAR!!!
+ public int[] expandContextToSpan (AtomicReaderContext atomic,
+ Bits bitset,
+ String field,
+ String element) {
+
+ try {
+ // Store character offsets in ByteBuffer
+ ByteBuffer bb = ByteBuffer.allocate(8);
+
+ SpanElementQuery cquery =
+ new SpanElementQuery(field, element);
+
+ Spans contextSpans = cquery.getSpans(
+ atomic,
+ bitset,
+ new HashMap<Term, TermContext>()
+ );
+
+ int newStart = -1,
+ newEnd = -1;
+ int newStartChar = -1,
+ newEndChar = -1;
+
+ if (DEBUG)
+ log.trace("Extend match to context boundary with {} in {}",
+ cquery.toString(),
+ this.localDocID);
+
+ while (true) {
+
+ // Game over
+ if (contextSpans.next() != true)
+ break;
+
+ if (contextSpans.doc() != this.localDocID) {
+ contextSpans.skipTo(this.localDocID);
+ if (contextSpans.doc() != this.localDocID)
+ break;
+ };
+
+ // There's a <context> found -- I'm curious,
+ // if it's closer to the match than everything before
+ if (contextSpans.start() <= this.getStartPos() &&
+ contextSpans.end() >= this.getStartPos()) {
+
+ // Set as newStart
+ newStart = contextSpans.start() > newStart ?
+ contextSpans.start() : newStart;
+
+ // Get character offset (start)
+ if (contextSpans.isPayloadAvailable()) {
+ try {
+ bb.rewind();
+ for (byte[] b : contextSpans.getPayload()) {
+
+ // Not an element span
+ if (b.length != 8)
+ continue;
+
+ bb.put(b);
+ bb.rewind();
+ newStartChar = bb.getInt();
+ newEndChar = bb.getInt();
+ break;
+ };
+ }
+ catch (Exception e) {
+ log.warn(e.getMessage());
+ };
+ };
+ }
+ else {
+ // Has to be resettet to avoid multiple readings of the payload
+ newEndChar = 0;
+ };
+
+ // There's an s found, that ends after the match
+ if (contextSpans.end() >= this.getEndPos()) {
+ newEnd = contextSpans.end();
+
+ // Get character offset (end)
+ if (newEndChar == 0 && contextSpans.isPayloadAvailable()) {
+ try {
+ bb.rewind();
+ for (byte[] b : contextSpans.getPayload()) {
+
+ // Not an element span
+ if (b.length != 8)
+ continue;
+
+ bb.put(b);
+ bb.rewind();
+ newEndChar = bb.getInt(1);
+ break;
+ };
+ }
+ catch (Exception e) {
+ log.warn(e.getMessage());
+ };
+ };
+ break;
+ };
+ };
+
+ // We have a new match surrounding
+ if (DEBUG)
+ log.trace("New match spans from {}-{}/{}-{}", newStart, newEnd, newStartChar, newEndChar);
+
+ return new int[]{newStart, newEnd, newStartChar, newEndChar};
+ }
+ catch (IOException e) {
+ log.error(e.getMessage());
+ };
+
+ return new int[]{-1,-1,-1,-1};
+ };
+
// Reset all internal data
private void _reset () {
@@ -574,10 +725,7 @@
// Get the list of spans for matches and highlighting
if (this.span == null || this.span.size() == 0) {
- if (!this._processHighlightSpans(
- leftTokenContext,
- rightTokenContext
- ))
+ if (!this._processHighlightSpans())
return false;
};
@@ -585,6 +733,10 @@
// (opening and closing elements)
ArrayList<int[]> stack = this._processHighlightStack();
+ if (DEBUG)
+ log.trace("The snippet is {}", this.tempSnippet);
+
+
// The temporary snippet is empty, nothing to do
if (this.tempSnippet == null) {
processed = true;
@@ -1127,20 +1279,16 @@
/**
* This will retrieve character offsets for all spans.
*/
- private boolean _processHighlightSpans (boolean leftTokenContext,
- boolean rightTokenContext) {
+ private boolean _processHighlightSpans () {
if (DEBUG)
log.trace("--- Process Highlight spans");
- int startOffsetChar,
- endOffsetChar,
- startPosChar,
- endPosChar;
-
// Local document ID
int ldid = this.localDocID;
+ int startPosChar = -1, endPosChar = -1;
+
// No positionsToOffset object found
if (this.positionsToOffset == null)
return false;
@@ -1154,8 +1302,8 @@
// Check potential differing start characters
// e.g. from element spans
if (potentialStartPosChar != -1 &&
- (startPosChar > potentialStartPosChar))
- startPosChar = potentialStartPosChar;
+ (startPosChar > this.potentialStartPosChar))
+ startPosChar = this.potentialStartPosChar;
endPosChar = this.positionsToOffset.end(ldid, this.endPos - 1);
@@ -1174,98 +1322,17 @@
startPosChar,
endPosChar);
- // left context
- if (leftTokenContext) {
- if (DEBUG)
- log.trace("PTO will retrieve {} (Left context)",
- this.startPos - this.leftContextOffset);
-
- startOffsetChar = this.positionsToOffset.start(
- ldid,
- this.startPos - this.leftContextOffset
- );
- }
- else {
- startOffsetChar = startPosChar - this.leftContextOffset;
- };
-
- // right context
- if (rightTokenContext) {
- if (DEBUG)
- log.trace("PTO will retrieve {} (Right context)",
- this.endPos + this.rightContextOffset - 1);
-
- endOffsetChar = this.positionsToOffset.end(
- ldid,
- this.endPos + this.rightContextOffset - 1
- );
- }
- else {
- if (endPosChar == -1) {
- endOffsetChar = -1;
- }
- else {
- endOffsetChar = endPosChar + this.rightContextOffset;
- };
- };
-
- // This can happen in case of non-token characters
- // in the match and null offsets
- if (startOffsetChar > startPosChar) {
- startOffsetChar = startPosChar;
- }
- else if (startOffsetChar < 0) {
- startOffsetChar = 0;
- };
-
- // No ... at the beginning
- if (startOffsetChar == 0) {
- startMore = false;
- };
-
- if (endOffsetChar != -1 && endOffsetChar < endPosChar)
- endOffsetChar = endPosChar;
-
- if (DEBUG)
- log.trace("The context spans from chars {}-{}",
- startOffsetChar, endOffsetChar);
-
- if (endOffsetChar > -1 &&
- (endOffsetChar < this.getPrimaryDataLength())) {
- this.tempSnippet = this.getPrimaryData(
- startOffsetChar,
- endOffsetChar
- );
- }
- else {
- this.tempSnippet = this.getPrimaryData(startOffsetChar);
- // endPosChar = this.tempSnippet.length() - 1 + startOffsetChar;
- endMore = false;
- };
-
- if (DEBUG)
- log.trace("Snippet: '" + this.tempSnippet + "'");
+ this.identifier = null;
// No spans yet
if (this.span == null)
this.span = new LinkedList<int[]>();
- this.identifier = null;
+ // Process offset char findings
+ int[] intArray = this._processOffsetChars(ldid, startPosChar, endPosChar);
- // TODO: Simplify
- int[] intArray = new int[]{
- startPosChar - startOffsetChar,
- endPosChar - startOffsetChar,
- -1,
- 0};
-
- if (DEBUG)
- log.trace("The match entry is {}-{} ({}-{}) with startOffsetChar {}",
- startPosChar - startOffsetChar,
- endPosChar - startOffsetChar,
- startPosChar,
- endPosChar,
- startOffsetChar);
+ // Recalculate startOffsetChar
+ int startOffsetChar = startPosChar - intArray[0];
// Add match span
this.span.add(intArray);
@@ -1313,6 +1380,135 @@
};
+ // Pass the local docid to retrieve character positions for the offset
+ private int[] _processOffsetChars (int ldid, int startPosChar, int endPosChar) {
+
+ int startOffsetChar = -1, endOffsetChar = -1;
+ int startOffset = -1, endOffset = -1;
+
+ // The offset is defined by a span
+ if (this.getContext().isSpanDefined()) {
+
+ if (DEBUG)
+ log.trace("Try to expand to <{}>",
+ this.context.getSpanContext());
+
+ this.startMore = false;
+ this.endMore = false;
+
+ int [] spanContext = this.expandContextToSpan(
+ this.positionsToOffset.getAtomicReader(),
+ (Bits) null,
+ "tokens",
+ this.context.getSpanContext()
+ );
+ startOffset = spanContext[0];
+ endOffset = spanContext[1];
+ startOffsetChar = spanContext[2];
+ endOffsetChar = spanContext[3];
+ if (DEBUG)
+ log.trace("Got context is based from span {}-{}/{}-{}",
+ startOffset, endOffset, startOffsetChar, endOffsetChar);
+ };
+
+ // The offset is defined by tokens or characters
+ if (endOffset == -1) {
+
+ PositionsToOffset pto = this.positionsToOffset;
+
+ // The left offset is defined by tokens
+ if (this.context.left.isToken()) {
+ startOffset = this.startPos - this.context.left.getLength();
+ if (DEBUG)
+ log.trace("PTO will retrieve {} (Left context)", startOffset);
+ pto.add(ldid, startOffset);
+ }
+
+ // The left offset is defined by characters
+ else {
+ startOffsetChar = startPosChar - this.context.left.getLength();
+ };
+
+ // The right context is defined by tokens
+ if (this.context.right.isToken()) {
+ endOffset = this.endPos + this.context.right.getLength() -1;
+ if (DEBUG)
+ log.trace("PTO will retrieve {} (Right context)", endOffset);
+ pto.add(ldid, endOffset);
+
+ }
+
+ // The right context is defined by characters
+ else {
+ endOffsetChar = (endPosChar == -1) ? -1 :
+ endPosChar + this.context.right.getLength();
+ };
+
+ if (startOffset != -1)
+ startOffsetChar = pto.start(ldid, startOffset);
+
+ if (endOffset != -1)
+ endOffsetChar = pto.end(ldid, endOffset);
+ };
+
+ if (DEBUG)
+ log.trace("Premature found offsets at {}-{}",
+ startOffsetChar,
+ endOffsetChar);
+
+
+ // This can happen in case of non-token characters
+ // in the match and null offsets
+ if (startOffsetChar > startPosChar)
+ startOffsetChar = startPosChar;
+ else if (startOffsetChar < 0)
+ startOffsetChar = 0;
+
+ // No "..." at the beginning
+ if (startOffsetChar == 0)
+ this.startMore = false;
+
+ if (endOffsetChar != -1 && endOffsetChar < endPosChar)
+ endOffsetChar = endPosChar;
+
+ if (DEBUG)
+ log.trace("The context spans from chars {}-{}",
+ startOffsetChar, endOffsetChar);
+
+ // Get snippet information from the primary data
+ if (endOffsetChar > -1 &&
+ (endOffsetChar < this.getPrimaryDataLength())) {
+ this.tempSnippet = this.getPrimaryData(
+ startOffsetChar,
+ endOffsetChar
+ );
+ }
+ else {
+ this.tempSnippet = this.getPrimaryData(startOffsetChar);
+ this.endMore = false;
+ };
+
+ if (DEBUG)
+ log.trace("Snippet: '" + this.tempSnippet + "'");
+
+ if (DEBUG)
+ log.trace("The match entry is {}-{} ({}-{}) with absolute offsetChars {}-{}",
+ startPosChar - startOffsetChar,
+ endPosChar - startOffsetChar,
+ startPosChar,
+ endPosChar,
+ startOffsetChar,
+ endOffsetChar);
+
+ // TODO: Simplify
+ return new int[]{
+ startPosChar - startOffsetChar,
+ endPosChar - startOffsetChar,
+ -1,
+ 0};
+ };
+
+
// Identical to KorapResult!
public String toJSON () {
ObjectNode json = (ObjectNode) mapper.valueToTree(this);
@@ -1321,18 +1517,7 @@
if (json.size() == 0)
return "{}";
- ArrayNode leftContext = mapper.createArrayNode();
- leftContext.add(this.leftTokenContext ? "token" : "char");
- leftContext.add(this.leftContextOffset);
-
- ArrayNode rightContext = mapper.createArrayNode();
- rightContext.add(this.rightTokenContext ? "token" : "char");
- rightContext.add(this.rightContextOffset);
-
- ObjectNode context = mapper.createObjectNode();
- context.put("left", leftContext);
- context.put("right", rightContext);
- json.put("context", context);
+ json.put("context", this.getContext().toJSON());
if (this.version != null)
json.put("version", this.getVersion());
diff --git a/src/main/java/de/ids_mannheim/korap/KorapResult.java b/src/main/java/de/ids_mannheim/korap/KorapResult.java
index 68ddac6..e53b7f4 100644
--- a/src/main/java/de/ids_mannheim/korap/KorapResult.java
+++ b/src/main/java/de/ids_mannheim/korap/KorapResult.java
@@ -3,6 +3,7 @@
import java.util.*;
import de.ids_mannheim.korap.KorapMatch;
import de.ids_mannheim.korap.index.PositionsToOffset;
+import de.ids_mannheim.korap.index.SearchContext;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -28,11 +29,9 @@
private int totalResults = 0;
private int startIndex = 0;
+ private SearchContext context;
+
private short itemsPerPage = ITEMS_PER_PAGE;
- private short leftContextOffset = 6,
- rightContextOffset = 6;
- private boolean leftTokenContext,
- rightTokenContext;
private String benchmarkSearchResults,
benchmarkHitCounter;
@@ -46,16 +45,13 @@
private final static Logger log = LoggerFactory.getLogger(KorapMatch.class);
// Empty result
- public KorapResult () {
- };
+ public KorapResult () {};
+
public KorapResult (String query,
int startIndex,
short itemsPerPage,
- boolean leftTokenContext,
- short leftContextOffset,
- boolean rightTokenContext,
- short rightContextOffset) {
+ SearchContext context) {
mapper.enable(SerializationFeature.INDENT_OUTPUT);
// mapper.disable(SerializationFeature.FAIL_ON_EMPTY_BEANS);
@@ -65,11 +61,7 @@
this.query = query;
this.startIndex = startIndex;
this.itemsPerPage = (itemsPerPage > 50 || itemsPerPage < 1) ? ITEMS_PER_PAGE : itemsPerPage;
- this.leftContextOffset = leftContextOffset;
- this.rightContextOffset = rightContextOffset;
-
- this.leftTokenContext = leftTokenContext;
- this.rightTokenContext = rightTokenContext;
+ this.context = context;
};
public void add (KorapMatch km) {
@@ -81,22 +73,23 @@
// Temporary - should use the same interface like results
// in the future:
- km.leftContextOffset = this.leftContextOffset;
- km.leftTokenContext = this.leftTokenContext;
- km.rightContextOffset = this.rightContextOffset;
- km.rightTokenContext = this.rightTokenContext;
+ km.setContext(this.context);
// Add pos for context
// That's not really a good position for it,
// to be honest ...
// But maybe it will make the offset
// information in the match be obsolete!
+
+ // TODO:
+ /*
if (km.leftTokenContext) {
pto.add(localDocID, startPos - this.leftContextOffset);
};
if (km.rightTokenContext) {
pto.add(localDocID, endPos + this.rightContextOffset - 1);
};
+ */
this.add(km);
return km;
@@ -194,22 +187,22 @@
return startIndex;
};
+
+ public KorapResult setContext (SearchContext context) {
+ this.context = context;
+ return this;
+ };
+
+ @JsonIgnore
+ public SearchContext getContext () {
+ return this.context;
+ };
+
// Identical to KorapMatch!
public String toJSON () {
ObjectNode json = (ObjectNode) mapper.valueToTree(this);
- ArrayNode leftContext = mapper.createArrayNode();
- leftContext.add(this.leftTokenContext ? "token" : "char");
- leftContext.add(this.leftContextOffset);
-
- ArrayNode rightContext = mapper.createArrayNode();
- rightContext.add(this.rightTokenContext ? "token" : "char");
- rightContext.add(this.rightContextOffset);
-
- ObjectNode context = mapper.createObjectNode();
- context.put("left", leftContext);
- context.put("right", rightContext);
- json.put("context", context);
+ json.put("context", this.getContext().toJSON());
if (this.version != null)
json.put("version", this.version);
diff --git a/src/main/java/de/ids_mannheim/korap/KorapSearch.java b/src/main/java/de/ids_mannheim/korap/KorapSearch.java
index 8843e31..0737415 100644
--- a/src/main/java/de/ids_mannheim/korap/KorapSearch.java
+++ b/src/main/java/de/ids_mannheim/korap/KorapSearch.java
@@ -8,6 +8,7 @@
import de.ids_mannheim.korap.KorapIndex;
import de.ids_mannheim.korap.KorapResult;
import de.ids_mannheim.korap.util.QueryException;
+import de.ids_mannheim.korap.index.SearchContext;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.JsonNode;
@@ -36,68 +37,13 @@
private JsonNode request;
- public KorapSearchContext leftContext, rightContext;
+ public SearchContext context;
+ private String spanContext;
{
- leftContext = new KorapSearchContext();
- rightContext = new KorapSearchContext();
+ context = new SearchContext();
};
- public class KorapSearchContext {
- private boolean type = true;
- private short length = 6, maxLength = 300;
-
- public boolean isToken () {
- return this.type;
- };
-
- public boolean isCharacter () {
- return !(this.type);
- };
-
- public KorapSearchContext setToken (boolean value) {
- this.type = value;
- return this;
- };
-
- public KorapSearchContext setCharacter (boolean value) {
- this.type = !(value);
- return this;
- };
-
- public short getLength() {
- return this.length;
- };
-
- public KorapSearchContext setLength (short value) {
- if (value >= 0) {
- if (value <= maxLength) {
- this.length = value;
- }
- else {
- this.length = this.maxLength;
- };
- };
- return this;
- };
-
- public KorapSearchContext setLength (int value) {
- return this.setLength((short) value);
- };
-
- public void fromJSON (JsonNode json) {
- String type = json.get(0).asText();
- if (type.equals("token")) {
- this.setToken(true);
- }
- else if (type.equals("char")) {
- this.setCharacter(true);
- };
- this.setLength(json.get(1).asInt());
- };
- };
-
-
public KorapSearch (String jsonString) {
ObjectMapper mapper = new ObjectMapper();
try {
@@ -141,14 +87,8 @@
this.setCutOff(meta.get("cutOff").asBoolean());
// Defined contexts
- if (meta.has("context")) {
- JsonNode context = meta.get("context");
- if (context.has("left"))
- this.leftContext.fromJSON(context.get("left"));
-
- if (context.has("right"))
- this.rightContext.fromJSON(context.get("right"));
- };
+ if (meta.has("context"))
+ this.context.fromJSON(meta.get("context"));
};
};
}
@@ -194,6 +134,16 @@
return this;
};
+ public SearchContext getContext () {
+ return this.context;
+ };
+
+
+ public KorapSearch setContext (SearchContext context) {
+ this.context = context;
+ return this;
+ };
+
public int getStartIndex () {
return this.startIndex;
};
diff --git a/src/main/java/de/ids_mannheim/korap/index/PositionsToOffset.java b/src/main/java/de/ids_mannheim/korap/index/PositionsToOffset.java
index eae79c1..57375d0 100644
--- a/src/main/java/de/ids_mannheim/korap/index/PositionsToOffset.java
+++ b/src/main/java/de/ids_mannheim/korap/index/PositionsToOffset.java
@@ -20,17 +20,18 @@
private AtomicReaderContext atomic;
private boolean processed = false;
private Integer[] pair;
- private static ByteBuffer bbOffset = ByteBuffer.allocate(8);
+ private static ByteBuffer bbOffset =
+ ByteBuffer.allocate(8);
HashSet<PositionsToOffsetArray> positions;
HashMap<PositionsToOffsetArray, Integer[]> offsets;
- private final static Logger log = LoggerFactory.getLogger(PositionsToOffset.class);
+ private final static Logger log =
+ LoggerFactory.getLogger(PositionsToOffset.class);
// This advices the java compiler to ignore all loggings
public static final boolean DEBUG = false;
-
private class PositionsToOffsetArray {
public int docID;
public int pos;
@@ -245,4 +246,8 @@
positions.clear();
return offsets;
};
+
+ public AtomicReaderContext getAtomicReader () {
+ return this.atomic;
+ };
};
diff --git a/src/main/java/de/ids_mannheim/korap/index/SearchContext.java b/src/main/java/de/ids_mannheim/korap/index/SearchContext.java
new file mode 100644
index 0000000..3c72456
--- /dev/null
+++ b/src/main/java/de/ids_mannheim/korap/index/SearchContext.java
@@ -0,0 +1,151 @@
+package de.ids_mannheim.korap.index;
+import com.fasterxml.jackson.databind.ObjectMapper;
+import com.fasterxml.jackson.databind.SerializationFeature;
+import com.fasterxml.jackson.databind.JsonNode;
+import com.fasterxml.jackson.databind.node.*;
+import com.fasterxml.jackson.annotation.*;
+
+
+public class SearchContext {
+ ObjectMapper mapper = new ObjectMapper();
+
+
+ private boolean spanType = false;
+
+ @JsonIgnore
+ public SearchContextSide left, right;
+
+ @JsonIgnore
+ public String spanContext;
+
+ {
+ left = new SearchContextSide();
+ right = new SearchContextSide();
+ };
+
+ public SearchContext () {};
+
+ public SearchContext (String spanContext) {
+ this.spanType = true;
+ this.spanContext = spanContext;
+ };
+
+ public SearchContext (boolean leftTokenContext,
+ short leftContext,
+ boolean rightTokenContext,
+ short rightContext) {
+ this.spanType = false;
+ this.left.setToken(leftTokenContext);
+ this.left.setLength(leftContext);
+ this.right.setToken(leftTokenContext);
+ this.right.setLength(rightContext);
+ };
+
+ public boolean isSpanDefined () {
+ return this.spanType;
+ };
+
+ public String getSpanContext () {
+ return this.spanContext;
+ };
+
+ public SearchContext setSpanContext (String spanContext) {
+ this.spanType = true;
+
+ if (spanContext.equals("sentence")) {
+ spanContext = "s";
+ }
+ else if (spanContext.equals("paragraph")) {
+ spanContext = "p";
+ };
+
+ this.spanContext = spanContext;
+ return this;
+ };
+
+ public class SearchContextSide {
+ private boolean type = true;
+ private short length = 6;
+ private short maxLength = 500;
+
+ public boolean isToken () {
+ return this.type;
+ };
+
+ public boolean isCharacter () {
+ return !(this.type);
+ };
+
+ public SearchContextSide setToken (boolean value) {
+ this.type = value;
+ return this;
+ };
+
+ public SearchContextSide setCharacter (boolean value) {
+ this.type = !(value);
+ return this;
+ };
+
+ public short getLength() {
+ return this.length;
+ };
+
+ public SearchContextSide setLength (short value) {
+ if (value >= 0) {
+ if (value <= maxLength) {
+ this.length = value;
+ }
+ else {
+ this.length = this.maxLength;
+ };
+ };
+ return this;
+ };
+
+ public SearchContextSide setLength (int value) {
+ return this.setLength((short) value);
+ };
+
+ public void fromJSON (JsonNode json) {
+ String type = json.get(0).asText();
+ if (type.equals("token")) {
+ this.setToken(true);
+ }
+ else if (type.equals("char")) {
+ this.setCharacter(true);
+ };
+ this.setLength(json.get(1).asInt(this.length));
+ };
+ };
+
+
+ public void fromJSON (JsonNode context) {
+ if (context.isContainerNode()) {
+ if (context.has("left"))
+ this.left.fromJSON(context.get("left"));
+
+ if (context.has("right"))
+ this.right.fromJSON(context.get("right"));
+ }
+ else if (context.isValueNode()) {
+ this.setSpanContext(context.asText());
+ };
+ };
+
+ public ObjectNode toJSON () {
+ ArrayNode leftContext = mapper.createArrayNode();
+ leftContext.add(this.left.isToken() ? "token" : "char");
+ leftContext.add(this.left.getLength());
+
+ ArrayNode rightContext = mapper.createArrayNode();
+ rightContext.add(this.right.isToken() ? "token" : "char");
+ rightContext.add(this.right.getLength());
+
+ ObjectNode context = mapper.createObjectNode();
+ context.put("left", leftContext);
+ context.put("right", rightContext);
+
+ return context;
+ };
+
+};
diff --git a/src/main/java/de/ids_mannheim/korap/query/spans/ClassSpans.java b/src/main/java/de/ids_mannheim/korap/query/spans/ClassSpans.java
index 14ff683..6de3fb7 100644
--- a/src/main/java/de/ids_mannheim/korap/query/spans/ClassSpans.java
+++ b/src/main/java/de/ids_mannheim/korap/query/spans/ClassSpans.java
@@ -31,7 +31,7 @@
private final static Logger log = LoggerFactory.getLogger(ClassSpans.class);
// This advices the java compiler to ignore all loggings
- public static final boolean DEBUG = true;
+ public static final boolean DEBUG = false;
public ClassSpans (SpanQuery highlight,
AtomicReaderContext context,
diff --git a/src/main/java/de/ids_mannheim/korap/query/spans/NextSpans.java b/src/main/java/de/ids_mannheim/korap/query/spans/NextSpans.java
index 02dfdcf..d1a8759 100644
--- a/src/main/java/de/ids_mannheim/korap/query/spans/NextSpans.java
+++ b/src/main/java/de/ids_mannheim/korap/query/spans/NextSpans.java
@@ -30,6 +30,10 @@
private boolean hasMoreFirstSpan;
private Logger log = LoggerFactory.getLogger(NextSpans.class);
+
+ // This advices the java compiler to ignore all loggings
+ public static final boolean DEBUG = false;
+
public NextSpans (SpanNextQuery spanNextQuery,
AtomicReaderContext context,
@@ -59,12 +63,13 @@
matchEndPosition = matchList.get(0).getEnd();
if (collectPayloads)
matchPayload.addAll( matchList.get(0).getPayloads() );
-
- log.trace("Match doc#: {}",matchDocNumber);
- log.trace("Match positions: {}-{}", matchStartPosition,
- matchEndPosition);
- matchList.remove(0);
- return true;
+ if (DEBUG) {
+ log.trace("Match doc#: {}",matchDocNumber);
+ log.trace("Match positions: {}-{}", matchStartPosition,
+ matchEndPosition);
+ };
+ matchList.remove(0);
+ return true;
}
// Forward firstspan
hasMoreFirstSpan = firstSpans.next();
diff --git a/src/main/resources/log4j.properties b/src/main/resources/log4j.properties
index af72c2d..5050e55 100644
--- a/src/main/resources/log4j.properties
+++ b/src/main/resources/log4j.properties
@@ -9,8 +9,8 @@
#log4j.logger.de.ids_mannheim.korap.query.SpanNextQuery = TRACE, stdout
#log4j.logger.de.ids_mannheim.korap.query.spans.NextSpans = TRACE, stdout
#log4j.logger.de.ids_mannheim.korap.query.spans.SimpleSpans = TRACE, stdout
-# log4j.logger.de.ids_mannheim.korap.query.spans.ClassSpans = TRACE, stdout
-# log4j.logger.de.ids_mannheim.korap.query.spans.MatchSpans = TRACE, stdout
+#log4j.logger.de.ids_mannheim.korap.query.spans.ClassSpans = TRACE, stdout
+#log4j.logger.de.ids_mannheim.korap.query.spans.MatchSpans = TRACE, stdout
# Collections
#log4j.logger.de.ids_mannheim.korap.KorapFilter = TRACE, stdout
@@ -18,12 +18,11 @@
# Results:
-# log4j.logger.de.ids_mannheim.korap.KorapIndex = TRACE, stdout
+#log4j.logger.de.ids_mannheim.korap.KorapIndex = TRACE, stdout
#log4j.logger.de.ids_mannheim.korap.KorapMatch = TRACE, stdout
#log4j.logger.de.ids_mannheim.korap.index.PositionsToOffset = TRACE, stdout
#log4j.logger.de.ids_mannheim.korap.index.TestSegmentIndex = TRACE, stdout
-
#log4j.logger.de.ids_mannheim.korap.analysis.MultiTermTokenStream = TRACE, stdout
log4j.appender.stdout=org.apache.log4j.ConsoleAppender