Improved treatment and testibility of highlights (2 broken tests)
diff --git a/src/main/java/de/ids_mannheim/korap/KorapIndex.java b/src/main/java/de/ids_mannheim/korap/KorapIndex.java
index edbeb5c..614302f 100644
--- a/src/main/java/de/ids_mannheim/korap/KorapIndex.java
+++ b/src/main/java/de/ids_mannheim/korap/KorapIndex.java
@@ -573,31 +573,12 @@
// Do not load all of this, in case the doc is the same!
Document doc = lreader.document(localDocID, fieldsToLoadLocal);
- KorapMatch match = kr.addMatch(pto); // new KorapMatch();
-
- match.startPos = spans.start();
- match.endPos = spans.end();
- match.localDocID = localDocID;
-
- pto.add(localDocID, match.startPos);
- pto.add(localDocID, match.endPos - 1);
-
- /*
- match.leftContext = leftContext;
- match.rightContext = rightContext;
-
- match.leftTokenContext = leftTokenContext;
- match.rightTokenContext = rightTokenContext;
- */
- // Add pos for context
- if (match.leftTokenContext) {
- pto.add(localDocID, match.startPos - match.leftContext);
- };
-
- // Add pos for context
- if (match.rightTokenContext) {
- pto.add(localDocID, match.endPos + match.rightContext - 1);
- };
+ KorapMatch match = kr.addMatch(
+ pto,
+ localDocID,
+ spans.start(),
+ spans.end()
+ ); // new KorapMatch();
if (spans.isPayloadAvailable()) {
@@ -629,11 +610,6 @@
byte number = bb.get();
log.trace("Have a payload: {}-{}", start, end);
-
- // Add this for offset search
- pto.add(localDocID, start);
- pto.add(localDocID, end);
-
match.addHighlight(start, end, number);
}
diff --git a/src/main/java/de/ids_mannheim/korap/KorapMatch.java b/src/main/java/de/ids_mannheim/korap/KorapMatch.java
index ea271a3..fad0867 100644
--- a/src/main/java/de/ids_mannheim/korap/KorapMatch.java
+++ b/src/main/java/de/ids_mannheim/korap/KorapMatch.java
@@ -28,34 +28,57 @@
// Snippet information
@JsonIgnore
- public short leftContext, rightContext;
+ public short leftContext,
+ rightContext;
@JsonIgnore
- public int startPos, endPos;
+ public int startPos,
+ endPos;
@JsonIgnore
- public int potentialStartPosChar = -1, potentialEndPosChar = -1;
+ public int potentialStartPosChar = -1,
+ potentialEndPosChar = -1;
@JsonIgnore
- public boolean leftTokenContext, rightTokenContext;
+ public boolean leftTokenContext,
+ rightTokenContext;
- private String tempSnippet, snippetHTML, snippetBrackets;
+ private String tempSnippet,
+ snippetHTML,
+ snippetBrackets;
+
private HighlightCombinator snippetStack;
- private boolean startMore = true, endMore = true;
+
+ private boolean startMore = true,
+ endMore = true;
private Collection<byte[]> payload;
private ArrayList<int[]> highlight;
- @JsonIgnore
- public PositionsToOffset positionsToOffset;
-
+ private PositionsToOffset positionsToOffset;
private boolean processed = false;
// Logger
private final static Logger log = LoggerFactory.getLogger(KorapMatch.class);
- public KorapMatch (PositionsToOffset pto) {
+ /**
+ * Constructs a new KorapMatch object.
+ *
+ * @param pto The PositionsToOffset object, containing relevant
+ * positional information for highlighting
+ * @see #snippetHTML()
+ * @see #snippetBrackets()
+ * @see PositionsToOffset
+ */
+ public KorapMatch (PositionsToOffset pto, int localDocID, int startPos, int endPos) {
this.positionsToOffset = pto;
+ this.localDocID = localDocID;
+ this.startPos = startPos;
+ this.endPos = endPos;
+
+ // Preprocess matching
+ pto.add(localDocID, startPos);
+ pto.add(localDocID, endPos - 1);
};
/**
@@ -81,52 +104,25 @@
public void addHighlight (int start, int end, int number) {
if (this.highlight == null)
this.highlight = new ArrayList<int[]>();
-
log.trace("Add highlight of class {} from {} to {}", number, start, end);
+ this._reset();
+
+ // Add this for offset search
+ this.positionsToOffset.add(this.localDocID, start);
+ this.positionsToOffset.add(this.localDocID, end);
+
this.highlight.add(new int[]{ start, end, number});
};
- /*
- public JSONObject toJSON() {
- JSONObject json = new JSONObject();
- json.put("internalDocID", this.internalDocID);
-
- if (this.author != null)
- json.put("author", this.author);
-
- if (this.getPubDate() != null && this.getPubDate().year() > 0)
- json.put("pubDate", this.getPubDate().toDisplay());
-
- if (this.snippetHTML() != null)
- json.put("snippet", this.snippetHTML);
-
- // json.put("primary", this.primary);
-
- JSONArray pos = new JSONArray();
- pos.add(this.startPos);
- pos.add(this.endPos);
- json.put("position", pos);
-
- return json;
+ private void _reset () {
+ this.processed = false;
+ this.snippetHTML = null;
+ this.snippetBrackets = null;
};
- */
-
- /**
- * Generates a highlighted snippet for the mach, that can be
- * retrieved afterwards via snippetHTML() and snippetBrackets().
- * <p>
- * The information on offset positions has to be retrieved beforehand
- * by filling up the PositionsToOffset.
- *
- * @param pto The PositionsToOffset object, containing relevant
- * positional information for highlighting
- * @see #snippetHTML()
- * @see #snippetBrackets()
- * @see PositionsToOffset
- */
- public void processHighlight () {
+ // Start building highlighted snippets
+ private void _processHighlight () {
if (processed)
return;
@@ -134,41 +130,53 @@
log.trace("Start highlight processing ...");
// Get the list of spans for matches and highlighting
- LinkedList<int[]> spans = this._processHighlightSpans(leftTokenContext,rightTokenContext);
+ LinkedList<int[]> spans = this._processHighlightSpans(
+ leftTokenContext,
+ rightTokenContext
+ );
+ /*
for (int[] s : spans) {
- log.trace(" >> [Spans] Start: {}, End: {}, Class: {}, Dummy: {}", s[0], s[1], s[2], s[3]);
+ log.trace(" >> [Spans] Start: {}, End: {}, Class: {}, Dummy: {}",
+ s[0], s[1], s[2], s[3]);
};
+ */
+ // Create a stack for highlighted elements (opening and closing elements)
ArrayList<int[]> stack = this._processHighlightStack(spans);
+ /*
for (int[] s : stack) {
- log.trace(" >> [Stack] Start: {}, End: {}, Class: {}, Dummy: {}", s[0], s[1], s[2], s[3]);
+ log.trace(" >> [Stack] Start: {}, End: {}, Class: {}, Dummy: {}",
+ s[0], s[1], s[2], s[3]);
};
+ */
+ // The temparary snippet is empty, nothing to do
if (this.tempSnippet == null) {
processed = true;
return;
};
+ // Merge the element stack with the primary textual data
this._processHighlightSnippet(this.tempSnippet, stack);
+ // Match is processed - done
processed = true;
-
- /*
-
- Collection.sort(openList);
-with http://docs.oracle.com/javase/6/docs/api/java/util/Comparator.html
- */
};
+ /*
+ Comparator class for opening tags
+ */
private class OpeningTagComparator implements Comparator<int[]> {
@Override
public int compare (int[] arg0, int[] arg1) {
+ // Check start positions
if (arg0[0] > arg1[0]) {
return 1;
}
else if (arg0[0] == arg1[0]) {
+ // Check endpositions
if (arg0[1] > arg1[1])
return -1;
return 1;
@@ -177,13 +185,18 @@
};
};
+ /*
+ Comparator class for closing tags
+ */
private class ClosingTagComparator implements Comparator<int[]> {
@Override
public int compare (int[] arg0, int[] arg1) {
+ // Check end positions
if (arg0[1] > arg1[1]) {
return 1;
}
else if (arg0[1] == arg1[1]) {
+ // Check start positions
if (arg0[0] < arg1[0])
return 1;
return -1;
@@ -192,29 +205,44 @@
};
};
+ /*
+ Private class for elements with highlighting information
+ */
private class HighlightCombinatorElement {
- private short type;
- private int number;
+
+ // Type 0: Textual data
+ // Type 1: Opening
+ // Type 2: Closing
+ private byte type;
+
+ private int number = 0;
+ // TODO: Should be possibly a short (as for the -1)
+
private String characters;
private boolean terminal = true;
- public HighlightCombinatorElement (short type, int number) {
+ // Constructor for highlighting elements
+ public HighlightCombinatorElement (byte type, int number) {
this.type = type;
this.number = number;
};
- public HighlightCombinatorElement (short type, int number, boolean terminal) {
- // Type 1: Match,
- this.type = type;
- this.number = number;
+ // Constructor for highlighting elements,
+ // that may not be terminal, i.e. they were closed and will
+ // be reopened for overlapping issues.
+ public HighlightCombinatorElement (byte type, int number, boolean terminal) {
+ this.type = type;
+ this.number = number;
this.terminal = terminal;
};
+ // Constructor for textual data
public HighlightCombinatorElement (String characters) {
- this.type = 0;
+ this.type = (byte) 0;
this.characters = characters;
};
+ // Return html fragment for this combinator element
public String toHTML (FixedBitSet level, byte[] levelCache) {
// Opening
if (this.type == 1) {
@@ -250,9 +278,12 @@
level.set((int) levelCache[this.number]);
return "</em>";
};
+
+ // HTML encode primary data
return encodeHTML(this.characters);
};
+ // Return bracket fragment for this combinator element
public String toBrackets () {
if (this.type == 1) {
StringBuilder sb = new StringBuilder();
@@ -276,46 +307,56 @@
};
+ /*
+ Private class for combining highlighting elements
+ */
private class HighlightCombinator {
private LinkedList<HighlightCombinatorElement> combine;
private LinkedList<Integer> balanceStack = new LinkedList<>();
private ArrayList<Integer> tempStack = new ArrayList<>(32);
+ // Empty constructor
public HighlightCombinator () {
this.combine = new LinkedList<>();
};
+ // Return the combination stack
public LinkedList<HighlightCombinatorElement> stack () {
return this.combine;
};
+ // get the first element (without removing)
public HighlightCombinatorElement getFirst () {
return this.combine.getFirst();
};
+ // get the last element (without removing)
public HighlightCombinatorElement getLast () {
return this.combine.getLast();
};
+ // get an element by index (without removing)
public HighlightCombinatorElement get (int index) {
return this.combine.get(index);
};
+ // Get the size of te combinator stack
public short size () {
return (short) this.combine.size();
};
+ // Add primary data to the stack
public void addString (String characters) {
this.combine.add(new HighlightCombinatorElement(characters));
};
- // add opening highlight combinator
+ // Add opening highlight combinator to the stack
public void addOpen (int number) {
- this.combine.add(new HighlightCombinatorElement((short) 1, number));
+ this.combine.add(new HighlightCombinatorElement((byte) 1, number));
this.balanceStack.add(number);
};
- // add closing highlight combinator
+ // Add closing highlight combinator to the stack
public void addClose (int number) {
HighlightCombinatorElement lastComb;
this.tempStack.clear();
@@ -338,7 +379,7 @@
else {
// Add a closer for the old element (this has following elements)
- this.combine.add(new HighlightCombinatorElement((short) 2, eold, false));
+ this.combine.add(new HighlightCombinatorElement((byte) 2, eold, false));
};
// add this element number temporarily on the stack
@@ -358,16 +399,17 @@
}
else {
// Add a closer
- this.combine.add(new HighlightCombinatorElement((short) 2, number));
+ this.combine.add(new HighlightCombinatorElement((byte) 2, number));
};
// Fetch everything from the tempstack and reopen it
for (int e : tempStack) {
- combine.add(new HighlightCombinatorElement((short) 1, e));
+ combine.add(new HighlightCombinatorElement((byte) 1, e));
balanceStack.add(e);
};
};
+ // Get all combined elements as a string
public String toString () {
StringBuilder sb = new StringBuilder();
for (HighlightCombinatorElement e : combine) {
@@ -415,11 +457,13 @@
@JsonProperty("snippet")
public String getSnippetHTML () {
- this.processHighlight();
+ this._processHighlight();
- if (this.snippetHTML != null)
+ if (this.processed && this.snippetHTML != null)
return this.snippetHTML;
+ log.trace("Create HTML Snippet");
+
StringBuilder sb = new StringBuilder();
short start = (short) 0;
@@ -472,9 +516,9 @@
@JsonIgnore
public String getSnippetBrackets () {
- this.processHighlight();
+ this._processHighlight();
- if (this.snippetBrackets != null)
+ if (this.processed && this.snippetBrackets != null)
return this.snippetBrackets;
StringBuilder sb = new StringBuilder();
@@ -500,7 +544,6 @@
log.trace("Create Stack");
-
LinkedList<int[]> openList = new LinkedList<int[]>();
LinkedList<int[]> closeList = new LinkedList<int[]>();
@@ -622,26 +665,6 @@
// -- I'm not sure about this.
if (this.highlight != null) {
for (int[] highlight : this.highlight) {
-
- /*
-
- int start = pto.start(ldid, highlight[0]);
- int end = pto.end(ldid, highlight[1]);
-
- // Todo: Does this have to be and or or?
- if (start == -1 || end == -1)
- continue;
-
- if (start > startOffsetChar) {
- start -= startOffsetChar;
- }
- else {
- start = 0;
- };
-
- end -= startOffsetChar;
- */
-
int start = this.positionsToOffset.start(ldid, highlight[0]) - startOffsetChar;
int end = this.positionsToOffset.end(ldid, highlight[1]) - startOffsetChar;
diff --git a/src/main/java/de/ids_mannheim/korap/KorapResult.java b/src/main/java/de/ids_mannheim/korap/KorapResult.java
index c24ce3b..d74ae0d 100644
--- a/src/main/java/de/ids_mannheim/korap/KorapResult.java
+++ b/src/main/java/de/ids_mannheim/korap/KorapResult.java
@@ -66,14 +66,24 @@
this.matches.add(km);
};
- public KorapMatch addMatch (PositionsToOffset pto) {
- KorapMatch km = new KorapMatch(pto);
- // Temporary:
+ public KorapMatch addMatch (PositionsToOffset pto, int localDocID, int startPos, int endPos) {
+ KorapMatch km = new KorapMatch(pto, localDocID, startPos, endPos);
+ // Temporary - should use the same interface like results in the future:
km.leftContext = this.leftContextOffset;
km.leftTokenContext = this.leftTokenContext;
km.rightContext = this.rightContextOffset;
km.rightTokenContext = this.rightTokenContext;
- km.positionsToOffset = pto;
+
+ // Add pos for context
+ // That's not really a good position for it, to be honest ...
+ // But maybe it will make the offset information in the match be obsolete!
+ if (km.leftTokenContext) {
+ pto.add(localDocID, startPos - this.leftContextOffset);
+ };
+ if (km.rightTokenContext) {
+ pto.add(localDocID, endPos + this.rightContextOffset - 1);
+ };
+
this.add(km);
return km;
};
diff --git a/src/main/java/de/ids_mannheim/korap/index/PositionsToOffset.java b/src/main/java/de/ids_mannheim/korap/index/PositionsToOffset.java
index c893d53..c817449 100644
--- a/src/main/java/de/ids_mannheim/korap/index/PositionsToOffset.java
+++ b/src/main/java/de/ids_mannheim/korap/index/PositionsToOffset.java
@@ -15,7 +15,6 @@
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-
public class PositionsToOffset {
private String field;
private AtomicReaderContext atomic;
@@ -71,6 +70,10 @@
};
public void add (PositionsToOffsetArray ptoa) {
+ log.trace("Add positionsToOffsetArray {}/{}", ptoa.docID, ptoa.pos);
+ if (this.processed && this.exists(ptoa))
+ return;
+ log.trace("Reopen processing");
this.positions.add(ptoa);
this.processed = false;
};
@@ -135,13 +138,15 @@
if (processed)
return offsets;
+ log.trace("Process offsets");
+
StringBuilder sb = new StringBuilder().append('_');
try {
Terms terms = atomic.reader().fields().terms(field);
if (terms != null) {
- // Todo: Maybe reuse a termsEnum!
+ // TODO: Maybe reuse a termsEnum!
final TermsEnum termsEnum = terms.iterator(null);
diff --git a/src/test/java/de/ids_mannheim/korap/highlight/TestHighlight.java b/src/test/java/de/ids_mannheim/korap/highlight/TestHighlight.java
index c1ba519..69cbbb6 100644
--- a/src/test/java/de/ids_mannheim/korap/highlight/TestHighlight.java
+++ b/src/test/java/de/ids_mannheim/korap/highlight/TestHighlight.java
@@ -54,8 +54,60 @@
km = kr.getMatch(0);
assertEquals("<span class=\"context-left\">a</span><span class=\"match\"><em class=\"class-1 level-0\">b</em><em class=\"class-2 level-0\">c</em></span><span class=\"context-right\"></span>", km.getSnippetHTML());
- // Check {1a:{1b:huhu:1a}hihi:1b}
+ kr = ki.search((SpanQuery) kq.seq(kq._(1, kq.seg("s:b"))).append(kq._(2, kq.seg("s:c"))).toQuery());
+ km = kr.getMatch(0);
+ assertEquals("<span class=\"context-left\">a</span><span class=\"match\"><em class=\"class-1 level-0\">b</em><em class=\"class-2 level-0\">c</em></span><span class=\"context-right\"></span>", km.getSnippetHTML());
+
+ kr = ki.search((SpanQuery) kq.seq(kq._(1, kq.seq(kq.seg("s:a")).append(kq.seg("s:b")))).append(kq._(2, kq.seg("s:c"))).toQuery());
+ km = kr.getMatch(0);
+ assertEquals("<span class=\"context-left\"></span><span class=\"match\"><em class=\"class-1 level-0\">ab</em><em class=\"class-2 level-0\">c</em></span><span class=\"context-right\"></span>", km.getSnippetHTML());
+
+
+ kr = ki.search((SpanQuery) kq._(3, kq.seq(kq._(1, kq.seq(kq.seg("s:a")).append(kq.seg("s:b")))).append(kq._(2, kq.seg("s:c")))).toQuery());
+ km = kr.getMatch(0);
+ assertEquals("<span class=\"context-left\"></span><span class=\"match\"><em class=\"class-3 level-0\"><em class=\"class-1 level-1\">ab</em><em class=\"class-2 level-1\">c</em></em></span><span class=\"context-right\"></span>", km.getSnippetHTML());
+ };
+
+ @Test
+ public void checkHighlightsManually () throws IOException {
+
+ KorapIndex ki = new KorapIndex();
+ String json = new String(
+"{" +
+" \"fields\" : [" +
+" { "+
+" \"primaryData\" : \"abc\"" +
+" }," +
+" {" +
+" \"name\" : \"tokens\"," +
+" \"data\" : [" +
+" [ \"s:a\", \"i:a\", \"_0#0-1\", \"-:t$<i>3\"]," +
+" [ \"s:b\", \"i:b\", \"_1#1-2\" ]," +
+" [ \"s:c\", \"i:c\", \"_2#2-3\" ]" +
+" ]" +
+" }" +
+" ]" +
+"}");
+
+ FieldDocument fd = ki.addDoc(json);
+ ki.commit();
+
+ KorapQuery kq = new KorapQuery("tokens");
+
+ KorapResult kr = ki.search((SpanQuery) kq.seq(kq.seg("s:a")).append(kq.seg("s:b")).append(kq.seg("s:c")).toQuery());
+ KorapMatch km = kr.getMatch(0);
+ km.addHighlight(0, 1, (short) 7);
+ assertEquals("<span class=\"context-left\"></span><span class=\"match\"><em class=\"class-7 level-0\">ab</em>c</span><span class=\"context-right\"></span>", km.getSnippetHTML());
+
+ km.addHighlight(1, 2, (short) 6);
+ assertEquals("<span class=\"context-left\"></span><span class=\"match\"><em class=\"class-7 level-0\">a<em class=\"class-6 level-1\">b</em></em><em class=\"class-6 level-1\">c</em></span><span class=\"context-right\"></span>", km.getSnippetHTML());
+
+ km.addHighlight(0, 1, (short) 5);
+ assertEquals("<span class=\"context-left\"></span><span class=\"match\"><em class=\"class-7 level-0\"><em class=\"class-5 level-1\">a<em class=\"class-6 level-2\">b</em></em></em><em class=\"class-6 level-2\">c</em></span><span class=\"context-right\"></span>", km.getSnippetHTML());
+
+
+ // Check {1a:{1b:huhu:1a}hihi:1b}
};
};