Made token position semantics more explicit
Change-Id: Ib52f08558b00471cd4623ef5ddabe56c63d644a7
diff --git a/misc/payloads.md b/misc/payloads.md
index d4ac3e8..1839543 100644
--- a/misc/payloads.md
+++ b/misc/payloads.md
@@ -1,5 +1,11 @@
# Payload Handling in Krill
-Apache Lucene supports payloads as arbitrary byte sequences to store information for terms specific to any token position. Krill uses payloads to store various information in a compact way. This documents describes the payload information for index payloads (payloads stored in the index for different term concepts) and computed payloads (payloads created during the retrieval phase).
+Apache Lucene supports payloads as arbitrary byte sequences to store information for terms specific to any token position. Krill uses payloads to store various information in a compact way. This document describes the payload information for index payloads (payloads stored in the index for different term concepts) and computed payloads (payloads created on-the-fly during the retrieval phase).
+
+## Token positions
+Token positions mark the positions between tokens. All tokens are indexed
+at the start position. A simple token always has implicitely an end position of start position ```+ 1```.
+That means, the end position of a span will always be the token position of the
+final token of the span ```+ 1```.
## Payload Type Identifier (PTI)
Payloads (both indexed and computed) have a leading byte indicating the type of the payload sequence. This is necessary because the origin (i.e. the requested term) of a payload is lost during the retrieval phase. Payload type identifiers range between 0 and 255 and have the length of a byte (\<b\>). In case a token has no payload, no payload type identifier is stored.
@@ -10,7 +16,7 @@
## Index Payloads
### Token position payloads
-A token always has a special character payload storing the start and end offset of the token. The special character is a reference symbol for this payload, which is an underscore followed by the corresponding token position. For example, the _1$\<i\>0\<i\>3 is the special character payload for the token in position 1 describing that the token ranges from 0 to 3. This offset information is stored in integer.
+A token always has a special character payload storing the start and end offset of the token. The special character is a reference symbol for this payload, which is an underscore followed by the corresponding token position. For example, the _1$\<i\>0\<i\>3 is the special character payload for the token starting at position 1 describing that the token ranges from character 0 to 3. This offset information is stored in integer.
Token payloads are not retrieved via SpanQueries and therefore do not have a PTI.
### Token payloads
diff --git a/src/test/java/de/ids_mannheim/korap/highlight/TestHighlight.java b/src/test/java/de/ids_mannheim/korap/highlight/TestHighlight.java
index 6e24c18..666fd73 100644
--- a/src/test/java/de/ids_mannheim/korap/highlight/TestHighlight.java
+++ b/src/test/java/de/ids_mannheim/korap/highlight/TestHighlight.java
@@ -444,6 +444,50 @@
km.getSnippetHTML());
};
+
+ @Test
+ public void checkSpanHighlights () throws IOException, QueryException {
+
+ KrillIndex ki = new KrillIndex();
+
+ FieldDocument fd = new FieldDocument();
+ fd.addString("ID", "doc-1");
+ fd.addString("UID", "1");
+ fd.addString("textSigle", "c1/d1/1");
+ fd.addTV("base", "abc",
+ "[(0-1)s:a|i:a|_0#0-1|-:t$<i>3|<>:base/t:t$<b>64<i>0<i>3<i>3<b>0]" +
+ "[(1-2)s:b|i:b|base/l:B|_1#1-2|<>:corenlp/x:a$<b>64<i>1<i>2<i>2<b0>]" +
+ "[(2-3)s:c|i:c|base/l:C|_2#2-3]");
+ ki.addDoc(fd);
+ ki.commit();
+
+ QueryBuilder kq = new QueryBuilder("base");
+ Result kr = ki
+ .search((SpanQuery) kq.tag("base/t:t").toQuery());
+
+ Match km = kr.getMatch(0);
+ assertEquals(km.getStartPos(), 0);
+ assertEquals(km.getEndPos(), 3);
+ assertEquals("match-c1/d1/1-p0-3",km.getID());
+
+ km = ki.getMatchInfo("match-c1/d1/1-p0-3", "base", true,
+ (ArrayList) null, (ArrayList) null, true, true, false);
+ assertEquals(0, km.getStartPos());
+ assertEquals(3, km.getEndPos());
+ assertEquals("<span class=\"context-left\"></span>" +
+ "<span class=\"match\">"+
+ "<mark>"+
+ "<span title=\"base/t:t\">a"+
+ "<span title=\"base/l:B\">"+
+ "<span title=\"corenlp/x:a\">b</span>"+
+ "</span>"+
+ "<span title=\"base/l:C\">c</span>"+
+ "</span>"+
+ "</mark>"+
+ "</span>"+
+ "<span class=\"context-right\"></span>", km.getSnippetHTML());
+ };
+
@Test
public void highlightEmptySpan () throws IOException, QueryException {
diff --git a/src/test/java/de/ids_mannheim/korap/index/TestElementIndex.java b/src/test/java/de/ids_mannheim/korap/index/TestElementIndex.java
index 5cd2d80..6d5d49a 100644
--- a/src/test/java/de/ids_mannheim/korap/index/TestElementIndex.java
+++ b/src/test/java/de/ids_mannheim/korap/index/TestElementIndex.java
@@ -4,7 +4,10 @@
import java.io.IOException;
+import org.apache.lucene.index.Term;
import org.apache.lucene.search.spans.SpanQuery;
+import org.apache.lucene.search.spans.SpanTermQuery;
+
import org.junit.Test;
import org.junit.runner.RunWith;
import org.junit.runners.JUnit4;
@@ -26,7 +29,7 @@
// <a>x<a>y<a>zhij</a>hij</a>hij</a>hij</a>
FieldDocument fd = new FieldDocument();
fd.addTV("base", "x y z h i j h i j h i j ",
- "[(0-3)s:x|<>:a$<b>64<i>0<i>3<i>12<b>0]"
+ "[(0-3)s:x|<>:a$<b>64<i>0<i>3<i>12<b>0||<>:b$<b>64<i>0<i>3<i>1<b>0]"
+ "[(3-6)s:y|<>:a$<b>64<i>3<i>6<i>9<b>0]"
+ "[(6-9)s:z|<>:a$<b>64<i>6<i>9<i>6]"
+ "[(9-12)s:h<b>0]" + "[(12-15)s:i]" + "[(15-18)s:j]"
@@ -70,7 +73,23 @@
assertEquals("StartPos (2)", 2, kr.getMatch(5).startPos);
assertEquals("EndPos (2)", 6, kr.getMatch(5).endPos);
- // System.err.println(kr.toJSON());
+ sq = new SpanTermQuery(new Term("base", "s:x"));
+
+ kr = ki.search(sq, (short) 10);
+
+ assertEquals("totalResults", kr.getTotalResults(), 2);
+
+ assertEquals("StartPos (0)", 0, kr.getMatch(0).startPos);
+ assertEquals("EndPos (0)", 1, kr.getMatch(0).endPos);
+
+ sq = new SpanElementQuery("base", "b");
+
+ kr = ki.search(sq, (short) 10);
+
+ assertEquals("totalResults", kr.getTotalResults(), 1);
+
+ assertEquals("StartPos (0)", 0, kr.getMatch(0).startPos);
+ assertEquals("EndPos (0)", 1, kr.getMatch(0).endPos);
};